# ******************************************
#
# Code for generating and tuning the random forest models
#
# Data (labelled dat) is the extracted feature dataset as will be made available on the Rothamsted Research Repository (https://repository.rothamsted.ac.uk/) under a Creative Commons Attribution 4.0 Licence.
#
# *****************************************

library(randomForest)
library(ggplot2)
library(ggpubr)


rm(list=ls())

# read in data from doi
head(dat)

# create sommon name for each species
dat$species_common <- as.character(dat$species)
dat$species_common[dat$species == "Brassicogethes aeneus"] <- "Pollen beetle"
dat$species_common[dat$species == "Psylliodes chrysocephala"] <- "Cabbage stem flea beetle"
dat$species_common[dat$species == "Aphis fabae"] <- "Black bean aphid"
dat$species_common[dat$species == "Drepanosiphum platanoidis"] <- "Sycamore aphid"
dat$species_common[dat$species == "Myzus persicae"] <- "Peach-potato aphid"
dat$species_common[dat$species == "Periphyllus testudinaceus"] <- "Maple aphid"
dat$species_common[dat$species == "Rhopalosiphum padi"] <- "Bird cherry-oat aphid"
dat$species_common[dat$species == "Sitobion avenae"] <- "English grain aphid"
dat$species_common <- factor(dat$species_common)

# select 52 extracted features
model1 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms",
            "sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4",
            "log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy",
            "harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10",
            "log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")

dim(dat[, model1])


# split the data into training and validation
set.seed(6546) # for reproducibility
train <- sample(nrow(dat), 0.7*nrow(dat), replace=FALSE)

dat$set <- "validation"
dat$set[train] <- "training"
# remove the two species with too few observations
dat$set[dat$species %in%c("Myzus persicae", "Rhopalosiphum padi")] <- "null"
table(dat$set)
table(dat$set, dat$species_common)

trainTree <- dat[dat$set=="training",model1]
trainTree <- droplevels(trainTree)
trainSp <- dat[dat$set=="training","species_common"]
trainSp <- droplevels(trainSp)

validTree <- dat[dat$set=="validation",model1]
validTree <- droplevels(validTree)
validSp <- dat[dat$set=="validation","species_common"]
validSp <- droplevels(validSp)



# Tuning ---------------------

# functions to extract error rates from a RF
getOOB <- function(rf){
  n <- dim(rf$confusion)[1]
  oob <- 100 - (sum(diag(rf$confusion[,1:n])) / sum(rf$confusion[,1:n])) * 100
  return(oob)
}

# true positive
getTP <- function(rf){
  n <- dim(rf$confusion)[1]
  TP <- (diag(rf$confusion[,1:n]))
  return(c(TP))
}

# true negative
getTN <- function(rf){
  n <- dim(rf$confusion)[1]
  TN <- rep(NA, n)
  conf <- rf$confusion[1:n, 1:n]
  for (i in 1:n){
    TN[i] <- sum(conf[-i,-i])
    
  }
  return(TN)
}

# false negative
getFN <- function(rf){
  n <- dim(rf$confusion)[1]
  conf <- rf$confusion[1:n, 1:n]
  FN <- rep(NA, n)
  for (i in 1:n){
    FN[i] <- sum(conf[i,-i])
  }
  return(FN)
}

# false positive
getFP <- function(rf){
  n <- dim(rf$confusion)[1]
  conf <- rf$confusion[1:n, 1:n]
  FP <- rep(NA, n)
  for (i in 1:n){
    FP[i] <- sum(conf[-i,i])
  }
  return(FP)
}

# Step 1. Tuning the class sample sizes
# With the imbalance in species, a standard RF will be biased to the majority classes

# by default, the class sample sizes will be
.632*table(trainSp)
# out of a total of
table(trainSp)

# to tune this, explore different weight options.
# b0 - b1, all classes equally weighted, sampling small numbers of observations
b0 <- c(30,30,30,30,30,30) 
b1 <- c(50,50,50,50,50,50)
# b2 - b7, maintaing equal weighting where possible, but gradually increasing sample size. i.e. maximal number of Periphyllus to be sampled is 50.
b2 <- c(75,75,75,50,75,75)
b3 <- c(75,75,120,50,120,120)
b4 <- c(75,75,120,50,350,350)
b5 <- c(75,75,120,50,350,500)
b6 <- c(75,75,120,50,120,750)
b7 <- c(75,75,120,50,120,1000)

# search over these 7 weightings
b <- rbind(b0,b1,b2,b3,b4,b5,b6,b7)
# trying 3 different tree growths
allB <- data.frame(ntree = rep(c(100,500,1000), times=8), rbind(b,b,b))

# storing error rates
TNb <- TPb <- FPb <- FNb <- matrix(NA, nrow=dim(allB)[1], ncol=6)
OOBb <- rep(NA, dim(allB)[1])


{

  for (i in 1:dim(allB)[1]){
    smp <- unlist(allB[i,2:7])
    names(smp) <- NULL
    rfB <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=20, ntree = allB$ntree[i], sampsize=smp)
    
    TNb[i,] <- getTN(rfB)
    FNb[i,] <- getFN(rfB)
    TPb[i,] <- getTP(rfB)
    FPb[i,] <- getFP(rfB)
    OOBb[i] <- getOOB(rfB)
    
    print(i)
    
  }
  
  
  # get class error rates
  classErrb <- 1 - TPb / (TPb + FNb)
  # get true positive rate
  TPRb <- TPb / (TPb + FNb)
  # get true negative rate
  TNRb <- TNb/(TNb + FPb)
  # weighted accuracy
  wAccb <- 0.5* TNRb + 0.5* TPRb
  
  # ROC curves
  plot(TPRb[,1], TNRb[,1], xlab="True Positive rate", ylab="True Negative rate", xlim=c(0,1), ylim=c(0,1))
  points(TPRb[,2], TNRb[,2], col=2)
  points(TPRb[,3], TNRb[,3], col=3)
  points(TPRb[,4], TNRb[,4], col=4)
  points(TPRb[,5], TNRb[,5], col=5)
  points(TPRb[,6], TNRb[,6], col=6)
  
  maxClassErrb <- apply(classErrb, 1, max)
  minTNRb <- apply(TNRb, 1, min)
  
  df <- data.frame(allB, OOBb, maxClassErrb, minTNRb, classErrb, TNRb)
  
  ggplot(df, aes(x=interaction(X1,X2,X3,X4,X5,X6), y=OOBb, colour=factor(ntree))) + geom_point()
  ggplot(df, aes(x=interaction(X1,X2,X3,X4,X5,X6), y=maxClassErrb, colour=factor(ntree))) + geom_point()
  ggplot(df, aes(x=interaction(X1,X2,X3,X4,X5,X6), y=X1.1, colour=factor(ntree))) + geom_point()
 
  df2 <- df[df$ntree == 500,c(2:7,11:16)]
  df2_long <- reshape(df2, direction="long", varying=list(7:12), timevar="species")
  ggplot(df2_long, aes(x=interaction(X1,X2,X3,X4,X5,X6), y=X1.1, colour=factor(species))) + geom_point()  
}

# # the best balanced 
# b3 <- c(75,75,120,50,120,120)
# # though the oob changes dramatically from this one to 
# b4 <- c(75,75,120,50,350,350)
# # the best oob is from
# b5 <- c(75,75,120,50,350,500)

# a finer search through the balanced sampling
{
 
  allB <- expand.grid(b1=75,b2=75,b3=120,b4=50,b5=c(120,200,250,300,350),b6=c(120,200,300,400,500,600))
  names(allB) <- c("b1","b2","b3","b4","b5","b6")
  allB <- allB[allB$b6 >= allB$b5, ]
  
  TNb <- TPb <- FPb <- FNb <- matrix(NA, nrow=dim(allB)[1], ncol=6)
  OOBb <- rep(NA, dim(allB)[1])
  for (i in 1:dim(allB)[1]){
    smp <- unlist(allB[i,])
    names(smp) <- NULL
    rfB <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=20, ntree = 500, sampsize=smp)
    
    TNb[i,] <- getTN(rfB)
    FNb[i,] <- getFN(rfB)
    TPb[i,] <- getTP(rfB)
    FPb[i,] <- getFP(rfB)
    OOBb[i] <- getOOB(rfB)
    
    print(i)
    
  }
  
  
  # get class error rates
  classErrb <- 1 - TPb / (TPb + FNb)
  # get true positive rate
  TPRb <- TPb / (TPb + FNb)
  # get true negative rate
  TNRb <- TNb/(TNb + FPb)
  # weighted accuracy
  wAccb <- 0.5* TNRb + 0.5* TPRb
  
  # ROC curves
  plot(TPRb[,1], TNRb[,1], xlab="True Positive rate", ylab="True Negative rate", xlim=c(0,1), ylim=c(0,1))
  points(TPRb[,2], TNRb[,2], col=2)
  points(TPRb[,3], TNRb[,3], col=3)
  points(TPRb[,4], TNRb[,4], col=4)
  points(TPRb[,5], TNRb[,5], col=5)
  points(TPRb[,6], TNRb[,6], col=6)
  
  maxClassErrb <- apply(classErrb, 1, max)
  minTNRb <- apply(TNRb, 1, min)
  
  df <- data.frame(allB, OOBb, maxClassErrb, minTNRb, classErrb, TNRb, wAccb)
  
  g1 <- ggplot(df, aes(x=interaction(b1,b2,b3,b4,b5,b6), y=OOBb)) + geom_point()
  ggplot(df, aes(x=interaction(b1,b2,b3,b4,b5,b6), y=wAccb)) + geom_point()
  ggplot(df, aes(x=interaction(b1,b2,b3,b4,b5,b6), y=maxClassErrb)) + geom_point()

  df2 <- df[,c(1:6,10:15)]
  df2_long <- reshape(df2, direction="long", varying=list(7:12), timevar="species")
  g2 <- ggplot(df2_long, aes(x=interaction(b1,b2,b3,b4,b5,b6), y=X1, colour=factor(species))) + geom_point()  
  
  ggarrange(g1,g2, nrow=2,ncol=1)
}
# best compromise looks to be 
# 75 75 120 50 200 200
# or possibly
# 75 75 120 50 120 300
rfB1 <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=20, ntree = 500, sampsize=c(75, 75, 120, 50, 200, 200))
rfB2 <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=20, ntree = 500, sampsize=c(75, 75, 120, 50, 120, 300))
# let's go with rfB2


# Step 2 - tune the RF parameters, ntree, mtry

# Tuning
{

  options <- expand.grid(mtry=c(6,10,20, 25, 30),ntree=c(100,500,1000,2000), miss=c("omit", "roughfix"))

  oob <- at <- av <- rep(NA, dim(options)[1])

  for (opt in 1: dim(options)[1]){
    print(opt)

    # All species
    if(options[opt, 3] == "omit"){

      rf <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.omit, mtry=options[opt, 1], ntree = options[opt, 2], sampsize=c(60, 60, 96, 40, 96, 240))

    } else if (options[opt, 3] == "roughfix"){

      rf <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=options[opt, 1], ntree = options[opt, 2], sampsize=c(75, 75, 120, 50, 120, 300))

    
    }

    oob[opt] <- rf$err.rate[options[opt, 2],1]
    predTrain <- predict(rf, data.frame(trainTree, trainSp), type = "class")
    at[opt] <- mean(predTrain == trainSp, na.rm=TRUE)
    predValid <- predict(rf, data.frame(validTree, validSp), type = "class")
    av[opt] <- mean(predValid == validSp, na.rm=TRUE)

  }

  df <- data.frame(options, oob, at, av)
  df
  ggplot(df, aes(x=interaction(mtry,ntree), y=oob, colour=miss)) + geom_point()
  ggplot(df, aes(x=interaction(mtry,ntree), y=av, colour=miss)) + geom_point()
}
# 10 variables, 1000 trees


rfB <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 75, 120, 50, 120, 300))
# randomForest(formula = trainSp ~ ., data = trainTree, importance = TRUE,      mtry = 10, ntree = 1000, sampsize = c(75, 75, 120, 50, 120,          300), na.action = na.roughfix) 
# Type of random forest: classification
# Number of trees: 1000
# No. of variables tried at each split: 10
# 
# OOB estimate of  error rate: 20.62%

rfB_omit <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.omit, mtry=10, ntree = 1000, sampsize=c(60, 60, 96, 40, 96, 240))
# randomForest(formula = trainSp ~ ., data = trainTree, importance = TRUE,      mtry = 10, ntree = 1000, sampsize = c(60, 60, 96, 40, 96,          240), na.action = na.omit) 
# Type of random forest: classification
# Number of trees: 1000
# No. of variables tried at each split: 10
# 
# OOB estimate of  error rate: 17.88%


