# ******************************************
#
# Code for generating the selected random forest models of different data subsets
#
# Data (labelled dat) is the extracted feature dataset as will be made available on the Rothamsted Research Repository (https://repository.rothamsted.ac.uk/) under a Creative Commons Attribution 4.0 Licence.
#
# *****************************************


library(randomForest)
library(ggplot2)

rm(list=ls())


# Functions ----------
# functions to extract error rates from a RF
getOOB <- function(rf){
  n <- dim(rf$confusion)[1]
  oob <- 100 - (sum(diag(rf$confusion[,1:n])) / sum(rf$confusion[,1:n])) * 100
  return(oob)
}

# true positive
getTP <- function(rf){
  n <- dim(rf$confusion)[1]
  TP <- (diag(rf$confusion[,1:n]))
  return(c(TP))
}

# true negative
getTN <- function(rf){
  n <- dim(rf$confusion)[1]
  TN <- rep(NA, n)
  conf <- rf$confusion[1:n, 1:n]
  for (i in 1:n){
    TN[i] <- sum(conf[-i,-i])
    
  }
  return(TN)
}

# false negative
getFN <- function(rf){
  n <- dim(rf$confusion)[1]
  conf <- rf$confusion[1:n, 1:n]
  FN <- rep(NA, n)
  for (i in 1:n){
    FN[i] <- sum(conf[i,-i])
  }
  return(FN)
}

# false positive
getFP <- function(rf){
  n <- dim(rf$confusion)[1]
  conf <- rf$confusion[1:n, 1:n]
  FP <- rep(NA, n)
  for (i in 1:n){
    FP[i] <- sum(conf[-i,i])
  }
  return(FP)
}






# Data ------------


# read in data from doi

head(dat)

# create sommon name for each species
dat$species_common <- as.character(dat$species)
dat$species_common[dat$species == "Brassicogethes aeneus"] <- "Pollen beetle"
dat$species_common[dat$species == "Psylliodes chrysocephala"] <- "Cabbage stem flea beetle"
dat$species_common[dat$species == "Aphis fabae"] <- "Black bean aphid"
dat$species_common[dat$species == "Drepanosiphum platanoidis"] <- "Sycamore aphid"
dat$species_common[dat$species == "Myzus persicae"] <- "Peach-potato aphid"
dat$species_common[dat$species == "Periphyllus testudinaceus"] <- "Maple aphid"
dat$species_common[dat$species == "Rhopalosiphum padi"] <- "Bird cherry-oat aphid"
dat$species_common[dat$species == "Sitobion avenae"] <- "English grain aphid"
dat$species_common <- factor(dat$species_common)


# select 52 extracted features
model1 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms",
            "sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4",
            "log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy",
            "harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10",
            "log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")

dim(dat[, model1])


# split the data into training and validation
set.seed(6546) # for reproducibility
train <- sample(nrow(dat), 0.7*nrow(dat), replace=FALSE)

dat$set <- "validation"
dat$set[train] <- "training"
# remove the two species with too few observations
dat$set[dat$species %in%c("Myzus persicae", "Rhopalosiphum padi")] <- "null"
table(dat$set)
table(dat$set, dat$species_common)








# Feature selection ------------

# select variables
model1 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms",
            "sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4",
            "log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy",
            "harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10",
            "log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")

model2 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms","sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4","log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy","harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10")
model3 <- c("log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")
model4 <- c("harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10","log_fundFreq","harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")
model5 <- c("sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4","spectralEntropy","harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10","log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4","spectralEntropy_g","harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")

model6 <- c("log_power", "acousticEntropy_g", "harmonics_g.6")
model7 <- c("log_power", "bioacousticIndex_3", "spectralEntropy", "temporalEntropy_g","harmonics_g.6")
model8 <- c("log_rangeAmp","log_iqrAmp","log_crestFactor", "bioacousticIndex_3", "spectralEntropy","sqrt_domfreq", "harmonics.1","temporalEntropy_g","bioacousticIndex_g","sqrt_domfreq_g","harmonics_g.2", "harmonics_g.4","harmonics_g.8")

# select training and validation data
trainSet <- dat[dat$set=="training",]
trainSet <- droplevels(trainSet)
trainSp <- dat[dat$set=="training","species_common"]
trainSp <- droplevels(trainSp)

validSet <- dat[dat$set=="validation",]
validSet <- droplevels(validSet)
validSp <- dat[dat$set=="validation","species_common"]
validSp <- droplevels(validSp)

# impute missing values by median of the non-missing
validSet_med <- validSet
for (i in model1){
  ind <- which(is.na(validSet[,i]))
  validSet_med[ind, i] <- median(trainSet[,i], na.rm=TRUE)
}


for (model in c(1:8)){
  print(model)
  
  if (model == 1){
    trainTree <- trainSet[, model1]
    validTree <- validSet[, model1]
    validTree_med <- validSet_med[, model1]
    
    
  } else if (model == 2){
    trainTree <- trainSet[, model2]
    validTree <- validSet[, model2]
    validTree_med <- validSet_med[, model2]
    
  } else if (model == 3){
    trainTree <- trainSet[, model3]
    validTree <- validSet[, model3]
    validTree_med <- validSet_med[, model3]
    
  } else if (model == 4){
    trainTree <- trainSet[, model4]
    validTree <- validSet[, model4]
    validTree_med <- validSet_med[, model4]
    
  } else if (model == 5) {
    trainTree <- trainSet[, model5]
    validTree <- validSet[, model5]
    validTree_med <- validSet_med[, model5]
    
  } else if (model == 8) {
    trainTree <- trainSet[, model8]
    validTree <- validSet[, model8]
    validTree_med <- validSet_med[, model8]
    
  } else if (model == 6) {
    trainTree <- trainSet[, model6]
    validTree <- validSet[, model6]
    validTree_med <- validSet_med[, model6]
    
  }else if (model == 7) {
    trainTree <- trainSet[, model7]
    validTree <- validSet[, model7]
    validTree_med <- validSet_med[, model7]
    
  }
  
  
  
  
  # run the random forests, and extract error rates
  
  options <- expand.grid(mtry=c(10),ntree=c(1000), miss=c("omit", "roughfix"))
  
  oob <- at <- av <- rep(NA, dim(options)[1])
  classErr <- TPR <- TNR <- wAcc <- matrix(NA, nrow=dim(options)[1], ncol=6)
  
  for (opt in 1: dim(options)[1]){
    print(opt)
    
    # All species
    if(options[opt, 3] == "omit"){
      
      if (model %in% c(6,7)){
        rf <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.omit, mtry=dim(trainTree)[2], ntree = 200, sampsize=c(60, 60, 96, 40, 96, 240))
        
      } else {
        rf <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.omit, mtry=10, ntree = 1000, sampsize=c(60, 60, 96, 40, 96, 240))
        
      }
      
      oob[opt] <- rf$err.rate[rf$ntree,1]
      predTrain <- rf$predicted
      at[opt] <- mean(predTrain == trainSet$species_common[rowSums(is.na(trainTree)) == 0], na.rm=TRUE) 
      predValid <- predict(rf, data.frame(validTree, validSet$species_common), type = "class")
      av[opt] <- mean(predValid == validSet$species_common, na.rm=TRUE) 
      
    } else if (options[opt, 3] == "roughfix"){
      
      if (model %in% c(6,7)){
        rf <-  randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=dim(trainTree)[2], ntree = 200, sampsize=c(75, 75, 120, 50, 120, 300))
      } else {
        rf <-  randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 75, 120, 50, 120, 300))
      }
      
      
      oob[opt] <- rf$err.rate[rf$ntree,1]
      predTrain <- rf$predicted
      at[opt] <- mean(predTrain == trainSp, na.rm=TRUE) 
      predValid <- predict(rf, data.frame(validTree_med, validSp), type = "class")
      av[opt] <- mean(predValid == validSp, na.rm=TRUE) 
      
    } 
    
    # class specific error rates
    TN <- getTN(rf)
    TP <- getTP(rf)
    FN <- getFN(rf)
    FP <- getFP(rf)
    
    # get class error rates
    classErr[opt, ] <- 1 - TP / (TP + FN)
    # get true positive rate
    TPR[opt, ] <- TP / (TP + FN)
    # get true negative rate
    TNR[opt, ] <- TN/(TN + FP)
    # weighted accuracy
    wAcc[opt, ] <- 0.5* TNR[opt,] + 0.5* TPR[opt,]
    
    
  }
  names(classErr) <- paste("classErr.",levels(trainSp),sep="")
  names(TPR) <- paste("TPR.",levels(trainSp),sep="")
  names(TNR) <- paste("TNR.",levels(trainSp),sep="")
  names(wAcc) <- paste("wAcc.",levels(trainSp),sep="")
  
  if (model == 1){
    df <- data.frame(model = 1, options, oob, at, av, classErr, TPR, TNR, wAcc)
  } else {
    temp <- data.frame(model = model, options, oob, at, av, classErr, TPR, TNR, wAcc)
    df <- rbind(df, temp)
  }
  
}


df$model <- factor(df$model)
names(df) <- c("model","mtry","ntree","miss","oob","at","av", paste("classErr.",levels(trainSp),sep=""), paste("TPR.",levels(trainSp),sep=""), paste("TNR.",levels(trainSp),sep=""), paste("wAcc.",levels(trainSp),sep=""))


ggplot(df, aes(y=oob, x=model, col=miss)) + geom_point(size=4) + facet_wrap(~miss) + labs(y="Out-of-bag error rate")
ggplot(df, aes(y=at, x=model, col=miss)) + geom_point(size=4) + facet_wrap(~miss) + labs(y="Predicted accuracy rate on out-bag training data")
ggplot(df, aes(y=av, x=model, col=miss)) + geom_point(size=4) + facet_wrap(~miss) + labs(y="Accuracy on Validation set")

df_long <- reshape(df, varying=list(6:7), timevar="Dataset", times=c("Training", "Validation"), direction="long", drop=c("oob", "mtry", "ntree","classErr.Black bean aphid","classErr.Cabbage stem flea beetle","classErr.English grain aphid","classErr.Maple aphid","classErr.Pollen beetle" ,"classErr.Sycamore aphid", "TPR.Black bean aphid","TPR.Cabbage stem flea beetle","TPR.English grain aphid","TPR.Maple aphid","TPR.Pollen beetle","TPR.Sycamore aphid","TNR.Black bean aphid","TNR.Cabbage stem flea beetle" , "TNR.English grain aphid","TNR.Maple aphid","TNR.Pollen beetle" ,"TNR.Sycamore aphid","wAcc.Black bean aphid","wAcc.Cabbage stem flea beetle", "wAcc.English grain aphid","wAcc.Maple aphid","wAcc.Pollen beetle","wAcc.Sycamore aphid"   ))
df_long$Dataset <- factor(df_long$Dataset)

df_long$missing <- as.character(df_long$miss)
df_long$missing[df_long$miss == "omit"] <- "Missing values omitted"
df_long$missing[df_long$miss == "roughfix"] <- "Missing values imputed"






# Within and Between insect order classification ------------

rm(list=ls()[which(ls()!="dat")])


# select variables
model1 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms",
            "sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4",
            "log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy",
            "harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10",
            "log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")


dat$insect <- factor(ifelse(dat$species_common %in% c("Pollen beetle", "Cabbage stem flea beetle"), "Beetle", "Aphid"))

dim(dat[, model1])


# select training and validation data
trainSet <- dat[dat$set=="training",]
trainSet <- droplevels(trainSet)
trainSp <- dat[dat$set=="training","species_common"]
trainSp <- droplevels(trainSp)

validSet <- dat[dat$set=="validation",]
validSet <- droplevels(validSet)
validSp <- dat[dat$set=="validation","species_common"]
validSp <- droplevels(validSp)

validSet_med <- validSet
for (i in model1){
  ind <- which(is.na(validSet[,i]))
  validSet_med[ind, i] <- median(trainSet[,i], na.rm=TRUE)
}


trainTree <- trainSet[, model1]
validTree <- validSet_med[, model1]

# Aphid species only
trainSetA <- droplevels(trainSet[trainSet$insect == "Aphid", ])
validSetA <- droplevels(validSet[validSet$insect == "Aphid", ])
trainTreeA <- droplevels(trainTree[trainSet$insect == "Aphid", ])
validTreeA <- droplevels(validTree[validSet$insect == "Aphid", ])

# Beetle species only
trainSetB <- droplevels(trainSet[trainSet$insect == "Beetle", ])
validSetB <- droplevels(validSet[validSet$insect == "Beetle", ])
trainTreeB <- droplevels(trainTree[trainSet$insect == "Beetle", ])
validTreeB <- droplevels(validTree[validSet$insect == "Beetle", ])


rf <- randomForest(trainSp ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 75, 120, 50, 120, 300))

varImpPlot(rf, main="Model 1: All species")

# Aphid species only --------------

# tune the balanced random sampling based on current best weights
rf_A1 <- randomForest(trainSetA$species_common ~ ., trainTreeA, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 120, 50, 300))
rf_A2 <- randomForest(trainSetA$species_common ~ ., trainTreeA, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 75, 50, 75))
rf_A3 <- randomForest(trainSetA$species_common ~ ., trainTreeA, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 2000, sampsize=c(75, 120, 50, 120))
rf_A4 <- randomForest(trainSetA$species_common ~ ., trainTreeA, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(120, 120, 50, 120))

# best one
rf_A <- rf_A3

varImpPlot(rf_A, main="Model 1: Aphid species")


# Beetle species only ----------------

# tune the balanced random sampling based on current best weights
rf_B1 <- randomForest(trainSetB$species_common ~ ., trainTreeB, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 120))
rf_B2 <- randomForest(trainSetB$species_common ~ ., trainTreeB, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(75, 75))

# best one
rf_B <- rf_B2

varImpPlot(rf_B, main="Model 1: Beetle species")


# Aphid vs Beetle -------------------------

rf_AB1 <- randomForest(trainSet$insect ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(1000, 400))
rf_AB2 <- randomForest(trainSet$insect ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(800, 400))
rf_AB3 <- randomForest(trainSet$insect ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(600, 400))
rf_AB4 <- randomForest(trainSet$insect ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=c(400, 400))

rf_AB <- rf_AB4

varImpPlot(rf_AB, main="Model 1: Aphid vs Beetle")







# Unknown Species classification ---------------



rm(list=ls()[which(ls()!="dat")])


model1 <- c("log_maxAmp","log_rangeAmp","log_iqrAmp","log_crestFactor","log_power","log_rms",
            "sqrt_domfreq","bioacousticIndex","bioacousticIndex_2","bioacousticIndex_3","bioacousticIndex_4",
            "log_amplitudeIndex","spectralEntropy","temporalEntropy","acousticEntropy",
            "harmonics.1","harmonics.2","harmonics.3","harmonics.4","harmonics.5", "harmonics.6","harmonics.7","harmonics.8","harmonics.9","harmonics.10",
            "log_gamAmp","log_maxAmp_g", "log_rangeAmp_g", "log_iqrAmp_g", "log_crestFactor_g", "log_power_g", "log_rms_g",
            "log_fundFreq","sqrt_domfreq_g","bioacousticIndex_g","bioacousticIndex_g_2","bioacousticIndex_g_3","bioacousticIndex_g_4",
            "log_amplitudeIndex_g","spectralEntropy_g","temporalEntropy_g", "acousticEntropy_g",
            "harmonics_g.1", "harmonics_g.2","harmonics_g.3", "harmonics_g.4","harmonics_g.5", "harmonics_g.6","harmonics_g.7","harmonics_g.8","harmonics_g.9","harmonics_g.10")

dim(dat[, model1])


dataTree <- dat[, model1]
dim(dataTree)

trainTree <- dat[dat$set=="training",model1]
trainTree <- droplevels(trainTree)
trainSp <- dat[dat$set=="training","species_common"]
trainSp <- droplevels(trainSp)

validTree <- dat[dat$set=="validation",model1]
validTree <- droplevels(validTree)
validSp <- dat[dat$set=="validation","species_common"]
validSp <- droplevels(validSp)

validTree_med <- validTree
for (i in 1:52){
  ind <- which(is.na(validTree[,i]))
  validTree_med[ind, i] <- median(trainTree[,i], na.rm=TRUE)
}




# loop over each species and run the random forest models excluding each species in turn. Obtain predictions of the excluded species to see how an "unknown" species would be allocated
for (sp in c("Black bean aphid", "Cabbage stem flea beetle", "English grain aphid", "Maple aphid","Pollen beetle","Sycamore aphid")){
  print(sp)
  trainSet <- dat[!(dat$species_common %in% sp) & dat$set != "null", ]
  trainTree <- trainSet[, model1]
  validSet <- dat[dat$species_common %in% sp & dat$set != "null", ]
  validTree <- validSet[, model1]
  trainSet <- droplevels(trainSet)
  validSet <- droplevels(validSet)
  
  validTree_med <- validTree
  for (i in 1:52){
    med <- median(trainTree[, i], na.rm=TRUE)
    validTree_med[is.na(validTree_med[, i]), i]  <- med
  }
  
  smp <- c(75, 75, 120, 50, 120, 300)
  indsp <- which(!(levels(droplevels(dat$species_common[dat$set != "null"]))%in%sp))
  
  # fit random forest model
  assign(paste("rf_", sp, sep=""), randomForest(trainSet$species_common ~ ., trainTree, importance=TRUE, na.action=na.roughfix, mtry=10, ntree = 1000, sampsize=smp[indsp]))
  
  # extract class and probabilistic predictions on the complete set of data
  # max probs contains the maximal predicted probability. We would expect this to be low if the species hasn't been observed before as it should be "randomly" assigned to one of the observed classes.
  assign(paste("predValid_", sp, sep=""), predict(get(paste("rf_", sp, sep="")), data.frame(validTree, validSet$species_common), type = "class"))
  assign(paste("predValid_prob_", sp, sep=""), predict(get(paste("rf_", sp, sep="")), data.frame(validTree, validSet$species_common), type = "prob"))
  assign(paste("maxProbs_", sp, sep=""),  apply(get(paste("predValid_prob_", sp, sep="")), 1, max))
  
  # extract class and probabilistic predictions on the imputed set of data
  assign(paste("predValidi_", sp, sep=""), predict(get(paste("rf_", sp, sep="")), data.frame(validTree_med, validSet$species_common), type = "class"))
  assign(paste("predValidi_prob_", sp, sep=""), predict(get(paste("rf_", sp, sep="")), data.frame(validTree_med, validSet$species_common), type = "prob"))
  assign(paste("maxProbsi_", sp, sep=""),  apply(get(paste("predValidi_prob_", sp, sep="")), 1, max))
}


# combine all predicted allocations
alloc <- rbind(
  # data.frame(true = "Bird cherry-oat aphid", table(`predValid_Bird cherry-oat aphid`, dnn="species")/sum(table(`predValid_Bird cherry-oat aphid`, dnn="species"))),
  data.frame(true = "Black bean aphid", table(`predValid_Black bean aphid`, dnn="species")/sum(table(`predValid_Black bean aphid`, dnn="species"))),
  data.frame(true = "Cabbage stem flea beetle", table(`predValid_Cabbage stem flea beetle`, dnn="species")/sum(table(`predValid_Cabbage stem flea beetle`, dnn="species"))),
  data.frame(true = "English grain aphid", table(`predValid_English grain aphid`, dnn="species")/sum(table(`predValid_English grain aphid`, dnn="species"))),
  data.frame(true = "Maple aphid", table(`predValid_Maple aphid`, dnn="species")/sum(table(`predValid_Maple aphid`, dnn="species"))),
  # data.frame(true = "Peach-potato aphid", table(`predValid_Peach-potato aphid`, dnn="species")/sum(table(`predValid_Peach-potato aphid`, dnn="species"))),
  data.frame(true = "Pollen beetle", table(`predValid_Pollen beetle`, dnn="species")/sum(table(`predValid_Pollen beetle`, dnn="species"))),
  data.frame(true = "Sycamore aphid", table(`predValid_Sycamore aphid`, dnn="species")/sum(table(`predValid_Sycamore aphid`, dnn="species")))
)

# combine all predicted allocations (from imputed data)
alloc_i <- rbind(
  # data.frame(true = "Bird cherry-oat aphid", table(`predValidi_Bird cherry-oat aphid`, dnn="species")/sum(table(`predValidi_Bird cherry-oat aphid`, dnn="species"))),
  data.frame(true = "Black bean aphid", table(`predValidi_Black bean aphid`, dnn="species")/sum(table(`predValidi_Black bean aphid`, dnn="species"))),
  data.frame(true = "Cabbage stem flea beetle", table(`predValidi_Cabbage stem flea beetle`, dnn="species")/sum(table(`predValidi_Cabbage stem flea beetle`, dnn="species"))),
  data.frame(true = "English grain aphid", table(`predValidi_English grain aphid`, dnn="species")/sum(table(`predValidi_English grain aphid`, dnn="species"))),
  data.frame(true = "Maple aphid", table(`predValidi_Maple aphid`, dnn="species")/sum(table(`predValidi_Maple aphid`, dnn="species"))),
  # data.frame(true = "Peach-potato aphid", table(`predValidi_Peach-potato aphid`, dnn="species")/sum(table(`predValidi_Peach-potato aphid`, dnn="species"))),
  data.frame(true = "Pollen beetle", table(`predValidi_Pollen beetle`, dnn="species")/sum(table(`predValidi_Pollen beetle`, dnn="species"))),
  data.frame(true = "Sycamore aphid", table(`predValidi_Sycamore aphid`, dnn="species")/sum(table(`predValidi_Sycamore aphid`, dnn="species")))
)

alloc$species <- factor(as.character(alloc$species))
ggplot(alloc, aes(x=species, y=true, fill=Freq, colour=Freq)) + geom_point(shape=22, size=10) + labs(x="Predicted Class", y="True Class") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_y_discrete(limits = rev(levels(alloc$true)))

alloc_i$species <- factor(as.character(alloc_i$species))
ggplot(alloc_i, aes(x=species, y=true, fill=Freq, colour=Freq)) + geom_point(shape=22, size=10) + labs(x="Predicted Class", y="True Class") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_y_discrete(limits = rev(levels(alloc_i$true)))



# look at how the maximal probabilities vary over species
maxProbs <- rbind(
  # data.frame(true="Bird cherry-oat aphid", maxProbs = `maxProbs_Bird cherry-oat aphid`),
  data.frame(true="Black bean aphid", maxProbs = `maxProbs_Black bean aphid`),
  data.frame(true="Cabbage stem flea beetle", maxProbs = `maxProbs_Cabbage stem flea beetle`),
  data.frame(true="English grain aphid", maxProbs = `maxProbs_English grain aphid`),
  data.frame(true="Maple aphid", maxProbs = `maxProbs_Maple aphid`),
  # data.frame(true="Peach-potato aphid", maxProbs = `maxProbs_Peach-potato aphid`),
  data.frame(true="Pollen beetle", maxProbs = `maxProbs_Pollen beetle`),
  data.frame(true="Sycamore aphid", maxProbs = `maxProbs_Sycamore aphid`)
)

ggplot(maxProbs, aes(y=maxProbs, x=true, colour=true)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1),panel.background = element_rect(fill="white", colour="black")) + labs(x="", y="Maximum Class Probability") + guides(colour="none")


maxProbs_i <- rbind(
  # data.frame(true="Bird cherry-oat aphid", maxProbs = `maxProbsi_Bird cherry-oat aphid`),
  data.frame(true="Black bean aphid", maxProbs = `maxProbsi_Black bean aphid`),
  data.frame(true="Cabbage stem flea beetle", maxProbs = `maxProbsi_Cabbage stem flea beetle`),
  data.frame(true="English grain aphid", maxProbs = `maxProbsi_English grain aphid`),
  data.frame(true="Maple aphid", maxProbs = `maxProbsi_Maple aphid`),
  # data.frame(true="Peach-potato aphid", maxProbs = `maxProbsi_Peach-potato aphid`),
  data.frame(true="Pollen beetle", maxProbs = `maxProbsi_Pollen beetle`),
  data.frame(true="Sycamore aphid", maxProbs = `maxProbsi_Sycamore aphid`)
)

ggplot(maxProbs_i, aes(y=maxProbs, x=true, colour=true)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1),panel.background = element_rect(fill="white", colour="black")) + labs(x="", y="Maximum Class Probability") + guides(colour="none")





