# Script to look at modelled versus sampled data library(ggplot2) library(ggcorrplot) library(mgcv) library(data.table) library(car) library(predictmeans) rm(list=ls()) # get aligned data source('X:/Kirsty Hassall/Anna Szyniszewska/Cassava/Stats/1_alignData.R') # read in extracted data (produced by Had) source('X:/Kirsty Hassall/Anna Szyniszewska/Cassava/Stats/2_alignRasterOutputs.R') dat_civ$Centre_Lon <- round(dat_civ$Centre_Lon, 7) dat_civ$Centre_Lat <- round(dat_civ$Centre_Lat, 7) CIV_buff_pointstats$Centre_Lon <- round(CIV_buff_pointstats$Centre_Lon, 7) CIV_buff_pointstats$Centre_Lat <- round(CIV_buff_pointstats$Centre_Lat, 7) CIVdata <- merge(dat_civ, CIV_buff_pointstats, by=c("Centre_Lon", "Centre_Lat"), all.x=TRUE, all.y=TRUE) dat_uga$Centre_Lon <- round(dat_uga$Centre_Lon, 7) dat_uga$Centre_Lat <- round(dat_uga$Centre_Lat, 7) UGA_buff_pointstats$Centre_Lon <- round(UGA_buff_pointstats$Centre_Lon, 7) UGA_buff_pointstats$Centre_Lat <- round(UGA_buff_pointstats$Centre_Lat, 7) UGAdata <- merge(dat_uga, UGA_buff_pointstats, by=c("Centre_Lon", "Centre_Lat"), all.x=TRUE, all.y=TRUE) # create different subsets # remove zeros CIVdata_pres <- CIVdata[CIVdata$tot_cassava_area >0, ] UGAdata_pres <- UGAdata[UGAdata$tot_cassava_area >0, ] # response variables res_names <- c("tot_cassava_area", "tot_monoculture_area", "tot_intercrop_area", "tot_area_ind_plants", "tot_cassava_area_w","tot_monoculture_area_w", "tot_intercrop_area_w")#, "cass_area_presence") # explanatory variables from survey xsurvey_names <- c("tot_buildings") # explanatory variables for topography xtopo_names <- c("Centre_Lon","Centre_Lat")#, "admin_name") # explanatory variables for modelled cassava xmodel_names <- c("Cass_Prod",#"Cass_Prod_5k","Cass_Prod_10k", "mean_Prod.2000","mean_Prod.5000","mean_Prod.10000", "sd_Prod.2000","sd_Prod.5000","sd_Prod.10000", # "sum_Prod.2000","sum_Prod.5000","sum_Prod.10000", "min_Prod.2000" ,"lowerQ_Prod.2000","median_Prod.2000","upperQ_Prod.2000","max_Prod.2000", "min_Prod.5000","lowerQ_Prod.5000","median_Prod.5000","upperQ_Prod.5000","max_Prod.5000", "lowerQ_Prod.10000","median_Prod.10000","upperQ_Prod.10000","max_Prod.10000", "Cass_HA",#"Cass_HA_5k","Cass_HA_10k", "mean_Harv.2000","mean_Harv.5000","mean_Harv.10000", "sd_Harv.2000","sd_Harv.5000","sd_Harv.10000", # "sum_Harv.2000","sum_Harv.5000","sum_Harv.10000", "min_Harv.2000","lowerQ_Harv.2000","median_Harv.2000","upperQ_Harv.2000","max_Harv.2000", "min_Harv.5000","lowerQ_Harv.5000","median_Harv.5000","upperQ_Harv.5000","max_Harv.5000", "lowerQ_Harv.10000","median_Harv.10000","upperQ_Harv.10000","max_Harv.10000") # explanatory variables for populaion layer xpop_names <- c("Population", "mean_lspop.2000","mean_lspop.5000","mean_lspop.10000", "sd_lspop.2000","sd_lspop.5000","sd_lspop.10000", # "sum_lspop.2000","sum_lspop.5000","sum_lspop.10000", "min_lspop.2000" ,"lowerQ_lspop.2000","median_lspop.2000","upperQ_lspop.2000","max_lspop.2000", "lowerQ_lspop.5000","median_lspop.5000","upperQ_lspop.5000","max_lspop.5000", "lowerQ_lspop.10000","median_lspop.10000","upperQ_lspop.10000","max_lspop.10000") # explanatory variables for settlement layer xsettle_names <- c("mean_settle.2000","mean_settle.5000","mean_settle.10000")#, # "sd_settle.2000","sd_settle.5000","sd_settle.10000", # "sum_settle.2000","sum_settle.5000","sum_settle.10000")#, # "min_settle.2000" ,"lowerQ_settle.2000","median_settle.2000","upperQ_settle.2000","max_settle.2000", # "min_settle.5000","lowerQ_settle.5000","median_settle.5000","upperQ_settle.5000","max_settle.5000", # "min_settle.10000","lowerQ_settle.10000","median_settle.10000","upperQ_settle.10000","max_settle.10000") # Correlations ---------- # using robust measures of correlation to get ranks relationships may well not be linear # survey data ggcorrplot(cor(CIVdata[, c(res_names)], use="pairwise", method="spearman"), type="upper", title="Ivory Coast") ggcorrplot(cor(UGAdata[, c(res_names)], use="pairwise", method="spearman"), type="upper", title="Uganda") ggcorrplot(cor(CIVdata_pres[, c(res_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("tot_cassava_area", xsurvey_names, xtopo_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata_pres[, c("tot_cassava_area", xsurvey_names, xtopo_names)], use="pairwise", method="spearman")) # obsevred data to model and covariate layers ggcorrplot(cor(CIVdata[, c("tot_cassava_area", xmodel_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("tot_cassava_area", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("tot_cassava_area", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata_pres[, c("tot_cassava_area", xmodel_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata_pres[, c("tot_cassava_area", xpop_names)], use="pairwise", method="spearman")) # gets stronger having removed zeros ggcorrplot(cor(CIVdata_pres[, c("tot_cassava_area", xsettle_names)], use="pairwise", method="spearman")) # gets stronger having removed zeros # are population and settlement layers correlated? ggcorrplot(cor(CIVdata_pres[, c(xsettle_names, xpop_names)], use="pairwise", method="spearman")) # modelled data to covariate layers ggcorrplot(cor(CIVdata[, c("Cass_Prod","Cass_Prod_5k","Cass_Prod_10k", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("mean_Prod.2000","mean_Prod.5000","mean_Prod.10000", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("max_Prod.2000","max_Prod.5000","max_Prod.10000", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("min_Prod.2000","min_Prod.5000","min_Prod.10000", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("lowerQ_Prod.2000","lowerQ_Prod.5000","lowerQ_Prod.10000", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("upperQ_Prod.2000","upperQ_Prod.5000","upperQ_Prod.10000", xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("Cass_Prod","Cass_Prod_5k","Cass_Prod_10k", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("mean_Prod.2000","mean_Prod.5000","mean_Prod.10000", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("max_Prod.2000","max_Prod.5000","max_Prod.10000", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("min_Prod.2000","min_Prod.5000","min_Prod.10000", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("lowerQ_Prod.2000","lowerQ_Prod.5000","lowerQ_Prod.10000", xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c("upperQ_Prod.2000","upperQ_Prod.5000","upperQ_Prod.10000", xsettle_names)], use="pairwise", method="spearman")) # the different data layers ggcorrplot(cor(CIVdata[, c(xmodel_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c(xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(CIVdata[, c(xsettle_names)], use="pairwise", method="spearman")) ggcorrplot(cor(UGAdata[, c(xmodel_names)], use="pairwise", method="spearman")) ggcorrplot(cor(UGAdata[, c(xpop_names)], use="pairwise", method="spearman")) ggcorrplot(cor(UGAdata[, c(xsettle_names)], use="pairwise", method="spearman")) # Modelling -------- # # want to link observed cassava production to modelled cassava production # ggplot(CIVdata, aes(x=Cass_Prod, y=tot_cassava_area)) + geom_point() # ggplot(CIVdata, aes(x=log(Cass_Prod), y=log(tot_cassava_area))) + geom_point() # # # Cass_Prod needs to be on log scale to remove influence of skew # cprod <- min(CIVdata$Cass_Prod[CIVdata$Cass_Prod > 0]) / 2 # m1 <- lm(tot_cassava_area ~ log(Cass_Prod + cprod), data=CIVdata) # par(mfrow=c(2,2)) # plot(m1) # # zero inflated # m1 <- lm(tot_cassava_area ~ log(Cass_Prod + cprod), data=CIVdata_pres) # par(mfrow=c(2,2)) # plot(m1) # # # add in covariates # m2 <- lm(tot_cassava_area ~tot_buildings + log(Cass_Prod + cprod), data=CIVdata_pres) # summary(m2) # # mall <- lm(tot_cassava_area ~ mean_lspop.2000 + mean_settle.2000 + Population + log(Cass_Prod + cprod), data=CIVdata_pres) # summary(mall) # mall2 <- step(mall) # summary(mall2) # # # what about harvest area rather than cassava production? # # # now move to aggregated model predictions rather than point locations. # ggplot(CIVdata, aes(x=log(mean_Prod.2000), y=tot_cassava_area)) + geom_point() # ggplot(CIVdata, aes(x=log(mean_Prod.5000), y=tot_cassava_area)) + geom_point() # ggplot(CIVdata, aes(x=log(mean_Prod.10000), y=tot_cassava_area)) + geom_point() # # mall <- lm(tot_cassava_area ~ mean_lspop.2000 + mean_settle.2000 + Population + log(mean_Prod.2000), data=CIVdata_pres) # summary(mall) # mall2 <- step(mall) # summary(mall2) # plot(mall2) # # mall <- lm(tot_cassava_area ~ mean_lspop.5000 + mean_settle.5000 + Population + log(mean_Prod.5000), data=CIVdata_pres) # summary(mall) # mall2 <- step(mall) # summary(mall2) # plot(mall2) # # # what about other summaries # # # what about multiplicative effects or curvey relationships # # # need a strategy for finding the best model... # Model fitting strategy ------------ # Step 1: response ~ diff modelled values fitModel <- function(x_name, y_name, dat){ dat$x <- dat[, x_name] dat$y <- dat[, y_name] if (min(dat$x, na.rm=TRUE) <=0 ){ c <- min(dat$x[dat$x > 0], na.rm=TRUE) / 2 } else { c <- 0 } m1 <- lm(y ~ x, data=dat) m2 <- lm(y ~ log(x + c), data=dat) m3 <- lm(y ~ x + I(x^2), data=dat) m4 <- gam(y ~ s(x), data=dat) formula <- c(formula(m1), formula(m2), formula(m3), formula(m4)) AIC <- c(AIC(m1), AIC(m2), AIC(m3), AIC(m4)) BIC <- c(BIC(m1), BIC(m2), BIC(m3), BIC(m4)) adjR <- c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared, summary(m4)$r.sq) response <- y_name explanatory <- c(x_name, paste("log(", x_name, ")", sep=""), paste(x_name, "+", x_name, "^2", sep=""), paste("s(", x_name, ")", sep="")) # include more details of explanatry model cass_model <- x_name population <- "not included" settlement <- "not included" modeltype <- c(rep("lm", 3), rep("gam", 1)) out <- data.frame(response, explanatory, AIC, BIC, adjR, cass_model, population, settlement, modeltype) return(out) } # y_name="tot_cassava_area" # x_name=xmodel_names[49] # dat=CIVdata # fitModel(x_name, y_name, dat) # # loop over all possible model variables # step1 <- lapply(xmodel_names, fitModel, y_name="tot_cassava_area", dat=CIVdata) # # unlist # step1_df <- rbindlist(step1) # head(step1_df) # Step 2: Response ~ Model x Population # Step 3: Response ~ Model x Settlement fitModelZ <- function(x_name, z_name, y_name, dat, type){ dat$x <- dat[, x_name] dat$z <- dat[, z_name] dat$y <- dat[, y_name] if (min(dat$x, na.rm=TRUE) <=0 ){ c <- min(dat$x[dat$x > 0], na.rm=TRUE) / 2 } else { c <- 0 } if (min(dat$z, na.rm=TRUE) <=0 ){ cz <- min(dat$z[dat$z > 0], na.rm=TRUE) / 2 } else { cz <- 0 } m1 <- lm(y ~ x + z, data=dat) m2 <- lm(y ~ log(x + c) + z, data=dat) m3 <- lm(y ~ x + log(z + cz), data=dat) m4 <- lm(y ~ log(x + c) + log(z + cz), data=dat) m5 <- gam(y ~ s(x) + s(z), data=dat) m6 <- gam(y ~ s(x,z), data=dat) AIC <- c(AIC(m1), AIC(m2), AIC(m3), AIC(m4), AIC(m5), AIC(m6)) BIC <- c(BIC(m1), BIC(m2), BIC(m3), BIC(m4), BIC(m5), BIC(m6)) adjR <- c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared, summary(m4)$adj.r.squared, summary(m5)$r.sq, summary(m6)$r.sq) response <- y_name explanatory <- c(paste(x_name, "+", z_name, sep=""), paste("log(",x_name, ")+", z_name, sep=""),paste(x_name, "+log(", z_name,")", sep=""), paste("log(",x_name, ")+log(", z_name,")", sep=""),paste("s(",x_name, ")+s(", z_name,")", sep=""),paste("s(",x_name, ":", z_name,")", sep="")) # include more details of explanatry model cass_model <- x_name if (type == "population"){ population <- z_name settlement <- "not included" } else if (type == "settlement"){ population <- "not included" settlement <- z_name } modeltype <- c(rep("lm", 4), rep("gam", 2)) out <- data.frame(response, explanatory, AIC, BIC, adjR, cass_model, population, settlement, modeltype) return(out) } # y_name="tot_cassava_area" # x_name=xmodel_names[49] # z_name="mean_lspop.2000" # dat=CIVdata # fitModelZ(x_name, z_name, y_name, dat) fitModelZ_outer <- function(z_name, y_name, dat, type){ # loop over all possible model variables temp <- lapply(xmodel_names, fitModelZ, z_name=z_name, y_name=y_name, dat=dat, type=type) # unlist temp_df <- rbindlist(temp) return(temp_df) } # # loop over all possible model variables and population summaries # step2 <- lapply(xpop_names, fitModelZ_outer, y_name="tot_cassava_area", dat=CIVdata) # # unlist # step2_df <- rbindlist(step2) # head(step2_df) # # # loop over all possible model variables and settlement summaries # step3 <- lapply(xsettle_names, fitModelZ_outer, y_name="tot_cassava_area", dat=CIVdata) # # unlist # step3_df <- rbindlist(step3) # head(step3_df) # Step 4: Response ~ Model x Population x Settlement fitModelZZ <- function(x_name, z_name1, z_name2, y_name, dat){ dat$x <- dat[, x_name] dat$z1 <- dat[, z_name1] dat$z2 <- dat[, z_name2] dat$y <- dat[, y_name] if (min(dat$x, na.rm=TRUE) <=0 ){ c <- min(dat$x[dat$x > 0], na.rm=TRUE) / 2 } else { c <- 0 } if (min(dat$z1, na.rm=TRUE) <=0 ){ cz1 <- min(dat$z1[dat$z1 > 0], na.rm=TRUE) / 2 } else { cz1 <- 0 } if (min(dat$z2, na.rm=TRUE) <=0 ){ cz2 <- min(dat$z2[dat$z2 > 0], na.rm=TRUE) / 2 } else { cz2 <- 0 } m1 <- lm(y ~ x + z1 + z2, data=dat) m2 <- lm(y ~ log(x + c) + z1 + z2, data=dat) m3 <- lm(y ~ x + log(z1 + cz1) + z2, data=dat) m4 <- lm(y ~ x + z1 + log(z2 + cz2), data=dat) m5 <- lm(y ~ x + log(z1 + cz1) + log(z2 + cz2), data=dat) m6 <- lm(y ~ log(x + c) + log(z1 + cz1) + log(z2 + cz2), data=dat) m7 <- gam(y ~ s(x) + s(z1) + s(z2), data=dat) m8 <- gam(y ~ s(x,z1) + s(z2), data=dat) m9 <- gam(y ~ s(x,z2) + s(z1), data=dat) m10 <- gam(y ~ s(x) + s(z1,z2), data=dat) # m11 <- gam(y ~ s(x,z1,z2), data=dat) # fails to fit AIC <- c(AIC(m1), AIC(m2), AIC(m3), AIC(m4), AIC(m5), AIC(m6), AIC(m7), AIC(m8), AIC(m9), AIC(m10)) BIC <- c(BIC(m1), BIC(m2), BIC(m3), BIC(m4), BIC(m5), BIC(m6), BIC(m7), BIC(m8), BIC(m9), BIC(m10)) adjR <- c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared, summary(m4)$adj.r.squared, summary(m5)$adj.r.squared, summary(m6)$adj.r.squared, summary(m7)$r.sq, summary(m8)$r.sq, summary(m9)$r.sq, summary(m10)$r.sq) response <- y_name explanatory <- c(paste(x_name, "+", z_name1, "+", z_name2, sep=""), paste("log(",x_name, ")+", z_name1, "+", z_name2, sep=""), paste(x_name, "+log(", z_name1,")", "+", z_name2, sep=""), paste(x_name, "+", z_name1, "+log(", z_name2,")", sep=""), paste(x_name, "+log(", z_name1,")", "+log(", z_name2,")", sep=""), paste("log(",x_name, ")+log(", z_name1,")", "+log(", z_name2,")", sep=""), paste("s(",x_name, ")+s(", z_name1,")+s(", z_name2,")", sep=""), paste("s(",x_name, ":", z_name1,")+s(", z_name2,")", sep=""), paste("s(",x_name, ":", z_name2,")+s(", z_name1,")", sep=""), paste("s(",x_name, ")+s(", z_name1,":", z_name2,")", sep="")) # include more details of explanatry model cass_model <- x_name population <- z_name1 settlement <- z_name2 modeltype <- c(rep("lm", 6), rep("gam", 4)) out <- data.frame(response, explanatory, AIC, BIC, adjR, cass_model, population, settlement, modeltype) return(out) } # y_name="tot_cassava_area" # x_name=xmodel_names[49] # z_name1="mean_lspop.2000" # z_name2="mean_settle.2000" # dat=CIVdata # fitModelZZ(x_name, z_name1, z_name2, y_name, dat) fitModelZZ_outer1 <- function(z_name1, z_name2, y_name, dat){ # loop over all possible model variables temp <- lapply(xmodel_names, fitModelZZ, z_name1=z_name1, z_name2=z_name2, y_name=y_name, dat=dat) # unlist temp_df <- rbindlist(temp) return(temp_df) } fitModelZZ_outer2 <- function(z_name2, y_name, dat){ # loop over all possible pop variables (and model variables) temp <- lapply(xpop_names, fitModelZZ_outer1, z_name2=z_name2, y_name=y_name, dat=dat) # unlist temp_df <- rbindlist(temp) return(temp_df) } # # loop over all possible settlement summaries (and pop and model variables) # step4 <- lapply(xsettle_names, fitModelZZ_outer2, y_name="tot_cassava_area", dat=CIVdata) # # unlist # step4_df <- rbindlist(step4) # head(step4_df) # Step 6: loop over response variables getAllModels <- function(y_name, dat){ # loop over all possible model variables step1 <- lapply(xmodel_names, fitModel, y_name=y_name, dat=dat) # unlist step1_df <- rbindlist(step1) print("Finished Step 1") # head(step1_df) # loop over all possible model variables and population summaries step2 <- lapply(xpop_names, fitModelZ_outer, y_name=y_name, dat=dat, type="population") # unlist step2_df <- rbindlist(step2) print("Finished Step 2") # head(step2_df) # loop over all possible model variables and settlement summaries step3 <- lapply(xsettle_names, fitModelZ_outer, y_name=y_name, dat=dat, type="settlement") # unlist step3_df <- rbindlist(step3) print("Finished Step 3") # head(step3_df) # loop over all possible settlement summaries (and pop and model variables) step4 <- lapply(xsettle_names, fitModelZZ_outer2, y_name=y_name, dat=dat) # unlist step4_df <- rbindlist(step4) print("Finished Step 4") # head(step4_df) all_models <- rbind(step1_df, step2_df, step3_df, step4_df) } # takes a little time to complete step 4 r1_CIV <- getAllModels("tot_cassava_area", dat=CIVdata) r2_CIV <- getAllModels("tot_monoculture_area", dat=CIVdata) r3_CIV <- getAllModels("tot_intercrop_area", dat=CIVdata) r4_CIV <- getAllModels("tot_area_ind_plants", dat=CIVdata) r5_CIV <- getAllModels("tot_cassava_area_w", dat=CIVdata) r6_CIV <- getAllModels("tot_monoculture_area_w", dat=CIVdata) r7_CIV <- getAllModels("tot_intercrop_area_w", dat=CIVdata) # presence only r1_CIV_pres <- getAllModels("tot_cassava_area", dat=CIVdata_pres) r2_CIV_pres <- getAllModels("tot_monoculture_area", dat=CIVdata_pres) r3_CIV_pres <- getAllModels("tot_intercrop_area", dat=CIVdata_pres) r4_CIV_pres <- getAllModels("tot_area_ind_plants", dat=CIVdata_pres) r5_CIV_pres <- getAllModels("tot_cassava_area_w", dat=CIVdata_pres) r6_CIV_pres <- getAllModels("tot_monoculture_area_w", dat=CIVdata_pres) r7_CIV_pres <- getAllModels("tot_intercrop_area_w", dat=CIVdata_pres) # Step 7: repeat for uganda r1_UGA <- getAllModels("tot_cassava_area", dat=UGAdata) r2_UGA <- getAllModels("tot_monoculture_area", dat=UGAdata) r3_UGA <- getAllModels("tot_intercrop_area", dat=UGAdata) r4_UGA <- getAllModels("tot_area_ind_plants", dat=UGAdata) r5_UGA <- getAllModels("tot_cassava_area_w", dat=UGAdata) r6_UGA <- getAllModels("tot_monoculture_area_w", dat=UGAdata) r7_UGA <- getAllModels("tot_intercrop_area_w", dat=UGAdata) # presence only r1_UGA_pres <- getAllModels("tot_cassava_area", dat=UGAdata_pres) r2_UGA_pres <- getAllModels("tot_monoculture_area", dat=UGAdata_pres) r3_UGA_pres <- getAllModels("tot_intercrop_area", dat=UGAdata_pres) r4_UGA_pres <- getAllModels("tot_area_ind_plants", dat=UGAdata_pres) r5_UGA_pres <- getAllModels("tot_cassava_area_w", dat=UGAdata_pres) r6_UGA_pres <- getAllModels("tot_monoculture_area_w", dat=UGAdata_pres) r7_UGA_pres <- getAllModels("tot_intercrop_area_w", dat=UGAdata_pres) save(r1_CIV, r2_CIV, r3_CIV, r4_CIV, r5_CIV, r6_CIV, r7_CIV, r1_CIV_pres, r2_CIV_pres, r3_CIV_pres, r4_CIV_pres, r5_CIV_pres, r6_CIV_pres, r7_CIV_pres, r1_UGA, r2_UGA, r3_UGA, r4_UGA, r5_UGA, r6_UGA, r7_UGA, r1_UGA_pres, r2_UGA_pres, r3_UGA_pres, r4_UGA_pres, r5_UGA_pres, r6_UGA_pres, r7_UGA_pres, file="3_modelVsSurvey_output.rdata") load("3_modelVsSurvey_output.rdata") # step 8: analyse, interpret (and refine) # ggplot(r1_CIV, aes(x=AIC)) + geom_histogram() # smallish range in AIC but some models come out better # ggplot(r2_CIV, aes(x=AIC)) + geom_histogram() # substantial range in AIC # ggplot(r3_CIV, aes(x=AIC)) + geom_histogram() # moderate AIC range # ggplot(r4_CIV, aes(x=AIC)) + geom_histogram() # substantial range # ggplot(r5_CIV, aes(x=AIC)) + geom_histogram() # moderate range # ggplot(r6_CIV, aes(x=AIC)) + geom_histogram() # substantial range # ggplot(r7_CIV, aes(x=AIC)) + geom_histogram() # small range ord_1 <- order(r1_CIV$AIC) r1_CIV[ord_1[1:10], ] # top models are all gams and inlcude all layers :-) # do we see something similar in presence only? ord_1_pres <- order(r1_CIV_pres$AIC) r1_CIV_pres[ord_1_pres[1:10], ] # ish # and is there any difference in looking at weighted cassava production ord_5 <- order(r5_CIV$AIC) r5_CIV[ord_5[1:10], ] # fit the top model and interpret # seeing pretty decent improvments in adjusted R2 - hurrah, the effort seems worth it # look into the top models... mod_CIV <- gam(tot_cassava_area ~ s(mean_Prod.10000,min_lspop.2000)+s(mean_settle.2000), data=CIVdata) gam.check(mod_CIV) plot(mod_CIV, scheme=2) summary(mod_CIV) par(mfrow=c(1,2)) plot(mod_CIV, select=1, scheme=2, main="", ylim=c(0, 30), xlim=c(0, 200)) plot(mod_CIV, select=2, main="", xlim=c(0, 0.15)) # and how about in Uganda? ord_1 <- order(r1_UGA$AIC) r1_UGA[ord_1[1:10], ] # top models are all gams and inlcude all layers. Much more consistent in choice of variables # do we see something similar in presence only? ord_1_pres <- order(r1_UGA_pres$AIC) r1_UGA_pres[ord_1_pres[1:10], ] # ish mod_UGA <- gam(tot_cassava_area ~ s(Cass_HA,median_lspop.5000)+s(mean_settle.2000), data=UGAdata) gam.check(mod_UGA) plot(mod_UGA, scheme=2) summary(mod_UGA) par(mfrow=c(1,2)) plot(mod_UGA, select=1, scheme=2, main="", ylim=c(0, 250), xlim=c(0, 40)) plot(mod_UGA, select=2, main="", xlim=c(0, 0.2)) # model surface is not straightforward. clearly these are (or are proxies) for things missing in the model to match to observed data. # some summary figures of the models fitted getSummaries <- function(r1){ r1$cass_type <- NA r1$cass_type[grep("Harv", r1$cass_model)] <- "Harv" r1$cass_type[grep("HA", r1$cass_model)] <- "Harv" r1$cass_type[grep("Prod", r1$cass_model)] <- "Prod" table(r1$cass_type, useNA="ifany") r1$cass_summary <- sapply(r1$cass_model, FUN=function(x)strsplit(as.character(x), "_", fixed=TRUE)[[1]][1]) r1$cass_summary[r1$cass_summary == "Cass"] <- "mean" # mean of a single point = point estimate table(r1$cass_summary, useNA="ifany") r1$cass_dist <- NA r1$cass_dist[grep("5", r1$cass_model)] <- "5km" r1$cass_dist[grep("2", r1$cass_model)] <- "2km" r1$cass_dist[grep("10", r1$cass_model)] <- "10km" r1$cass_dist[is.na(r1$cass_dist)] <- "0km" table(r1$cass_dist, useNA="ifany") # table(r1$cass_dist,r1$cass_model, useNA="ifany") r1$population_type <- NA r1$population_type[grep("not included", r1$population)] <- "not included" r1$population_type[grep("Population", r1$population)] <- "population" r1$population_type[grep("lspop", r1$population)] <- "population" table(r1$population_type, useNA="ifany") r1$population_summary <- sapply(r1$population, FUN=function(x)strsplit(as.character(x), "_", fixed=TRUE)[[1]][1]) r1$population_summary[r1$population_summary %in% c("Population")] <- "mean" # mean of a single point = point estimate table(r1$population_summary, useNA="ifany") r1$population_dist <- NA r1$population_dist[grep("5", r1$population)] <- "5km" r1$population_dist[grep("2", r1$population)] <- "2km" r1$population_dist[grep("10", r1$population)] <- "10km" r1$population_dist[is.na(r1$population_dist)] <- "0km" r1$population_dist[r1$population == "not included"] <- "not included" table(r1$population_dist, useNA="ifany") r1$settlement_type <- NA r1$settlement_type[grep("not included", r1$settlement)] <- "not included" r1$settlement_type[grep("settle", r1$settlement)] <- "settlement" table(r1$settlement_type, useNA="ifany") r1$settlement_summary <- sapply(r1$settlement, FUN=function(x)strsplit(as.character(x), "_", fixed=TRUE)[[1]][1]) table(r1$settlement_summary, useNA="ifany") r1$settlement_dist <- NA r1$settlement_dist[grep("5", r1$settlement)] <- "5km" r1$settlement_dist[grep("2", r1$settlement)] <- "2km" r1$settlement_dist[grep("10", r1$settlement)] <- "10km" r1$settlement_dist[is.na(r1$settlement_dist)] <- "0km" r1$settlement_dist[r1$settlement == "not included"] <- "not included" table(r1$settlement_dist, useNA="ifany") r1$modeltype <- factor(r1$modeltype) r1$cass_type <- factor(r1$cass_type) r1$cass_summary <- factor(r1$cass_summary) r1$cass_dist <- factor(r1$cass_dist) r1$population_type <- factor(r1$population_type) r1$population_summary <- factor(r1$population_summary) r1$population_dist <- factor(r1$population_dist) r1$settlement_type <- factor(r1$settlement_type) r1$settlement_summary <- factor(r1$settlement_summary) r1$settlement_dist <- factor(r1$settlement_dist) return(r1) } r1_CIV <- getSummaries(r1_CIV) r2_CIV <- getSummaries(r2_CIV) r3_CIV <- getSummaries(r3_CIV) r4_CIV <- getSummaries(r4_CIV) r5_CIV <- getSummaries(r5_CIV) r6_CIV <- getSummaries(r6_CIV) r7_CIV <- getSummaries(r7_CIV) r1_CIV_pres <- getSummaries(r1_CIV_pres) r2_CIV_pres <- getSummaries(r2_CIV_pres) r3_CIV_pres <- getSummaries(r3_CIV_pres) r4_CIV_pres <- getSummaries(r4_CIV_pres) r5_CIV_pres <- getSummaries(r5_CIV_pres) r6_CIV_pres <- getSummaries(r6_CIV_pres) r7_CIV_pres <- getSummaries(r7_CIV_pres) r1_UGA <- getSummaries(r1_UGA) r2_UGA <- getSummaries(r2_UGA) r3_UGA <- getSummaries(r3_UGA) r4_UGA <- getSummaries(r4_UGA) r5_UGA <- getSummaries(r5_UGA) r6_UGA <- getSummaries(r6_UGA) r7_UGA <- getSummaries(r7_UGA) r1_UGA_pres <- getSummaries(r1_UGA_pres) r2_UGA_pres <- getSummaries(r2_UGA_pres) r3_UGA_pres <- getSummaries(r3_UGA_pres) r4_UGA_pres <- getSummaries(r4_UGA_pres) r5_UGA_pres <- getSummaries(r5_UGA_pres) r6_UGA_pres <- getSummaries(r6_UGA_pres) r7_UGA_pres <- getSummaries(r7_UGA_pres) # with so many factors, fit a linear model to the AICs to see the effect of different things? # thing we're missing is whether we have a log term or not... mod_AIC_CIV <- lm(AIC ~ modeltype* (cass_type*(cass_dist * cass_summary) * population_type/(population_dist * population_summary) * settlement_type/(settlement_dist)), data=r1_CIV) # above too many interactions to fit. let's retsrict number of higher order terms... mod_AIC_CIV <- lm(AIC ~ modeltype* (cass_type * population_type * settlement_type + cass_type*(cass_dist * cass_summary) + population_type/(population_dist * population_summary) + settlement_type/(settlement_dist)), data=r1_CIV) mod_BIC_CIV <- lm(BIC ~ modeltype* (cass_type * population_type * settlement_type + cass_type*(cass_dist * cass_summary) + population_type/(population_dist * population_summary) + settlement_type/(settlement_dist)), data=r1_CIV) mod_R2_CIV <- lm(log(adjR + 0.06) ~ modeltype* (cass_type * population_type * settlement_type + cass_type*(cass_dist * cass_summary) + population_type/(population_dist * population_summary) + settlement_type/(settlement_dist)), data=r1_CIV) par(mfrow=c(2,2)) plot(mod_AIC_CIV) plot(mod_BIC_CIV) # bit dodgy, let's leave it out. plot(mod_R2_CIV) # confounding - order of fitting matters. either fit the different orders and look at sequential F's, or export to genstat to get the marginal and conditional F's # how many orders to do we have... # cass, prod, sett # cass, sett, prod # prod, cass, sett # prod, sett, cass # sett, cass, prod # sett, prod, cass # 6 ways of choosing main covariate x 2 ways (dist, summ or summ, dist) # 12 orders of fitting that make sense to look at. # we could look at type II F's from Anova instead, but this seems to be doing something slightly different. I would expect this to correspond to genstat's conditional F's, and it does ;-). # qualitatively checking there's no difference in two approaches so let's just stick with the type II F's from Anova. # write.csv(r1_CIV, file="r1_CIV.csv", row.names=FALSE) Anova(mod_AIC_CIV) # set up function to extract the F's for each response and each model summary getF <- function(r1){ mod_AIC_CIV <- lm(AIC ~ modeltype* (cass_type * population_type * settlement_type + cass_type*(cass_dist * cass_summary) + population_type/(population_dist * population_summary) + settlement_type/(settlement_dist)), data=r1) offset <- abs(min(r1$adjR) * 1.5 * (min(r1$adj) < 0)) mod_R2_CIV <- lm(log(adjR + offset) ~ modeltype* (cass_type * population_type * settlement_type + cass_type*(cass_dist * cass_summary) + population_type/(population_dist * population_summary) + settlement_type/(settlement_dist)), data=r1) par(mfrow=c(2,2)) plot(mod_AIC_CIV, main="AIC") plot(mod_R2_CIV, main="adjR2") anova_AIC <- Anova(mod_AIC_CIV) anova_R2 <- Anova(mod_R2_CIV) return(list(anova_AIC=anova_AIC, anova_R2=anova_R2)) } # these take a bit of time to run. Bit frustrating, for some reason it's much much faster in genstat! anova_r1_CIV <- getF(r1_CIV) anova_r2_CIV <- getF(r2_CIV) anova_r3_CIV <- getF(r3_CIV) anova_r4_CIV <- getF(r4_CIV) anova_r5_CIV <- getF(r5_CIV) anova_r6_CIV <- getF(r6_CIV) anova_r7_CIV <- getF(r7_CIV) anova_r1_CIV_pres <- getF(r1_CIV_pres) anova_r2_CIV_pres <- getF(r2_CIV_pres) anova_r3_CIV_pres <- getF(r3_CIV_pres) anova_r4_CIV_pres <- getF(r4_CIV_pres) anova_r5_CIV_pres <- getF(r5_CIV_pres) anova_r6_CIV_pres <- getF(r6_CIV_pres) anova_r7_CIV_pres <- getF(r7_CIV_pres) anova_r1_UGA <- getF(r1_UGA) anova_r2_UGA <- getF(r2_UGA) anova_r3_UGA <- getF(r3_UGA) anova_r4_UGA <- getF(r4_UGA) anova_r5_UGA <- getF(r5_UGA) anova_r6_UGA <- getF(r6_UGA) anova_r7_UGA <- getF(r7_UGA) anova_r1_UGA_pres <- getF(r1_UGA_pres) anova_r2_UGA_pres <- getF(r2_UGA_pres) anova_r3_UGA_pres <- getF(r3_UGA_pres) anova_r4_UGA_pres <- getF(r4_UGA_pres) anova_r5_UGA_pres <- getF(r5_UGA_pres) anova_r6_UGA_pres <- getF(r6_UGA_pres) anova_r7_UGA_pres <- getF(r7_UGA_pres) save(anova_r1_CIV,anova_r2_CIV,anova_r3_CIV,anova_r4_CIV,anova_r5_CIV,anova_r6_CIV,anova_r7_CIV, anova_r1_CIV_pres,anova_r2_CIV_pres,anova_r3_CIV_pres,anova_r4_CIV_pres,anova_r5_CIV_pres,anova_r6_CIV_pres,anova_r7_CIV_pres, anova_r1_UGA,anova_r2_UGA,anova_r3_UGA,anova_r4_UGA,anova_r5_UGA,anova_r6_UGA,anova_r7_UGA, anova_r1_UGA_pres,anova_r2_UGA_pres,anova_r3_UGA_pres,anova_r4_UGA_pres,anova_r5_UGA_pres,anova_r6_UGA_pres,anova_r7_UGA_pres,file="3_modelVsSurvey_modelANOVAs.rdata") load("3_modelVsSurvey_modelANOVAs.rdata") # export to tables CIV_table <- data.frame(DF = anova_r1_CIV$anova_AIC$Df, F_r1_AIC=anova_r1_CIV$anova_AIC$`F value`,F_r1_R2=anova_r1_CIV$anova_R2$`F value`, F_r2_AIC=anova_r2_CIV$anova_AIC$`F value`,F_r2_R2=anova_r2_CIV$anova_R2$`F value`, F_r3_AIC=anova_r3_CIV$anova_AIC$`F value`,F_r3_R2=anova_r3_CIV$anova_R2$`F value`, F_r4_AIC=anova_r4_CIV$anova_AIC$`F value`,F_r4_R2=anova_r4_CIV$anova_R2$`F value`, F_r5_AIC=anova_r5_CIV$anova_AIC$`F value`,F_r5_R2=anova_r5_CIV$anova_R2$`F value`, F_r6_AIC=anova_r6_CIV$anova_AIC$`F value`,F_r6_R2=anova_r6_CIV$anova_R2$`F value`, F_r7_AIC=anova_r7_CIV$anova_AIC$`F value`,F_r7_R2=anova_r7_CIV$anova_R2$`F value`) rownames(CIV_table) <- rownames(anova_r1_CIV$anova_AIC) CIV_pres_table <- data.frame(DF = anova_r1_CIV_pres$anova_AIC$Df, F_r1_AIC=anova_r1_CIV_pres$anova_AIC$`F value`,F_r1_R2=anova_r1_CIV_pres$anova_R2$`F value`, F_r2_AIC=anova_r2_CIV_pres$anova_AIC$`F value`,F_r2_R2=anova_r2_CIV_pres$anova_R2$`F value`, F_r3_AIC=anova_r3_CIV_pres$anova_AIC$`F value`,F_r3_R2=anova_r3_CIV_pres$anova_R2$`F value`, F_r4_AIC=anova_r4_CIV_pres$anova_AIC$`F value`,F_r4_R2=anova_r4_CIV_pres$anova_R2$`F value`, F_r5_AIC=anova_r5_CIV_pres$anova_AIC$`F value`,F_r5_R2=anova_r5_CIV_pres$anova_R2$`F value`, F_r6_AIC=anova_r6_CIV_pres$anova_AIC$`F value`,F_r6_R2=anova_r6_CIV_pres$anova_R2$`F value`, F_r7_AIC=anova_r7_CIV_pres$anova_AIC$`F value`,F_r7_R2=anova_r7_CIV_pres$anova_R2$`F value`) rownames(CIV_pres_table) <- rownames(anova_r1_CIV_pres$anova_AIC) UGA_table <- data.frame(DF = anova_r1_UGA$anova_AIC$Df, F_r1_AIC=anova_r1_UGA$anova_AIC$`F value`,F_r1_R2=anova_r1_UGA$anova_R2$`F value`, F_r2_AIC=anova_r2_UGA$anova_AIC$`F value`,F_r2_R2=anova_r2_UGA$anova_R2$`F value`, F_r3_AIC=anova_r3_UGA$anova_AIC$`F value`,F_r3_R2=anova_r3_UGA$anova_R2$`F value`, F_r4_AIC=anova_r4_UGA$anova_AIC$`F value`,F_r4_R2=anova_r4_UGA$anova_R2$`F value`, F_r5_AIC=anova_r5_UGA$anova_AIC$`F value`,F_r5_R2=anova_r5_UGA$anova_R2$`F value`, F_r6_AIC=anova_r6_UGA$anova_AIC$`F value`,F_r6_R2=anova_r6_UGA$anova_R2$`F value`, F_r7_AIC=anova_r7_UGA$anova_AIC$`F value`,F_r7_R2=anova_r7_UGA$anova_R2$`F value`) rownames(UGA_table) <- rownames(anova_r1_UGA$anova_AIC) UGA_pres_table <- data.frame(DF = anova_r1_UGA_pres$anova_AIC$Df, F_r1_AIC=anova_r1_UGA_pres$anova_AIC$`F value`,F_r1_R2=anova_r1_UGA_pres$anova_R2$`F value`, F_r2_AIC=anova_r2_UGA_pres$anova_AIC$`F value`,F_r2_R2=anova_r2_UGA_pres$anova_R2$`F value`, F_r3_AIC=anova_r3_UGA_pres$anova_AIC$`F value`,F_r3_R2=anova_r3_UGA_pres$anova_R2$`F value`, F_r4_AIC=anova_r4_UGA_pres$anova_AIC$`F value`,F_r4_R2=anova_r4_UGA_pres$anova_R2$`F value`, F_r5_AIC=anova_r5_UGA_pres$anova_AIC$`F value`,F_r5_R2=anova_r5_UGA_pres$anova_R2$`F value`, F_r6_AIC=anova_r6_UGA_pres$anova_AIC$`F value`,F_r6_R2=anova_r6_UGA_pres$anova_R2$`F value`, F_r7_AIC=anova_r7_UGA_pres$anova_AIC$`F value`,F_r7_R2=anova_r7_UGA_pres$anova_R2$`F value`) rownames(UGA_pres_table) <- rownames(anova_r1_UGA_pres$anova_AIC) write.csv(CIV_table, file="ModelSummaries_CIV.csv") write.csv(CIV_pres_table, file="ModelSummaries_CIV_pres.csv") write.csv(UGA_table, file="ModelSummaries_UGA.csv") write.csv(UGA_pres_table, file="ModelSummaries_UGA_pres.csv") # key summary figures. # using raw data rather than model means. Things would be clearer if we used predictedmeans for each group ggplot(r1_CIV, aes(x=modeltype, y=AIC, colour=modeltype)) + geom_boxplot() ggplot(r1_CIV, aes(x=settlement_type, y=AIC, colour=population_type)) + geom_boxplot() r1_CIV$cass_dist <- factor(r1_CIV$cass_dist, levels=c("0km", "2km", "5km", "10km")) ggplot(r1_CIV, aes(x=cass_dist, y=AIC, colour=cass_summary)) + geom_boxplot() + facet_wrap( ~ cass_summary) ggplot(r1_CIV, aes(x=population_summary, y=AIC, colour=population_dist)) + geom_boxplot() ggplot(r1_CIV, aes(x=settlement_summary, y=AIC, colour=settlement_dist)) + geom_boxplot() ggplot(r1_UGA, aes(x=modeltype, y=AIC, colour=modeltype)) + geom_boxplot() ggplot(r1_UGA, aes(x=cass_type, y=AIC, colour=modeltype)) + geom_boxplot() ggplot(r1_UGA, aes(x=settlement_type, y=AIC, colour=population_type)) + geom_boxplot() r1_UGA$cass_dist <- factor(r1_UGA$cass_dist, levels=c("0km", "2km", "5km", "10km")) ggplot(r1_UGA, aes(x=cass_dist, y=AIC, colour=cass_summary)) + geom_boxplot() + facet_wrap( ~ cass_summary) ggplot(r1_UGA, aes(x=population_summary, y=AIC, colour=population_dist)) + geom_boxplot() ggplot(r1_UGA, aes(x=settlement_summary, y=AIC, colour=settlement_dist)) + geom_boxplot() # given main effect is gam, let's only plot the gam models library(tidyr) library(dplyr) r1_CIV$cass_dist <- factor(r1_CIV$cass_dist, levels=c("0km", "2km", "5km", "10km")) r1_UGA$cass_dist <- factor(r1_UGA$cass_dist, levels=c("0km", "2km", "5km", "10km")) r1_CIV$settlement_dist <- factor(r1_CIV$settlement_dist, levels=c("not included", "2km", "5km", "10km")) r1_UGA$settlement_dist <- factor(r1_UGA$settlement_dist, levels=c("not included", "2km", "5km", "10km")) r1_CIV$population_dist <- factor(r1_CIV$population_dist, levels=c("not included","0km", "2km", "5km", "10km")) r1_UGA$population_dist <- factor(r1_UGA$population_dist, levels=c("not included","0km", "2km", "5km", "10km")) ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=cass_type, y=AIC, colour=cass_dist)) + geom_boxplot() + ggtitle("Ivory Coast") ggplot(r1_UGA %>% filter(modeltype == "gam"), aes(x=cass_type, y=AIC, colour=cass_dist)) + geom_boxplot() + ggtitle("Uganda") ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=cass_summary, y=AIC, colour=cass_summary)) + geom_boxplot() + ggtitle("Ivory Coast") ggplot(r1_UGA %>% filter(modeltype == "gam"), aes(x=cass_summary, y=AIC, colour=cass_summary)) + geom_boxplot() + ggtitle("Uganda") ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=population_summary, y=AIC, colour=population_summary)) + geom_boxplot() + ggtitle("Ivory Coast") ggplot(r1_UGA %>% filter(modeltype == "gam"), aes(x=population_summary, y=AIC, colour=population_summary)) + geom_boxplot() + ggtitle("Uganda") ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=settlement_type, y=AIC, colour=settlement_dist)) + geom_boxplot() + ggtitle("Ivory Coast") ggplot(r1_UGA %>% filter(modeltype == "gam"), aes(x=settlement_type, y=AIC, colour=settlement_dist)) + geom_boxplot() + ggtitle("Uganda") ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=population_type, y=AIC, colour=population_dist)) + geom_boxplot() + ggtitle("Ivory Coast") ggplot(r1_UGA %>% filter(modeltype == "gam"), aes(x=population_type, y=AIC, colour=population_dist)) + geom_boxplot() + ggtitle("Uganda") # settlement more important than population... ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(y=AIC, x=interaction(settlement_type, population_type), colour=interaction(settlement_type, population_type))) + geom_boxplot() ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=settlement_type, y=AIC, colour=settlement_summary)) + geom_boxplot() ggplot(r1_CIV %>% filter(modeltype == "gam"), aes(x=population_type, y=AIC, colour=population_summary)) + geom_boxplot()