# Moth declines are most severe in broadleaf woodlands despite a net gain in habitat availability - Blumgart et al., 2021

# Script_5_species_specific_analysis

# March 2021

# This script models the interaction between each trait in each region, habitat and a habitat*region 
# interaction.

# Libraries and functions ####

library(dplyr)
library(ggplot2)
library(gridExtra)
library(lme4)
library(DHARMa)
library(emmeans)
library(mgcv)
library(poptrend)


# Ben Bolker's code to get 95% CIs from mixed effects models. It excludes the random component of the 
# uncertainty.

# Got from here: https://bbolker.github.io/mixedmodels-misc/ecostats_chap.html
  #   - March 2021

easyPredCI <- function(model,newdata=NULL,alpha=0.05) {
  ## baseline prediction, on the linear predictor (logit) scale:
  pred0 <- predict(model,re.form=NA,newdata=newdata)
  ## fixed-effects model matrix for new data
  X <- model.matrix(formula(model,fixed.only=TRUE)[-2],newdata)
  beta <- fixef(model) ## fixed-effects coefficients
  V <- vcov(model)     ## variance-covariance matrix of beta
  pred.se <- sqrt(diag(X %*% V %*% t(X))) ## std errors of predictions
  ## inverse-link function
  linkinv <- family(model)$linkinv
  ## construct 95% Normal CIs on the link scale and
  ##  transform back to the response (probability) scale:
  crit <- -qnorm(alpha/2)
  linkinv(cbind(conf.low=pred0-crit*pred.se,
                conf.high=pred0+crit*pred.se))
}


# The anaylis will go as follows:

# 1. Species-specific trends - no traits
#       a) Habitat
#       b) Region
#       c) Hab*Region

# 2. Feeding_guild
#       a) Feeding_guild 
#       b) Feeding_guild*Habitat
#       c) Feeding_guild*Region
#       d) Feeding_guild*Region*Habitat

# 3. Host_plant_light (Ellenberg value for light)
#       a) Host_plant_light 
#       b) Host_plant_light*Habitat
#       c) Host_plant_light*Region
#       d) Host_plant_light*Region*Habitat

# Due to sparsity of sites, Region*Habitat will only include broadleaf woodland. So this is 
# essentially testing whether there is an interaction between traits and region within broadleaf woodland.

# # # 

# Data import ####

# For species-specific trends I need the moth trends modeled separately (this is in the same 
# df as for habitat-specific trends as Habitat == "Overall)
Habitat_species_trends <- read.csv("Data/Hab_trends.csv")

# For region specific trends I need each species trend modeled in each region
Region_species_trends <- read.csv("Data/Region_trends.csv")

# For region*habitat interaction I need each species trend modeled in each habitat (only Broadleaf
# woodland) in each region
Hab_region_species_trends <- read.csv("Data/Hab_region_trends.csv")

# Note: in the three data frames imported above, the outliers are already removed. The code
# showing how I dealt with the outliers is below. To generate the data complete with outliers
# you can run Script 4 to produce these three data frames. 

# I also need info on species traits
Traits <- read.csv("Data/Moth_traits.csv")

# Site_info
Site_info <- read.csv("Data/Site_info.csv")

# # # 

# Data prep ####

# Prepare Habitat_species_trends and Traits for merging
names(Habitat_species_trends)
names(Traits)

Habitat_species_trends$Common_name <- NULL
Habitat_species_trends$Binomial <- NULL

# Merge on traits
Habitat_species_trends_2 <- merge(Habitat_species_trends, Traits, by = "RIS_code", all.x = TRUE)

#  Prepare Region_species_trends and Traits for merging
names(Region_species_trends)
Region_species_trends$Common_name <- NULL
Region_species_trends$Binomial <- NULL
Region_species_trends$X <- NULL

# Merge on traits
Region_species_trends_2 <- merge(Region_species_trends, Traits, by = "RIS_code", all.x = TRUE)

#  Prepare Hab_region_species_trends and Traits for merging
names(Hab_region_species_trends)
Hab_region_species_trends$Common_name <- NULL
Hab_region_species_trends$Binomial <- NULL
Hab_region_species_trends$X <- NULL

# Merge on traits
Hab_region_species_trends_2 <- merge(Hab_region_species_trends, Traits, by = "RIS_code", all.x = TRUE)

# Swap out -100% trends for -99.99% in all three so that they can be transformed
Habitat_species_trends_2$Poptrend_perc_change[Habitat_species_trends_2$Poptrend_perc_change == -100] <- -99.99
Region_species_trends_2$Poptrend_perc_change[Region_species_trends_2$Poptrend_perc_change == -100] <- -99.99
Hab_region_species_trends_2$Poptrend_perc_change[Hab_region_species_trends_2$Poptrend_perc_change == -100] <- -99.99


# Cut out all species*habitat, species*region or species*Hab_region combination with a total sample 
# size of less than 200. This is an arbitrary cutoff point, but those with very small sample sizes have 
# very unreliable trends. 

# Habitat
Habitat_species_trends_2 <- 
  Habitat_species_trends_2 %>%
  dplyr::filter(Sample_size >= 200)

# Region
Region_species_trends_2 <- 
  Region_species_trends_2 %>%
  dplyr::filter(Sample_size >= 200)

# Hab_region
Hab_region_species_trends_2 <- 
  Hab_region_species_trends_2 %>%
  dplyr::filter(Sample_size >= 200)

# Solving outliers ####

# Some of the species models resulted in very extreme percentage changes that have an overly strong
# influence on the subsequent models. These come about when a species is not recorded at all
# in the first part of the time series and they increase later in the time series. As an increase
# from zero to any positive number equates to an infinite percentage growth, the resulting percentage
# change estimated by the model gets silly. Where this has happened, I reran the model starting at half-way
# through the time-series - i.e., 1993. 

# **YOU cAN SKIP THIS BIT AND GO STARIGHT TO LINE 751 ("Re-reading in data") 
# THE DATAFRAMES INCLUDED ALREADY HAVE THE OUTLIERS DEALT WITH**

# Solving outliers for Habitat ####

# What outliers are there?
ggplot(Habitat_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =200) +
  facet_wrap(~ Habitat) +
  geom_vline(xintercept = log(100), col = "red")

# There is one noticable outlier:
# 540 (Blair's Shoulder−knot) Arable

# I will take the mid-point of the time-series (1992) and rerun models for Blair's Shoulder-knot
# and substitute them back in. 


# Read in the raw data for Blair's Shoulder Knot
BSK <- readRDS("Data/Sindex_files/540.rds")

# Rename year
BSK <- 
  BSK %>%
  rename(Year = YEAR)

# Merge on site-info
BSK_plus_sites <- merge(BSK, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
BSK_plus_sites_B <- 
  BSK_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
BSK_plus_sites_C <-
  BSK_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  BSK_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
BSK_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
BSK_plus_sites_2 <- 
  BSK_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)

# Make Site_name a factor
BSK_plus_sites_2$Site_name <- as.factor(BSK_plus_sites_2$Site_name)

# Get a vector of habitats 
Hab_vec <- unique(BSK_plus_sites_2$Habitat_500m)

# Run the model and impute
BSK_all_sites <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                                family = nb, data = BSK_plus_sites_2)

# Extract the percentage change and round it to 2 decimal places
BSK_all_sites_perc_cng <- as.numeric(change(BSK_all_sites, start = 1968, end = 2016, alpha = 0.05)[1])
BSK_all_sites_perc_cng <- round(BSK_all_sites_perc_cng, digits = 2) # -34.34. Curious

# Now impute this into main df
Habitat_species_trends_2$Poptrend_perc_change[Habitat_species_trends_2$RIS_code == "540" &
                                                         Habitat_species_trends_2$Habitat == "Overall"] <- 
  BSK_all_sites_perc_cng

# Add back in the new sample size
Habitat_species_trends_2$Sample_size[Habitat_species_trends_2$RIS_code == "540" 
                                     & Habitat_species_trends_2$Habitat == "Overall"] <- sum(BSK_plus_sites_2$Raw_total) 


# and do the same for the other habitats so it is consistent

for(i_hab in 1:length(Hab_vec)){
  
  # Only run the model if the trend in this habitat could previous be modeled. 
  # Else it stays as NA
  
  # Make an obect called Rerun which is the percentage change for this species in this habitat. Only rerun 
  # the model if rerun both exists and is not equal to NA.
  Rerun <- Habitat_species_trends_2$Poptrend_perc_change[Habitat_species_trends_2$RIS_code == "540" &
                                                           Habitat_species_trends_2$Habitat == Hab_vec[i_hab]]
  if(length(Rerun) != 0){
    if(!is.na(Rerun)){

  BSK_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                            family = nb, data = subset(BSK_plus_sites_2, Habitat_500m == Hab_vec[i_hab]))
  
  # Extract the percentage change and round it to 2 decimal places
  BSK_perc_cng <- as.numeric(change(BSK_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  BSK_perc_cng <- round(BSK_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Hab_vec[i_hab], " = ", BSK_perc_cng))
  
  # Add back in the new sample size
  Habitat_species_trends_2$Sample_size[Habitat_species_trends_2$RIS_code == "540" 
                                       & Habitat_species_trends_2$Habitat == Hab_vec[i_hab]] <- sum(subset(BSK_plus_sites_2, Habitat_500m == Hab_vec[i_hab])$Raw_total) 
  
  # Now impute this into main df
  Habitat_species_trends_2$Poptrend_perc_change[Habitat_species_trends_2$RIS_code == "540" &
                                                           Habitat_species_trends_2$Habitat == Hab_vec[i_hab]] <- 
    BSK_perc_cng
    }
  }
}


# # # 

# Solving outliers for Region ####

# What outliers are there?
ggplot(Region_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =200) +
  facet_wrap(~ N_S) +
  geom_vline(xintercept = log(100), col = "red")

# Two outliers 
# 178 (Buff Footman) North
# 940 (Satin Beauty) North

# Rerun the model for these two
# Buff_footman
B_foot <- readRDS("Data/Sindex_files/178.rds")

# Rename year
B_foot <- 
  B_foot %>%
  rename(Year = YEAR)

# Merge on site-info
B_foot_plus_sites <- merge(B_foot, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
B_foot_plus_sites_B <- 
  B_foot_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
B_foot_plus_sites_C <-
  B_foot_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  B_foot_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
B_foot_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
B_foot_plus_sites_2 <- 
  B_foot_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)

# Make Site_name a factor
B_foot_plus_sites_2$Site_name <- as.factor(B_foot_plus_sites_2$Site_name)

# Get a vector of N_S 
Reg_vec <- as.vector(unique(B_foot_plus_sites_2$N_S))


for(i_reg in 1:length(Reg_vec)){
  # Run the model
  B_foot_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                    family = nb, data = subset(B_foot_plus_sites_2, N_S == Reg_vec[i_reg]))
  
  # Extract the percentage change and round it to 2 decimal places
  B_foot_perc_cng <- as.numeric(change(B_foot_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  B_foot_perc_cng <- round(B_foot_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Reg_vec[i_reg], " = ", B_foot_perc_cng))
  
  # Now impute this into main df
  Region_species_trends_2$Poptrend_perc_change[Region_species_trends_2$RIS_code == "178" &
                                                          Region_species_trends_2$N_S == Reg_vec[i_reg]] <- 
    B_foot_perc_cng
  
  # Add back in the new sample size
  Region_species_trends_2$Sample_size[Region_species_trends_2$RIS_code == "178" 
                                       & Region_species_trends_2$N_S == Reg_vec[i_reg]] <- sum(subset(B_foot_plus_sites_2, N_S == Reg_vec[i_reg])$Raw_total) 
  
  
} 


# Do the same for Satin Beauty (no. 940)
S_beaut <- readRDS("Data/Sindex_files/940.rds")

# Rename year
S_beaut <- 
  S_beaut %>%
  rename(Year = YEAR)

# Merge on site-info
S_beaut_plus_sites <- merge(S_beaut, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
S_beaut_plus_sites_B <- 
  S_beaut_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
S_beaut_plus_sites_C <-
  S_beaut_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  S_beaut_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
S_beaut_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
S_beaut_plus_sites_2 <- 
  S_beaut_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)

# Make Site_name a factor
S_beaut_plus_sites_2$Site_name <- as.factor(S_beaut_plus_sites_2$Site_name)

# Get a vector of N_S 
Reg_vec <- as.vector(unique(S_beaut_plus_sites_2$N_S))


for(i_reg in 1:length(Reg_vec)){
  # Run the model
  S_beaut_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                       family = nb, data = subset(S_beaut_plus_sites_2, N_S == Reg_vec[i_reg]))
  
  # Extract the percentage change and round it to 2 decimal places
  S_beaut_perc_cng <- as.numeric(change(S_beaut_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  S_beaut_perc_cng <- round(S_beaut_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Reg_vec[i_reg], " = ", S_beaut_perc_cng))
  
  # Now impute this into main df
  Region_species_trends_2$Poptrend_perc_change[Region_species_trends_2$RIS_code == "940" &
                                                 Region_species_trends_2$N_S == Reg_vec[i_reg]] <- 
    S_beaut_perc_cng
  
  # Add back in the new sample size
  Region_species_trends_2$Sample_size[Region_species_trends_2$RIS_code == "940" 
                                      & Region_species_trends_2$N_S == Reg_vec[i_reg]] <- sum(subset(S_beaut_plus_sites_2, N_S == Reg_vec[i_reg])$Raw_total) 
  
  
} 


# # # 

# Solving outliers for Hab*region ####
ggplot(Hab_region_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =200) +
  facet_wrap(~ Hab_region) +
  geom_vline(xintercept = log(100), col = "red")

# Another three outliers
# 178 (Buff Footman) North_Deciduous_woodland
# 455 (Slender Brindle) North_Deciduous_woodland
# 940 (Satin Beauty) North_Deciduous_woodland


# 178 (Buff Footman)
B_foot <- readRDS("Data/Sindex_files/178.rds")

# Rename year
B_foot <- 
  B_foot %>%
  rename(Year = YEAR)

# Merge on site-info
B_foot_plus_sites <- merge(B_foot, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
B_foot_plus_sites_B <- 
  B_foot_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
B_foot_plus_sites_C <-
  B_foot_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  B_foot_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
B_foot_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
B_foot_plus_sites_2 <- 
  B_foot_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)


# Make Site_name a factor
B_foot_plus_sites_2$Site_name <- as.factor(B_foot_plus_sites_2$Site_name)

# Actualy make a Hab_region variable
B_foot_plus_sites_2$Hab_region <- paste0(B_foot_plus_sites_2$N_S,"_",B_foot_plus_sites_2$Habitat_500m)

# And cut out only the two I need
B_foot_plus_sites_2 <-
  B_foot_plus_sites_2 %>%
  dplyr::filter(Hab_region == "North_Deciduous_woodland" |
                  Hab_region == "South_Deciduous_woodland")

# Get a vector of Hab_region 
Habreg_vec <- as.vector(unique(B_foot_plus_sites_2$Hab_region))


for(i_habreg in 1:length(Habreg_vec)){
  
  # Run the model
  B_foot_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                        family = nb, data = subset(B_foot_plus_sites_2, Hab_region == Habreg_vec[i_habreg]))
  
  # Extract the percentage change and round it to 2 decimal places
  B_foot_perc_cng <- as.numeric(change(B_foot_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  B_foot_perc_cng <- round(B_foot_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Habreg_vec[i_habreg], " = ", B_foot_perc_cng))
  
  # Now impute this into main df
  Hab_region_species_trends_2$Poptrend_perc_change[Hab_region_species_trends_2$RIS_code == "178" &
                                                          Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    B_foot_perc_cng
  
  # Add back in the new sample size
  Hab_region_species_trends_2$Sample_size[Hab_region_species_trends_2$RIS_code == "178" 
                                      & Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    sum(subset(B_foot_plus_sites_2, Hab_region == Habreg_vec[i_habreg])$Raw_total) 
  
 } 

# Buff footman could not be modeled in the north. Remove this one
Hab_region_species_trends_2 <- 
  Hab_region_species_trends_2 %>%
  dplyr::filter(!(RIS_code == "178" & Hab_region == "North_Deciduous_woodland"))

# # # 

# Same  for no. 455 (Slender Brindle) in North_Deciduous_woodland

S_brin <- readRDS("Data/Sindex_files/455.rds")

# Rename year
S_brin <- 
  S_brin %>%
  rename(Year = YEAR)

# Merge on site-info
S_brin_plus_sites <- merge(S_brin, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
S_brin_plus_sites_B <- 
  S_brin_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
S_brin_plus_sites_C <-
  S_brin_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  S_brin_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
S_brin_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
S_brin_plus_sites_2 <- 
  S_brin_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)


# Make Site_name a factor
S_brin_plus_sites_2$Site_name <- as.factor(S_brin_plus_sites_2$Site_name)

# Actualy make a Hab_region variable
S_brin_plus_sites_2$Hab_region <- paste0(S_brin_plus_sites_2$N_S,"_",S_brin_plus_sites_2$Habitat_500m)

# And cut out only the two I need
S_brin_plus_sites_2 <-
  S_brin_plus_sites_2 %>%
  dplyr::filter(Hab_region == "North_Deciduous_woodland" |
                  Hab_region == "South_Deciduous_woodland")

# Get a vector of Hab_region 
Habreg_vec <- as.vector(unique(S_brin_plus_sites_2$Hab_region))


for(i_habreg in 1:length(Habreg_vec)){
  
  # Run the model
  S_brin_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                       family = nb, data = subset(S_brin_plus_sites_2, Hab_region == Habreg_vec[i_habreg]))
  
  # Extract the percentage change and round it to 2 decimal places
  S_brin_perc_cng <- as.numeric(change(S_brin_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  S_brin_perc_cng <- round(S_brin_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Habreg_vec[i_habreg], " = ", S_brin_perc_cng))
  
  # Now impute this into main df
  Hab_region_species_trends_2$Poptrend_perc_change[Hab_region_species_trends_2$RIS_code == "455" &
                                                     Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    S_brin_perc_cng
  
  # Add back in the new sample size
  Hab_region_species_trends_2$Sample_size[Hab_region_species_trends_2$RIS_code == "455" 
                                          & Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    sum(subset(S_brin_plus_sites_2, Hab_region == Habreg_vec[i_habreg])$Raw_total) 
  
} 

# Slender Brindle could not be modeled in the north. Remove this one
Hab_region_species_trends_2 <- 
  Hab_region_species_trends_2 %>%
  dplyr::filter(!(RIS_code == "455" & Hab_region == "North_Deciduous_woodland"))

# # 

# Same for no. 940 (Satin Beauty) in North_Deciduous_woodland
S_beaut <- readRDS("Data/Sindex_files/940.rds")

# Rename year
S_beaut <- 
  S_beaut %>%
  rename(Year = YEAR)

# Merge on site-info
S_beaut_plus_sites <- merge(S_beaut, Site_info, by = "SITE", all.x = TRUE)

# Get rid of Channel Isles sites first and cut out anything pre-1993
S_beaut_plus_sites_B <- 
  S_beaut_plus_sites %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Year >=1993)

# Cut out any site-years with NM less than 0.5
S_beaut_plus_sites_C <-
  S_beaut_plus_sites_B %>%
  dplyr::filter(NM >= 0.5)

# Make an actual_years variable to merge on 
Actual_years_ran <-
  S_beaut_plus_sites_C %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
S_beaut_plus_sites_D <- merge(BSK_plus_sites_C, Actual_years_ran, by = "SITE")

# Cut out anything with Actual_years of less than 3
S_beaut_plus_sites_2 <- 
  S_beaut_plus_sites_D %>% 
  dplyr::filter(Actual_years >= 3)


# Make Site_name a factor
S_beaut_plus_sites_2$Site_name <- as.factor(S_beaut_plus_sites_2$Site_name)

# Actualy make a Hab_region variable
S_beaut_plus_sites_2$Hab_region <- paste0(S_beaut_plus_sites_2$N_S,"_",S_beaut_plus_sites_2$Habitat_500m)

# And cut out only the two I need
S_beaut_plus_sites_2 <-
  S_beaut_plus_sites_2 %>%
  dplyr::filter(Hab_region == "North_Deciduous_woodland" |
                  Hab_region == "South_Deciduous_woodland")

# Get a vector of Hab_region 
Habreg_vec <- as.vector(unique(S_beaut_plus_sites_2$Hab_region))


for(i_habreg in 1:length(Habreg_vec)){
  
  # Run the model
  S_beaut_mod <- ptrend(My_raw_sindex ~ trend(Year, tempRE = TRUE, type = "smooth", k = 16) + s(Site_name, bs = "re"),
                       family = nb, data = subset(S_beaut_plus_sites_2, Hab_region == Habreg_vec[i_habreg]))
  
  # Extract the percentage change and round it to 2 decimal places
  S_beaut_perc_cng <- as.numeric(change(S_beaut_mod, start = 1968, end = 2016, alpha = 0.05)[1])
  S_beaut_perc_cng <- round(S_beaut_perc_cng, digits = 2)
  
  print(paste0("Perc change for ", Habreg_vec[i_habreg], " = ", S_beaut_perc_cng))
  
  # Now impute this into main df
  Hab_region_species_trends_2$Poptrend_perc_change[Hab_region_species_trends_2$RIS_code == "940" &
                                                     Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    S_beaut_perc_cng
  
  # Add back in the new sample size
  Hab_region_species_trends_2$Sample_size[Hab_region_species_trends_2$RIS_code == "940" 
                                          & Hab_region_species_trends_2$Hab_region == Habreg_vec[i_habreg]] <- 
    sum(subset(S_beaut_plus_sites_2, Hab_region == Habreg_vec[i_habreg])$Raw_total) 
  
}  

# Satin Beauty could not be modeled in the north either! Remove this one
Hab_region_species_trends_2 <- 
  Hab_region_species_trends_2 %>%
  dplyr::filter(!(RIS_code == "940" & Hab_region == "North_Deciduous_woodland"))

# # # 

# Just check the histograms again to make sure there are no leftover outliers

# Habitat
ggplot(Habitat_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =100) +
  facet_wrap(~ Habitat) +
  geom_vline(xintercept = log(100), col = "red") 

# Region
ggplot(Region_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =100) +
  facet_wrap(~ N_S) +
  geom_vline(xintercept = log(100), col = "red") 

# Hab_region
ggplot(Hab_region_species_trends_2, aes(x = log(Poptrend_perc_change+100))) +
  geom_histogram(bins =100) +
  facet_wrap(~ Hab_region) +
  geom_vline(xintercept = log(100), col = "red")


# Make a log(Sample_size) var that can be used as weights in the models
Habitat_species_trends_2$Log_sample_size = log(Habitat_species_trends_2$Sample_size)
Region_species_trends_2$Log_sample_size = log(Region_species_trends_2$Sample_size)
Hab_region_species_trends_2$Log_sample_size = log(Hab_region_species_trends_2$Sample_size)

# And make a transformed trend variable that will be the response
Habitat_species_trends_2$Trend_trans <- log(Habitat_species_trends_2$Poptrend_perc_change + 100)
Region_species_trends_2$Trend_trans <- log(Region_species_trends_2$Poptrend_perc_change + 100)
Hab_region_species_trends_2$Trend_trans <- log(Hab_region_species_trends_2$Poptrend_perc_change + 100)

# Make a new df for renaming and reordering
Habitat_species_trends_3 <- Habitat_species_trends_2
Region_species_trends_3 <- Region_species_trends_2
Hab_region_species_trends_3 <- Hab_region_species_trends_2

# Rename habitats so I don't need to faff around in ggplot
Habitat_species_trends_3$Habitat[Habitat_species_trends_3$Habitat == "Lowland_semi_natural"] <- "'Other semi-natural'"
Habitat_species_trends_3$Habitat[Habitat_species_trends_3$Habitat == "Improved_grassland"] <- "Improved grassland"
Habitat_species_trends_3$Habitat[Habitat_species_trends_3$Habitat == "Conifer_plantation"] <- "Conifer plantation"
Habitat_species_trends_3$Habitat[Habitat_species_trends_3$Habitat == "Deciduous_woodland"] <- "Broadleaf woodland"


# Rename Hab_regions so no faffing in ggplot
Hab_region_species_trends_3$Hab_region[Hab_region_species_trends_3$Hab_region == "North_Deciduous_woodland"] <- "Broadleaf woodland (North)"
Hab_region_species_trends_3$Hab_region[Hab_region_species_trends_3$Hab_region == "South_Deciduous_woodland"] <- "Broadleaf woodland (South)"

# Get the N_S and Habitat variables back
Hab_region_species_trends_3$Habitat[Hab_region_species_trends_3$Hab_region == "Broadleaf woodland (North)"] <- "Broadleaf woodland"
Hab_region_species_trends_3$Habitat[Hab_region_species_trends_3$Hab_region == "Broadleaf woodland (South)"] <- "Broadleaf woodland"
Hab_region_species_trends_3$N_S[Hab_region_species_trends_3$Hab_region == "Broadleaf woodland (North)"] <- "North"
Hab_region_species_trends_3$N_S[Hab_region_species_trends_3$Hab_region == "Broadleaf woodland (South)"] <- "South"


# Write these dfs to a csv 

# Note you will need to change the address and un-hash the code

##write.csv(Habitat_species_trends_3, "Data/Hab_trends_ready.csv", row.names = FALSE)
##write.csv(Region_species_trends_3, "Data/Region_trends_ready.csv", row.names = FALSE)
##write.csv(Hab_region_species_trends_3, "Data/Hab_region_trends_ready.csv", row.names = FALSE)


# ~~~Re-reading in data ####

# Only necessary to reread in data if you re-generated the data anew using Script 4

Habitat_species_trends_3 <- read.csv("Data/Hab_trends_ready.csv")
Region_species_trends_3 <- read.csv("Data/Region_trends_ready.csv")
Hab_region_species_trends_3 <- read.csv("Data/Hab_region_trends_ready.csv")

# How many moths used overall?
sum(Habitat_species_trends_3$Sample_size[Habitat_species_trends_3$Habitat == "Overall"]) # 10,963,959 moths

# How many species?
length(unique(Habitat_species_trends_3$Common_name)) # 427


# Reorder habitat levels so they come out in right order
levels(as.factor(Habitat_species_trends_3$Habitat))
Habitat_species_trends_3$Habitat <- factor(Habitat_species_trends_3$Habitat, 
                                           levels = c("Overall", "Urban", "Upland", "'Other semi-natural'", 
                                                      "Improved grassland", "Conifer plantation",
                                                      "Broadleaf woodland", "Arable"))

# Reorder region
levels(as.factor(Region_species_trends_3$N_S))
Region_species_trends_3$N_S <- factor(Region_species_trends_3$N_S, levels = c("South", "North"))

# Relevel Hab_region
levels(as.factor(Hab_region_species_trends_3$Hab_region))
Hab_region_species_trends_3$Hab_region <- 
  factor(Hab_region_species_trends_3$Hab_region, levels = c("Broadleaf woodland (South)",
                                                            "Broadleaf woodland (North)"))


# # #

# ~~~ ####

# i. Species-specific traits - overall - to compare to all others ####

# Run a linear model with the transformed percentage change with an intercept only
Species_spec_only <- lm(Trend_trans ~ 1, 
                       weights = Log_sample_size, data = subset(Habitat_species_trends_3, 
                                                                Habitat == "Overall"))


# A simple t-test testing whether the mean trend differs from zero 
temp <- Habitat_species_trends_3 %>% dplyr::filter(Habitat == "Overall")
t.test(temp$Trend_trans)
#One Sample t-test
## data:  temp$Trend_trans
 #t = 58.604, df = 426, p-value < 2.2e-16
#alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
#  3.939682 4.213121


# Run the emmeans "comparisons"
Species_spec_only_emmeans <- emmeans(Species_spec_only, pairwise ~ NULL)


# Extract the dfs of contratsts and emmeans

# Emmeans
Species_spec_only_emmeans_df <- data.frame(Species_spec_only_emmeans$emmeans)

# Get estimated percentage change for each group
Species_spec_only_emmeans_df %>%
  dplyr::select(emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))

# the average moth has declined by 40%
# emmean Perc_change
# 1 4.089591   -40.28454

# How many species altogether?
temp <- Habitat_species_trends_3 %>% dplyr::filter(Habitat == "Overall")
length(unique(temp$Common_name)) # 427


# # # 

# Some summary stats ####

# How many species?
length(unique(Habitat_species_trends_3$Common_name)) # 427
# Check all the species look sensible
Species_list <- 
  Habitat_species_trends_3 %>%
  dplyr::select(RIS_code, Common_name, Binomial, Feeding_guild) %>%
  distinct()
# It's ok, everything dodgy like 'unidentified macros' etc has been removed before this stage


# ~~~ #### 

# 1. Species-specific trends - no traits ####

# 

#       ~~~a) Habitat ####

# Run model
Species_spec_1 <- lmer(Trend_trans ~ Habitat + (1|Common_name),
                       weights = Log_sample_size, 
                       data = subset(Habitat_species_trends_3, Habitat != "Overall"))

# check residuals
plot(Species_spec_1) # Pretty good

# Is the effect of habitat significant?
drop1(Species_spec_1, test = "Chisq")
#         npar    AIC    LRT   Pr(Chi)    
# <none>       6916.8                     
# Habitat    6 7027.2 122.37 < 2.2e-16 ***

# Yes, the effect of habitat is significant. Get the delta-AIC

7027.2 - 6916.8 # delta-AIC = 110.4

# Run the emmeans comparisons
Species_spec_emmeans <- emmeans(Species_spec_1, pairwise ~ Habitat)

# Extract the dfs of contratsts and emmeans
# Contrasts if needed
Species_spec_contrasts_df <- data.frame(Species_spec_emmeans$contrasts)

# Emmeans
Species_spec_emmeans_df <- data.frame(Species_spec_emmeans$emmeans)

# Get estimated percentage change for each group
Species_spec_emmeans_df %>%
  dplyr::select(Habitat, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100)) %>%
  dplyr::arrange(desc(Habitat))
#               Habitat   emmean Perc_change
#1               Arable 4.635456    3.074915
#2   Broadleaf woodland 3.737133  -58.022543
#3   Conifer plantation 4.141887  -37.078566
#4   Improved grassland 4.070710  -41.401463
#5 'Other semi-natural' 3.875914  -51.773230
#6               Upland 4.540672   -6.246184
#7                Urban 3.646115  -61.674526

# Get number of species modelled in each group
Habitat_species_trends_3 %>%
  group_by(Habitat) %>%
  summarise(Total_species = length(unique(Common_name))) %>%
  dplyr::arrange(desc(Habitat))
#Habitat              Total_species
#<chr>                        <int>
#1 Arable                         297
#2 Broadleaf woodland             365
#3 Conifer plantation             211
#4 Improved grassland             380
#5 'Other semi-natural'           202
#6 Upland                         109
#7 Urban                          285
#8 Overall                        427

# How many species altogether?
temp <- Habitat_species_trends_3 %>% dplyr::filter(Habitat != "Overall")
length(unique(temp$Common_name)) # 405

# Relevel for plotting
Species_spec_emmeans_df$Habitat <- factor(Species_spec_emmeans_df$Habitat, 
                                           levels = c("Urban", "Upland", "'Other semi-natural'", 
                                                      "Improved grassland", "Conifer plantation",
                                                      "Broadleaf woodland", "Arable"))

# Plot (Fig. S8b)
ggplot(data = Species_spec_emmeans_df, aes(x = Habitat, y = emmean)) +
  geom_point() + #size = 2
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) + #, size = 1
  geom_hline(yintercept = log(100), linetype = 2) + #, colour = "red"
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  ylab("Percentage change 1968 - 2016") +
  ggtitle("Single-species trends") +
  xlab("Habitat type") +
  scale_y_continuous(breaks = c(log(-99+100), log(-75+100), log(-50+100), log(-25+100), 
                              log(33+100), log(100+100), log(300+100), log(9000+100)), 
                     labels = c(" -99%", "- 75%", " -50%", " -25%", " 33%", " 100%", " 300%", " 9000%"), 
                     limits = c(3.2, 6)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 5 x 6 inch pdf 

# # # 

# ~~~b) Region ####

# Run model
Species_region_1 <- lmer(Trend_trans ~ N_S + (1|Common_name), 
                       weights = Log_sample_size, data = Region_species_trends_3)

# Check residuals
plot(Species_region_1) # A positive trend in the residuals. I've tried everything and this is the best I 
# can do. This seems to be because the distribution of the data is almost a normal distribution but the 
# peak is too steep. Interpret p-values with caution, especially when they are near to the 0.05 cut-off
# for significance.

# Is the effect of region significant?
drop1(Species_region_1, test = "Chisq")
#         npar    AIC    LRT   Pr(Chi)    
#  <none>      2657.9                     
#  N_S       1 2688.4 32.502 1.191e-08 ***
#
2688.4-2657.9 # delta-AIC = 30.5

# Run the emmeans comparisons
Species_spec_reg_emmeans <- emmeans(Species_region_1, pairwise ~ N_S)

# Extract the dfs of contratsts and emmeans
# Contrasts if needed
Species_spec_reg_contrasts_df <- data.frame(Species_spec_reg_emmeans$contrasts)

# Emmeans
Species_spec_reg_emmeans_df <- data.frame(Species_spec_reg_emmeans$emmeans)

# Get estimated percentage change for each group
Species_spec_reg_emmeans_df %>%
  dplyr::select(N_S, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
#     N_S   emmean Perc_change
#1 South 3.914436   -49.87919
#2 North 4.448264   -14.52159

# Get number of species modelled in each group
Region_species_trends_3 %>%
  group_by(N_S) %>%
  summarise(Total_species = length(unique(Common_name)))
#   N_S   Total_species
#1 South           417
#2 North           309

# How many species altogether?
length(unique(Region_species_trends_3$Common_name)) # 426

# Relevel for plotting
Species_spec_reg_emmeans_df$N_S <- factor(Species_spec_reg_emmeans_df$N_S, levels = c("South", "North"))

# Plot (Fig. S8a)
ggplot(data = Species_spec_reg_emmeans_df, aes(x = N_S, y = emmean)) +
  geom_point() + #size = 2
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) + #, size = 1
  geom_hline(yintercept = log(100), linetype = 2) + #, colour = "red"
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  ylab("Percentage change 1968 - 2016") +
  ggtitle("Single-species trends") +
  xlab("Region") +
  scale_y_continuous(breaks = c(log(-99+100), log(-75+100), log(-50+100), log(-25+100), 
                                log(33+100), log(100+100), log(300+100), log(9000+100)), 
                     labels = c(" -99%", "- 75%", " -50%", " -25%", " 33%", " 100%", " 300%", " 9000%"), 
                     limits = c(3.7, 5.3)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 5 x 5 inch pdf

# # # 

# ~~~c) Hab*Region ####

# Run the model
Species_hab_region_1 <- lmer(Trend_trans ~ N_S + (1|Common_name), 
                         weights = Log_sample_size, data = Hab_region_species_trends_3)

# Check residuals
plot(Species_hab_region_1) # Not great. See line 944. I don't know why some of the residuals are 
# correlated so neatly with the fitted values

# Is the effect of region significant for woodlands?
drop1(Species_hab_region_1, test = "Chisq")
 #npar    AIC    LRT   Pr(Chi)    
# <none>      2201.6                     
# N_S       1 2237.7 38.091 6.752e-10 ***

# There is a significant difference in mean abundance trends between north and south in broadleaf woodland

##
2237.7 - 2201.6 # delta-AIC = 36.1

# Run the emmeans comparisons
Species_spec_hab_reg_emmeans <- emmeans(Species_hab_region_1, pairwise ~ N_S)

# Extract the dfs of contratsts and emmeans
# Contrasts if needed
Species_spec_hab_reg_contrasts_df <- data.frame(Species_spec_hab_reg_emmeans$contrasts)

# Emmeans
Species_spec_hab_reg_emmeans_df <- data.frame(Species_spec_hab_reg_emmeans$emmeans)

# Get estimated percentage change for each group
Species_spec_hab_reg_emmeans_df %>%
  dplyr::select(N_S, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
#     N_S   emmean Perc_change
# 1 North 4.295322   -26.64416
# 2 South 3.445955   -68.62676

# Get number of species modelled in each group
Hab_region_species_trends_3 %>%
  group_by(N_S) %>%
  summarise(Total_species = length(unique(Common_name)))
#   N_S       Total_species
#1 North           227
#2 South           335

# How many species altogether?
length(unique(Region_species_trends_3$Common_name)) # 426

# Reorder North and South
levels(as.factor(Species_spec_hab_reg_emmeans_df$N_S))

Species_spec_hab_reg_emmeans_df$N_S <- factor(Species_spec_hab_reg_emmeans_df$N_S, 
                                              levels = c("South", "North"))


# Plot (Fig. S8c)
ggplot(data = Species_spec_hab_reg_emmeans_df, aes(x = N_S, y = emmean)) +
  geom_point() + #size = 2
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) + #, size = 1
  geom_hline(yintercept = log(100), linetype = 2) + #, colour = "red"
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  ylab("lnPercentage change 1968 - 2016") +
  ggtitle("Single-species trends in Broadleaf woodland") +
  xlab("Region (broadleaf woodland only)") +
  scale_y_continuous(breaks = c(log(-70+100), log(-50+100), log(-25+100), 
                                log(33+100), log(100+100)), 
                     labels = c("-70%", " -50%", " -25%", " 33%", " 100%"), 
                     limits = c(3.2, 5.3)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 5 x 5 inch pdf

# ~~~ ####

# 2. Feeding_guild ####

# Data prep

# Get rid of the unused feeding guild categories (these have too few species in them)
Habitat_species_trends_x_guild <- 
  Habitat_species_trends_3 %>%
  dplyr::filter(Feeding_guild != "Other" &  Feeding_guild != "Plants_low" & 
                  Feeding_guild != "Unknown" & Feeding_guild != "Woody" )

levels(as.factor(Habitat_species_trends_x_guild$Feeding_guild))

# Rename the feeding guilds to their final names
Habitat_species_trends_x_guild$Feeding_guild[Habitat_species_trends_x_guild$Feeding_guild == "Coniferous"] <- "Conifers"
Habitat_species_trends_x_guild$Feeding_guild[Habitat_species_trends_x_guild$Feeding_guild == "Deciduous"] <- "Broadleaf polyphagous"
Habitat_species_trends_x_guild$Feeding_guild[Habitat_species_trends_x_guild$Feeding_guild == "Shrubs_decid"] <- "Broadleaf shrubs"
Habitat_species_trends_x_guild$Feeding_guild[Habitat_species_trends_x_guild$Feeding_guild == "Trees_decid"] <- "Broadleaf trees"
Habitat_species_trends_x_guild$Feeding_guild[Habitat_species_trends_x_guild$Feeding_guild == "Highly_polyphagous"] <- "Highly polyphagous"


# Reorder them in the order I'd like them to appear in the plots
Habitat_species_trends_x_guild$Feeding_guild <- factor(Habitat_species_trends_x_guild$Feeding_guild,
                                                               levels = c("Forbs","Grasses","Broadleaf shrubs",
                                                                          "Broadleaf trees","Broadleaf polyphagous",
                                                                          "Conifers","Lichen","Highly polyphagous"))


# ~~~a) Feeding_guild ####

# Run the model
Feeding_guild_mod <- lm(Trend_trans ~ Feeding_guild,
                       weights = Log_sample_size, 
                       data = subset(Habitat_species_trends_x_guild, Habitat == "Overall"))

# Check residuals
plot(Feeding_guild_mod) # Pretty good

# Is the effect of feeding guild significant?
drop1(Feeding_guild_mod, test = "Chisq")
#               Df Sum of Sq    RSS    AIC  Pr(>Chi)    
# <none>                     6840.0 1171.7              
# Feeding_guild  7    592.53 7432.5 1191.9 1.618e-05 ***
#
1191.9-1171.7 # delta-AIC = 20.2


# Run the emmeans comparisons
Feeding_guild_emmeans <- emmeans(Feeding_guild_mod, pairwise ~ Feeding_guild)

# Extract the dfs of contratsts and emmeans
# Contrasts if needed
Feeding_guild_contrasts_df <- data.frame(Feeding_guild_emmeans$contrasts)

# Emmeans
Feeding_guild_emmeans_df <- data.frame(Feeding_guild_emmeans$emmeans)

# Get estimated percentage change for each group
Feeding_guild_emmeans_df %>%
  dplyr::select(Feeding_guild, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
#          Feeding_guild   emmean Perc_change
#1                 Forbs 3.995358   -45.65470
#2               Grasses 4.136792   -37.39834
#3      Broadleaf shrubs 3.689909   -59.95881
#4       Broadleaf trees 4.156344   -36.16231
#5 Broadleaf polyphagous 4.058506   -42.11224
#6              Conifers 4.275910   -28.05441
#7                Lichen 6.055170   326.31121
#8    Highly polyphagous 3.885407   -51.31327

# Get number of species modelled in each group
Habitat_species_trends_x_guild %>%
  group_by(Feeding_guild) %>%
  summarise(Total_species = length(unique(Common_name)))
#   Feeding_guild    Total_species
#1 Forbs                           105
#2 Grasses                          42
#3 Broadleaf shrubs                 65
#4 Broadleaf trees                  61
#5 Broadleaf polyphagous            58
#6 Conifers                         15
#7 Lichen                           13
#8 Highly polyphagous               52

# How many species altogether?
length(unique(Habitat_species_trends_x_guild$Common_name)) # 411

# Flip order of Feeding guild
levels(as.factor(Feeding_guild_emmeans_df$Feeding_guild))
Feeding_guild_emmeans_df$Feeding_guild <- factor(Feeding_guild_emmeans_df$Feeding_guild, 
                                          levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                     "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))

# Plot (Fig. S9a)
ggplot(data = Feeding_guild_emmeans_df, aes(x = Feeding_guild, y = emmean)) +
  geom_point() + #size = 2
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) + #, size = 1
  geom_hline(yintercept = log(100), linetype = 2) + #, colour = "red"
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  ylab("Percentage change 1968 - 2016") +
  xlab("Feeding guild") +
  scale_y_continuous(breaks = c(log(-75+100), log(-50+100), log(-25+100), 
                                log(25+100), log(100+100), log(300+100), log(900+100)), 
                     labels = c("-75%", "-50%", "-25%", "25%", "100%", "300%", "900%"), 
                     limits = c(3, 7)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 5 x 5 inch pdf


# # # 

# ~~~b) Feeding_guild*Habitat ####
 
# Run the model
Hab_guild_mod <- lmer(Trend_trans ~ Habitat*Feeding_guild + (1|Common_name),
                    weights = Log_sample_size, 
                    data = subset(Habitat_species_trends_x_guild, Habitat != "Overall"))

# Residual checks.
plot(Hab_guild_mod) # Good enough

# Is there a significant interaction between feeding guild and habitat?
drop1(Hab_guild_mod, test = "Chisq")
#                           npar    AIC    LRT   Pr(Chi)    
#<none>                     6581.6                     
#Habitat:Feeding_guild   42 6587.9 90.308 2.187e-05 ***
#
6587.9-6581.6 # delta-AIC = 6.3

# Run the emmeans comparisons
Guild_emmeans <- emmeans(Hab_guild_mod, pairwise ~ Habitat|Feeding_guild) # Note that running this 
# as Feeding_guild|Habitat gives exactly the same results in a different order

# Try a contrasts test for everything, subset by Habitat
joint_tests(Hab_guild_mod, by = "Habitat")

# There's no effect of feeding guild in Improved grassland

# Extract the dfs of contratsts and emmeans
# Contrasts
Guild_contrasts_df <- data.frame(Guild_emmeans$contrasts)

# Emmeans
Guild_emmeans_df <- data.frame(Guild_emmeans$emmeans)

# Get estimated percentage change for each group
Guild_emmeans_df %>%
  dplyr::select(Habitat, Feeding_guild, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
# First four rows:
#                Habitat         Feeding_guild   emmean Perc_change
#1                 Urban                 Forbs 3.502534  -66.800514
#2                Upland                 Forbs 4.477639  -11.973421
#3  'Other semi-natural'                 Forbs 3.317124  -72.419095
#4    Improved grassland                 Forbs 3.975250  -46.736557

# Flip the order of the Habitats so Arable comes first
Guild_emmeans_df$Habitat <- factor(Guild_emmeans_df$Habitat, 
                                           levels = c("Arable","Broadleaf woodland", 
                                                      "Conifer plantation", "Improved grassland",
                                                      "'Other semi-natural'", "Upland", 
                                                      "Urban"))

# Get the feeding guilds in the order in which I want them to appear
Guild_emmeans_df$Feeding_guild <- factor(Guild_emmeans_df$Feeding_guild, 
                                                      levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                                 "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))


# Run a model with no interaction effect to plot the results in the background of the main plot
Hab_guild_mod_2 <- lmer(Trend_trans ~ Habitat + Feeding_guild + (1|Common_name),
                        weights = Log_sample_size, 
                        data = subset(Habitat_species_trends_x_guild, Habitat != "Overall"))
# Run the emmeans comparisons
Guild_emmeans_2 <- emmeans(Hab_guild_mod_2, pairwise ~ Habitat|Feeding_guild)
# Emmeans
Guild_emmeans_df_2 <- data.frame(Guild_emmeans_2$emmeans)
# Flip the order of the variables to match
Guild_emmeans_df_2$Habitat <- factor(Guild_emmeans_df_2$Habitat, 
                                     levels = c("Arable","Broadleaf woodland", 
                                                "Conifer plantation", "Improved grassland",
                                                "'Other semi-natural'", "Upland", 
                                                "Urban"))
Guild_emmeans_df_2$Feeding_guild <- factor(Guild_emmeans_df_2$Feeding_guild, 
                                                levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                           "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))

# Plot (Fig. 4)
ggplot(data = Guild_emmeans_df_2, aes(x = Feeding_guild, y = emmean)) +
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0, color = "grey80", size = 3) +
  
  geom_point(data = Guild_emmeans_df, aes(x = Feeding_guild, y = emmean)) + 
  geom_errorbar(data = Guild_emmeans_df, aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) +
  
  geom_hline(yintercept = log(100), linetype = 2) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  xlab("Feeding guild") +
  ylab("Percentage change 1968 - 2016") +
  facet_wrap(~Habitat, nrow = 3) +
  scale_y_continuous(breaks = c(log(-90+100), log(-50+100), 
                                log(100+100), log(900+100), log(9900+100)), 
                     labels = c("- 90%", " -50%", " 100%", " 900%", " 9900%"),
                     limits = c(1.5, 12)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 8 x 8 inch pdf

# How many species in each subset?
Species_numbers <-
  Habitat_species_trends_x_guild %>%
  group_by(Feeding_guild, Habitat) %>%
  summarise(Total_species = length(unique(Common_name)))


# # #

# ~~~c) Feeding_guild*Region ####

# Data prep

# Get rid of the unused feeding guild categories (these have too few species in them)
Region_species_trends_x_guild <- 
  Region_species_trends_3 %>%
  dplyr::filter(Feeding_guild != "Other" &  Feeding_guild != "Plants_low" & 
                  Feeding_guild != "Unknown" & Feeding_guild != "Woody" )
levels(as.factor(Region_species_trends_x_guild$Feeding_guild))

# Rename them to their final names
Region_species_trends_x_guild$Feeding_guild[Region_species_trends_x_guild$Feeding_guild == "Coniferous"] <- "Conifers"
Region_species_trends_x_guild$Feeding_guild[Region_species_trends_x_guild$Feeding_guild == "Deciduous"] <- "Broadleaf polyphagous"
Region_species_trends_x_guild$Feeding_guild[Region_species_trends_x_guild$Feeding_guild == "Shrubs_decid"] <- "Broadleaf shrubs"
Region_species_trends_x_guild$Feeding_guild[Region_species_trends_x_guild$Feeding_guild == "Trees_decid"] <- "Broadleaf trees"
Region_species_trends_x_guild$Feeding_guild[Region_species_trends_x_guild$Feeding_guild == "Highly_polyphagous"] <- "Highly polyphagous"


# Reorder them in the order I'd like them to appear
Region_species_trends_x_guild$Feeding_guild <- factor(Region_species_trends_x_guild$Feeding_guild,
                                                            levels = c("Forbs","Grasses","Broadleaf shrubs",
                                                                       "Broadleaf trees","Broadleaf polyphagous",
                                                                       "Conifers","Lichen","Highly polyphagous"))
# Check
levels(as.factor(Region_species_trends_x_guild$Feeding_guild))

# Run the model
Region_guild_mod <- lmer(Trend_trans ~ N_S*Feeding_guild + (1|Common_name),
                    weights = Log_sample_size, 
                    data = subset(Region_species_trends_x_guild))

# Residual checks.
plot(Region_guild_mod) # Positive trend. See line 951

# Is there a significant interaction between feeding guild and region?
drop1(Region_guild_mod, test = "Chisq")
#                  npar    AIC    LRT Pr(Chi)  
#<none>                 2544.6                   
#N_S:Feeding_guild    7 2550.2 19.607 0.006484 **
#
2550.2-2544.6 # delta-AIC = 5.6


# Run the emmeans comparisons
Region_guild_emmeans <- emmeans(Region_guild_mod, pairwise ~ N_S|Feeding_guild)

# Try a contrasts test for everything, subset by N_S
joint_tests(Region_guild_mod, by = "N_S")

# Extract the dfs of contratsts and emmeans
# Contrasts
Region_guild_contrasts_df <- data.frame(Region_guild_emmeans$contrasts)

# Emmeans
Region_guild_emmeans_df <- data.frame(Region_guild_emmeans$emmeans)

# Get estimated percentage change for each group
Region_guild_emmeans_df %>%
  dplyr::select(N_S, Feeding_guild, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
# First four rows:
#     N_S    Feeding_guild   emmean Perc_change
#1  South                 Forbs 3.838718   -53.53411
#2  North                 Forbs 4.227462   -31.45698
#3  South               Grasses 4.234938   -30.94257
#4  North               Grasses 4.055832   -42.26680

# Change order of feeding guild for plotting
Region_guild_emmeans_df$Feeding_guild <- factor(Region_guild_emmeans_df$Feeding_guild, 
                                              levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                         "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))
# change order of Region
Region_guild_emmeans_df$N_S <- factor(Region_guild_emmeans_df$N_S, 
                                      levels = c("North", "South"))


# Run a model with no interaction effect to plot the results in the background of the main plot
Region_guild_mod_2 <- lmer(Trend_trans ~ N_S + Feeding_guild + (1|Common_name),
                         weights = Log_sample_size, 
                         data = subset(Region_species_trends_x_guild))
# Run the emmeans comparisons
Region_guild_emmeans_2 <- emmeans(Region_guild_mod_2, pairwise ~ N_S|Feeding_guild)
# Emmeans
Region_guild_emmeans_df_2 <- data.frame(Region_guild_emmeans_2$emmeans)
#
Region_guild_emmeans_df_2$Feeding_guild <- factor(Region_guild_emmeans_df_2$Feeding_guild, 
                                                levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                           "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))
# change order of Region
Region_guild_emmeans_df_2$N_S <- factor(Region_guild_emmeans_df_2$N_S, 
                                      levels = c("North", "South"))

# Plot (Fig. S10a)
ggplot(data = Region_guild_emmeans_df_2, aes(x = Feeding_guild, y = emmean)) +
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL), width = 0, color = "grey80", size = 3) +
  
  geom_point(data = Region_guild_emmeans_df, aes(x = Feeding_guild, y = emmean)) + 
  geom_errorbar(data = Region_guild_emmeans_df, aes(ymax = upper.CL, ymin = lower.CL), width = 0.1) +
  
  geom_hline(yintercept = log(100), linetype = 2) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  xlab("") +
  ylab("Percentage change 1968 - 2016") +
  facet_wrap(~N_S, nrow = 1) +
  scale_y_continuous(breaks = c(log(-85+100), log(-50+100), 
                                log(100+100), log(566+100)), 
                     labels = c("- 85%", " -50%", " 100%", " 566%"),
                     limits = c(2.7, 7.5)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))
# Saved as 3 x 6 inch PDF

# # #

# ~~~d) Feeding_guild*Region*Habitat ####

# Data prep

# Get rid of the unused feeding guild categories (these have too few species in them)
Hab_region_species_trends_x_guild <- 
  Hab_region_species_trends_3 %>%
  dplyr::filter(Feeding_guild != "Other" &  Feeding_guild != "Plants_low" & 
                  Feeding_guild != "Unknown" & Feeding_guild != "Woody" )
# Check
levels(as.factor(Hab_region_species_trends_x_guild$Feeding_guild))

# Rename them to their final names
Hab_region_species_trends_x_guild$Feeding_guild[Hab_region_species_trends_x_guild$Feeding_guild == "Coniferous"] <- "Conifers"
Hab_region_species_trends_x_guild$Feeding_guild[Hab_region_species_trends_x_guild$Feeding_guild == "Deciduous"] <- "Broadleaf polyphagous"
Hab_region_species_trends_x_guild$Feeding_guild[Hab_region_species_trends_x_guild$Feeding_guild == "Shrubs_decid"] <- "Broadleaf shrubs"
Hab_region_species_trends_x_guild$Feeding_guild[Hab_region_species_trends_x_guild$Feeding_guild == "Trees_decid"] <- "Broadleaf trees"
Hab_region_species_trends_x_guild$Feeding_guild[Hab_region_species_trends_x_guild$Feeding_guild == "Highly_polyphagous"] <- "Highly polyphagous"


# Reorder them in the order I'd like them to appear
Hab_region_species_trends_x_guild$Feeding_guild <- factor(Hab_region_species_trends_x_guild$Feeding_guild,
                                                           levels = c("Forbs","Grasses","Broadleaf shrubs",
                                                                      "Broadleaf trees","Broadleaf polyphagous",
                                                                      "Conifers","Lichen","Highly polyphagous"))
levels(as.factor(Hab_region_species_trends_x_guild$Feeding_guild))

# Run the model
Hab_region_guild_mod <- lmer(Trend_trans ~ N_S*Feeding_guild + (1|Common_name),
                         weights = Log_sample_size, 
                         data = subset(Hab_region_species_trends_x_guild))

# Residual checks.
plot(Hab_region_guild_mod) # Not ideal. See line 982

# Is there a significant interaction between feeding guild and region within woodland?
drop1(Hab_region_guild_mod, test = "Chisq")
#                  npar    AIC    LRT   Pr(Chi)    
#<none>                 2086.1                     
#N_S:Feeding_guild    7 2110.3 38.265 2.698e-06 ***
#
2110.3 - 2086.1 # delta-AIC = 24.2

# Run the emmeans comparisons
Hab_region_guild_emmeans <- emmeans(Hab_region_guild_mod, pairwise ~ N_S|Feeding_guild)

# Extract the dfs of contratsts and emmeans
# Contrasts
Hab_region_guild_contrasts_df <- data.frame(Hab_region_guild_emmeans$contrasts)

# Emmeans
Hab_region_guild_emmeans_df <- data.frame(Hab_region_guild_emmeans$emmeans)

# Get estimated percentage change for each group
Hab_region_guild_emmeans_df %>%
  dplyr::select(N_S, Feeding_guild, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))
# First four rows:
#      N_S    Feeding_guild   emmean Perc_change
#1  North                 Forbs 3.908007   -50.20041
#2  South                 Forbs 3.653447   -61.39247
#3  North               Grasses 3.472542   -67.78145
#4  South               Grasses 3.661471   -61.08147


# Run a model with no interaction effect to plot the results in the background of the main plot
Hab_region_guild_mod_2 <- lmer(Trend_trans ~ N_S + Feeding_guild + (1|Common_name),
                             weights = Log_sample_size, 
                             data = subset(Hab_region_species_trends_x_guild))
# Get the emmeans
Hab_region_guild_emmeans_2 <- emmeans(Hab_region_guild_mod_2, pairwise ~ N_S|Feeding_guild)

Hab_region_guild_emmeans_df_2 <- data.frame(Hab_region_guild_emmeans_2$emmeans)

# Reorder feeding guild
levels(as.factor(Hab_region_guild_emmeans_df_2$Feeding_guild))
Hab_region_guild_emmeans_df_2$Feeding_guild <- factor(Hab_region_guild_emmeans_df_2$Feeding_guild, 
                                                      levels = c("Highly polyphagous", "Lichen", "Conifers", "Broadleaf polyphagous",
                                                                 "Broadleaf trees", "Broadleaf shrubs", "Grasses", "Forbs"))

# Plot (Fig. S10b)
ggplot(data = Hab_region_guild_emmeans_df_2, aes(x = Feeding_guild, y = emmean)) +
  geom_errorbar(aes(ymax = upper.CL, ymin = lower.CL ), width = 0, color = "grey80", size = 3) +
  
  geom_point(data = Hab_region_guild_emmeans_df, aes(x = Feeding_guild, y = emmean)) + 
  geom_errorbar(data = Hab_region_guild_emmeans_df, aes(ymax = upper.CL, ymin = lower.CL ), width = 0.1) +
  
  geom_hline(yintercept = log(100), linetype = 2) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  xlab("Feeding guild") +
  ylab("Percentage change 1968 - 2016") +
  facet_wrap(~ N_S, nrow = 1) +
  scale_y_continuous(breaks = c(log(-85+100), log(-50+100), 
                              log(100+100), log(566+100)), 
                   labels = c("- 85%", " -50%", " 100%", " 566%"),
                   limits = c(2, 8.2)) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1)) +
  ggtitle("Broadleaf woodland sites only")

# Save as an 3 x 6 inch pdf 

# # #

# ~~~ ####

# 3. Host_plant_light ####

# Data prep 

# Cut all species that are too polyphagous (with more than three hostplants)
Habitat_species_trends_x_Ellenberg <- 
  Habitat_species_trends_3 %>%
  dplyr::filter(Use_Ellenbergs == "y")

# # #

# ~~~a) Host_plant_light  ####

# Run model
Light_mod <- lm(Trend_trans ~ Ellenberg_mean_Light,
                weights = Log_sample_size, 
                data = subset(Habitat_species_trends_x_Ellenberg, Habitat == "Overall"))

# Check residuals
plot(Light_mod) # Okish

# Is there a significant effect of light-affinity?
drop1(Light_mod, test = "F")
#                     Df Sum of Sq    RSS    AIC F value Pr(>F)
#<none>                            1559.8 287.33               
#Ellenberg_mean_Light  1     36.65 1596.5 287.77  2.4201 0.1229
#
287.77-287.33 # delta-AIC = 0.44

# Get model predictions
New_X <- expand.grid(Ellenberg_mean_Light = seq(from = min(Habitat_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = NA),
                                            to = max(Habitat_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = NA)), 
                     Log_sample_size = mean(Habitat_species_trends_x_Ellenberg$Log_sample_size))

# Predict means from data frame
New_Y <- predict(Light_mod, newdata = New_X, type = "response", re.form = NA, se = TRUE)

# Add together into a single data frame. 
New_df <- data.frame(New_X, New_Y)

# Plot (Fig. S9b)
ggplot(data = New_df, aes(x = Ellenberg_mean_Light, y = fit)) +
  geom_line() +
  geom_ribbon(aes(ymin = (fit - 1.96*se.fit), ymax = (fit + 1.96*se.fit)), alpha = 0.2) +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  geom_point(data = subset(Habitat_species_trends_x_Ellenberg, Habitat == "Overall"), 
             aes(x = Ellenberg_mean_Light, y = Trend_trans, 
                 size = Sample_size), alpha = 0.5) +
  xlab("Mean Ellenberg light") +
  ylab("Percentage change 1968 - 2016") +
  geom_hline(yintercept = log(100), linetype = "dashed") +
  scale_y_continuous(breaks = c(log(-99.9+100), log(-95+100), log(-50+100), 
                                log(100+100), log(900+100)), 
                     labels = c("-99.9%", "-95%", "-50%", "100%", "900%"), 
                     limits = c(-4.65, 8)) 

# Saved as 4 x 5 inch pdf

# # #

# ~~~b) Host_plant_light*Habitat ####

# Run model
Hab_light_mod <- lmer(Trend_trans ~ Habitat*Ellenberg_mean_Light + (1|Common_name),
                      weights = Log_sample_size, 
                      data = subset(Habitat_species_trends_x_Ellenberg, Habitat != "Overall"))

# Check residuals
plot(Hab_light_mod) # Okish

# Is there a significant interaction between light-affinity and habitat?
drop1(Hab_light_mod, test = "Chisq")
#                             npar    AIC    LRT Pr(Chi)
#<none>                            1276.9               
#Habitat:Ellenberg_mean_Light    6 1272.0 7.1839  0.3042
#
1272.0-1276.9 # = -4.9


# Get model predictions
temp <- Habitat_species_trends_x_Ellenberg %>% dplyr::filter(Habitat != "Overall")
temp <- droplevels(temp)
New_X <- expand.grid(Ellenberg_mean_Light = seq(from = min(Habitat_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = TRUE),
                                            to = max(Habitat_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = TRUE)), 
                     Habitat = levels(as.factor(temp$Habitat)), 
                     Log_sample_size = mean(Habitat_species_trends_x_Ellenberg$Log_sample_size))

# Predict means from data frame
New_Y <- predict(Hab_light_mod, newdata = New_X, type = "response", re.form = NA)

# Add together into a single data frame. 
New_df <- data.frame(New_X, New_Y)

# Predict CIs from data frame
cpred1.CI <- easyPredCI(model = Hab_light_mod, newdata = New_X)

# Add it to the new data frame:
New_df <- data.frame(New_df, cpred1.CI)

# See which ones are sig
Emtrends_df <- emtrends(Hab_light_mod, pairwise ~ Habitat, var = "Ellenberg_mean_Light")
Emtrends_df_2 <- data.frame(Emtrends_df$emtrends)
Emtrends_df_2
#               Habitat Ellenberg_mean_Light.trend        SE       df   lower.CL    upper.CL
#1               Arable                 -0.2146090 0.2879065 1989.260 -0.7792389 0.35002090
#2   Broadleaf woodland                 -0.3081562 0.2219184 1214.389 -0.7435423 0.12722987
#3   Conifer plantation                 -0.6157636 0.3916241 4442.556 -1.3835420 0.15201476
#4   Improved grassland                 -0.4111878 0.2162384 1066.438 -0.8354889 0.01311333
#5 'Other semi-natural'                 -0.3710155 0.4010770 4699.516 -1.1573144 0.41528342
#6               Upland                  0.5147669 0.5960284 9734.781 -0.6535726 1.68310629
#7                Urban                  0.1369274 0.2917668 2088.639 -0.4352566 0.70911137

# Only Improved grassland are sig diff from zero

# Change order of habitats 
levels(New_df$Habitat)
New_df$Habitat <- factor(New_df$Habitat, levels = c("Arable","Broadleaf woodland", 
                                                    "Conifer plantation","Improved grassland",
                                                    "'Other semi-natural'","Upland",
                                                    "Urban"))

levels(as.factor(Habitat_species_trends_x_Ellenberg$Habitat)) 
Habitat_species_trends_x_Ellenberg$Habitat <- factor(Habitat_species_trends_x_Ellenberg$Habitat, 
                                                     levels = c("Arable","Broadleaf woodland", 
                                                                "Conifer plantation","Improved grassland",
                                                                "'Other semi-natural'","Upland",
                                                                "Urban", "Overall"))
# Plot (not in paper)
ggplot(data = New_df, aes(x = Ellenberg_mean_Light, y = New_Y)) +
  geom_line() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  ylab("ln(Trend + 100") +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = log(100), col = "red") +
  facet_wrap(~Habitat) +
  geom_point(data = subset(Habitat_species_trends_x_Ellenberg, 
                           Habitat != "Overall"), aes(x = Ellenberg_mean_Light, 
                                                      y = Trend_trans, 
                                                      size = Sample_size), alpha = 0.5) 

# # #

# ~~~c) Host_plant_light*Region ####

# Data prep

# Cut all species that are too polyphagous (more than three hostplants)
Region_species_trends_x_Ellenberg <- 
  Region_species_trends_3 %>%
  dplyr::filter(Use_Ellenbergs == "y")

# Run model
Region_light_mod <- lmer(Trend_trans ~ N_S*Ellenberg_mean_Light + (1|Common_name),
                         weights = Log_sample_size, 
                         data = Region_species_trends_x_Ellenberg)

# Check residuals
plot(Region_light_mod) # Okish

# Is there as ignificant interaction between light-affinity and region?
drop1(Region_light_mod, test = "Chisq")
#                         npar    AIC    LRT Pr(Chi)
#<none>                        592.09               
#N_S:Ellenberg_mean_Light    1 591.13 1.0367  0.3086
#
591.13-592.09 # = -0.96


# Get model predictions
New_X <- expand.grid(Ellenberg_mean_Light = seq(from = min(Region_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = TRUE),
                                                to = max(Region_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = TRUE)), 
                     N_S = levels(as.factor(Region_species_trends_x_Ellenberg$N_S)), 
                     Log_sample_size = mean(Region_species_trends_x_Ellenberg$Log_sample_size))

# Predict means from data frame
New_Y <- predict(Region_light_mod, newdata = New_X, type = "response", re.form = NA)

# Add together into a single data frame. 
New_df <- data.frame(New_X, New_Y)

# Predict CIs from data frame
cpred1.CI <- easyPredCI(model = Region_light_mod, newdata = New_X)

# Add it to the new data frame:
New_df <- data.frame(New_df, cpred1.CI)

# Plot (not in paper)
ggplot(data = New_df, aes(x = Ellenberg_mean_Light, y = New_Y)) +
  geom_line() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  ylab("ln(Trend + 100") +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = log(100), col = "red") +
  facet_wrap(~N_S) +
  geom_point(data = Region_species_trends_x_Ellenberg, aes(x = Ellenberg_mean_Light, 
                                                           y = Trend_trans, 
                                                           size = Sample_size), alpha = 0.5) 


# ~~~d) Host_plant_light*Region*Habitat ####

# Data prep 

# Cut all species that are too polyphagous (more than three hostplants)
Hab_region_species_trends_x_Ellenberg <- 
  Hab_region_species_trends_3 %>%
  dplyr::filter(Use_Ellenbergs == "y")

#  Run model
Hab_region_light_mod <- lmer(Trend_trans ~ N_S*Ellenberg_mean_Light + (1|Common_name),
                             weights = Log_sample_size, 
                             data = Hab_region_species_trends_x_Ellenberg)

# check residuals
plot(Hab_region_light_mod) # Not great

# Is there a significant interaction between light-affinity and region within woodlands?
drop1(Hab_region_light_mod, test = "Chisq")
#                           npar    AIC      LRT Pr(Chi)
#<none>                        453.07                 
#N_S:Ellenberg_mean_Light    1 451.12 0.048141  0.8263
#
451.12-453.07 # delta-AIC = 1.95

# Get model predictions
New_X <- expand.grid(Ellenberg_mean_Light = seq(from = min(Hab_region_species_trends_x_Ellenberg$Ellenberg_mean_Light, na.rm = TRUE),
                                                to = max(Hab_region_species_trends_3$Ellenberg_mean_Light, na.rm = TRUE)), 
                     N_S = levels(as.factor(Hab_region_species_trends_x_Ellenberg$N_S)), 
                     Habitat = levels(as.factor(Hab_region_species_trends_x_Ellenberg$Habitat)), 
                     Log_sample_size = mean(Hab_region_species_trends_x_Ellenberg$Log_sample_size))

# Predict means from data frame
New_Y <- predict(Hab_region_light_mod, newdata = New_X, type = "response", re.form = NA)

# Add together into a single data frame. 
New_df <- data.frame(New_X, New_Y)

# Predict CIs from data frame
cpred1.CI <- easyPredCI(model = Hab_region_light_mod, newdata = New_X)

# Add it to the new data frame:
New_df <- data.frame(New_df, cpred1.CI)

# Plot (not in paper)
ggplot(data = New_df, aes(x = Ellenberg_mean_Light, y = New_Y)) +
  geom_line() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  ylab("ln(Trend + 100") +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = log(100), col = "red") +
  facet_wrap(~N_S) +
  geom_point(data = Hab_region_species_trends_x_Ellenberg, aes(x = Ellenberg_mean_Light, y = Trend_trans, 
                                                               size = Sample_size), alpha = 0.5) 


# # # 

# ~~~ ####

# # #



# Bonus bit - hostplant*habitat ####

# Make a new hostplant variable. I will create new factor levels for this to include species such as
# thise that feed only on willos and poplars, etc. 
Habitat_species_trends_3$Hostplant <- NA

# In every case where Foodplant_2 equals 'NA', I will make Hostplant equal to Foodplant_1, 
# as these are either strictly monophagous or are already grouped at the genus level. 
Habitat_species_trends_3$Hostplant[is.na(Habitat_species_trends_3$Foodplant_2)] <- 
  Habitat_species_trends_3$Foodplant_1[is.na(Habitat_species_trends_3$Foodplant_2)] 

# Check it worked
##View(Habitat_species_trends_3[, c(35, 64:65, 101)]) # good

# Now make a special case for poplars/willow
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Salix" &
                                     Habitat_species_trends_3$Foodplant_2 == "Populus"] <- "Salix/Populus"
# And the other option
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Populus" &
                                     Habitat_species_trends_3$Foodplant_2 == "Salix"] <- "Salix/Populus"
# Willowherbs
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Chamerion_angustifolium" &
                                     Habitat_species_trends_3$Foodplant_2 == "Epilobium_hirsutum"] <- "Willowherbs"

# Heathers
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Calluna_vulgaris" &
                                     Habitat_species_trends_3$Foodplant_2 == "Erica"] <- "Heathers (calluna and Erica sp.)"

# Silene
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Silene_dioica" &
                                     Habitat_species_trends_3$Foodplant_2 == "Silene_latifolia"] <- "Silene"

# Deschampsia (wavy hair grass)
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Deschampsia_flexuosa" &
                                     Habitat_species_trends_3$Foodplant_2 == "Deschampsia_cespitosa"] <- "Deschampsia"

# Small Angle-shades feeds mainly on bracken 
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Pteridium_aquilinum" &
                                     Habitat_species_trends_3$Foodplant_2 == "Dryopteris_filix-mas"] <- "Pteridium_aquilinum (mainly)"

# Rubus sp
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Rubus_fruticosus" &
                                     Habitat_species_trends_3$Foodplant_2 == "Rubus_idaea"] <- "Rubus"
# Rubus sp
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Rubus_fruticosus"] <- "Rubus"

# Gallium
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Galium_verum" &
                                     Habitat_species_trends_3$Foodplant_2 == "Galium_mollugo"] <- "Galium"
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Galium_palustre" &
                                     Habitat_species_trends_3$Foodplant_2 == "Galium_uliginosum"] <- "Galium"

# Achillea
Habitat_species_trends_3$Hostplant[Habitat_species_trends_3$Foodplant_1 == "Achillea_millefolium" &
                                     Habitat_species_trends_3$Foodplant_2 == "Achillea_ptarmica"] <- "Achillea"

# Cut the data frame down to only include hostplants that I am interested in, and also to remove the
# ones with very small smaple sizes. Get rid of the follwing:

Habitat_species_trends_x_hostplant <- 
  Habitat_species_trends_3 %>%
  dplyr::filter(Hostplant != "Achillea" & Hostplant != "Alchemilla_mollis" &
                  Hostplant != "Brassicaceae" & Hostplant != "Cupressus" &
                  Hostplant != "Eriophorum_angustifolium" & Hostplant != "Larix_decidua" &
                  Hostplant != "Linaria_vulgaris" & Hostplant != "Odontites_verna" &
                  Hostplant != "Phragmites_australis" & Hostplant != "Rhamnus_cathartica" &
                  Hostplant != "Stellaria" & Hostplant != "Tilia" &
                  Hostplant != "Vicia_cracca" & Hostplant != "Willowherbs" &
                  Hostplant != "Silene_dioica" & Hostplant != "Acer_campestre" &
                  Hostplant != "Ulmus" & Hostplant != "Ulmus_glabra" &
                  Hostplant != "Deschampsia_cespitosa" & Hostplant != "Cytisus_scoparius" &
                  Hostplant != "Epilobium" & Hostplant != "Rhinanthus_minor" &
                  Hostplant != "Populus" & Hostplant != "Pinus_sylvestris" &
                  Hostplant != "Jacobaea_vulgaris" & Hostplant != "Deschampsia" &
                  Hostplant != "Juniperus_communis" & Hostplant != "Hypericum" &
                  Hostplant != "Euonymus_europaeus" & Hostplant != "Digitalis_purpurea")

levels(as.factor(Habitat_species_trends_x_hostplant$Hostplant)) # 20 hostplants.


# Now do the models and plot the interaction plots
Hostplant_mod <- lmer(Trend_trans ~ Habitat*Hostplant + (1|Common_name),
                      weights = Log_sample_size, 
                      data = subset(Habitat_species_trends_x_hostplant, Habitat != "Overall"))

# Residual checks.
plot(Hostplant_mod) # ok

# Is there a significant interaction between hostplant and habitat?
drop1(Hostplant_mod, test = "Chisq")
# Trend_trans ~ Habitat * Hostplant + (1 | Common_name)
#                 npar    AIC    LRT   Pr(Chi)    
#<none>                 1313.2                     
#Habitat:Hostplant   90 1280.6 147.42 0.0001291 ***
#
# Sig interaction

# Run the emmeans comparisons
Hostplant_emmeans <- emmeans(Hostplant_mod, pairwise ~ Habitat|Hostplant)

# Extract the dfs of contratsts and emmeans

# Emmeans
Hostplant_emmeans_df <- data.frame(Hostplant_emmeans$emmeans)

# Get estimated percentage change for each group
Hostplant_emmeans_df %>%
  dplyr::select(Habitat, Hostplant, emmean) %>%
  dplyr::mutate(Perc_change = (exp(emmean)-100))

# First four rows
#Habitat                Hostplant   emmean Perc_change
#1                 Arable                  Alnus_glutinosa 4.676708    7.415876
#2     Broadleaf woodland                  Alnus_glutinosa 4.685057    8.316398
#3     Conifer plantation                  Alnus_glutinosa 4.352856  -22.299920
#4     Improved grassland                  Alnus_glutinosa 5.183141   78.241857

# Flip the order of the Habitats so Arable comes first
Hostplant_emmeans_df$Habitat <- factor(Hostplant_emmeans_df$Habitat, 
                                       levels = c("Arable","Broadleaf woodland", 
                                                  "Conifer plantation", "Improved grassland",
                                                  "'Other semi-natural'", "Upland", 
                                                  "Urban"))


# Run a model with no interaction to plot in the background of the main plot
Hostplant_mod_3 <- lmer(Trend_trans ~ Habitat + Hostplant + (1|Common_name),
                        weights = Log_sample_size, 
                        data = subset(Habitat_species_trends_x_hostplant, Habitat != "Overall"))
# Run the emmeans comparisons
Hostplant_emmeans_2 <- emmeans(Hostplant_mod_3, pairwise ~ Habitat|Hostplant)
# Emmeans
Hostplant_emmeans_df_2 <- data.frame(Hostplant_emmeans_2$emmeans)
# Flip the order of the Habitats so Arable comes first
Hostplant_emmeans_df_2$Habitat <- factor(Hostplant_emmeans_df_2$Habitat, 
                                         levels = c("Arable","Broadleaf woodland", 
                                                    "Conifer plantation", "Improved grassland",
                                                    "'Other semi-natural'", "Upland", 
                                                    "Urban"))

# The plants for which no interaction could be made are not showing up... Fix this with some merging
Hostplant_emmeans_df_B <- na.omit(Hostplant_emmeans_df)

Hostplant_emmeans_df_B <- merge(Hostplant_emmeans_df_B, Hostplant_emmeans_df_2, by = c("Habitat", "Hostplant"), all.x = TRUE)

# Get the sample size for each habitat-hostplant interaction
Sample_sizes <- 
  Habitat_species_trends_x_hostplant %>%
  dplyr::select(RIS_code, Habitat, Hostplant) %>%
  dplyr::distinct() %>%
  dplyr::group_by(Habitat, Hostplant) %>%
  dplyr::summarise(Sample_size = length(unique(RIS_code)))

# Merge this on to df for figure
Hostplant_emmeans_df_B <- merge(Hostplant_emmeans_df_B, Sample_sizes, by = c("Hostplant", "Habitat"), all.x = TRUE)

# Plot (Fig. S13)
ggplot(data = Hostplant_emmeans_df_B, aes(x = Hostplant, y = emmean.y)) +
  geom_errorbar(aes(ymax = upper.CL.y, ymin = lower.CL.y), width = 0, color = "grey80", size = 3) +
  
  geom_point(data = Hostplant_emmeans_df_B, aes(x = Hostplant, y = emmean.x)) + 
  geom_errorbar(data = Hostplant_emmeans_df_B, aes(ymax = upper.CL.x, ymin = lower.CL.x), width = 0.1) +
  
  geom_hline(yintercept = log(100), linetype = 2) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) +
  #theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  xlab("") +
  ylab("ln(trend + 100)") +
  ylab("Percentage change 1968 - 2016") +
  facet_wrap(~Habitat, nrow = 3) +
  # geom_text(aes(label = Sample_size)) + # Unhash ths line to print sample sizes on top of bars
  scale_y_continuous(breaks = c(log(-99+100), log(-90+100), log(-50+100), 
                                log(100+100), log(900+100), log(9900+100)), 
                     labels = c("-99%", "-90%", " -50%", " 100%", " 900%", " 9900%")) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

# Saved as 8 x 8 inch pdf

# # # 

# ~~~ ####

# # # 

# Making a map of sites and summarising site sample sizes #### 

# Calculating how many sites were used in this analysis and previous (general) analysis is not
# simple as each species used a different set of sites and the completeness of each site-year differs
# between species. The general analysis (script 2) used only 'complete' site-years. As the headline 
# figures from my paper used the analysis with 'complete' site-years only, it is more honest to present
# these sites as the main number of sites (although the number is substantially smaller). 

# I will map to the 266 sites that contributed to the general trends (total abundance, biomass, richness 
# and diversity). I will also tabulate the sample sizes of traps with the complete years as the main number 
# and the maximum number of sites used in species-specific trends in brackets. 

# Read in the complete dataset
Annual_species_totals <- read.csv("Data/All_years_all_species_all_sites.csv")

# and summarise it down to one value per site-year
Annual_aggregate_totals <- 
  Annual_species_totals %>%
  dplyr::group_by(CalYear, RIS.TrapCode) %>%
  dplyr::summarise(Annual_count = sum(Annual_count))

# And site info
Site_info <- read.csv("Data/Site_info.csv")

# And Site_year_completeness
Site_year_completeness <- read.csv("Data/Site_year_info.csv")

# All I need from this is Paul's complete codes (these are determined manually by the Rothamsted Insect 
# Survey) 
Site_year_completeness <- 
  Site_year_completeness %>%
  dplyr::select(SITE, Year, Complete_type_Paul)

# Prepare to merge on Site_info
Site_info_2 <- Site_info
Site_info_2$Site_name <- NULL

Site_year_completeness_2 <- Site_year_completeness
Site_year_completeness_2$Site_name_db <- NULL
Site_year_completeness_2$Site_name_db_nogaps <- NULL

# Some renaming
Annual_aggregate_totals <-
  Annual_aggregate_totals %>%
  dplyr::rename(Year = CalYear, SITE = RIS.TrapCode)

# Merge
Annual_aggregate_totals_B <- merge(Annual_aggregate_totals, Site_info_2, by = c("SITE"))
Annual_aggregate_totals_C <- merge(Annual_aggregate_totals_B, Site_year_completeness, by = c("SITE", "Year"))


# Now cut down data to the site-years that were used in the final general analysis
# Get rid of anything outside the 1968 - 2016 range and also the Ireland and Channel Isles sites
# and keep only "Complete" site-years
Annual_aggregate_totals_D <- 
  Annual_aggregate_totals_C %>%
  dplyr::filter(Year >= 1968 & Year <= 2016) %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Region != "Ireland") %>%
  dplyr::filter(Complete_type_Paul == "Complete")

# Make a new variables for Actual_years
Actual_years_ran <-
  Annual_aggregate_totals_D %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
Annual_aggregate_totals_E <- merge(Annual_aggregate_totals_D, Actual_years_ran, by = "SITE")

# And cut it down to sites with at least three Actual_years running 
Annual_aggregate_totals_F <- 
  Annual_aggregate_totals_E %>%
  dplyr::filter(Actual_years >= 3)

# How many sites?
length(unique(Annual_aggregate_totals_F$SITE))
# 266

# How many sites in the north/south
Annual_aggregate_totals_F %>%
  dplyr::group_by(N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
#   N_S   No_of_sites
#  North          89
#  South         177

# How many sites in each habitat?
Annual_aggregate_totals_F %>%
  dplyr::group_by(Habitat_500m) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
# Habitat_500m         No_of_sites
# Arable                        60
# Conifer_plantation            11
# Deciduous_woodland            42
# Improved_grassland            76
# Lowland_semi_natural          12
# Upland                        12
# Urban                         53

# How many sites in each Region/Habitat?
Annual_aggregate_totals_F %>%
  dplyr::group_by(Habitat_500m, N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
# Habitat_500m         N_S   No_of_sites
# Arable               North          10
# Arable               South          50
# Conifer_plantation   North           7
# Conifer_plantation   South           4
# Deciduous_woodland   North          18
# Deciduous_woodland   South          24
# Improved_grassland   North          29
# Improved_grassland   South          46
# Lowland_semi_natural North           6
# Lowland_semi_natural South           6
# Upland               North          10
# Upland               South           2
# Urban                North           9
# Urban                South          44

# How many sites with each dominant land-use type (LCM2015)?
print(Annual_aggregate_totals_F %>%
  dplyr::group_by(Dom_land_use_500m, Up_lowland, N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE))), n = 28)
#  Dom_land_use_500m              Up_lowland N_S   No_of_sites
# 1 Acid_grassland_2015_500m       Lowland    North           1
# 2 Acid_grassland_2015_500m       Upland     North           5
# 3 Arable_2015_500m               Lowland    North          10
# 4 Arable_2015_500m               Lowland    South          50
# 5 Bog_2015_500m                  Upland     North           2
# 6 Calcareous_grassland_2015_500m Lowland    South           1
# 7 Conifer_plantation_2015_500m   Lowland    North           7
# 8 Conifer_plantation_2015_500m   Lowland    South           4
# 9 Conifer_plantation_2015_500m   Upland     North           1
# 10 Conifer_plantation_2015_500m   Upland     South           1
# 11 Deciduous_woodland_2015_500m   Lowland    North          18
# 12 Deciduous_woodland_2015_500m   Lowland    South          24
# 13 Heather_2015_500m              Lowland    North           1
# 14 Heather_2015_500m              Lowland    South           1
# 15 Heather_grassland_2015_500m    Lowland    North           4
# 16 Improved_grassland_2000_500m   Lowland    North           2 (*These are on Isle of man where 2015 data was not available)
# 17 Improved_grassland_2015_500m   Lowland    North          27
# 18 Improved_grassland_2015_500m   Lowland    South          47
# 19 Improved_grassland_2015_500m   Upland     North           2
# 20 Improved_grassland_2015_500m   Upland     South           1
# 21 Littoral_sediment_2015_500m    Lowland    South           1
# 22 Neutral_grassland_2015_500m    Lowland    South           1
# 23 Saltmarsh_2015_500m            Lowland    South           1
# 24 Suburban_2015_500m             Lowland    North           8
# 25 Suburban_2015_500m             Lowland    South          39
# 26 Supra_littoral_rock_2015_500m  Lowland    South           1
# 27 Urban_2015_500m                Lowland    North           1
# 28 Urban_2015_500m                Lowland    South           5

# Get one row per site ready for plotting
Annual_aggregate_totals_G <-
  Annual_aggregate_totals_F %>%
  dplyr::select(Improved_longitude, Improved_latitude, Actual_years, Habitat_500m) %>%
  distinct()

# Get coastline
UK_coast <- read.csv("Data/UK_coastline_R.csv")

# Make better names
Hab_names <- c("Arable", "Conifer plantation", "Broadleaf woodland", 
               "Improved grassland", "'Other semi-natural'", "Upland", "Urban")
names(Hab_names) <- c("Arable", "Conifer_plantation", "Deciduous_woodland", 
                      "Improved_grassland", "Lowland_semi_natural", "Upland", "Urban")

# Plot (Fig S1)
ggplot(data = Annual_aggregate_totals_G, aes(x = Improved_longitude, y = Improved_latitude)) +
  geom_path(data = UK_coast, aes(x = Location.longitude, y = Location.latitude), size = 0.2) +
  geom_point(aes(size = Actual_years, colour = Habitat_500m), alpha = 0.5) +
  facet_wrap(~Habitat_500m, labeller = labeller(Habitat_500m = Hab_names)) +
  theme_bw() +
  theme(panel.border = element_blank()) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(axis.line = element_line(colour = "black")) +
  xlab("Longitude") +
  ylab("Latitude") +
  geom_hline(yintercept = 53.9, linetype = 2) +
  scale_size_continuous(breaks = c(3, 10, 20, 30, 40))

# Saved as 8 x 7.5 inch pdf 

# To get the maximum number of sites per individual species, I just need to rerun the above code without
# cutting down to only complete site-years

Annual_aggregate_totals_D_2 <- 
  Annual_aggregate_totals_C %>%
  dplyr::filter(Year >= 1968 & Year <= 2016) %>%
  dplyr::filter(Region != "Channel Islands") %>%
  dplyr::filter(Region != "Ireland")

# Make a new variables for Actual_years
Actual_years_ran <-
  Annual_aggregate_totals_D_2 %>%
  dplyr::group_by(SITE) %>%
  dplyr::summarise(Actual_years = length(unique(Year)))

# Merge it on
Annual_aggregate_totals_E_2 <- merge(Annual_aggregate_totals_D_2, Actual_years_ran, by = "SITE")

# And cut it down to sites with at least three Actual_years running 
Annual_aggregate_totals_F_2 <- 
  Annual_aggregate_totals_E_2 %>%
  dplyr::filter(Actual_years >= 3)

# How many sites?
length(unique(Annual_aggregate_totals_F_2$SITE))
# 384

# How many sites in the north/south
Annual_aggregate_totals_F_2 %>%
  dplyr::group_by(N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
#   N_S   No_of_sites
#1 North         119
#2 South         265

# How many sites in each habitat?
Annual_aggregate_totals_F_2 %>%
  dplyr::group_by(Habitat_500m) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
# Habitat_500m         No_of_sites
#1 Arable                        76
#2 Conifer_plantation            12
#3 Deciduous_woodland            51
#4 Improved_grassland           129
#5 Lowland_semi_natural          19
#6 Upland                        15
#7 Urban                         82

# How many sites in each Region/Habitat?
Annual_aggregate_totals_F_2 %>%
  dplyr::group_by(Habitat_500m, N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE)))
# Habitat_500m         N_S   No_of_sites
#1 Arable               North          11
#2 Arable               South          65
#3 Conifer_plantation   North           8
#4 Conifer_plantation   South           4
#5 Deciduous_woodland   North          22
#6 Deciduous_woodland   South          29
#7 Improved_grassland   North          44
#8 Improved_grassland   South          85
#9 Lowland_semi_natural North           8
#10 Lowland_semi_natural South          11
#11 Upland               North          11
#12 Upland               South           4
#13 Urban                North          15
#14 Urban                South          67

# How many sites with each dominant land-use type (LCM2015)?
print(Annual_aggregate_totals_F_2 %>%
  dplyr::group_by(Dom_land_use_500m, Up_lowland, N_S) %>%
  dplyr::summarise(No_of_sites = length(unique(SITE))), n = 31)
# Dom_land_use_500m                 Up_lowland N_S   No_of_sites
# 1 Acid_grassland_2015_500m          Lowland    North           1
# 2 Acid_grassland_2015_500m          Lowland    South           1
# 3 Acid_grassland_2015_500m          Upland     North           5
# 4 Acid_grassland_2015_500m          Upland     South           1
# 5 Arable_2015_500m                  Lowland    North          11
# 6 Arable_2015_500m                  Lowland    South          65
# 7 Bog_2015_500m                     Upland     North           2
# 8 Calcareous_grassland_2015_500m    Lowland    South           1
# 9 Conifer_plantation_2015_500m      Lowland    North           8
# 10 Conifer_plantation_2015_500m      Lowland    South           4
# 11 Conifer_plantation_2015_500m      Upland     North           1
# 12 Conifer_plantation_2015_500m      Upland     South           1
# 13 Deciduous_woodland_2015_500m      Lowland    North          22
# 14 Deciduous_woodland_2015_500m      Lowland    South          29
# 15 Heather_2015_500m                 Lowland    North           2
# 16 Heather_2015_500m                 Lowland    South           2
# 17 Heather_grassland_2015_500m       Lowland    North           5
# 18 Improved_grassland_2000_500m      Lowland    North           2
# 19 Improved_grassland_2015_500m      Lowland    North          42
# 20 Improved_grassland_2015_500m      Lowland    South          85
# 21 Improved_grassland_2015_500m      Upland     North           3
# 22 Improved_grassland_2015_500m      Upland     South           2
# 23 Littoral_sediment_2015_500m       Lowland    South           1
# 24 Neutral_grassland_2015_500m       Lowland    South           1
# 25 Saltmarsh_2015_500m               Lowland    South           3
# 26 Suburban_2015_500m                Lowland    North          14
# 27 Suburban_2015_500m                Lowland    South          58
# 28 Supra_littoral_rock_2015_500m     Lowland    South           1
# 29 Supra_littoral_sediment_2015_500m Lowland    South           1
# 30 Urban_2015_500m                   Lowland    North           1
# 31 Urban_2015_500m                   Lowland    South           9


# End of script