# Student R script for Project EDDIE module: # Exploring diatom biodiversity in the Everglades and three tropical karstic # wetlands # Authors: Katherine M. Johnson and Gabriel E. Kamener # Last Update: 2022-07-27 #Install needed packages in next 3 lines if not already installed #install.packages("vegan") #install.packages("tidyverse") #install.packages("iNEXT") # Load needed packages library(vegan) library(tidyverse) library(iNEXT) # If you did not start RStudio from the RStudio project file found in the module # Zip file, set your working directory to the location where you have placed the # data files # setwd("my_directory_path_here") # Load data everglades_data <-read_csv("everglades_2008_rel_diatom_abund.csv") carib_data <-read_csv("ST_PP_Lahee_001.csv") %>% rename(Site = Region) # View raw data view(everglades_data) view(carib_data) #### Activity A - Percent Relative Abundances #### # Take the mean of the percent relative abundance for each species across each # site in the Caribbean before pivoting into long form and removing # taxa that don't appear in any sample for a site carib_sites_mean_abund <- carib_data %>% # Subset columns select(Site, ACEXSP:SYACAN) %>% # Group data by Site group_by(Site) %>% # Calculate means for each species per site summarise(across(everything(), mean)) carib_sites_mean_abund_long <- carib_sites_mean_abund %>% # Pivot into long form pivot_longer(-Site, names_to = "Taxon", values_to = "Pct_rel_abund") %>% # Filter out taxa not appearing at a site filter(Pct_rel_abund > 0) # View means view(carib_sites_mean_abund) # View pivoted means view(carib_sites_mean_abund_long) # Optional challenge - write code below to check that mean percent abundance # values for each site sum to roughly 100%. # Take the mean of the percent relative abundance for each species across # the Caribbean as if all samples came from a single composite site carib_composite_site_mean_abund <- carib_data %>% select(ACEXSP:SYACAN) %>% summarise(across(everything(), mean)) %>% # Create "Site" variable and set the value to "C" for Caribbean composite site mutate(Site = "C") carib_composite_site_mean_abund_long <- carib_composite_site_mean_abund %>% pivot_longer(-Site, names_to = "Taxon", values_to = "Pct_rel_abund") # Take the mean of the percent relative abundance for each species across # the Everglades everglades_mean_abund <- everglades_data %>% select(ACHFTSP01:ULNULNULN) %>% summarise(across(everything(), mean)) %>% mutate(Site = "E") %>% select(Site, everything()) # Pivot Everglades data to long form everglades_mean_abund_long <- everglades_mean_abund %>% pivot_longer(-Site, names_to = "Taxon", values_to = "Pct_rel_abund") # Store mean values for all Caribbean sites and Everglades # in one data frame. The rbind function does this by combining the three # data frames we already created mean_abund_all_carib_and_everglades_sites <- rbind(carib_composite_site_mean_abund_long, carib_sites_mean_abund_long, everglades_mean_abund_long) # Optional challenge - write code below to sum the number of taxa (richness) # for each site # Plot abundances below. Plots may be too wide to display properly in RStudio. # Plots below are exported as png image files at a preset size for # viewing at the correct scale # As a class, plot relative percent abundance for the Caribbean composite site plot_carib_abund <- ggplot( subset(mean_abund_all_carib_and_everglades_sites, Site == "C"), aes(x = reorder(Taxon, -Pct_rel_abund), y = Pct_rel_abund)) + geom_bar(stat="identity", fill = "red") + labs(x = "Taxon", y = "Percent Relative abundance", title = "Caribbean percent relative abundance") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.title=element_text(size=16), plot.title = element_text(hjust = 0.5, size = 22)) + scale_y_continuous(limits =c(0, 40), breaks = seq(0, 40, len = 5)) # Save Caribbean plot as png image file ggsave("Caribbean_pct_rel_abundance.png", plot = plot_carib_abund, width = 70, height = 36, units = "cm", limitsize = FALSE) ##!!! # When you see "##!!!", make sure to carefully read and edit necessary parts # in the code after the comment. The output will otherwise come out blank # or incorrect. ##!!! # Now choose one of the available sites to plot on your own. # Choices: B = Belize, J = Jamaica, Y = Yucatan and E = Everglades ##!!! # Insert the letter for the site of your choice into the quotations for the # Site line below, and also write the name of the site in the title line ##!!! plot_my_site_abund <- ggplot( subset(mean_abund_all_carib_and_everglades_sites, Site == "Insert chosen letter for site here inside the quotation marks"), aes(x = reorder(Taxon, -Pct_rel_abund), y = Pct_rel_abund)) + geom_bar(stat="identity", fill = "blue") + labs(x = "Taxon", y = "Percent Relative abundance", title = "[Name of chosen site] percent relative abundance") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.title=element_text(size=16), plot.title = element_text(hjust = 0.5, size = 22)) + scale_y_continuous(limits =c(0, 40), breaks = seq(0, 40, len = 5)) # Save chosen site as png image file ggsave("chosen_site_pct_rel_abundance.png", plot = plot_my_site_abund, width = 70, height = 36, units = "cm", limitsize = FALSE) #### Activity B Part 1 - Biodiversity Indices #### ## Calculate Shannon-Weiner Index (H') values # Select just the species abundance columns because those are what we must # input into the diversity function carib_abund <- carib_data %>% select(ACEXSP:SYACAN) view(carib_abund) # Select only species columns from Everglades data everglades_abund <- everglades_data %>% select(ACHFTSP01:ULNULNULN) # Calculate local H' values for all Caribbean samples carib_shannon <- as.data.frame(diversity(carib_abund,index="shannon")) view(carib_shannon) # Calculate the mean of the local H' values for the Caribbean composite site mean_carib_shannon <- carib_shannon %>% summarise(Mean_local_shannon = mean(carib_shannon$`diversity(carib_abund, index = "shannon")`)) %>% mutate(Site = "C") %>% select(Site, Mean_local_shannon) view(mean_carib_shannon) # Calculate the mean local H' values for each Caribbean site mean_shannon_for_carib_sites <- aggregate(diversity(carib_abund, "shannon"), list(carib_data$Site), mean) %>% select(Site = Group.1, Mean_local_shannon = x) # Calculate local H' values for each everglades sample everglades_shannon <- as.data.frame(diversity(everglades_abund,index="shannon")) # Calculate mean of the local H' values for Everglades mean_everglades_shannon <- everglades_shannon %>% summarise(Mean_local_shannon = mean(everglades_shannon$`diversity(everglades_abund, index = "shannon")`)) %>% mutate(Site = "E") %>% select(Site, Mean_local_shannon) # Place mean local H' values for all sites into one data frame mean_shannon_all_sites <- rbind(mean_carib_shannon, mean_shannon_for_carib_sites, mean_everglades_shannon) view(mean_shannon_all_sites) # Filter the mean local H' value for the Caribbean composite site, # and print it carib_composite_shannon <- mean_shannon_all_sites %>% filter(Site == "C") carib_composite_shannon # Next we need to calculate the mean local species richness per # site # Filter Caribbean site abundances into their own data frames bel_diatoms <- carib_data %>% filter(Site == "B") %>% select(Site, ACEXSP:SYACAN) view(bel_diatoms) jam_diatoms <- carib_data %>% filter(Site == "J") %>% select(Site, ACEXSP:SYACAN) yuc_diatoms <- carib_data %>% filter(Site == "Y") %>% select(Site, ACEXSP:SYACAN) crb_diatoms <- carib_data %>% select(ACEXSP:SYACAN) # Calculate mean local species richness for each Caribbean Site. # "as.data.frame" is used in the first line because the specnumber # function generates a list of numbers, but we need them in a # a data frame to work with the data in the lines after that. bel_species_richness <- as.data.frame(specnumber(bel_diatoms)) %>% # Selecting and renaming the species number output variable to "Species" select(Species = `specnumber(bel_diatoms)`) %>% summarise(Mean_species = mean(Species)) %>% mutate(Site = "B") view(bel_species_richness) jam_species_richness <- as.data.frame(specnumber(jam_diatoms)) %>% select(Species = `specnumber(jam_diatoms)`) %>% summarise(Mean_species = mean(Species)) %>% mutate(Site = "J") yuc_species_richness <- as.data.frame(specnumber(yuc_diatoms)) %>% select(Species = `specnumber(yuc_diatoms)`) %>% summarise(Mean_species = mean(Species)) %>% mutate(Site = "Y") crb_species_richness <- as.data.frame(specnumber(crb_diatoms)) %>% select(Species = `specnumber(crb_diatoms)`) %>% summarise(Mean_species = mean(Species)) %>% mutate(Site = "C") # Calculate mean local species richness for Everglades site evr_species_richness <- as.data.frame(specnumber(everglades_abund)) %>% select(Species = `specnumber(everglades_abund)`) %>% summarise(Mean_species = mean(Species)) %>% mutate(Site = "E") # Place all mean local species richness into one data frame all_sites_species_richnesss <- rbind(bel_species_richness, jam_species_richness, yuc_species_richness, crb_species_richness, evr_species_richness) %>% select(Site, Mean_species) view(all_sites_species_richnesss) # Join the species data frame with the Shannon data frame we created # earlier by using the inner_join function. Rows from the two data frames # are joined together when they share the same Site. species_richness_with_shannon <- inner_join(mean_shannon_all_sites, all_sites_species_richnesss, by = "Site") view(species_richness_with_shannon) # Calculate the mean local evenness value for the Caribbean composite site carib_pielous_evenness <- species_richness_with_shannon %>% filter(Site == "C") %>% mutate(Evenness = Mean_local_shannon/log(Mean_species)) %>% select(Site, Evenness) # Print evenness value for the Caribbean composite site carib_pielous_evenness # Generate Sorensen-Dice similarity coefficient CC = 2C/(A+B) where C is the # number of species common to both samples have in common, A is the number of # species in sample A, and B is the number of species found in sample B # Create data frames containing mean abundance values for each Caribbean site bel_mean_abund <- carib_sites_mean_abund %>% filter(Site == "B") %>% column_to_rownames("Site") view(bel_mean_abund) jam_mean_abund <- carib_sites_mean_abund %>% filter(Site == "J") %>% column_to_rownames("Site") yuc_mean_abund <- carib_sites_mean_abund %>% filter(Site == "Y") %>% column_to_rownames("Site") crb_composite_mean_abund <- carib_composite_site_mean_abund %>% column_to_rownames("Site") # Bind the Caribbean composite site mean abundance and belize mean abundance # data frames into one we can use to generate the Sorensen-Dice similarity # coefficient carib_and_bel_mean_abund <- rbind(crb_composite_mean_abund, bel_mean_abund) view(carib_and_bel_mean_abund) # Calculate the Sorensen-Dice similarity coefficient carib_vs_bel_sorensen <- betadiver(carib_and_bel_mean_abund, method = "sor") # Print the Sorensen-Dice similarity coefficient carib_vs_bel_sorensen #### Activity B Part 2 - Rarefaction Curves #### # Prepare data to feed into ggiNEXT, which requires an input list of # presence/absence data for a site carib_presence_absence_long <- carib_data %>% pivot_longer(-c(Site:Date), names_to = "Taxon", values_to = "rel_abund") %>% # Sum the number of times each taxon was present across the samples mutate(present = if_else(rel_abund > 0, 1, 0)) %>% group_by(Taxon) %>% summarise(presence_summed = sum(present)) %>% select(presence_summed) %>% # Add number of samples to top of column. This is required because the # ggiNEXT function will require the number of samples to precede the # presence/absence values rbind(c(length(carib_data$Replicate)),.) view(carib_presence_absence_long) jam_presence_absence_long <- jam_diatoms %>% pivot_longer(-Site, names_to = "Taxon", values_to = "rel_abund") %>% mutate(present = if_else(rel_abund > 0, 1, 0)) %>% group_by(Taxon) %>% summarise(presence_summed = sum(present)) %>% select(presence_summed) %>% rbind(c(length(jam_diatoms$Site)),.) bel_presence_absence_long <- bel_diatoms %>% pivot_longer(-Site, names_to = "Taxon", values_to = "rel_abund") %>% mutate(present = if_else(rel_abund > 0, 1, 0)) %>% group_by(Taxon) %>% summarise(presence_summed = sum(present)) %>% select(presence_summed) %>% rbind(c(length(bel_diatoms$Site)),.) yuc_presence_absence_long <- yuc_diatoms %>% pivot_longer(-Site, names_to = "Taxon", values_to = "rel_abund") %>% mutate(present = if_else(rel_abund > 0, 1, 0)) %>% group_by(Taxon) %>% summarise(presence_summed = sum(present)) %>% select(presence_summed) %>% rbind(c(length(yuc_diatoms$Site)),.) # Subset 40 Everglades samples (out of the original 155 Everglades samples) # to simulate a sampling effort similar to that in the Caribbean sites. # the set.seed function is used here to ensure we all subset # the same 40 samples. set.seed(1234) # Setup the Everglades data for feeding into ggiNEXT evr_diatoms_filtered <- everglades_data %>% sample_n(., 40) %>% select(TAG_ID, ACHFTSP01:ULNULNULN) evr_presence_absence_long <- evr_diatoms_filtered %>% pivot_longer(-TAG_ID, names_to = "Taxon", values_to = "rel_abund") %>% mutate(present = if_else(rel_abund > 0, 1, 0)) %>% group_by(Taxon) %>% summarise(presence_summed = sum(present)) %>% select(presence_summed) %>% rbind(c(length(evr_diatoms_filtered$TAG_ID)),.) # Now that our data are mostly prepped, we will create a list # of the Caribbean composite site data and then plot a rarefaction curve for the # site # Create the list we need to feed into iNext to get the rarefaction curve crb_list <- list(pull(carib_presence_absence_long, presence_summed)) view(crb_list) # Store the name we want to include when we create a curve for the site. names(crb_list) <- c("CRB") view(crb_list) # Establish how many samples we want to extrapolate to and create the output # we want to plot t <- seq(1, 145, by=1) crb_rarefaction_out <- iNEXT(crb_list, q=0, datatype="incidence_freq", size=t) # Create the rarefaction curve. It should appear in the "Plots" tab. ggiNEXT(crb_rarefaction_out, type=1, color.var="site", se= TRUE) + theme_bw(base_size = 18) + scale_x_continuous(breaks = seq(0, 160, by = 20)) + scale_y_continuous(breaks = seq(0, 280, by = 20)) #### Activity C - Answering questions with our new tools #### # Based on what we learned in part B, choose your own site(s) # and write code to explore and analyze the data. # Code we used to analyze the Caribbean composite site is provided below. # Adapt it for your own chosen site(s). Make sure to rename output dataframes and # the corresponding inputs (e.g. change "plot_carib_abund" to something like # "plot_[chosen_sitename]_abund"). ## Rarefaction curves # Choose your site from the list below and enter it into the first function to # turn the data into a list we can feed into iNext. Make sure to also change # the name of the output data frame (as well as anything else referencing the # the Caribbean site name or data frames). # Choices: evr_presence_absence_long, jam_presence_absence_long, # bel_presence_absence_long, yuc_presence_absence_long ##!!! # Create the list we need to feed into iNext to get the rarefaction curve ##!!! crb_list <- list(pull(carib_presence_absence_long, presence_summed)) ##!!! # Store the name you want to include when creating a curve for the site ##!!! names(crb_list) <- c("CRB") # Establish how many samples we want to extrapolate to and create the output # we want to plot t <- seq(1, 145, by=1) crb_rarefaction_out <- iNEXT(crb_list, q=0, datatype="incidence_freq", size=t) # Create the rarefaction curve. It should appear in the "Plots" tab. ggiNEXT(crb_rarefaction_out, type=1, color.var="site", se= TRUE) + theme_bw(base_size = 18) + scale_x_continuous(breaks = seq(0, 160, by = 20)) + scale_y_continuous(breaks = seq(0, 280, by = 20)) # In order to answer overarching questions, you may need to apply other # techniques from previous activities. The Caribbean indices code provided below # can be adapted for the site(s) of your choice. # # Hint: Remember the other sites are B (Belize), J (Jamaica), Y (Yucatan), # and E (Everglades). ##!!! # Print the mean local H' value for your chosen site ##!!! carib_composite_shannon <- mean_shannon_all_sites %>% filter(Site == "C") carib_composite_shannon ##!!! # Calculate the mean local evenness value for your chosen site ##!!! carib_pielous_evenness <- species_richness_with_shannon %>% filter(Site == "C") %>% mutate(Evenness = Mean_local_shannon/log(Mean_species)) %>% select(Site, Evenness) # Print evenness value for your chosen site carib_pielous_evenness ##!!! # Sorensen-Dice similarity coefficient (CC) # Choose two of the data frames from the following list and enter them into the # rbind function below in order to create a data frame with the two sites you # want to compare. # Choices: bel_mean_abund, jam_mean_abund, yuc_mean_abund, crb_composite_mean_abund ##!!! carib_and_bel_mean_abund <- rbind(crb_composite_mean_abund, bel_mean_abund) # Calculate the Sorensen-Dice similarity coefficient for your chosen sites d <- betadiver(carib_and_bel_mean_abund, method = "sor") # Print the Sorensen-Dice similarity coefficient d