## ----rmd_setup, include = FALSE----------------------------------------------- #library(knitr) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----Pkg_Install, eval=FALSE-------------------------------------------------- # # Installing the BioMonTools library (with the vignette) from GitHub # library(remotes) # install_github("leppott/BioMonTools", force=TRUE, build_vignettes=TRUE) ## ----Pkg_Help, eval=FALSE----------------------------------------------------- # help(package="BioMonTools") ## ----MetricValues_Keep2------------------------------------------------------- # Packages # library(readxl) # library(knitr) # library(BioMonTools) # library(dplyr) # Load Data df_data <- readxl::read_excel(system.file("./extdata/Data_Benthos.xlsx" , package = "BioMonTools") , guess_max = 10^6) # Columns to keep myCols <- c("Area_mi2", "SurfaceArea", "Density_m2", "Density_ft2") # Run Function df_metval <- BioMonTools::metric.values(df_data, "bugs", fun.cols2keep = myCols) # Metrics of Interest ## thermal indicator (_ti_) #names(df.metval)[grepl("_ti_", names(df.metval))] col_met2keep <- c("ni_total" , "nt_total" , "nt_ti_stenocold" # renamed from corecold , "nt_ti_cold" , "nt_ti_cool" , "pi_ti_stenocold" # renamed from corecold , "pi_ti_cold" , "pi_ti_cool") col_ID <- c("SAMPLEID" , toupper(myCols) , "INDEX_NAME" , "INDEX_CLASS") # Ouput df_metval_ci <- df_metval[, c(col_ID, col_met2keep)] # RMD table knitr::kable(head(df_metval_ci), caption = "Metric Calculation, select metrics") ## ----Excl01------------------------------------------------------------------- # Packages #library(readxl) #library(dplyr) #library(lazyeval) #library(knitr) # Define pipe `%>%` <- dplyr::`%>%` # Data df_samps_bugs <- readxl::read_excel(system.file("./extdata/Data_Benthos.xlsx" , package = "BioMonTools") , guess_max = 10^6) # Variables SampID <- "SampleID" TaxaID <- "TaxaID" TaxaCount <- "N_Taxa" Exclude <- "Exclude_New" TaxaLevels <- c("Kingdom" , "Phylum" , "SubPhylum" , "Class" , "SubClass" , "Order" , "SubOrder" , "SuperFamily" , "Family" , "SubFamily" , "Tribe" , "Genus" , "SubGenus" , "Species" , "Variety") # Taxa that should be treated as equivalent Exceptions <- data.frame("TaxaID" = "Sphaeriidae", "PhyloID" = "Pisidiidae") # Filter Data # df_samptax <- filter(df_samps_bugs, !!as.name(SampID) == # "08BEA3478__2013-08-21_0") # df_tst_small <- markExcluded(df_samptax, SampID, TaxaID, TaxaCount, TaxaLevels #, Exceptions, Exclude) # EXAMPLE 1 df_tst <- BioMonTools::markExcluded(df_samps_bugs , SampID = "SampleID" , TaxaID = "TaxaID" , TaxaCount = "N_Taxa" , Exclude = "Exclude_New" , TaxaLevels = TaxaLevels , Exceptions = Exceptions) # Compare df_compare <- dplyr::summarise(dplyr::group_by(df_tst, SampleID) , Exclude_Import = sum(Exclude) , Exclude_R = sum(Exclude_New)) df_compare$Diff <- df_compare$Exclude_Import - df_compare$Exclude_R # tbl_diff <- table(df_compare$Diff) #kable(tbl_diff) # sort df_compare <- df_compare %>% dplyr::arrange(desc(Diff)) # Number with issues #sum(abs(df_compare$Diff)) # total samples #nrow(df_compare) # confusion matrix tbl_results <- table(df_tst$Exclude, df_tst$Exclude_New, useNA = "ifany") # # Show differences knitr::kable(tbl_results, caption = "Confusion Matrix") # samples with differences samp_diff <- as.data.frame(df_compare[df_compare[,"Diff"] != 0, "SampleID"]) # results for only those with differences df_tst_diff <- df_tst[df_tst[,"SampleID"] %in% samp_diff$SampleID, ] # add diff field df_tst_diff$Exclude_Diff <- df_tst_diff$Exclude - df_tst_diff$Exclude_New # Classification Performance Metrics class_TP <- tbl_results[2,2] # True Positive class_FN <- tbl_results[2,1] # False Negative class_FP <- tbl_results[1,2] # False Positive class_TN <- tbl_results[1,1] # True Negative class_n <- sum(tbl_results) # total # # sensitivity (recall); TP / (TP+FN); measure model to ID true positives class_sens <- class_TP / (class_TP + class_FN) # precision; TP / (TP+FP); accuracy of model positives class_prec <- class_TP / (class_TP + class_FP) # specifity; TN / (TN + FP); measure model to ID true negatives class_spec <- class_TN / (class_TN + class_FP) # overall accuracy; (TP + TN) / all cases; accuracy of all classifications class_acc <- (class_TP + class_TN) / class_n # F1; 2 * (class_prec*class_sens) / (class_prec+class_sens) ## balance of precision and recall class_F1 <- 2 * (class_prec * class_sens) / (class_prec + class_sens) # results_names <- c("Sensitivity (Recall)" , "Precision", "Specificity" , "OVerall Accuracy" , "F1") results_values <- c(class_sens , class_prec , class_spec , class_acc , class_F1) # tbl_class <- data.frame(results_names, results_values) names(tbl_class) <- c("Performance Metrics", "Percent") tbl_class$Percent <- round(tbl_class$Percent * 100, 2) knitr::kable(tbl_class, caption = "Classification Performance Metrics") ## ----Excl02------------------------------------------------------------------- # Packages #library(readxl) #library(dplyr) #library(lazyeval) #library(knitr) # Define pipe `%>%` <- dplyr::`%>%` # Data df_samps_bugs <- readxl::read_excel(system.file("./extdata/Data_Benthos.xlsx" , package = "BioMonTools") , guess_max = 10^6) # Variables SampID <- "SampleID" TaxaID <- "TaxaID" TaxaCount <- "N_Taxa" Exclude <- "Exclude_New" TaxaLevels <- c("Kingdom" , "Phylum" , "SubPhylum" , "Class" , "SubClass" , "Order" , "SubOrder" , "SuperFamily" , "Family" , "SubFamily" , "Tribe" , "Genus" , "SubGenus" , "Species" , "Variety") # Taxa that should be treated as equivalent Exceptions <- NA # EXAMPLE 2 ## No Exceptions df_tst2 <- BioMonTools::markExcluded(df_samps_bugs , SampID = "SampleID" , TaxaID = "TaxaID" , TaxaCount = "N_Taxa" , Exclude = "Exclude_New" , TaxaLevels = TaxaLevels , Exceptions = NA) # Compare df_compare2 <- dplyr::summarise(dplyr::group_by(df_tst2, SampleID) , Exclude_Import = sum(Exclude) , Exclude_R = sum(Exclude_New)) df_compare2$Diff <- df_compare2$Exclude_Import - df_compare2$Exclude_R # tbl_diff2 <- table(df_compare2$Diff) #kable(tbl_diff2) # sort df_compare2 <- df_compare2 %>% dplyr::arrange(desc(Diff)) # Number with issues #sum(abs(df_compare2$Diff)) # total samples #nrow(df_compare2) # confusion matrix tbl_results2 <- table(df_tst2$Exclude, df_tst2$Exclude_New, useNA = "ifany") # # Show differences knitr::kable(tbl_results2, caption = "Confusion Matrix") knitr::kable(df_compare2[1:10, ]) knitr::kable(tail(df_compare2)) # samples with differences (samp_diff2 <- as.data.frame(df_compare2[df_compare2[,"Diff"] != 0, "SampleID"])) # results for only those with differences df_tst_diff2 <- dplyr::filter(df_tst2, SampleID %in% samp_diff2$SampleID) # add diff field df_tst_diff2$Exclude_Diff <- df_tst_diff2$Exclude - df_tst_diff2$Exclude_New # Classification Performance Metrics class_TP2 <- tbl_results2[2,2] # True Positive class_FN2 <- tbl_results2[2,1] # False Negative class_FP2 <- tbl_results2[1,2] # False Positive class_TN2 <- tbl_results2[1,1] # True Negative class_n2 <- sum(tbl_results2) # total # # sensitivity (recall); TP / (TP+FN); measure model to ID true positives class_sens2 <- class_TP2 / (class_TP2 + class_FN2) # precision; TP / (TP+FP); accuracy of model positives class_prec2 <- class_TP2 / (class_TP2 + class_FP2) # specifity; TN / (TN + FP); measure model to ID true negatives class_spec2 <- class_TN2 / (class_TN2 + class_FP2) # overall accuracy; (TP + TN) / all cases; accuracy of all classifications class_acc2 <- (class_TP2 + class_TN2) / class_n2 # F1; 2 * (class_prec*class_sens) / (class_prec+class_sens) ## balance of precision and recall class_F12 <- 2 * (class_prec2 * class_sens2) / (class_prec2 + class_sens2) # results_names2 <- c("Sensitivity (Recall)" , "Precision" , "Specificity" , "OVerall Accuracy" , "F1") results_values2 <- c(class_sens2 , class_prec2 , class_spec2 , class_acc2 , class_F12) # tbl_class2 <- data.frame(results_names2, results_values2) names(tbl_class2) <- c("Performance Metrics", "Percent") tbl_class2$Percent <- round(tbl_class2$Percent * 100, 2) knitr::kable(tbl_class2, caption = "Classification Performance Metrics") ## ----rarify------------------------------------------------------------------- # Subsample to 500 organisms (from over 500 organisms) for 12 samples. # Packages #library(BioMonTools) #library(knitr) # load bio data df_biodata <- BioMonTools::data_bio2rarify #dim(df_biodata) #kable(head(df_biodata)) # subsample mySize <- 500 Seed_OR <- 18590214 Seed_WA <- 18891111 Seed_US <- 17760704 bugs_mysize <- BioMonTools::rarify(inbug = df_biodata , sample.ID = "SampleID" , abund = "N_Taxa" , subsiz = mySize , mySeed = Seed_US) # view results #dim(bugs_mysize) #kable(head(bugs_mysize)) # Compare pre- and post- subsample counts df_compare <- merge(df_biodata , bugs_mysize , by = c("SampleID", "TaxaID") , suffixes = c("_Orig","_500")) df_compare <- df_compare[,c("SampleID", "TaxaID", "N_Taxa_Orig", "N_Taxa_500")] knitr::kable(head(df_compare), caption = "Comparison, by Sample") # compare totals tbl_totals <- aggregate(cbind(N_Taxa_Orig, N_Taxa_500) ~ SampleID , df_compare , sum) knitr::kable(head(tbl_totals), caption = "Comparison, sample totals") ## ----Flags, echo=TRUE, eval=TRUE---------------------------------------------- # Packages #library(readxl) #library(reshape2) #library(knitr) #library(BioMonTools) # Import df.samps.bugs <- readxl::read_excel(system.file("extdata/Data_Benthos.xlsx" , package = "BioMonTools") , guess_max = 10^6) # Calculate Metrics # Extra columns to keep in results keep.cols <- c("Area_mi2", "SurfaceArea", "Density_m2", "Density_ft2") # Run Function df.metrics <- BioMonTools::metric.values(df.samps.bugs , "bugs" , fun.cols2keep = keep.cols) # Flags # Import QC Checks df.checks <- readxl::read_excel(system.file("extdata/MetricFlags.xlsx" , package = "BioMonTools") , sheet = "Flags") # Run Function df.flags <- BioMonTools::qc.checks(df.metrics, df.checks) # Change terminology; PASS/FAIL to NA/flag df.flags[, "FLAG"][df.flags[, "FLAG"] == "FAIL"] <- "flag" df.flags[, "FLAG"][df.flags[, "FLAG"] == "PASS"] <- NA # long to wide format df.flags.wide <- reshape2::dcast(df.flags , SAMPLEID ~ CHECKNAME , value.var = "FLAG") # Calc number of "flag"s by row. df.flags.wide$NumFlags <- rowSums(df.flags.wide == "flag", na.rm = TRUE) # Rearrange columns NumCols <- ncol(df.flags.wide) df.flags.wide <- df.flags.wide[, c(1, NumCols, 2:(NumCols - 1))] # View(df.flags.wide) # Summarize Results knitr::kable(table(df.flags[,"CHECKNAME"], df.flags[,"FLAG"], useNA = "ifany"))