Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 9:8f2f346a5e1c draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 04 Dec 2018 13:44:12 -0500 |
| parents | d2057e183772 |
| children | 6c93244a36e2 |
comparison
equal
deleted
inserted
replaced
| 8:d2057e183772 | 9:8f2f346a5e1c |
|---|---|
| 1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
| 2 | 2 |
| 3 suppressPackageStartupMessages(library("adegenet")) | 3 suppressPackageStartupMessages(library("adegenet")) |
| 4 suppressPackageStartupMessages(library("ape")) | 4 suppressPackageStartupMessages(library("ape")) |
| 5 suppressPackageStartupMessages(library("data.table")) | 5 suppressPackageStartupMessages(library("data.table")) |
| 6 #suppressPackageStartupMessages(library("dbplyr")) | |
| 7 suppressPackageStartupMessages(library("dplyr")) | 6 suppressPackageStartupMessages(library("dplyr")) |
| 8 suppressPackageStartupMessages(library("ggplot2")) | 7 suppressPackageStartupMessages(library("ggplot2")) |
| 9 suppressPackageStartupMessages(library("knitr")) | 8 suppressPackageStartupMessages(library("knitr")) |
| 10 suppressPackageStartupMessages(library("optparse")) | 9 suppressPackageStartupMessages(library("optparse")) |
| 11 suppressPackageStartupMessages(library("poppr")) | 10 suppressPackageStartupMessages(library("poppr")) |
| 12 suppressPackageStartupMessages(library("RColorBrewer")) | 11 suppressPackageStartupMessages(library("RColorBrewer")) |
| 13 suppressPackageStartupMessages(library("RPostgres")) | 12 suppressPackageStartupMessages(library("RPostgres")) |
| 14 #suppressPackageStartupMessages(library("tidyr")) | |
| 15 suppressPackageStartupMessages(library("vcfR")) | 13 suppressPackageStartupMessages(library("vcfR")) |
| 16 suppressPackageStartupMessages(library("vegan")) | 14 suppressPackageStartupMessages(library("vegan")) |
| 17 | 15 |
| 18 option_list <- list( | 16 option_list <- list( |
| 19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | 17 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), |
| 20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), | 18 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), |
| 21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | 19 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), |
| 22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | 20 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), |
| 23 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"), | |
| 24 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") | 21 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") |
| 25 ) | 22 ) |
| 26 | 23 |
| 27 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | 24 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); |
| 28 args <- parse_args(parser, positional_arguments=TRUE); | 25 args <- parse_args(parser, positional_arguments=TRUE); |
| 78 # TODO: probably should not hard-code 2 cores. | 75 # TODO: probably should not hard-code 2 cores. |
| 79 gl <- vcfR2genlight(vcf, n.cores=2); | 76 gl <- vcfR2genlight(vcf, n.cores=2); |
| 80 gind <- new("genind", (as.matrix(gl))); | 77 gind <- new("genind", (as.matrix(gl))); |
| 81 | 78 |
| 82 # Add population information to the genind object. | 79 # Add population information to the genind object. |
| 83 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); | 80 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors = FALSE); |
| 81 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); | |
| 84 gind@pop <- as.factor(poptab$region); | 82 gind@pop <- as.factor(poptab$region); |
| 85 | 83 |
| 86 # Convert genind object to a genclone object. | 84 # Convert genind object to a genclone object. |
| 87 obj2 <- as.genclone(gind); | 85 obj2 <- as.genclone(gind); |
| 88 | 86 |
| 89 # Calculate the bitwise distance between individuals. | 87 # Calculate the bitwise distance between individuals. |
| 90 xdis <- bitwise.dist(obj2); | 88 xdis <- bitwise.dist(obj2); |
| 91 | 89 |
| 92 # Multilocus genotypes (threshold of 1%). | 90 # Multilocus genotypes (threshold of 16%). |
| 93 mlg.filter(obj2, distance=xdis) <- 0.01; | 91 mlg.filter(obj2, distance=xdis) <- 0.016; |
| 94 m <- mlg.table(obj2, background=TRUE, color=TRUE); | 92 m <- mlg.table(obj2, background=TRUE, color=TRUE); |
| 95 | 93 |
| 96 # Create table of MLGs. | 94 # Create table of MLGs. |
| 97 id <- mlg.id(obj2); | 95 id <- mlg.id(obj2); |
| 98 dt <- data.table(id, keep.rownames=TRUE); | 96 dt <- data.table(id, keep.rownames=TRUE); |
| 99 setnames(dt, c("id"), c("user_specimen_id")); | 97 setnames(dt, c("id"), c("affy_id")); |
| 100 | 98 |
| 101 # Read user's Affymetrix 96 well plate csv file. | 99 # Read user's Affymetrix 96 well plate csv file. |
| 102 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); | 100 pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t"); |
| 103 pinfo <- pinfo$user_specimen_id; | 101 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); |
| 104 pi <- data.table(pinfo); | 102 pinfo2 <- as.character(pinfo$user_specimen_id); |
| 105 setnames(pi, c("pinfo"), c("user_specimen_id")); | 103 pi <- data.table(pinfo2); |
| 104 setnames(pi, c("pinfo2"), c("user_specimen_id")); | |
| 106 | 105 |
| 107 # Connect to database. | 106 # Connect to database. |
| 108 conn <- get_database_connection(opt$database_connection_string); | 107 conn <- get_database_connection(opt$database_connection_string); |
| 109 | 108 |
| 110 # Import the sample table. | 109 # Import the sample table. |
| 111 mD <- tbl(conn, "sample"); | 110 mD <- tbl(conn, "sample"); |
| 112 | 111 |
| 113 # Select user_specimen_id and mlg columns. | 112 # Select user_specimen_id and mlg columns. |
| 114 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); | 113 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); |
| 115 | 114 |
| 116 # Convert to dataframe. | 115 # Convert to dataframe. |
| 117 sm <- data.frame(smlg); | 116 sm <- data.frame(smlg); |
| 118 sm[sm==""] <- NA; | 117 sm[sm==""] <- NA; |
| 119 | 118 |
| 120 # Convert missing data into data table. | 119 # Convert missing data into data table. |
| 121 mi <-setDT(miss, keep.rownames=TRUE)[]; | 120 mi <-setDT(miss, keep.rownames=TRUE)[]; |
| 122 # Change names to match db. | 121 setnames(mi, c("rn"), c("affy_id")); |
| 123 setnames(mi, c("rn"), c("user_specimen_id")); | |
| 124 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | 122 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); |
| 125 # Round missing data to two digits. | 123 # Round missing data to two digits. |
| 126 mi$percent_missing <- round(mi$percent_missing, digits=2); | 124 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); |
| 127 | 125 |
| 128 # Convert heterozygosity data into data table. | 126 # Convert heterozygosity data into data table. |
| 129 ht <-setDT(ht, keep.rownames=TRUE)[]; | 127 ht <-setDT(ht, keep.rownames=TRUE)[]; |
| 130 # Change names to match db. | 128 setnames(ht, c("rn"), c("affy_id")); |
| 131 setnames(ht, c("rn"), c("user_specimen_id")); | |
| 132 setnames(ht, c("hets"), c("percent_mixed_coral")); | 129 setnames(ht, c("hets"), c("percent_mixed_coral")); |
| 133 # Round missing data to two digits. | 130 # Round missing data to two digits. |
| 134 ht$percent_mixed<-round(ht$percent_mixed, digits=2); | 131 ht$percent_mixed<-round(ht$percent_mixed, digits=2); |
| 135 | 132 |
| 136 # Convert refA data into data.table. | 133 # Convert refA data into data.table. |
| 137 rA <-setDT(rA, keep.rownames=TRUE)[]; | 134 rA <-setDT(rA, keep.rownames=TRUE)[]; |
| 138 # Change names to match db. | 135 setnames(rA, c("rn"), c("affy_id")); |
| 139 setnames(rA, c("rn"), c("user_specimen_id")); | |
| 140 setnames(rA, c("refA"), c("percent_reference_coral")); | 136 setnames(rA, c("refA"), c("percent_reference_coral")); |
| 141 # round missing data to two digits. | 137 # round missing data to two digits. |
| 142 rA$percent_reference<-round(rA$percent_reference, digits=2); | 138 rA$percent_reference<-round(rA$percent_reference, digits=2); |
| 143 | 139 |
| 144 # Convert altB data into data table. | 140 # Convert altB data into data table. |
| 145 aB <-setDT(aB, keep.rownames=TRUE)[]; | 141 aB <-setDT(aB, keep.rownames=TRUE)[]; |
| 146 # Change names to match db. | 142 setnames(aB, c("rn"), c("affy_id")); |
| 147 setnames(aB, c("rn"), c("user_specimen_id")); | |
| 148 setnames(aB, c("altB"), c("percent_alternative_coral")); | 143 setnames(aB, c("altB"), c("percent_alternative_coral")); |
| 149 # Round missing data to two digits. | 144 # Round missing data to two digits. |
| 150 aB$percent_alternative<-round(aB$percent_alternative, digits=2); | 145 aB$percent_alternative<-round(aB$percent_alternative, digits=2); |
| 151 | 146 |
| 152 #convert mlg id to data.table format | 147 #convert mlg id to data.table format |
| 153 dt <- data.table(id, keep.rownames=TRUE); | 148 dt <- data.table(id, keep.rownames=TRUE); |
| 154 # Change name to match db. | 149 setnames(dt, c("id"), c("affy_id")); |
| 155 setnames(dt, c("id"), c("user_specimen_id")); | |
| 156 | 150 |
| 157 # Transform. | 151 # Transform. |
| 158 df3 <- dt %>% | 152 df3 <- dt %>% |
| 159 group_by(row_number()) %>% | 153 group_by(row_number()) %>% |
| 160 dplyr::rename(group='row_number()') %>% | 154 dplyr::rename(group='row_number()') %>% |
| 161 unnest (user_specimen_id) %>% | 155 unnest (user_specimen_id) %>% |
| 162 # Join with mlg table. | 156 # Join with mlg table. |
| 163 left_join(sm %>% | 157 left_join(sm %>% |
| 164 select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id'); | 158 select("affy_id","coral_mlg_clonal_id"), |
| 159 by='affy_id'); | |
| 165 | 160 |
| 166 # If found in database, group members on previous mlg id. | 161 # If found in database, group members on previous mlg id. |
| 167 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); | 162 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); |
| 168 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; | 163 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; |
| 169 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); | 164 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); |
| 192 # this for loop assigns the new id iteratively for all that have NA. | 187 # this for loop assigns the new id iteratively for all that have NA. |
| 193 for (i in 1:length(na.mlg2)) { | 188 for (i in 1:length(na.mlg2)) { |
| 194 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; | 189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; |
| 195 } | 190 } |
| 196 | 191 |
| 192 # subset poptab for all samples. | |
| 193 subpop <- poptab[c(2, 3)]; | |
| 194 | |
| 197 # Merge data frames for final table. | 195 # Merge data frames for final table. |
| 198 report_user <- pi %>% | 196 report_user <- pi %>% |
| 199 # Join with the second file (only the first and third column). | 197 left_join(subpop %>% |
| 198 select("affy_id", "user_specimen_id"), | |
| 199 by='user_specimen_id') %>% | |
| 200 left_join(df4 %>% | 200 left_join(df4 %>% |
| 201 select("user_specimen_id","coral_mlg_clonal_id","DB_match"), | 201 select("affy_id", "coral_mlg_clonal_id", "DB_match"), |
| 202 by='user_specimen_id') %>% | 202 by='affy_id') %>% |
| 203 # Join with the second file (only the first and third column). | |
| 204 left_join(mi %>% | 203 left_join(mi %>% |
| 205 select("user_specimen_id","percent_missing_coral"), | 204 select("affy_id", "percent_missing_data_coral"), |
| 206 by='user_specimen_id') %>% | 205 by='affy_id') %>% |
| 207 # Join with the second file (only the first and third column). | |
| 208 left_join(ht %>% | 206 left_join(ht %>% |
| 209 select("user_specimen_id","percent_mixed_coral"), | 207 select("affy_id", "percent_mixed_coral"), |
| 210 by='user_specimen_id') %>% | 208 by='affy_id') %>% |
| 211 # Join with the second file (only the first and third column); | |
| 212 left_join(rA %>% | 209 left_join(rA %>% |
| 213 select("user_specimen_id","percent_reference_coral"), | 210 select("affy_id", "percent_reference_coral"), |
| 214 by='user_specimen_id') %>% | 211 by='affy_id') %>% |
| 215 # Join with the second file (only the first and third column). | |
| 216 left_join(aB %>% | 212 left_join(aB %>% |
| 217 select("user_specimen_id","percent_alternative_coral"), | 213 select("affy_id", "percent_alternative_coral"), |
| 218 by='user_specimen_id') %>% | 214 by='affy_id') %>% |
| 219 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | 215 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% |
| 220 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% | 216 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% |
| 221 ungroup() %>% | 217 ungroup() %>% |
| 222 select(-group); | 218 select(-group); |
| 223 | 219 |
| 224 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); | 220 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); |
| 225 | 221 |
| 226 # Rarifaction curve. | 222 # Combine sample information for database. |
| 227 # Start PDF device driver. | 223 report_db <- pinfo %>% |
| 228 dev.new(width=10, height=7); | 224 left_join(report_user %>% |
| 229 file_path = get_file_path("geno_rarifaction_curve.pdf"); | 225 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", |
| 230 pdf(file=file_path, width=10, height=7); | 226 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", |
| 231 rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); | 227 "percent_alternative_coral"), |
| 232 dev.off(); | 228 by='user_specimen_id') |
| 233 | 229 |
| 234 # Genotype accumulation curve, sample is number of | 230 # Create vector indicating number of individuals desired |
| 235 # loci randomly selected for to make the curve. | 231 # made from affy_id collumn of report_user data table. |
| 236 dev.new(width=10, height=7); | 232 i <- report_user[[2]]; |
| 237 file_path = get_file_path("geno_accumulation_curve.pdf"); | 233 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; |
| 238 pdf(file=file_path, width=10, height=7); | |
| 239 genotype_curve(gind, sample=5, quiet=TRUE); | |
| 240 dev.off(); | |
| 241 | 234 |
| 242 # Create a phylogeny of samples based on distance matrices. | 235 # Create a phylogeny of samples based on distance matrices. |
| 243 cols <- palette(brewer.pal(n=12, name='Set3')); | 236 cols <- palette(brewer.pal(n=12, name='Set3')); |
| 244 set.seed(999); | 237 set.seed(999); |
| 245 # Start PDF device driver. | 238 # Start PDF device driver. |
| 246 dev.new(width=10, height=7); | 239 dev.new(width=10, height=7); |
| 247 file_path = get_file_path("nj_phylogeny.pdf"); | 240 file_path = get_file_path("nj_phylogeny.pdf"); |
| 248 pdf(file=file_path, width=10, height=7); | 241 pdf(file=file_path, width=10, height=7); |
| 249 # Organize branches by clade. | 242 # Organize branches by clade. |
| 250 tree <- obj2 %>% | 243 theTree <- sub96 %>% |
| 251 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% | 244 aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>% |
| 252 ladderize(); | 245 ladderize(); |
| 253 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); | 246 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; |
| 247 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); | |
| 254 # Add a scale bar showing 5% difference.. | 248 # Add a scale bar showing 5% difference.. |
| 255 add.scale.bar(length=0.05, cex=0.65); | 249 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); |
| 256 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | 250 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); |
| 251 legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8); | |
| 257 dev.off(); | 252 dev.off(); |
| 258 | 253 |
| 254 # Missing data barplot. | |
| 255 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; | |
| 256 test2 <- which(!is.na(poptab$miss)); | |
| 257 miss96 <- poptab$miss[test2]; | |
| 258 name96 <- poptab$user_specimen_id[test2]; | |
| 259 dev.new(width=10, height=7); | |
| 260 file_path = get_file_path("missing_data.pdf"); | |
| 261 pdf (file=file_path, width=10, height=7); | |
| 262 par(mar = c(8, 4, 4, 2)); | |
| 263 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); | |
| 264 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); | |
| 265 dev.off() | |
| 266 | |
| 267 # Rarifaction curve. | |
| 268 # Start PDF device driver. | |
| 269 #dev.new(width=10, height=7); | |
| 270 #file_path = get_file_path("geno_rarifaction_curve.pdf"); | |
| 271 #pdf(file=file_path, width=10, height=7); | |
| 272 #rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); | |
| 273 #dev.off(); | |
| 274 | |
| 275 # Genotype accumulation curve, sample is number of | |
| 276 # loci randomly selected for to make the curve. | |
| 277 #dev.new(width=10, height=7); | |
| 278 #file_path = get_file_path("geno_accumulation_curve.pdf"); | |
| 279 #pdf(file=file_path, width=10, height=7); | |
| 280 #genotype_curve(gind, sample=5, quiet=TRUE); | |
| 281 #dev.off(); | |
| 282 |
