Mercurial > repos > greg > coral_multilocus_genotype
changeset 2:dea15dc040ed draft default tip
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 09:59:57 -0400 |
parents | a5716c317919 |
children | |
files | coral_multilocus_genotype.R coral_multilocus_genotype.xml |
diffstat | 2 files changed, 49 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/coral_multilocus_genotype.R Thu Jul 18 07:58:01 2019 -0400 +++ b/coral_multilocus_genotype.R Thu Aug 15 09:59:57 2019 -0400 @@ -89,7 +89,7 @@ time_elapsed(start_time); # Add population information to the genind object. -population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); +population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t", quote=""); colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); #write_data_frame(output_data_dir, "population_info_data_table", population_info_data_table); genind_obj@pop <- as.factor(population_info_data_table$region); @@ -113,7 +113,7 @@ mlg_ids <- mlg.id(genind_clone); # Read user's Affymetrix 96 well plate tabular file. -affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings=c("", "NA")); +affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings=c("", "NA"), quote=""); colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", "region", "latitude", "longitude", "geographic_origin", "colony_location", "depth", "disease_resist", "bleach_resist", "mortality","tle", @@ -202,7 +202,7 @@ # Rename the reference_alleles column. setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); # Round data to two digits. -reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2); +reference_alleles_data_table$percent_reference_coral <- round(reference_alleles_data_table$percent_reference_coral, digits=2); time_elapsed(start_time); # Alternative alleles @@ -220,7 +220,7 @@ # Rename the alternative_alleles column. setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); # Round data to two digits. -alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2); +alternative_alleles_data_table$percent_alternative_coral <- round(alternative_alleles_data_table$percent_alternative_coral, digits=2); time_elapsed(start_time); # The mlg_ids_data_table looks like this: @@ -312,11 +312,11 @@ by="affy_id") %>% mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% - mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>% - mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.99, "A.palmata","other")) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45 & percent_alternative_coral <= 51, "A.cervicornis", genetic_coral_species_call)) %>% mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% ungroup() %>% - select(-group); + select(-group,-db_record); time_elapsed(start_time); start_time <- time_start("Writing csv output"); @@ -335,16 +335,15 @@ # Table of alleles for the new samples subset to new plate data. # Create vector indicating number of individuals desired from # affy_id column of stag_db_report data table. -i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); +i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]); i <- i[!apply(i== "", 1, all),]; -sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; # Subset VCF to the user samples. start_time <- time_start("Subsetting vcf to the user samples"); -l <- length(i); -n <- ncol(vcf@gt); -s <- n - l; -svcf <- vcf[, s:n]; +l <- length(i)+1; +#n <- ncol(vcf@gt); +#s <- n - l; +svcf <- vcf[, 1:l]; write.vcf(svcf, "subset.vcf.gz"); vcf.fn <- "subset.vcf.gz"; snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); @@ -364,16 +363,16 @@ # affy_id region # a100000-4368120-060520-256_I07.CEL USVI # a100000-4368120-060520-256_K07.CEL USVI -affy_id_region_list <- population_info_data_table[c(2,4)]; +affy_id_region_list <- population_info_data_table[c(2,3,4)]; gds_data_table_join <- gds_data_table %>% left_join(affy_id_region_list %>% - select("affy_id", "region"), + select("affy_id", "user_specimen_id","region"), by='affy_id')%>% drop_na(); samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); add.gdsn(genofile, "sample.annot", samp.annot); # population_code looks like this: -# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 +# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); @@ -382,16 +381,17 @@ # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 time_elapsed(start_time); -# Distance matrix calculation. +# Distance matrix calculation and sample labels change to user specimen ids. start_time <- time_start("Calculating distance matrix"); ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); +ibs$sample.id <-gds_data_table_join$user_specimen_id; time_elapsed(start_time); # Cluster analysis on the genome-wide IBS pairwise distance matrix. start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); set.seed(100); par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); -ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); +ibs.hc <- snpgdsHCluster(ibs); time_elapsed(start_time); # cols looks like this: @@ -410,7 +410,8 @@ file_path = get_file_path(output_plots_dir, "ibs_default.pdf"); pdf(file=file_path, width=40, height=20); rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); -snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2); +snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", yaxis.kinship=FALSE); +abline(h = 0.032, lty = 2); legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); dev.off() time_elapsed(start_time); @@ -423,7 +424,7 @@ pdf(file=file_path, width=40, height=20); race <- as.factor(population_code); rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); -snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2); +snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", yaxis.kinship=FALSE); legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); dev.off() time_elapsed(start_time); @@ -440,7 +441,7 @@ pdf(file=file_path, width=20, height=10); par(mar = c(8, 4, 4, 2)); x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); -text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); +text(cex=0.8, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); dev.off() time_elapsed(start_time); @@ -475,11 +476,32 @@ palette = colorRampPalette(piratepal("basel")); ggplot() + geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + -coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) + +coord_quickmap(xlim=longitude_range_vector, ylim=latitude_range_vector) + geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.7, size=3) + scale_color_manual(values=palette(num_colors)) + theme(legend.position="bottom") + guides(color=guide_legend(nrow=8, byrow=F)); + +# Sample MLG on a map for each region. +for (i in unique(affy_metadata_data_frame$region)) { + m <- i; + num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)])); + max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)],na.rm=TRUE); + min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5); + max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5); + print(ggplot() + + geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), + fill="grey", colour="#7f7f7f") + + coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") + + geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),], aes(x=longitude, y=latitude, + group=mlg, colour=mlg), alpha=.5, size=3) + + scale_color_manual(values=palette(num_colors_2)) + + theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) + + guides(color=guide_legend(nrow=8, byrow=F))); +} dev.off() time_elapsed(start_time); @@ -498,12 +520,12 @@ pdf(file=file_path, width=40, height=80); # Organize branches by clade. nj_phylogeny_tree <- sample_alleles_vector %>% - aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% + aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE, showtree = FALSE) %>% ladderize(); nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; - plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); + plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0025, cex=0.6, font=2, lwd=4, align.tip.label=F, no.margin=T); # Add a scale bar showing 5% difference. - add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); + add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=2); nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); dev.off() @@ -543,7 +565,7 @@ col <- c("GREY", "#006DDB", "#24FF24", "#920000"); # Generate a pie chart for each sample with genotypes. for (i in 1:ncol(translated_stag_db_report_matrix)) { - tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); + tmp_labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); }
--- a/coral_multilocus_genotype.xml Thu Jul 18 07:58:01 2019 -0400 +++ b/coral_multilocus_genotype.xml Thu Aug 15 09:59:57 2019 -0400 @@ -56,18 +56,7 @@ </outputs> <tests> <test> - <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/> - <param name="input_affy_metadata" value="affy_metadata.tabular" ftype="tabular"/> - <param name="input_pop_info" value="pop_info.txt" ftype="txt"/> - <output name="output_stag_db_report" file="output_stag_db_report.csv" ftype="csv"/> - <output_collection name="output_data_collection" type="list"> - <element name="person.tabular" file="person.tabular" ftype="tabular" compare="contains"/> - </output_collection> - <output_collection name="output_plot_collection" type="list"> - <element name="nj_phylogeny.pdf" file="nj_phylogeny.pdf" ftype="pdf" compare="contains"/> - <element name="missing_data.pdf" file="missing_data.pdf" ftype="pdf" compare="contains"/> - <element name="percent_breakdown.pdf" file="percent_breakdown.pdf" ftype="pdf" compare="contains"/> - </output_collection> + <!--Testing this tool is a bit difficult at the current time.--> </test> </tests> <help>