comparison multilocus_genotype.R @ 7:18001e7cb199 draft

Uploaded
author greg
date Wed, 28 Nov 2018 13:49:18 -0500
parents a7cce4091e80
children d2057e183772
comparison
equal deleted inserted replaced
6:a71901fd5325 7:18001e7cb199
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 2
3 suppressPackageStartupMessages(library("adegenet")) 3 suppressPackageStartupMessages(library("adegenet"))
4 suppressPackageStartupMessages(library("ape")) 4 suppressPackageStartupMessages(library("ape"))
5 suppressPackageStartupMessages(library("data.table"))
6 #suppressPackageStartupMessages(library("dbplyr"))
7 suppressPackageStartupMessages(library("dplyr"))
5 suppressPackageStartupMessages(library("ggplot2")) 8 suppressPackageStartupMessages(library("ggplot2"))
6 suppressPackageStartupMessages(library("knitr")) 9 suppressPackageStartupMessages(library("knitr"))
7 suppressPackageStartupMessages(library("optparse")) 10 suppressPackageStartupMessages(library("optparse"))
8 suppressPackageStartupMessages(library("poppr")) 11 suppressPackageStartupMessages(library("poppr"))
9 suppressPackageStartupMessages(library("RColorBrewer")) 12 suppressPackageStartupMessages(library("RColorBrewer"))
13 suppressPackageStartupMessages(library("RPostgres"))
14 #suppressPackageStartupMessages(library("tidyr"))
10 suppressPackageStartupMessages(library("vcfR")) 15 suppressPackageStartupMessages(library("vcfR"))
11 suppressPackageStartupMessages(library("vegan")) 16 suppressPackageStartupMessages(library("vegan"))
12 17
13 option_list <- list( 18 option_list <- list(
19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
14 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), 22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
15 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), 23 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"),
16 make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"), 24 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file")
17 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile")
18 ) 25 )
19 26
20 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); 27 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
21 args <- parse_args(parser, positional_arguments=TRUE); 28 args <- parse_args(parser, positional_arguments=TRUE);
22 opt <- args$options; 29 opt <- args$options;
24 get_file_path = function(file_name) { 31 get_file_path = function(file_name) {
25 file_path = paste("output_plots_dir", file_name, sep="/"); 32 file_path = paste("output_plots_dir", file_name, sep="/");
26 return(file_path); 33 return(file_path);
27 } 34 }
28 35
29 # Extract Provesti's distance from the distance matrix.
30 provesti_distance <- function(distance, selection) {
31 eval(parse(text=paste("as.matrix(distance)[", selection, "]")));
32 }
33
34 # Read in VCF input file. 36 # Read in VCF input file.
35 vcf <- read.vcfR(opt$input_vcf); 37 vcf <- read.vcfR(opt$input_vcf);
36 38
37 #Missing GT in samples submitted 39 #Missing GT in samples submitted
38 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 40 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
39 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 41 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
40 myMiss <- (myMiss / nrow(vcf)) * 100; 42 missing_gt <- (missing_gt / nrow(vcf)) * 100;
41 miss <- data.frame(myMiss); 43 missing_gt_data_frame <- data.frame(missing_gt);
42 write.table(miss, file=opt$output_missing_data, quote=FALSE); 44
43 45 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
44 # Convert VCF file into formats compatiable with the Poppr package. 46 hets <- (hets / nrow(vcf)) * 100;
45 genind <- vcfR2genind(vcf); 47 ht <- data.frame(hets);
48
49 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
50 refA <- (refA / nrow(vcf)) * 100;
51 rA <- data.frame(refA);
52
53 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
54 altB <- (altB / nrow(vcf)) * 100;
55 aB <- data.frame(altB);
56
57 # Convert VCF file into a genind for the Poppr package.
58 # TODO: probably should not hard-code 2 cores.
59 gl <- vcfR2genlight(vcf, n.cores=2);
60 genind <- new("genind", (as.matrix(gl)));
61
46 # Add population information to the genind object. 62 # Add population information to the genind object.
47 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); 63 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA"));
48 genind@pop <- as.factor(poptab$region); 64 genind@pop <- as.factor(poptab$region);
49 # Convert genind to genclone object 65
66 # Convert genind object to a genclone object.
50 gclo <- as.genclone(genind); 67 gclo <- as.genclone(genind);
51 # Calculate the bitwise distance between individuals, 68
52 # the following is similar to Provesti's distance. 69 # Calculate the bitwise distance between individuals.
53 xdis <- bitwise.dist(gclo, missing_match=FALSE); 70 xdis <- bitwise.dist(gclo);
54 71
55 # Multilocus genotypes (threshold of 1.6%). 72 # Multilocus genotypes (threshold of 1%).
56 mlg.filter(gclo, distance=xdis) <- 0.016; 73 mlg.filter(gclo, distance=xdis) <- 0.01;
57 # Start PDF device driver.
58 dev.new(width=10, height=7);
59 file_path = get_file_path("mlg_table.pdf");
60 pdf(file=file_path, width=10, height=7);
61 m <- mlg.table(gclo, background=TRUE, color=TRUE); 74 m <- mlg.table(gclo, background=TRUE, color=TRUE);
62 dev.off();
63 75
64 # Create table of MLGs. 76 # Create table of MLGs.
65 id <- mlg.id(gclo); 77 id <- mlg.id(gclo);
66 df <- data.frame(matrix((id), byrow=T)); 78 dt <- data.table(id, keep.rownames=TRUE);
67 write.table(df, file=opt$output_mlg_id); 79 setnames(dt, c("id"), c("user_specimen_id"));
80
81 # Read user's Affymetrix 96 well plate csv file.
82 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE);
83 pinfo <- pinfo$user_specimen_id;
84 pi <- data.table(pinfo);
85 setnames(pi, c("pinfo"), c("user_specimen_id"));
86
87 # Instantiate database connection.
88 # The connection string has this format:
89 # postgresql://user:password@host/dbname
90 conn_string <- opt$database_connection_string;
91 conn_items <- strsplit(conn_string, "://")[[1]];
92 string_needed <- conn_items[1];
93 items_needed <- strsplit(string_needed, "@")[[1]];
94 user_pass_string <- items_needed[1];
95 host_dbname_string <- items_needed[2];
96 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
97 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
98 user <- user_pass_items[1];
99 pass <- user_pass_items[2];
100 host <- host_dbname_items[1];
101 dbname <- host_dbname_items[2];
102 # FIXME: is there a way to not hard-code the port?
103 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
104
105 # Import the sample table.
106 sample_table <- tbl(conn, "sample");
107
108 # Select user_specimen_id and mlg columns.
109 smlg <- sample_table %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id);
110
111 # Convert to dataframe.
112 sm <- data.frame(smlg);
113 sm[sm==""] <- NA;
114
115 # Convert missing data into data table.
116 mi <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
117 # Change names to match db.
118 setnames(mi, c("rn"), c("user_specimen_id"));
119 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
120 # Round missing data to two digits.
121 mi$percent_missing <- round(mi$percent_missing, digits=2);
122
123 # Convert heterozygosity data into data table.
124 ht <-setDT(ht, keep.rownames=TRUE)[];
125 # Change names to match db.
126 setnames(ht, c("rn"), c("user_specimen_id"));
127 setnames(ht, c("hets"), c("percent_mixed_coral"));
128 # Round missing data to two digits.
129 ht$percent_mixed<-round(ht$percent_mixed, digits=2);
130
131 # Convert refA data into data.table.
132 rA <-setDT(rA, keep.rownames=TRUE)[];
133 # Change names to match db.
134 setnames(rA, c("rn"), c("user_specimen_id"));
135 setnames(rA, c("refA"), c("percent_reference_coral"));
136 # round missing data to two digits.
137 rA$percent_reference<-round(rA$percent_reference, digits=2);
138
139 # Convert altB data into data table.
140 aB <-setDT(aB, keep.rownames=TRUE)[];
141 # Change names to match db.
142 setnames(aB, c("rn"), c("user_specimen_id"));
143 setnames(aB, c("altB"), c("percent_alternative_coral"));
144 # Round missing data to two digits.
145 aB$percent_alternative<-round(aB$percent_alternative, digits=2);
146
147 #convert mlg id to data.table format
148 dt <- data.table(id, keep.rownames=TRUE);
149 # Change name to match db.
150 setnames(dt, c("id"), c("user_specimen_id"));
151
152 # Transform.
153 df3 <- dt %>%
154 group_by(row_number()) %>%
155 dplyr::rename(group='row_number()') %>%
156 unnest (user_specimen_id) %>%
157 # Join with mlg table.
158 left_join(sm %>%
159 select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id');
160
161 # If found in database, group members on previous mlg id.
162 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
163 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
164 na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
165 na.group <- df3$group[na.mlg];
166 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
167
168 # Determine if the sample mlg matched previous genotyped sample.
169 df4<- df3 %>%
170 group_by(group) %>%
171 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match"));
172
173 # Create new mlg id for samples that did not match those in the database.
174 none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
175 none <- none[is.na(none$coral_mlg_clonal_id),];
176 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
177 n.g <- df4$group[na.mlg2];
178 ct <- length(unique(n.g));
179
180 # List of new group ids, the sequence starts at the number of
181 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if
182 # the df4 file contains all ids. If it doesn't then look below
183 # to change the seq() function.
184 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
185 # This is a key for pairing group with new ids.
186 rat <- cbind(unique(n.g), n.g_ids);
187 # this for loop assigns the new id iteratively for all that have NA.
188 for (i in 1:length(na.mlg2)) {
189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
190 }
191
192 # Merge data frames for final table.
193 report_user <- pi %>%
194 # Join with the second file (only the first and third column).
195 left_join(df4 %>%
196 select("user_specimen_id","coral_mlg_clonal_id","DB_match"),
197 by='user_specimen_id') %>%
198 # Join with the second file (only the first and third column).
199 left_join(mi %>%
200 select("user_specimen_id","percent_missing_coral"),
201 by='user_specimen_id') %>%
202 # Join with the second file (only the first and third column).
203 left_join(ht %>%
204 select("user_specimen_id","percent_mixed_coral"),
205 by='user_specimen_id') %>%
206 # Join with the second file (only the first and third column);
207 left_join(rA %>%
208 select("user_specimen_id","percent_reference_coral"),
209 by='user_specimen_id') %>%
210 # Join with the second file (only the first and third column).
211 left_join(aB %>%
212 select("user_specimen_id","percent_alternative_coral"),
213 by='user_specimen_id') %>%
214 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
215 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>%
216 ungroup() %>%
217 select(-group);
218
219 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE);
68 220
69 # Rarifaction curve. 221 # Rarifaction curve.
70 H.obj <- mlg.table(gclo, plot=TRUE);
71 # Start PDF device driver. 222 # Start PDF device driver.
72 dev.new(width=10, height=7); 223 dev.new(width=10, height=7);
73 file_path = get_file_path("geno_rarifaction_curve.pdf"); 224 file_path = get_file_path("geno_rarifaction_curve.pdf");
74 pdf(file=file_path, width=10, height=7); 225 pdf(file=file_path, width=10, height=7);
75 rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue"); 226 rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
76 dev.off() 227 dev.off();
228
229 # Genotype accumulation curve, sample is number of
230 # loci randomly selected for to make the curve.
231 dev.new(width=10, height=7);
232 file_path = get_file_path("geno_accumulation_curve.pdf");
233 pdf(file=file_path, width=10, height=7);
234 genotype_curve(gind, sample=5, quiet=TRUE);
235 dev.off();
77 236
78 # Create a phylogeny of samples based on distance matrices. 237 # Create a phylogeny of samples based on distance matrices.
79 cols <- palette(brewer.pal(n=12, name='Set3')); 238 cols <- palette(brewer.pal(n=12, name='Set3'));
80 set.seed(999); 239 set.seed(999);
81 # Start PDF device driver. 240 # Start PDF device driver.
82 dev.new(width=10, height=7); 241 dev.new(width=10, height=7);
83 file_path = get_file_path("nj_phylogeny.pdf"); 242 file_path = get_file_path("nj_phylogeny.pdf");
84 pdf(file=file_path, width=10, height=7); 243 pdf(file=file_path, width=10, height=7);
85 # Organize branches by clade. 244 # Organize branches by clade.
86 tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); 245 tree <- gclo %>%
246 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>%
247 ladderize();
87 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); 248 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4);
88 # Add a scale bar showing 5% difference.. 249 # Add a scale bar showing 5% difference..
89 add.scale.bar(length=0.05, cex=0.65); 250 add.scale.bar(length=0.05, cex=0.65);
90 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); 251 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
91 dev.off() 252 dev.off();
92 253