comparison data2.R @ 0:56d5be6c03b9 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit d63c84012410608b3b5d23e130f0beff475ce1f8-dirty
author matthias
date Fri, 08 Mar 2019 06:30:11 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:56d5be6c03b9
1 library(dada2)
2
3 # library(DBI)
4 # library(ggplot2)
5 library(optparse)
6 # library(RSQLite)
7 # library(stringr)
8
9 ## source required R functions
10 source('user_input_functions.R')
11
12 # print dada2 version
13 print(paste("dada2 version: ", packageVersion("dada2")))
14
15 # # R function to create fasta file from dada2 output data
16 # outdir is directory to output fasta file
17 # taxa is taxonomy file generated by dada2
18 # prefix is string for desired prefix attached to output file names
19
20 dada2fasta <- function(outdir, seqtab.nochim, prefix){
21 seq <- colnames(seqtab.nochim)
22 n <- 0
23 ASVs <- c()
24 for(i in seq){
25 n <- n + 1
26 ASV <- paste('ASV', as.character(n), sep = '_')
27 ASVs <- c(ASVs, ASV)
28 line1 <- paste('>',ASV,sep='')
29 write(line1, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
30 write(i, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
31 }
32 return(ASVs)
33 }
34
35
36 # # R DADA2 workflow
37 # wd is path to fastq files
38 # r_path is path to user_input_functions.R
39 # outdir is path to output directory
40 # prefix is string for desired prefix attached to output file names
41
42 dada2run <- function(wd, r_path, outdir, prefix){
43
44 # read-in files-------------------------------------------------------
45 ## obtain vectors of forward and reverse reads based on 'R1' and 'R2' in file names
46 ## additionally obtain the coressponding sample names for these files
47 p1 <- c()
48 p2 <- c()
49 sample.names <- c()
50 for(f in list.files(wd, full.names=T)){
51 if(grepl('_1.fq', f)){
52 sample <- gsub('^.*[/](.*?)_1\\.fq\\.gz', '\\1', f)
53 sample.names <- c(sample.names, sample)
54 p1 <- c(p1, f)
55 }
56 if(grepl('_2.fq', f)){
57 p2 <- c(p2, f)
58 }
59 }
60 fnFs <- sort(p1)
61 fnRs <- sort(p2)
62 sample.names <- sort(sample.names)
63
64 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
65 #load(file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
66
67 ## print for review
68 to_view <- data.frame(sample.names, fnFs, fnRs)
69 cat("The following fastq files were found:\n")
70 print(to_view)
71
72 # Perform quality filtering and trimming---------------------------------
73 ## assign new names to samples
74 filtFs <- file.path(outdir, paste0(sample.names, 'F_filt.fastq.gz'))
75 filtRs <- file.path(outdir, paste0(sample.names, 'R_filt.fastq.gz'))
76
77 ## plot forward and reverse quality so that user can decide on filtering parameters
78 cat('Plotting quality profile of forward reads...\n')
79 Fqp1 <- plotQualityProfile(fnFs[1])
80 #print(Fqp1)
81 ggsave(sprintf('%s_forward_1_qualityprofile.pdf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
82 #ggsave(sprintf('%s_forward_1_qualityprofile.emf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
83 Fqp2 <- plotQualityProfile(fnFs[2])
84 #print(Fqp2)
85 ggsave(sprintf('%s_forward_2_qualityprofile.pdf',prefix),Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
86 #ggsave(sprintf('%s_forward_2_qualityprofile.emf',prefix), Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
87 #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
88 len1 <- 240
89 cat('Plotting quality profile of reverse reads...\n')
90 Rqp1 <- plotQualityProfile(fnRs[1])
91 #print(Rqp1)
92 ggsave(sprintf('%s_reverse_1_qualityprofile.pdf',prefix),Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
93 #ggsave(sprintf('%s_reverse_1_qualityprofile.emf',prefix), Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
94 Rqp2 <- plotQualityProfile(fnRs[2])
95 #print(Rqp2)
96 ggsave(sprintf('%s_reverse_2_qualityprofile.pdf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
97 #ggsave(sprintf('%s_reverse_2_qualityprofile.emf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
98 #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
99 len2 <- 240
100
101 ## filter and trim
102 ## remaining parameters set to recommended defaults
103 ## maxN must be set to 0 (DADA2 requries no Ns)
104 ## The maxEE parameter sets the maximum number of "expected errors" allowed in a read, which is a better filter than simply averaging quality scores.
105 ## If not using Windows, you may set multithread to TRUE
106 ## NOTE: Do not use the truncLen parameter for ITS sequencing
107 ## trimLeft needs to be varied based on the size of your primers (i.e. it is used to trim your primer sequences)!!
108 cat('Filtering and trimming sequences...\n')
109 out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=T, compress=T, multithread=threads, trimLeft=15)
110
111 ## have user review read count changes, and relax error rate if too many reads are lost
112 ## for example, you may especially want to relax the number of expected errors on the reverse reads (i.e. maxEE=c(2,5)), as the reverse is prone to error on the Illumina sequencing platform
113 print(head(out))
114 check2 <- T
115 while(check2 == F){
116 maxF <- numeric_input("What would you like the maximum number of expected errors in the forward reads to be?\nDefault 2:", 2)
117 maxR <- numeric_input("What would you like the maximum number of expected errors in the reverse reads to be?\nDefault 5:", 5)
118 out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(maxF,maxR), truncQ=2, rm.phix=T, compress=T, multithread=threads)
119 print(head(out))
120 check2 <- yn_input('Proceed? If you lost too many reads, you can choose to not proceed and you will have the option to relax the error rate. Default yes:',T)
121 }
122
123 # Have DADA2 learn the error rates-----------------------------------------------
124 ## If not using Windows, you may set multithread to TRUE
125 read.subset <- 1e6
126 cat('Learning error rate of forward reads...\n')
127 errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
128
129 ## have user check estimated error rates
130 ## note the calculations are done with a subset of the total reads, as it is computationally intensive
131 ## if the fit is poor, the user has the option to try again with an increased subset number of reads
132 Error_f <- plotErrors(errF, nominalQ = T)
133 #print(Error_f)
134 ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
135 #ggsave(sprintf('%s_forward_Error_plot.emf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
136 check3a <- T
137 while(check3a == F){
138 read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
139 errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
140 print(Error_f)
141 ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
142 ggsave(sprintf('%s_forward_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
143 check3a <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
144 }
145
146
147 ## also do for reverseL
148 cat('Learning error rate of reverse reads...\n')
149 errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
150 Error_r <- plotErrors(errR, nominalQ=T)
151 #print(Error_r)
152 ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
153 #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
154 check3b <- T
155 while(check3b == F){
156 read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
157 errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
158 print(Error_r)
159 ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
160 #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
161 check3b <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
162 }
163
164 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
165 #load(file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
166
167 # Dereplicate sequences to generate unique sequence fastq files with corresponding count tables-------------------------
168 ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
169 cat('Dereplicating forward reads...\n')
170 derepFs <- derepFastq(filtFs, verbose=T)
171 cat('Dereplicating reverse reads...\n')
172 derepRs <- derepFastq(filtRs, verbose=T)
173
174 ## name the derep-class objects by sample names
175 names(derepFs) <- sample.names
176 names(derepRs) <- sample.names
177
178 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
179 #load(file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
180
181 # Infer sequence variants using learned error rates---------------------------------
182 ## If not using Windows, you may set multithread to TRUE
183 ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
184 ## NOTE2: you can use DADA2 for 454 or IonTorrent data as well. Please see https://benjjneb.github.io/dada2/tutorial.html.
185 s.pool = F
186 cat('Inferring sequence variants of forward reads...\n')
187 dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
188
189 ## have user inspect detected number of sequence variants, to ensure it is logical based on the biological context of their samples
190 ## if you have low sampling depths, you may not want to process each sample independently as per default, but set pool=T. It gives better results at increased computation time. The user will have the option to do this if the number of sequence variants doesn't make sense.
191 print(dadaFs[[1]])
192 check4 <- T
193 if(check4 == F){
194 s.pool = T
195 dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
196 print(dadaFs[[1]])
197 cat('Hopefully, these results make more sense.\nOtherwise, there is not much more you can do except start over!\n')
198 check <- yn_input('Proceed? Default yes, no to quit:',T)
199 if(check == F){
200 stop()
201 }
202 }
203
204 ## also do for reverse, but don't need to re-check as you need to keep the pool consistent between the forward and reverse!
205 cat('Inferring sequence variants of reversed reads...\n')
206 dadaRs <- dada(derepRs, err=errR, pool=s.pool, multithread=threads)
207
208 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
209 #load(file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
210
211
212 # Merge forward and reverse reads-------------------------------------------------
213 cat('Merging paired-end reads...\n')
214 mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=T)
215 #cat('Most of your reads should have been retained (i.e. were able to merge, see above).\nOtherwise, there is not much more you can do except start over (i.e. did you over-trim your sequences??)!\n')
216 check <- T
217 if(check == F){
218 stop()
219 }
220
221 # Construct sequences table-------------------------------------------------------
222 cat('Constructing sequence table...\n')
223 seqtab <- makeSequenceTable(mergers)
224
225 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
226 #load(file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
227
228
229 ## inspect distribution of sequence lengths
230 ## give user the option to filter out overly long or short sequneces
231 cat('Sequence length distribution listed below:\n')
232 print(table(nchar(getSequences(seqtab))))
233 check5 <- T
234 if(check5 == F){
235 min.cutoff <- numeric_input('Please input desired minimum length of sequences:',NULL)
236 max.cutoff <- numeric_input('Please input desired maximum length of sequences:',NULL)
237 seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seq(min.cutoff,max.cutoff)]
238 }
239
240 # Remove chimeras------------------------------------------------------------------
241 ## If not using Windows, you may set multithread to TRUE
242 cat('Removing chimeras...\n')
243 seqtab.nochim <- removeBimeraDenovo(seqtab, method='consensus', multithread=threads, verbose=T)
244
245 ## display percentage of chimeras removed
246 ## this number should be small (<5%), otherwise some processing parameters need to be revisited
247 percent.nochim <- (sum(seqtab.nochim)/sum(seqtab))*100
248 percent.nochim <- paste(as.character(percent.nochim),'of reads retained after chimera removal.\n',sep=' ')
249 cat(percent.nochim)
250 #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
251 check <- T
252 if(check == F){
253 stop()
254 }
255
256 # Final sanity check--------------------------------------------------------------
257 ## track reads removed throughout the pipeline
258 ## If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
259 getN <- function(x) sum(getUniques(x))
260 track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
261 colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
262 rownames(track) <- sample.names
263 print(head(track))
264 #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
265 check <- T
266 if(check == F){
267 stop()
268 }
269
270 write.csv(track,file=file.path(outdir, sprintf('%s_read_count-quality_control.csv',prefix)))
271
272 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
273 # load(file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
274
275 # Assign taxonomy-----------------------------------------------------------------
276 ## require silva database files
277 ## If not using Windows, you may set multithread to TRUE
278 ## Minimum boot strap should be 80, but for sequnce length =< 250 Minimum bootstrap set to 50 (which is also the default)
279
280 ## SILVA
281 cat('Assigning taxonomy to genus level using SILVA...\n')
282 taxa_silva <- assignTaxonomy(seqtab.nochim, file.path(wd,"silva_nr_v132_train_set.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
283 cat('Assigning taxonomy at species level using SILVA...\n')
284 taxa_silva <- addSpecies(taxa_silva, file.path(wd,"silva_species_assignment_v132.fa.gz"), allowMultiple=T, tryRC=T)
285 write.csv(taxa_silva,file=file.path(outdir, sprintf('%s_taxonomy_silva.csv',prefix)))
286
287 ## RDP - used for copy number correction
288 cat('Assigning taxonomy to genus level using RDP...\n')
289 taxa_rdp <- assignTaxonomy(seqtab.nochim, file.path(wd,"rdp_train_set_16.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
290 cat('Assigning taxonomy at species level using RDP...\n')
291 taxa_rdp <- addSpecies(taxa_rdp, file.path(wd,"rdp_species_assignment_16.fa.gz"), allowMultiple=T, tryRC=T)
292 write.csv(taxa_rdp,file=file.path(outdir, sprintf('%s_taxonomy_rdp.csv',prefix)))
293 save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
294 #load(file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
295
296
297 # Return data----------------------------------------------------------------------
298 cat('Returning data...\n')
299 ## create fasta file
300 ASVs <- dada2fasta(outdir, seqtab.nochim, prefix)
301 ## create master dataframe for each classification
302
303 ## Assigning ASVs to count table
304 sequences <- colnames(seqtab.nochim)
305 colnames(seqtab.nochim) <- ASVs
306 seqtab.nochim <- t(seqtab.nochim)
307
308 ## silva
309 taxa_silva <- taxa_silva[match(sequences, rownames(taxa_silva)),]
310 rownames(taxa_silva) <- ASVs
311 d <- merge(seqtab.nochim, taxa_silva, by='row.names')
312 colnames(d)[1] <- 'ASV'
313 ## create database of all information
314 db <- dbConnect(RSQLite::SQLite(), file.path(outdir, sprintf('%s.sqlite',prefix)))
315 dbWriteTable(db, 'dada2_results_silva', d)
316 ## write master dataframe for user, and return it
317 write.table(d, file.path(outdir, sprintf('%s_dada2_results_silva.txt',prefix)), sep='\t', quote=F, row.names=F)
318
319 ## rdp
320 taxa_rdp <- taxa_rdp[match(sequences, rownames(taxa_rdp)),]
321 rownames(taxa_rdp) <- ASVs
322 d <- merge(seqtab.nochim, taxa_rdp, by='row.names')
323 colnames(d)[1] <- 'ASV'
324 ## create database of all information
325 dbWriteTable(db, 'dada2_results_rdp', d)
326 dbDisconnect(db)
327 ## write master dataframe for user, and return it
328 write.table(d, file.path(outdir, sprintf('%s_dada2_results_rdp.txt',prefix)), sep='\t', quote=F, row.names=F)
329 return(d)
330
331 cat('DADA2 processing completed!\n')
332 }
333
334
335 option_list = list(
336 make_option(c("-t", "--threads"), type = "integer", default = 1,
337 help = "number of threads to use", metavar = "THREADS")
338 );
339
340 opt_parser = OptionParser(option_list=option_list);
341 opt = parse_args(opt_parser);
342
343 print(opt)
344
345 threads = as.integer(Sys.getenv("NSLOTS", "1"))
346
347 exit(1)
348
349
350
351 dada2run(wd='/work/haange/Leaky_gut/', r_path='/work/haange/Leaky_gut', outdir='/work/haange/Leaky_gut/results', prefix='Leaky_gut')
352