Mercurial > repos > matthias > dada2_learnerrors

Binary file dada2.pdf has changed
--- a/dada2_learnErrors.xml	Tue May 28 12:45:55 2019 -0400
+++ b/dada2_learnErrors.xml	Thu Aug 29 09:01:22 2019 -0400
@@ -1,9 +1,10 @@
-<tool id="dada2_learnErrors" name="dada2: learnErrors" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@">
+<tool id="dada2_learnErrors" name="dada2: learnErrors" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@" profile="19.09">
     <description>Learn Error rates</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"/>
+    <expand macro="stdio"/>
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
     Rscript '$dada2_script' \${GALAXY_SLOTS:-1}
--- a/data2.R	Tue May 28 12:45:55 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,352 +0,0 @@
-library(dada2)
-
-# library(DBI)
-# library(ggplot2)
-library(optparse)
-# library(RSQLite)
-# library(stringr)
-
-## source required R functions
-source('user_input_functions.R')
-
-# print dada2 version
-print(paste("dada2 version: ", packageVersion("dada2")))
-
-# # R function to create fasta file from dada2 output data
-# outdir is directory to output fasta file
-# taxa is taxonomy file generated by dada2
-# prefix is string for desired prefix attached to output file names
-
-dada2fasta <- function(outdir, seqtab.nochim, prefix){
-  seq <- colnames(seqtab.nochim)
-  n <- 0
-  ASVs <- c()
-  for(i in seq){
-    n <- n + 1
-    ASV <- paste('ASV', as.character(n), sep = '_')
-    ASVs <- c(ASVs, ASV)
-    line1 <- paste('>',ASV,sep='')
-    write(line1, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
-    write(i, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
-  }
-  return(ASVs)
-}
-
-
-# # R DADA2 workflow
-# wd is path to fastq files
-# r_path is path to user_input_functions.R
-# outdir is path to output directory
-# prefix is string for desired prefix attached to output file names
-
-dada2run <- function(wd, r_path, outdir, prefix){
-
-  # read-in files-------------------------------------------------------
-  ## obtain vectors of forward and reverse reads based on 'R1' and 'R2' in file names
-  ## additionally obtain the coressponding sample names for these files
-  p1 <- c()
-  p2 <- c()
-  sample.names <- c()
-  for(f in list.files(wd, full.names=T)){
-    if(grepl('_1.fq', f)){
-      sample <- gsub('^.*[/](.*?)_1\\.fq\\.gz', '\\1', f)
-      sample.names <- c(sample.names, sample)
-      p1 <- c(p1, f)
-    }
-    if(grepl('_2.fq', f)){
-      p2 <- c(p2, f)
-    }
-  }
-  fnFs <- sort(p1)
-  fnRs <- sort(p2)
-  sample.names <- sort(sample.names)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
-
-  ## print for review
-  to_view <- data.frame(sample.names, fnFs, fnRs)
-  cat("The following fastq files were found:\n")
-  print(to_view)
-
-  # Perform quality filtering and trimming---------------------------------
-  ## assign new names to samples
-  filtFs <- file.path(outdir, paste0(sample.names, 'F_filt.fastq.gz'))
-  filtRs <- file.path(outdir, paste0(sample.names, 'R_filt.fastq.gz'))
-
-  ## plot forward and reverse quality so that user can decide on filtering parameters
-  cat('Plotting quality profile of forward reads...\n')
-  Fqp1 <- plotQualityProfile(fnFs[1])
-  #print(Fqp1)
-  ggsave(sprintf('%s_forward_1_qualityprofile.pdf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_1_qualityprofile.emf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  Fqp2 <- plotQualityProfile(fnFs[2])
-  #print(Fqp2)
-  ggsave(sprintf('%s_forward_2_qualityprofile.pdf',prefix),Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_2_qualityprofile.emf',prefix), Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
-  len1 <- 240
-  cat('Plotting quality profile of reverse reads...\n')
-  Rqp1 <- plotQualityProfile(fnRs[1])
-  #print(Rqp1)
-  ggsave(sprintf('%s_reverse_1_qualityprofile.pdf',prefix),Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_1_qualityprofile.emf',prefix), Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  Rqp2 <- plotQualityProfile(fnRs[2])
-  #print(Rqp2)
-  ggsave(sprintf('%s_reverse_2_qualityprofile.pdf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_2_qualityprofile.emf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
-  len2 <- 240
-
-  ## filter and trim
-  ## remaining parameters set to recommended defaults
-  ## maxN must be set to 0 (DADA2 requries no Ns)
-  ## The maxEE parameter sets the maximum number of "expected errors" allowed in a read, which is a better filter than simply averaging quality scores.
-  ## If not using Windows, you may set multithread to TRUE
-  ## NOTE: Do not use the truncLen parameter for ITS sequencing
-  ## trimLeft needs to be varied based on the size of your primers (i.e. it is used to trim your primer sequences)!!
-  cat('Filtering and trimming sequences...\n')
-  out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=T, compress=T, multithread=threads, trimLeft=15)
-
-  ## have user review read count changes, and relax error rate if too many reads are lost
-  ## for example, you may especially want to relax the number of expected errors on the reverse reads (i.e. maxEE=c(2,5)), as the reverse is prone to error on the Illumina sequencing platform
-  print(head(out))
-  check2 <- T
-  while(check2 == F){
-    maxF <- numeric_input("What would you like the maximum number of expected errors in the forward reads to be?\nDefault 2:", 2)
-    maxR <- numeric_input("What would you like the maximum number of expected errors in the reverse reads to be?\nDefault 5:", 5)
-    out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(maxF,maxR), truncQ=2, rm.phix=T, compress=T, multithread=threads)
-    print(head(out))
-    check2 <- yn_input('Proceed? If you lost too many reads, you can choose to not proceed and you will have the option to relax the error rate. Default yes:',T)
-  }
-
-  # Have DADA2 learn the error rates-----------------------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  read.subset <- 1e6
-  cat('Learning error rate of forward reads...\n')
-  errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
-
-  ## have user check estimated error rates
-  ## note the calculations are done with a subset of the total reads, as it is computationally intensive
-  ## if the fit is poor, the user has the option to try again with an increased subset number of reads
-  Error_f <- plotErrors(errF, nominalQ = T)
-  #print(Error_f)
-  ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_Error_plot.emf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
-  check3a <- T
-  while(check3a == F){
-    read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
-    errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
-    print(Error_f)
-    ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    ggsave(sprintf('%s_forward_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    check3a <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
-  }
-
-
-  ## also do for reverseL
-  cat('Learning error rate of reverse reads...\n')
-  errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
-  Error_r <- plotErrors(errR, nominalQ=T)
-  #print(Error_r)
-  ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
-  check3b <- T
-  while(check3b == F){
-    read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
-    errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
-    print(Error_r)
-    ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    check3b <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
-  }
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
-
-  # Dereplicate sequences to generate unique sequence fastq files with corresponding count tables-------------------------
-  ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
-  cat('Dereplicating forward reads...\n')
-  derepFs <- derepFastq(filtFs, verbose=T)
-  cat('Dereplicating reverse reads...\n')
-  derepRs <- derepFastq(filtRs, verbose=T)
-
-  ## name the derep-class objects by sample names
-  names(derepFs) <- sample.names
-  names(derepRs) <- sample.names
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
-
-  # Infer sequence variants using learned error rates---------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
-  ## NOTE2: you can use DADA2 for 454 or IonTorrent data as well. Please see https://benjjneb.github.io/dada2/tutorial.html.
-  s.pool = F
-  cat('Inferring sequence variants of forward reads...\n')
-  dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
-
-  ## have user inspect detected number of sequence variants, to ensure it is logical based on the biological context of their samples
-  ## if you have low sampling depths, you may not want to process each sample independently as per default, but set pool=T. It gives better results at increased computation time. The user will have the option to do this if the number of sequence variants doesn't make sense.
-  print(dadaFs[[1]])
-  check4 <- T
-  if(check4 == F){
-    s.pool = T
-    dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
-    print(dadaFs[[1]])
-    cat('Hopefully, these results make more sense.\nOtherwise, there is not much more you can do except start over!\n')
-    check <- yn_input('Proceed? Default yes, no to quit:',T)
-    if(check == F){
-      stop()
-    }
-  }
-
-  ## also do for reverse, but don't need to re-check as you need to keep the pool consistent between the forward and reverse!
-  cat('Inferring sequence variants of reversed reads...\n')
-  dadaRs <- dada(derepRs, err=errR, pool=s.pool, multithread=threads)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
-
-
-  # Merge forward and reverse reads-------------------------------------------------
-  cat('Merging paired-end reads...\n')
-  mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=T)
-  #cat('Most of your reads should have been retained (i.e. were able to merge, see above).\nOtherwise, there is not much more you can do except start over (i.e. did you over-trim your sequences??)!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-
-  # Construct sequences table-------------------------------------------------------
-  cat('Constructing sequence table...\n')
-  seqtab <- makeSequenceTable(mergers)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
-
-
-  ## inspect distribution of sequence lengths
-  ## give user the option to filter out overly long or short sequneces
-  cat('Sequence length distribution listed below:\n')
-  print(table(nchar(getSequences(seqtab))))
-  check5 <- T
-  if(check5 == F){
-    min.cutoff <- numeric_input('Please input desired minimum length of sequences:',NULL)
-    max.cutoff <- numeric_input('Please input desired maximum length of sequences:',NULL)
-    seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seq(min.cutoff,max.cutoff)]
-  }
-
-  # Remove chimeras------------------------------------------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  cat('Removing chimeras...\n')
-  seqtab.nochim <- removeBimeraDenovo(seqtab, method='consensus', multithread=threads, verbose=T)
-
-  ## display percentage of chimeras removed
-  ## this number should be small (<5%), otherwise some processing parameters need to be revisited
-  percent.nochim <- (sum(seqtab.nochim)/sum(seqtab))*100
-  percent.nochim <- paste(as.character(percent.nochim),'of reads retained after chimera removal.\n',sep=' ')
-  cat(percent.nochim)
-  #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-
-  # Final sanity check--------------------------------------------------------------
-  ## track reads removed throughout the pipeline
-  ## If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
-  getN <- function(x) sum(getUniques(x))
-  track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
-  colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
-  rownames(track) <- sample.names
-  print(head(track))
-  #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-
-  write.csv(track,file=file.path(outdir, sprintf('%s_read_count-quality_control.csv',prefix)))
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
-#   load(file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
-
-  # Assign taxonomy-----------------------------------------------------------------
-  ## require silva database files
-  ## If not using Windows, you may set multithread to TRUE
-  ## Minimum boot strap should be 80, but for sequnce length =< 250 Minimum bootstrap set to 50 (which is also the default)
-
-  ## SILVA
-  cat('Assigning taxonomy to genus level using SILVA...\n')
-  taxa_silva <- assignTaxonomy(seqtab.nochim, file.path(wd,"silva_nr_v132_train_set.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
-  cat('Assigning taxonomy at species level using SILVA...\n')
-  taxa_silva <- addSpecies(taxa_silva, file.path(wd,"silva_species_assignment_v132.fa.gz"), allowMultiple=T, tryRC=T)
-  write.csv(taxa_silva,file=file.path(outdir, sprintf('%s_taxonomy_silva.csv',prefix)))
-
-  ## RDP - used for copy number correction
-  cat('Assigning taxonomy to genus level using RDP...\n')
-  taxa_rdp <- assignTaxonomy(seqtab.nochim, file.path(wd,"rdp_train_set_16.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
-  cat('Assigning taxonomy at species level using RDP...\n')
-  taxa_rdp <- addSpecies(taxa_rdp, file.path(wd,"rdp_species_assignment_16.fa.gz"), allowMultiple=T, tryRC=T)
-  write.csv(taxa_rdp,file=file.path(outdir, sprintf('%s_taxonomy_rdp.csv',prefix)))
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
-
-
-  # Return data----------------------------------------------------------------------
-  cat('Returning data...\n')
-  ## create fasta file
-  ASVs <- dada2fasta(outdir, seqtab.nochim, prefix)
-  ## create master dataframe for each classification
-
-  ## Assigning ASVs to count table
-  sequences <- colnames(seqtab.nochim)
-  colnames(seqtab.nochim) <- ASVs
-  seqtab.nochim <- t(seqtab.nochim)
-
-  ## silva
-  taxa_silva <- taxa_silva[match(sequences, rownames(taxa_silva)),]
-  rownames(taxa_silva) <- ASVs
-  d <- merge(seqtab.nochim, taxa_silva, by='row.names')
-  colnames(d)[1] <- 'ASV'
-  ## create database of all information
-  db <- dbConnect(RSQLite::SQLite(), file.path(outdir, sprintf('%s.sqlite',prefix)))
-  dbWriteTable(db, 'dada2_results_silva', d)
-  ## write master dataframe for user, and return it
-  write.table(d, file.path(outdir, sprintf('%s_dada2_results_silva.txt',prefix)), sep='\t', quote=F, row.names=F)
-
-  ## rdp
-  taxa_rdp <- taxa_rdp[match(sequences, rownames(taxa_rdp)),]
-  rownames(taxa_rdp) <- ASVs
-  d <- merge(seqtab.nochim, taxa_rdp, by='row.names')
-  colnames(d)[1] <- 'ASV'
-  ## create database of all information
-  dbWriteTable(db, 'dada2_results_rdp', d)
-  dbDisconnect(db)
-  ## write master dataframe for user, and return it
-  write.table(d, file.path(outdir, sprintf('%s_dada2_results_rdp.txt',prefix)), sep='\t', quote=F, row.names=F)
-  return(d)
-
-  cat('DADA2 processing completed!\n')
-}
-
-
-option_list = list(
-  make_option(c("-t", "--threads"), type = "integer", default = 1,
-              help = "number of threads to use", metavar = "THREADS")
-);
-
-opt_parser = OptionParser(option_list=option_list);
-opt = parse_args(opt_parser);
-
-print(opt)
-
-threads = as.integer(Sys.getenv("NSLOTS", "1"))
-
-exit(1)
-
-
-
-dada2run(wd='/work/haange/Leaky_gut/', r_path='/work/haange/Leaky_gut', outdir='/work/haange/Leaky_gut/results', prefix='Leaky_gut')
-
--- a/macros.xml	Tue May 28 12:45:55 2019 -0400
+++ b/macros.xml	Thu Aug 29 09:01:22 2019 -0400
@@ -16,6 +16,13 @@
         ]]></version_command>
     </xml>

+    <xml name="stdio">
+        <stdio>
+				<regex match="Error: cannot allocate" source="stderr" level="fatal_oom" description="Out of memory error occurred" />
+				<regex match="'Calloc' could not allocate memory" source="stderr" level="fatal_oom" description="Out of memory error occurred" />
+        </stdio>
+    </xml>
+
     <xml name="citations">
         <citations>
             <citation type="doi">10.1038/nmeth.3869</citation>
--- a/notes.txt	Tue May 28 12:45:55 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,148 +0,0 @@
-TODO
-====
-
-
-
-If we make a monolithic tool:
-
-* implement sanity checks between important compute intensive steps (user definable criteria, abort if violated)
-
-If we keep separate tools:
-
-- make Rdata data types specific (like xmcs https://github.com/workflow4metabolomics/xcms/tree/dev/datatypes)
-* alternatively the data set types could be derived from tabular and the Rdata could be attached via
-  `.extra_files_path` this way the user could have some intermediate output that he could look at.
-
-
-In both cases:
-
-* allow input of single end data, single pair, single pair in separate data sets, ...
-* add mergePairsByID functionality to mergePairs tool
-
-
-Datatypes:
-==========
-
-**derep-class**: list w 3 members
-- uniques: Named integer vector. Named by the unique sequence, valued by abundance.
-• quals: Numeric matrix of average quality scores by position for each unique. Uniques are
-rows, positions are cols.
-* map: Integer vector of length the number of reads, and value the index (in uniques) of the
-unique to which that read was assigned.
-
-**learnErrorsOutput**: A named list with three entries
-- err_out: A numeric matrix with the learned error rates.
-- err_in: The initialization error rates (unimportant).
-- trans: A feature table of observed transitions for each type (eg. A->C) and quality score.
-
-**dada-class**: A multi-item List with the following named values...
-• denoised: Integer vector, named by sequence valued by abundance, of the denoised sequences.
-• clustering: An informative data.frame containing information on each cluster.
-• sequence: A character vector of each denoised sequence. Identical to names(denoised).
-• quality: The average quality scores for each cluster (row) by position (col).
-• map: Integer vector that maps the unique (index of derep.unique) to the denoised sequence (index of dada.denoised).
-• birth_subs: A data.frame containing the substitutions at the birth of each new cluster.
-• trans: The matrix of transitions by type (row), eg. A2A, A2C..., and quality score (col)
-observed in the final output of the dada algorithm.
-• err_in: The err matrix used for this invocation of dada.
-• err_out: The err matrix estimated from the output of dada. NULL if err_function not provided.
-• opts: A list of the dada_opts used for this invocation of dada.
-• call: The function call used for this invocation of dada.
-
-**uniques**: derep, dada, mergepairs(or data frame w sequenc and abundance columns)
-
-**mergepairs**:
-
-data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences, and the following columns:
-• abundance: Number of reads corresponding to this forward/reverse combination.
-• sequence: The merged sequence.
-• forward: The index of the forward denoised sequence.
-• reverse: The index of the reverse denoised sequence.
-• nmatch: Number of matches nts in the overlap region.
-• nmismatch: Number of mismatches in the overlap region.
-• nindel: Number of indels in the overlap region.
-• prefer: The sequence used for the overlap region. 1=forward; 2=reverse.
-• accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise.
-• ...: Additional columns specified in propagateCol
-
-
-
-Tools:
-======
-
-• Quality filtering
-
-  filterAndTrim IO=(fastq -> fastq)
-
-• Dereplication
-
-  derepFastq (fastq -> derep-class object)
-
-• Learn error rates
-
-  learnErrors + plotErrors
-    - in: input list, or vector, of file names (or a list of derep-class objects WHY .. learning should be done on full data)
-    - out: named list w entries
-      - \$err\_out: A numeric matrix with the learned error rates.
-      - \$err\_in: The initialization error rates (unimportant).
-      - \$trans: A feature table of observed transitions for each type (eg. A->C) and quality score
-
-• Sample Inference (dada)
-   in: (list of) derep-class object
-   out: (list of) dada-class object
-
-• Chimera Removal
-
-  removeBimeraDenovo
-
-  in: A uniques-vector or any object that can be coerced into one with getUniques.
-  out: A uniques vector, or an object of matching class if a data.frame or sequence table is provided
-
-• Merging of Paired Reads
-
-  mergePairs
-   in: 2x dada-class object(s), 2x derep-class object(s)
-   out: A data.frame, or a list of data.frames.
-     - The return data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences,
-     - cols
-       - \$abundance: Number of reads corresponding to this forward/reverse combination.
-       - \$sequence: The merged sequence.
-       - \$forward: The index of the forward denoised sequence.
-       - \$reverse: The index of the reverse denoised sequence.
-       - \$nmatch: Number of matches nts in the overlap region.
-       - \$nmismatch: Number of mismatches in the overlap region.
-       - \$nindel: Number of indels in the overlap region.
-       - \$prefer: The sequence used for the overlap region. 1=forward; 2=reverse.
-       - \$accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise.
-       - \$...: Additional columns specified in propagateCol.
-
-• Taxonomic Classification (assignTaxonomy, assignSpecies)
-
-* Other
-
-  makeSequenceTable
-   in A list of the samples to include in the sequence table. Samples can be provided in any format that can be processed by getUniques
-   out Named integer matrix (row for each sample, column for each unique sequence)
-
-  mergeSequenceTables
-
-  uniquesToFasta
-  in: A uniques-vector or any object that can be coerced into one with getUniques.
-
-  getSequences
-
-  extracts the sequences from several different data objects: including including dada-class
-  and derep-class objects, as well as data.frame objects that have both \$sequence and \$abun-
-  dance columns.
-
-  getUniques
-
-  extracts the uniques-vector from several different data objects, including dada-class
-  and derep-class objects, as well as data.frame objects that have both \$sequence and \$abundance
-  columns
-
-  plotQualityProfile
-
-  seqComplexity
-
-  setDadaOpt(...)
Binary file static/images/pairpipe.png has changed
--- a/static/images/pairpipe.svg	Tue May 28 12:45:55 2019 -0400
+++ b/static/images/pairpipe.svg	Thu Aug 29 09:01:22 2019 -0400
@@ -45,12 +45,12 @@
      inkscape:snap-bbox="false"
      inkscape:object-paths="true"
      inkscape:zoom="1"
-     inkscape:cx="685.79259"
-     inkscape:cy="205.1652"
+     inkscape:cx="654.80177"
+     inkscape:cy="201.1652"
      inkscape:window-x="0"
      inkscape:window-y="27"
      inkscape:window-maximized="1"
-     inkscape:current-layer="svg384"
+     inkscape:current-layer="layer1"
      fit-margin-top="0"
      fit-margin-left="0"
      fit-margin-right="0"
@@ -919,7 +919,15 @@
      inkscape:groupmode="layer"
      id="layer1"
      inkscape:label="Layer 1"
-     transform="translate(-178.97434,-516.80337)" />
+     transform="translate(-178.97434,-516.80337)">
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path1104"
+       d="m 547.55976,25.196631 41,34"
+       style="stroke:#000000;marker-end:url(#id2)"
+       transform="translate(178.97434,516.80337)" />
+  </g>
   <g
      inkscape:groupmode="layer"
      id="layer2"
@@ -978,5 +986,11 @@
        sodipodi:role="line"
        id="tspan1102"
        x="214"
-       y="-108.17315"></tspan></text>
+       y="-108.17315" /></text>
+  <path
+     style="stroke:#000000;marker-end:url(#id2)"
+     d="m 549.55976,123.19663 38,44"
+     id="path1255"
+     inkscape:connector-curvature="0"
+     sodipodi:nodetypes="cc" />
 </svg>
--- a/user_input_functions.R	Tue May 28 12:45:55 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,197 +0,0 @@
-# defining functions for checking user inputs
-
-# requesting directory input----------------------------------------------------------
-dir_input <- function(prompt) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- dir.exists(user_input)
-    if(check==FALSE) {
-      msg <- sprintf("The directory: %s not found.", user_input)
-      message(msg)
-    }
-  }
-  return(user_input)
-}
-
-# requesting file input
-file_input <- function(prompt, directory) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- file.exists(file.path(directory, user_input))
-    if(check==FALSE) {
-      msg <- sprintf("File: %s not found.", user_input)
-      message(msg)
-    }
-  }
-  return(user_input)
-}
-
-# requesting string input------------------------------------------------------------
-string_input <- function(prompt) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- user_input!=''
-    if(check == FALSE) {
-      message("Input can't be empty.")
-    }
-  }
-  return(user_input)
-}
-
-# requesting integer input----------------------------------------------------------
-numeric_input <- function(prompt, default) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-
-    # if blank, set user_input to defalut
-    if(user_input == '') {
-      user_input <- default
-      msg <- sprintf("No input supplied, default of %s used.", default)
-      message(msg)
-    }
-    # coerce input to be numeric
-    else {
-      user_input <- as.numeric(user_input)
-    }
-    # check if number supplied
-    check <- !is.na(user_input)
-    if(check == FALSE) {
-      message("Input must be a number.")
-    }
-  }
-  return(user_input)
-}
-
-# request constrained string---------------------------------------------------------
-# default is numeric, referring to the index of desired option in 'choices'
-cons_string_input <- function(prompt, choices, default) {
-
-  if(missing(default)){
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- as.numeric(menu(choices, graphics=FALSE, title=prompt))
-      user_input <- choices[user_input]
-
-      check <- user_input!=''
-      if(check == FALSE) message("Input can't be empty.")
-      else message(sprintf("\nSelected: %s", user_input))
-    }
-  }
-
-  if(!missing(default)){
-    # setting up prompt with menu
-    num_choices <- str_c(1:length(choices), choices, sep='. ')
-    menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t'))
-    message(menu_prompt)
-
-    # requesting user input
-    user_input <- readline("Selection:")
-
-    # setting default
-    if(user_input == '') {
-      user_input <- as.numeric(default)
-      user_input <- choices[user_input]
-      msg <- sprintf("No input supplied, default of %s used.", user_input)
-      message(msg)
-    }
-    else {
-      user_input <- as.numeric(user_input)
-      user_input <- choices[user_input]
-      message(sprintf("\nSelected: %s", user_input))
-    }
-  }
-
-  return(user_input)
-}
-
-# request multiple inputs-----------------------------------------------------------
-## default is optional; can set default to NULL if want to allow blank entry
-cons_string_mult <- function(prompt, choices, default) {
-  if(missing(default)) {
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- select.list(choices=choices, multiple=TRUE, title=prompt,
-                                graphics = FALSE)
-
-      if(length(user_input)==0)  {
-        message("Input can't be empty.")
-        check <- FALSE
-      }
-      else {
-        message(sprintf("\nSelected: %s", user_input))
-        check <- TRUE
-      }
-    }
-  }
-
-  if(!missing(default)) {
-    user_input <- select.list(choices=choices, multiple=TRUE, title=prompt,
-                              graphics=FALSE)
-
-    if(length(user_input)==0) {
-      if(is.null(default)) user_input <- default
-      else user_input <- default
-      msg <- sprintf("No input supplied, using default:%s",
-                     paste(user_input, collapse=', '))
-      message(msg)
-    }
-
-  }
-
-  return(user_input)
-
-}
-
-# yesno input-------------------------------------------------------------------
-# default is TRUE or FALSE
-yn_input <- function(prompt, default) {
-
-  choices <- c("Yes", "No")
-
-  if(missing(default)) {
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- menu(choices, graphics=FALSE, title=prompt)
-
-      if(length(user_input)==0)  {
-        message("Input can't be empty.")
-        check <- FALSE
-      }
-      else {
-        message(sprintf("\nSelected: %s", user_input))
-        check <- TRUE
-      }
-    }
-    user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-  }
-  if(!missing(default)) {
-
-    # setting up prompt with menu
-    num_choices <- str_c(1:length(choices), choices, sep='. ')
-    menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t'))
-    message(menu_prompt)
-
-    # requesting user input
-    user_input <- readline("Selection:")
-
-    # setting default
-    if(user_input == '') {
-      user_input <- as.numeric(default)
-      msg <- sprintf("No input supplied, default of %s used.", choices[user_input])
-      message(msg)
-      user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-    }
-    else {
-      user_input <- as.numeric(user_input)
-      message(sprintf("\nSelected: %s", choices[user_input]))
-      user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-    }
-  }
-
-
-  return(user_input)
-}
\ No newline at end of file