changeset 11:faa2c40569b0 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit b59ed2476188176bb5b2973f59899b7d733e7641-dirty
author matthias
date Thu, 29 Aug 2019 09:03:48 -0400
parents 34e6f55234a6
children 000bebcfd0b1
files dada2.pdf dada2_removeBimeraDenovo.xml data2.R macros.xml notes.txt static/images/pairpipe.png static/images/pairpipe.svg user_input_functions.R
diffstat 8 files changed, 29 insertions(+), 703 deletions(-) [+]
line wrap: on
line diff
Binary file dada2.pdf has changed
--- a/dada2_removeBimeraDenovo.xml	Tue May 28 12:48:25 2019 -0400
+++ b/dada2_removeBimeraDenovo.xml	Thu Aug 29 09:03:48 2019 -0400
@@ -1,9 +1,10 @@
-<tool id="dada2_removeBimeraDenovo" name="dada2: removeBimeraDenovo" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@">
+<tool id="dada2_removeBimeraDenovo" name="dada2: removeBimeraDenovo" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@" profile="19.09">
     <description>Remove bimeras from collections of unique sequences</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"/>
+    <expand macro="stdio"/>
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
 Rscript '$dada2_script'
@@ -113,6 +114,7 @@
 
 Considerations for your own data: Most of your reads should remain after chimera removal (it is not uncommon for a majority of sequence variants to be removed though). If most of your reads were removed as chimeric, upstream processing may need to be revisited. In almost all cases this is caused by primer sequences with ambiguous nucleotides that were not removed prior to beginning the DADA2 pipeline.
 
+@HELP_OVERVIEW@
     ]]></help>
     <expand macro="citations"/>
 </tool>
--- a/data2.R	Tue May 28 12:48:25 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,352 +0,0 @@
-library(dada2)
-
-# library(DBI)
-# library(ggplot2)
-library(optparse)
-# library(RSQLite)
-# library(stringr)
-
-## source required R functions
-source('user_input_functions.R')
-
-# print dada2 version
-print(paste("dada2 version: ", packageVersion("dada2")))
-
-# # R function to create fasta file from dada2 output data
-# outdir is directory to output fasta file
-# taxa is taxonomy file generated by dada2
-# prefix is string for desired prefix attached to output file names
-
-dada2fasta <- function(outdir, seqtab.nochim, prefix){
-  seq <- colnames(seqtab.nochim)
-  n <- 0
-  ASVs <- c()
-  for(i in seq){
-    n <- n + 1
-    ASV <- paste('ASV', as.character(n), sep = '_')
-    ASVs <- c(ASVs, ASV)
-    line1 <- paste('>',ASV,sep='')
-    write(line1, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
-    write(i, file.path(outdir,sprintf('%s.fasta',prefix)), append=T)
-  }
-  return(ASVs)
-}
-
-
-# # R DADA2 workflow
-# wd is path to fastq files
-# r_path is path to user_input_functions.R
-# outdir is path to output directory 
-# prefix is string for desired prefix attached to output file names
-
-dada2run <- function(wd, r_path, outdir, prefix){
-  
-  # read-in files-------------------------------------------------------
-  ## obtain vectors of forward and reverse reads based on 'R1' and 'R2' in file names
-  ## additionally obtain the coressponding sample names for these files
-  p1 <- c()
-  p2 <- c()
-  sample.names <- c()
-  for(f in list.files(wd, full.names=T)){
-    if(grepl('_1.fq', f)){
-      sample <- gsub('^.*[/](.*?)_1\\.fq\\.gz', '\\1', f)
-      sample.names <- c(sample.names, sample)
-      p1 <- c(p1, f)
-    }
-    if(grepl('_2.fq', f)){
-      p2 <- c(p2, f)
-    }
-  }
-  fnFs <- sort(p1)
-  fnRs <- sort(p2)
-  sample.names <- sort(sample.names)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_test.Rdata')))
-  
-  ## print for review
-  to_view <- data.frame(sample.names, fnFs, fnRs)
-  cat("The following fastq files were found:\n")
-  print(to_view)
-  
-  # Perform quality filtering and trimming---------------------------------
-  ## assign new names to samples
-  filtFs <- file.path(outdir, paste0(sample.names, 'F_filt.fastq.gz'))
-  filtRs <- file.path(outdir, paste0(sample.names, 'R_filt.fastq.gz'))
-  
-  ## plot forward and reverse quality so that user can decide on filtering parameters
-  cat('Plotting quality profile of forward reads...\n')
-  Fqp1 <- plotQualityProfile(fnFs[1])
-  #print(Fqp1)
-  ggsave(sprintf('%s_forward_1_qualityprofile.pdf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_1_qualityprofile.emf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  Fqp2 <- plotQualityProfile(fnFs[2])
-  #print(Fqp2)
-  ggsave(sprintf('%s_forward_2_qualityprofile.pdf',prefix),Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_2_qualityprofile.emf',prefix), Fqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
-  len1 <- 240
-  cat('Plotting quality profile of reverse reads...\n')
-  Rqp1 <- plotQualityProfile(fnRs[1])
-  #print(Rqp1)
-  ggsave(sprintf('%s_reverse_1_qualityprofile.pdf',prefix),Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_1_qualityprofile.emf',prefix), Rqp1, path = outdir, width = 20,height = 15,units = c("cm"))
-  Rqp2 <- plotQualityProfile(fnRs[2])
-  #print(Rqp2)
-  ggsave(sprintf('%s_reverse_2_qualityprofile.pdf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_2_qualityprofile.emf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm"))
-  #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n')
-  len2 <- 240
-
-  ## filter and trim
-  ## remaining parameters set to recommended defaults
-  ## maxN must be set to 0 (DADA2 requries no Ns)
-  ## The maxEE parameter sets the maximum number of "expected errors" allowed in a read, which is a better filter than simply averaging quality scores.
-  ## If not using Windows, you may set multithread to TRUE
-  ## NOTE: Do not use the truncLen parameter for ITS sequencing
-  ## trimLeft needs to be varied based on the size of your primers (i.e. it is used to trim your primer sequences)!! 
-  cat('Filtering and trimming sequences...\n')
-  out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=T, compress=T, multithread=threads, trimLeft=15)
-  
-  ## have user review read count changes, and relax error rate if too many reads are lost
-  ## for example, you may especially want to relax the number of expected errors on the reverse reads (i.e. maxEE=c(2,5)), as the reverse is prone to error on the Illumina sequencing platform
-  print(head(out))
-  check2 <- T
-  while(check2 == F){
-    maxF <- numeric_input("What would you like the maximum number of expected errors in the forward reads to be?\nDefault 2:", 2)
-    maxR <- numeric_input("What would you like the maximum number of expected errors in the reverse reads to be?\nDefault 5:", 5)
-    out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(maxF,maxR), truncQ=2, rm.phix=T, compress=T, multithread=threads)
-    print(head(out))
-    check2 <- yn_input('Proceed? If you lost too many reads, you can choose to not proceed and you will have the option to relax the error rate. Default yes:',T)
-  }
-  
-  # Have DADA2 learn the error rates-----------------------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  read.subset <- 1e6
-  cat('Learning error rate of forward reads...\n')
-  errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
-  
-  ## have user check estimated error rates
-  ## note the calculations are done with a subset of the total reads, as it is computationally intensive
-  ## if the fit is poor, the user has the option to try again with an increased subset number of reads
-  Error_f <- plotErrors(errF, nominalQ = T)
-  #print(Error_f)
-  ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_forward_Error_plot.emf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm"))
-  check3a <- T
-  while(check3a == F){
-    read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
-    errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads)
-    print(Error_f)
-    ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    ggsave(sprintf('%s_forward_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    check3a <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
-  }
-  
-
-  ## also do for reverseL
-  cat('Learning error rate of reverse reads...\n')
-  errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
-  Error_r <- plotErrors(errR, nominalQ=T)
-  #print(Error_r)
-  ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
-  #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm"))
-  check3b <- T
-  while(check3b == F){
-    read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7)
-    errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads)
-    print(Error_r)
-    ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm"))
-    check3b <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T)
-  }
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata')))
-  
-  # Dereplicate sequences to generate unique sequence fastq files with corresponding count tables-------------------------
-  ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
-  cat('Dereplicating forward reads...\n')
-  derepFs <- derepFastq(filtFs, verbose=T)
-  cat('Dereplicating reverse reads...\n')
-  derepRs <- derepFastq(filtRs, verbose=T)
-  
-  ## name the derep-class objects by sample names
-  names(derepFs) <- sample.names
-  names(derepRs) <- sample.names
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata')))
-  
-  # Infer sequence variants using learned error rates---------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details.
-  ## NOTE2: you can use DADA2 for 454 or IonTorrent data as well. Please see https://benjjneb.github.io/dada2/tutorial.html.
-  s.pool = F
-  cat('Inferring sequence variants of forward reads...\n')
-  dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
-  
-  ## have user inspect detected number of sequence variants, to ensure it is logical based on the biological context of their samples
-  ## if you have low sampling depths, you may not want to process each sample independently as per default, but set pool=T. It gives better results at increased computation time. The user will have the option to do this if the number of sequence variants doesn't make sense.
-  print(dadaFs[[1]])
-  check4 <- T
-  if(check4 == F){
-    s.pool = T
-    dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads)
-    print(dadaFs[[1]])
-    cat('Hopefully, these results make more sense.\nOtherwise, there is not much more you can do except start over!\n')
-    check <- yn_input('Proceed? Default yes, no to quit:',T)
-    if(check == F){
-      stop()
-    }
-  }
-  
-  ## also do for reverse, but don't need to re-check as you need to keep the pool consistent between the forward and reverse!
-  cat('Inferring sequence variants of reversed reads...\n')
-  dadaRs <- dada(derepRs, err=errR, pool=s.pool, multithread=threads)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata')))
-
-  
-  # Merge forward and reverse reads-------------------------------------------------
-  cat('Merging paired-end reads...\n')
-  mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=T)
-  #cat('Most of your reads should have been retained (i.e. were able to merge, see above).\nOtherwise, there is not much more you can do except start over (i.e. did you over-trim your sequences??)!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-  
-  # Construct sequences table-------------------------------------------------------
-  cat('Constructing sequence table...\n')
-  seqtab <- makeSequenceTable(mergers)
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata')))
-
-
-  ## inspect distribution of sequence lengths
-  ## give user the option to filter out overly long or short sequneces
-  cat('Sequence length distribution listed below:\n')
-  print(table(nchar(getSequences(seqtab))))
-  check5 <- T
-  if(check5 == F){
-    min.cutoff <- numeric_input('Please input desired minimum length of sequences:',NULL)
-    max.cutoff <- numeric_input('Please input desired maximum length of sequences:',NULL)
-    seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seq(min.cutoff,max.cutoff)]
-  }
-  
-  # Remove chimeras------------------------------------------------------------------
-  ## If not using Windows, you may set multithread to TRUE
-  cat('Removing chimeras...\n')
-  seqtab.nochim <- removeBimeraDenovo(seqtab, method='consensus', multithread=threads, verbose=T)
-  
-  ## display percentage of chimeras removed
-  ## this number should be small (<5%), otherwise some processing parameters need to be revisited
-  percent.nochim <- (sum(seqtab.nochim)/sum(seqtab))*100
-  percent.nochim <- paste(as.character(percent.nochim),'of reads retained after chimera removal.\n',sep=' ')
-  cat(percent.nochim)
-  #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-  
-  # Final sanity check--------------------------------------------------------------
-  ## track reads removed throughout the pipeline
-  ## If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
-  getN <- function(x) sum(getUniques(x))
-  track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
-  colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
-  rownames(track) <- sample.names
-  print(head(track))
-  #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n')
-  check <- T
-  if(check == F){
-    stop()
-  }
-  
-  write.csv(track,file=file.path(outdir, sprintf('%s_read_count-quality_control.csv',prefix)))
-
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
-#   load(file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata')))
-  
-  # Assign taxonomy-----------------------------------------------------------------
-  ## require silva database files
-  ## If not using Windows, you may set multithread to TRUE
-  ## Minimum boot strap should be 80, but for sequnce length =< 250 Minimum bootstrap set to 50 (which is also the default)
-  
-  ## SILVA
-  cat('Assigning taxonomy to genus level using SILVA...\n')
-  taxa_silva <- assignTaxonomy(seqtab.nochim, file.path(wd,"silva_nr_v132_train_set.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
-  cat('Assigning taxonomy at species level using SILVA...\n')
-  taxa_silva <- addSpecies(taxa_silva, file.path(wd,"silva_species_assignment_v132.fa.gz"), allowMultiple=T, tryRC=T)
-  write.csv(taxa_silva,file=file.path(outdir, sprintf('%s_taxonomy_silva.csv',prefix)))
-  
-  ## RDP - used for copy number correction
-  cat('Assigning taxonomy to genus level using RDP...\n')
-  taxa_rdp <- assignTaxonomy(seqtab.nochim, file.path(wd,"rdp_train_set_16.fa.gz"), multithread=threads, minBoot=80, tryRC=T)
-  cat('Assigning taxonomy at species level using RDP...\n')
-  taxa_rdp <- addSpecies(taxa_rdp, file.path(wd,"rdp_species_assignment_16.fa.gz"), allowMultiple=T, tryRC=T)
-  write.csv(taxa_rdp,file=file.path(outdir, sprintf('%s_taxonomy_rdp.csv',prefix)))
-  save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
-  #load(file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata')))
-
-
-  # Return data----------------------------------------------------------------------
-  cat('Returning data...\n')
-  ## create fasta file
-  ASVs <- dada2fasta(outdir, seqtab.nochim, prefix)
-  ## create master dataframe for each classification
-  
-  ## Assigning ASVs to count table
-  sequences <- colnames(seqtab.nochim)
-  colnames(seqtab.nochim) <- ASVs
-  seqtab.nochim <- t(seqtab.nochim)
-  
-  ## silva
-  taxa_silva <- taxa_silva[match(sequences, rownames(taxa_silva)),]
-  rownames(taxa_silva) <- ASVs
-  d <- merge(seqtab.nochim, taxa_silva, by='row.names')
-  colnames(d)[1] <- 'ASV'
-  ## create database of all information
-  db <- dbConnect(RSQLite::SQLite(), file.path(outdir, sprintf('%s.sqlite',prefix)))
-  dbWriteTable(db, 'dada2_results_silva', d)
-  ## write master dataframe for user, and return it
-  write.table(d, file.path(outdir, sprintf('%s_dada2_results_silva.txt',prefix)), sep='\t', quote=F, row.names=F)
-  
-  ## rdp
-  taxa_rdp <- taxa_rdp[match(sequences, rownames(taxa_rdp)),]
-  rownames(taxa_rdp) <- ASVs
-  d <- merge(seqtab.nochim, taxa_rdp, by='row.names')
-  colnames(d)[1] <- 'ASV'
-  ## create database of all information
-  dbWriteTable(db, 'dada2_results_rdp', d)
-  dbDisconnect(db)
-  ## write master dataframe for user, and return it
-  write.table(d, file.path(outdir, sprintf('%s_dada2_results_rdp.txt',prefix)), sep='\t', quote=F, row.names=F)
-  return(d)
-  
-  cat('DADA2 processing completed!\n')
-}
-
- 
-option_list = list(
-  make_option(c("-t", "--threads"), type = "integer", default = 1,
-              help = "number of threads to use", metavar = "THREADS")
-); 
-
-opt_parser = OptionParser(option_list=option_list);
-opt = parse_args(opt_parser);
-
-print(opt)
-
-threads = as.integer(Sys.getenv("NSLOTS", "1"))
-
-exit(1)
-
-
-
-dada2run(wd='/work/haange/Leaky_gut/', r_path='/work/haange/Leaky_gut', outdir='/work/haange/Leaky_gut/results', prefix='Leaky_gut')
-
--- a/macros.xml	Tue May 28 12:48:25 2019 -0400
+++ b/macros.xml	Thu Aug 29 09:03:48 2019 -0400
@@ -16,6 +16,13 @@
         ]]></version_command>
     </xml>
 
+    <xml name="stdio">
+        <stdio>
+				<regex match="Error: cannot allocate" source="stderr" level="fatal_oom" description="Out of memory error occurred" />
+				<regex match="'Calloc' could not allocate memory" source="stderr" level="fatal_oom" description="Out of memory error occurred" />
+        </stdio>
+    </xml>
+
     <xml name="citations">
         <citations>
             <citation type="doi">10.1038/nmeth.3869</citation>
--- a/notes.txt	Tue May 28 12:48:25 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,148 +0,0 @@
-TODO
-====
-
-
-
-If we make a monolithic tool: 
-
-* implement sanity checks between important compute intensive steps (user definable criteria, abort if violated)
-
-If we keep separate tools: 
-
-- make Rdata data types specific (like xmcs https://github.com/workflow4metabolomics/xcms/tree/dev/datatypes)
-* alternatively the data set types could be derived from tabular and the Rdata could be attached via
-  `.extra_files_path` this way the user could have some intermediate output that he could look at.
-
-
-In both cases: 
-
-* allow input of single end data, single pair, single pair in separate data sets, ...
-* add mergePairsByID functionality to mergePairs tool
-
-
-Datatypes:
-==========
-
-**derep-class**: list w 3 members
-- uniques: Named integer vector. Named by the unique sequence, valued by abundance.
-• quals: Numeric matrix of average quality scores by position for each unique. Uniques are
-rows, positions are cols.
-* map: Integer vector of length the number of reads, and value the index (in uniques) of the
-unique to which that read was assigned.
-
-**learnErrorsOutput**: A named list with three entries
-- err_out: A numeric matrix with the learned error rates. 
-- err_in: The initialization error rates (unimportant). 
-- trans: A feature table of observed transitions for each type (eg. A->C) and quality score.
-
-**dada-class**: A multi-item List with the following named values...
-• denoised: Integer vector, named by sequence valued by abundance, of the denoised sequences.
-• clustering: An informative data.frame containing information on each cluster.
-• sequence: A character vector of each denoised sequence. Identical to names(denoised).
-• quality: The average quality scores for each cluster (row) by position (col).
-• map: Integer vector that maps the unique (index of derep.unique) to the denoised sequence (index of dada.denoised).
-• birth_subs: A data.frame containing the substitutions at the birth of each new cluster.
-• trans: The matrix of transitions by type (row), eg. A2A, A2C..., and quality score (col)
-observed in the final output of the dada algorithm.
-• err_in: The err matrix used for this invocation of dada.
-• err_out: The err matrix estimated from the output of dada. NULL if err_function not provided.
-• opts: A list of the dada_opts used for this invocation of dada.
-• call: The function call used for this invocation of dada.
-
-**uniques**: derep, dada, mergepairs(or data frame w sequenc and abundance columns)
-
-**mergepairs**:
-
-data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences, and the following columns:
-• abundance: Number of reads corresponding to this forward/reverse combination.
-• sequence: The merged sequence.
-• forward: The index of the forward denoised sequence.
-• reverse: The index of the reverse denoised sequence.
-• nmatch: Number of matches nts in the overlap region.
-• nmismatch: Number of mismatches in the overlap region.
-• nindel: Number of indels in the overlap region.
-• prefer: The sequence used for the overlap region. 1=forward; 2=reverse.
-• accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise.
-• ...: Additional columns specified in propagateCol
-
-
-
-Tools: 
-======
-
-• Quality filtering 
-  
-  filterAndTrim IO=(fastq -> fastq) 
-
-• Dereplication 
-
-  derepFastq (fastq -> derep-class object)
-
-• Learn error rates 
-
-  learnErrors + plotErrors
-    - in: input list, or vector, of file names (or a list of derep-class objects WHY .. learning should be done on full data) 
-    - out: named list w entries 
-      - \$err\_out: A numeric matrix with the learned error rates. 
-      - \$err\_in: The initialization error rates (unimportant). 
-      - \$trans: A feature table of observed transitions for each type (eg. A->C) and quality score
-
-• Sample Inference (dada)
-   in: (list of) derep-class object
-   out: (list of) dada-class object 
-
-• Chimera Removal 
-
-  removeBimeraDenovo
-
-  in: A uniques-vector or any object that can be coerced into one with getUniques.
-  out: A uniques vector, or an object of matching class if a data.frame or sequence table is provided
-
-• Merging of Paired Reads 
-
-  mergePairs
-   in: 2x dada-class object(s), 2x derep-class object(s)
-   out: A data.frame, or a list of data.frames.
-     - The return data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences,
-     - cols
-       - \$abundance: Number of reads corresponding to this forward/reverse combination.
-       - \$sequence: The merged sequence.
-       - \$forward: The index of the forward denoised sequence.
-       - \$reverse: The index of the reverse denoised sequence.
-       - \$nmatch: Number of matches nts in the overlap region.
-       - \$nmismatch: Number of mismatches in the overlap region.
-       - \$nindel: Number of indels in the overlap region.
-       - \$prefer: The sequence used for the overlap region. 1=forward; 2=reverse.
-       - \$accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise.
-       - \$...: Additional columns specified in propagateCol.
-
-• Taxonomic Classification (assignTaxonomy, assignSpecies)
-
-* Other 
-
-  makeSequenceTable
-   in A list of the samples to include in the sequence table. Samples can be provided in any format that can be processed by getUniques
-   out Named integer matrix (row for each sample, column for each unique sequence)
-
-  mergeSequenceTables
-
-  uniquesToFasta
-  in: A uniques-vector or any object that can be coerced into one with getUniques.
-  
-  getSequences
-
-  extracts the sequences from several different data objects: including including dada-class
-  and derep-class objects, as well as data.frame objects that have both \$sequence and \$abun-
-  dance columns.
-
-  getUniques
-
-  extracts the uniques-vector from several different data objects, including dada-class
-  and derep-class objects, as well as data.frame objects that have both \$sequence and \$abundance
-  columns
-
-  plotQualityProfile
-
-  seqComplexity
-
-  setDadaOpt(...)
Binary file static/images/pairpipe.png has changed
--- a/static/images/pairpipe.svg	Tue May 28 12:48:25 2019 -0400
+++ b/static/images/pairpipe.svg	Thu Aug 29 09:03:48 2019 -0400
@@ -45,12 +45,12 @@
      inkscape:snap-bbox="false"
      inkscape:object-paths="true"
      inkscape:zoom="1"
-     inkscape:cx="685.79259"
-     inkscape:cy="205.1652"
+     inkscape:cx="654.80177"
+     inkscape:cy="201.1652"
      inkscape:window-x="0"
      inkscape:window-y="27"
      inkscape:window-maximized="1"
-     inkscape:current-layer="svg384"
+     inkscape:current-layer="layer1"
      fit-margin-top="0"
      fit-margin-left="0"
      fit-margin-right="0"
@@ -919,7 +919,15 @@
      inkscape:groupmode="layer"
      id="layer1"
      inkscape:label="Layer 1"
-     transform="translate(-178.97434,-516.80337)" />
+     transform="translate(-178.97434,-516.80337)">
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path1104"
+       d="m 547.55976,25.196631 41,34"
+       style="stroke:#000000;marker-end:url(#id2)"
+       transform="translate(178.97434,516.80337)" />
+  </g>
   <g
      inkscape:groupmode="layer"
      id="layer2"
@@ -978,5 +986,11 @@
        sodipodi:role="line"
        id="tspan1102"
        x="214"
-       y="-108.17315"></tspan></text>
+       y="-108.17315" /></text>
+  <path
+     style="stroke:#000000;marker-end:url(#id2)"
+     d="m 549.55976,123.19663 38,44"
+     id="path1255"
+     inkscape:connector-curvature="0"
+     sodipodi:nodetypes="cc" />
 </svg>
--- a/user_input_functions.R	Tue May 28 12:48:25 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,197 +0,0 @@
-# defining functions for checking user inputs
-
-# requesting directory input----------------------------------------------------------
-dir_input <- function(prompt) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- dir.exists(user_input)
-    if(check==FALSE) {
-      msg <- sprintf("The directory: %s not found.", user_input)
-      message(msg)
-    }
-  }
-  return(user_input)
-}
-
-# requesting file input
-file_input <- function(prompt, directory) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- file.exists(file.path(directory, user_input))
-    if(check==FALSE) {
-      msg <- sprintf("File: %s not found.", user_input)
-      message(msg)
-    }
-  }
-  return(user_input)
-}
-
-# requesting string input------------------------------------------------------------
-string_input <- function(prompt) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    check <- user_input!=''
-    if(check == FALSE) {
-      message("Input can't be empty.")
-    }
-  }
-  return(user_input)
-}
-
-# requesting integer input----------------------------------------------------------
-numeric_input <- function(prompt, default) {
-  check <- FALSE
-  while(check == FALSE) {
-    user_input <- readline(prompt)
-    
-    # if blank, set user_input to defalut
-    if(user_input == '') {
-      user_input <- default
-      msg <- sprintf("No input supplied, default of %s used.", default)
-      message(msg)
-    }
-    # coerce input to be numeric
-    else {
-      user_input <- as.numeric(user_input) 
-    }
-    # check if number supplied
-    check <- !is.na(user_input)
-    if(check == FALSE) {
-      message("Input must be a number.")
-    }
-  }
-  return(user_input)
-}
-
-# request constrained string---------------------------------------------------------
-# default is numeric, referring to the index of desired option in 'choices'
-cons_string_input <- function(prompt, choices, default) {
-  
-  if(missing(default)){
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- as.numeric(menu(choices, graphics=FALSE, title=prompt))
-      user_input <- choices[user_input]
-      
-      check <- user_input!=''
-      if(check == FALSE) message("Input can't be empty.")
-      else message(sprintf("\nSelected: %s", user_input))
-    }
-  }
-  
-  if(!missing(default)){
-    # setting up prompt with menu
-    num_choices <- str_c(1:length(choices), choices, sep='. ')
-    menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t'))
-    message(menu_prompt)
-    
-    # requesting user input
-    user_input <- readline("Selection:")
-    
-    # setting default
-    if(user_input == '') {
-      user_input <- as.numeric(default)
-      user_input <- choices[user_input]
-      msg <- sprintf("No input supplied, default of %s used.", user_input)
-      message(msg)
-    }
-    else {
-      user_input <- as.numeric(user_input)
-      user_input <- choices[user_input]
-      message(sprintf("\nSelected: %s", user_input))
-    }
-  }
-  
-  return(user_input)
-}
-
-# request multiple inputs-----------------------------------------------------------
-## default is optional; can set default to NULL if want to allow blank entry
-cons_string_mult <- function(prompt, choices, default) {
-  if(missing(default)) {
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- select.list(choices=choices, multiple=TRUE, title=prompt, 
-                                graphics = FALSE)
-      
-      if(length(user_input)==0)  {
-        message("Input can't be empty.")
-        check <- FALSE
-      }
-      else {
-        message(sprintf("\nSelected: %s", user_input))
-        check <- TRUE
-      }
-    }
-  }
-  
-  if(!missing(default)) {
-    user_input <- select.list(choices=choices, multiple=TRUE, title=prompt, 
-                              graphics=FALSE)
-    
-    if(length(user_input)==0) {
-      if(is.null(default)) user_input <- default
-      else user_input <- default
-      msg <- sprintf("No input supplied, using default:%s", 
-                     paste(user_input, collapse=', '))
-      message(msg)
-    }
-    
-  }
-  
-  return(user_input)
-  
-}
-
-# yesno input-------------------------------------------------------------------
-# default is TRUE or FALSE
-yn_input <- function(prompt, default) {
-  
-  choices <- c("Yes", "No")
-  
-  if(missing(default)) {
-    check <- FALSE
-    while(check == FALSE) {
-      user_input <- menu(choices, graphics=FALSE, title=prompt)
-      
-      if(length(user_input)==0)  {
-        message("Input can't be empty.")
-        check <- FALSE
-      }
-      else {
-        message(sprintf("\nSelected: %s", user_input))
-        check <- TRUE
-      }
-    }
-    user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-  }
-  if(!missing(default)) {
-    
-    # setting up prompt with menu
-    num_choices <- str_c(1:length(choices), choices, sep='. ')
-    menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t'))
-    message(menu_prompt)
-    
-    # requesting user input
-    user_input <- readline("Selection:")
-    
-    # setting default
-    if(user_input == '') {
-      user_input <- as.numeric(default)
-      msg <- sprintf("No input supplied, default of %s used.", choices[user_input])
-      message(msg)
-      user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-    }
-    else {
-      user_input <- as.numeric(user_input)
-      message(sprintf("\nSelected: %s", choices[user_input]))
-      user_input <- ifelse(user_input == 1L, TRUE, FALSE)
-    }
-  }
-  
-  
-  return(user_input)
-}
\ No newline at end of file