changeset 2:d9601e1eb360 draft

v1.1.0
author mingchen0919
date Sun, 18 Mar 2018 11:07:16 -0400
parents e969699f49b6
children 1731a1cca226
files getopt_specification.csv helper.R skewer.Rmd skewer.sh skewer.xml skewer_render.R
diffstat 6 files changed, 271 insertions(+), 95 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getopt_specification.csv	Sun Mar 18 11:07:16 2018 -0400
@@ -0,0 +1,28 @@
+short flag,argument mask,data type,variable name,galaxy input type,description
+o,1,character,report,data,
+d,1,character,report.files_path,,
+s,1,character,sink_message,data,
+A,1,character,first_reads,data,
+B,1,character,second_reads,data,
+x,1,character,adapter_first_reads,data,
+y,1,character,adapter_second_reads,data,
+m,1,character,trimming_mode,select,
+r,1,character,maximum_allowed_error_rate,float,
+D,1,character,maximum_allowed_indel_error_rate,float,
+q,1,character,quality_trimming_3_end,integer,
+Q,1,character,mean_quality,integer,
+l,1,character,minimum_read_length,integer,
+j,1,character,advanced_options.junction_adapter,data,
+M,1,character,advanced_options.tab_adapter,data,
+b,1,character,advanced_options.barcode,boolean,
+c,1,character,advanced_options.cut,text,
+n,1,character,advanced_options.filter_degenerative_reads,boolean,
+u,1,character,advanced_options.filter_undetermined_mate_pair_reads,boolean,
+f,1,character,advanced_options.format,select,
+z,1,character,advanced_options.compress,boolean,
+E,1,character,advanced_options.qiime,boolean,
+F,1,character,advanced_options.quiet,boolean,
+i,1,character,advanced_options.intelligent,boolean,
+1,1,character,trimmed_r1,
+2,1,character,trimmed_r2,
+3,1,character,trimmed_s,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/helper.R	Sun Mar 18 11:07:16 2018 -0400
@@ -0,0 +1,28 @@
+#' \code{getopt_specification_matrix} returns a getopt specification matrix.
+#'
+#' @param specification_file a cvs file within the \code{galaxy_tool_directory} which stores getopt specification matrix data.
+#' The first column are short flags, the second column are argument masks, the third column
+#' is data types. The fourth column are variable names used in the tool XML. These three columns are required.
+#' @param gtg_name the name of a running GTG.
+getopt_specification_matrix = function(specification_file, gtg_name = 'gtg', tool_dir = Sys.getenv('TOOL_DIRECTORY')) {
+  df = read.csv(paste0(tool_dir, specification_file),
+                header = TRUE, stringsAsFactors = FALSE)
+  # check if there are duplicated short flags
+  short_flags = df[, 1]
+  if (length(unique(short_flags)) < length(short_flags)) {
+    cat('----Duplicated short flags found ----\n')
+    cat('short flags: ', df[, 1][duplicated(df[, 1])], '\n')
+    stop('Duplicated short flags are not allowed.')
+  }
+  
+  # use short flags to generate long flags
+  long_flags = paste0('X_', df[, 1])
+  
+  # specification matrix
+  df2 = data.frame(long_flags = long_flags,
+                   short_flags = df[, 1],
+                   argument_mask = df[, 2],
+                   data_type = df[, 3])
+  
+  as.matrix(df2)
+}
\ No newline at end of file
--- a/skewer.Rmd	Fri Mar 02 09:59:25 2018 -0500
+++ b/skewer.Rmd	Sun Mar 18 11:07:16 2018 -0400
@@ -1,11 +1,15 @@
 ---
-title: 'Skewer Trimming'
-output: html_document
+title: 'Skewer report'
+output:
+    html_document:
+      number_sections: true
+      highlight: tango
+      code_folding: hide
 ---
 
 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
 knitr::opts_chunk$set(
-  echo = as.logical(opt$X_e), 
+  echo = TRUE, 
   error = TRUE
 )
 ```
@@ -13,44 +17,90 @@
 
 # Run Skewer
 
-```{bash}
+```{bash echo=FALSE}
+#--- build skewer job script ---
+## change directory to output dir
 cd ${X_d}
 
 cat >temp.sh <<EOL
 skewer \\
-  -q ${X_A} \\
-  -Q ${X_B} \\
-  -x ${X_x} \\
-  -y ${X_y} \\
-  ${X_X} \\
-  ${X_Y} \\
-  -o ${X_d}/trim > /dev/null 2>&1
+	${X_A} \\
+	${X_B} \\
+	-x ${X_x} \\ 
+	-y ${X_y} \\
+	-m ${X_m} \\
+	-r ${X_r} \\
+	-d ${X_D} \\
+	-q ${X_q} \\
+	-Q ${X_Q} \\
+	-l ${X_l} \\
+	-j ${X_j} \\
+	-M ${X_M} \\
+	-b ${X_b} \\
+	-c ${X_b} \\
+	-n ${X_n} \\
+	-u ${X_u} \\
+	-f ${X_f} \\
+	-z ${X_z} \\
+	-qiime ${X_E} \\
+	-quiet ${X_F} \\
+	-i ${X_i} \\
+	-o trim > /dev/null 2>&1
+
 EOL
 
-grep -v None temp.sh > skewer-job.sh
-
-# run skewer
-sh skewer-job.sh
-
-cp trim-trimmed-pair1.fastq ${X_f}
-cp trim-trimmed-pair2.fastq ${X_r}
-mv trim-trimmed.log trim-trimmed.txt
+# remove empty input lines
+grep -v '\-M  \\' temp.sh |\
+  grep -v 'None' |\
+  grep -v 'NO_ARGUMENT_NO' |\
+  sed 's/NO_ARGUMENT_YES//g' > skewer-job.sh
 ```
 
-
-```{r}
+```{r echo=FALSE}
 # display skewer job script
 skewer_sh = paste0(opt$X_d, '/skewer-job.sh')
 tags$code(tags$pre(readChar(skewer_sh, file.info(skewer_sh)$size )))
 ```
 
+```{bash echo=FALSE}
+## change directory to output dir
+cd ${X_d}
+
+sh skewer-job.sh
+
+if [ -e trim-trimmed-pair1.fastq ]; then
+  cp trim-trimmed-pair1.fastq ${X_1}
+fi
+
+if [ -e trim-trimmed-pair2.fastq ]; then
+  cp trim-trimmed-pair2.fastq ${X_2}
+fi
+
+if [ -e trim-trimmed.fastq ]; then
+  cp trim-trimmed.fastq ${X_3}
+fi
+
+if [ -e trim-trimmed.log ]; then
+  cp trim-trimmed.log trim-trimmed.txt
+fi
+```
+
+
+
 # Results
 
-```{r}
-tags$ul(
-  tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')),
-  tags$li(tags$a(href = 'trim-trimmed-pair1.fastq', 'trim-trimmed-pair1.fastq')),
-  tags$li(tags$a(href = 'trim-trimmed-pair2.fastq', 'trim-trimmed-pair2.fastq'))
-)
+```{r echo=FALSE}
+if (file.exists(paste0(opt$X_d, '/trim-trimmed.fastq'))) {
+  tags$ul(
+    tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')),
+    tags$li(tags$a(href = 'trim-trimmed.fastq', 'trim-trimmed.fastq'))
+  )
+} else {
+  tags$ul(
+    tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')),
+    tags$li(tags$a(href = 'trim-trimmed-pair1.fastq', 'trim-trimmed-pair1.fastq')),
+    tags$li(tags$a(href = 'trim-trimmed-pair2.fastq', 'trim-trimmed-pair2.fastq'))
+  )
+}
+
 ```
-
--- a/skewer.sh	Fri Mar 02 09:59:25 2018 -0500
+++ b/skewer.sh	Sun Mar 18 11:07:16 2018 -0400
@@ -1,17 +1,31 @@
-Rscript '${__tool_directory__}/skewer_render.R'
+export TOOL_DIR='${__tool_directory__}' &&
+
+Rscript '${__tool_directory__}/'skewer_render.R
 
-    -e $echo
-    -o $report
-    -d $report.files_path
-    -s $sink_message
-    -t '${__tool_directory__}'
-    
-    -X $first_reads
-    -Y $second_reads
-    -x $adapter_x
-    -y $adapter_y
-    -A $end_quality
-    -B $mean_quality
-    
-    -f $trimmed_r1
-    -r $trimmed_r2
+	-o '$report'
+	-d '$report.files_path'
+	-s '$sink_message'
+	-A '$first_reads'
+	-B '$second_reads'
+	-x '$adapter_first_reads'
+	-y '$adapter_second_reads'
+	-m '$trimming_mode'
+	-r '$maximum_allowed_error_rate'
+	-D '$maximum_allowed_indel_error_rate'
+	-q '$quality_trimming_3_end'
+	-Q '$mean_quality'
+	-l '$minimum_read_length'
+	-j '$advanced_options.junction_adapter'
+	-M '$advanced_options.tab_adapter'
+	-b '$advanced_options.barcode'
+	-c '$advanced_options.cut'
+	-n '$advanced_options.filter_degenerative_reads'
+	-u '$advanced_options.filter_undetermined_mate_pair_reads'
+	-f '$advanced_options.format'
+	-z '$advanced_options.compress'
+	-E '$advanced_options.qiime'
+	-F '$advanced_options.quiet'
+	-i '$advanced_options.intelligent'
+	-1 '$trimmed_r1'
+	-2 '$trimmed_r2'
+	-3 '$trimmed_s'
--- a/skewer.xml	Fri Mar 02 09:59:25 2018 -0500
+++ b/skewer.xml	Sun Mar 18 11:07:16 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="aurora_skewer" name="Aurora Skewer" version="1.0.0">
+<tool id="tool_1" name="Aurora Skewer" version="1.1.0">
     <description>A fast and accurate adapter trimmer for next-generation sequencing paired-end reads&#xD;
     </description>
     <requirements>
@@ -11,47 +11,115 @@
         <regex match="XXX" source="stderr" level="warning"
                description="Check the warnings_and_errors.txt file for more details."/>
     </stdio>
-    <command><![CDATA[Rscript '${__tool_directory__}/skewer_render.R'
+    <command><![CDATA[export TOOL_DIR='${__tool_directory__}' &&
+
+Rscript '${__tool_directory__}/'skewer_render.R
 
-    -e $echo
-    -o $report
-    -d $report.files_path
-    -s $sink_message
-    -t '${__tool_directory__}'
-    
-    -X $first_reads
-    -Y $second_reads
-    -x $adapter_x
-    -y $adapter_y
-    -A $end_quality
-    -B $mean_quality
-    
-    -f $trimmed_r1
-    -r $trimmed_r2
+	-o '$report'
+	-d '$report.files_path'
+	-s '$sink_message'
+	-A '$first_reads'
+	-B '$second_reads'
+	-x '$adapter_first_reads'
+	-y '$adapter_second_reads'
+	-m '$trimming_mode'
+	-r '$maximum_allowed_error_rate'
+	-D '$maximum_allowed_indel_error_rate'
+	-q '$quality_trimming_3_end'
+	-Q '$mean_quality'
+	-l '$minimum_read_length'
+	-j '$advanced_options.junction_adapter'
+	-M '$advanced_options.tab_adapter'
+	-b '$advanced_options.barcode'
+	-c '$advanced_options.cut'
+	-n '$advanced_options.filter_degenerative_reads'
+	-u '$advanced_options.filter_undetermined_mate_pair_reads'
+	-f '$advanced_options.format'
+	-z '$advanced_options.compress'
+	-E '$advanced_options.qiime'
+	-F '$advanced_options.quiet'
+	-i '$advanced_options.intelligent'
+	-1 '$trimmed_r1'
+	-2 '$trimmed_r2'
+	-3 '$trimmed_s'
 ]]></command>
     <inputs>
-        <param type="data" name="first_reads" label="First reads file" optional="False" format="fastq,fastqsanger"/>
-        <param type="data" name="second_reads" label="Second reads" optional="False" format="fastq,fastqsanger"/>
-        <param type="data" name="adapter_x" argument="-x" label="Adapter sequence file for the first reads"
-               optional="True" format="fasta,fa"/>
-        <param type="data" name="adapter_y" argument="-y" label="Adapter sequence file for the second reads"
-               optional="True" format="fasta,fa"/>
-        <param type="integer" name="end_quality" argument="-q" label="3&#x2019; end quality trimming"
-               help="Trim 3&#x2019; end until specified or higher quality reached. The default value is 0."
+        <param type="data" name="first_reads" label="First reads" optional="False" format="fastq,fastqsanger"
+               multiple="True"/>
+        <param type="data" name="second_reads" label="Second reads"
+               help="If it is single end reads, ignore this input field and use the &quot;first reads&quot; field only."
+               optional="True" format="fastq,fastqsanger" multiple="True"/>
+        <param type="data" name="adapter_first_reads" argument="-x" label="Adapter sequence/file for the first reads."
+               optional="True" format="fasta,fa" multiple="False"/>
+        <param type="data" name="adapter_second_reads" argument="-y" label="Adapter sequence/file for the second reads"
+               optional="True" format="fasta,fa" multiple="False"/>
+        <param type="select" name="trimming_mode" argument="-m" label="trimming mode" optional="False" multiple="False">
+            <option value="None" selected="true">--select a model--</option>
+            <option value="head" selected="false">5&#x2019; end trimming (single end reads)</option>
+            <option value="tail" selected="false">3&#x2019; end trimming (single end reads)</option>
+            <option value="any" selected="false">anywhere adapter detection and trimming (single end reads)</option>
+            <option value="pe" selected="false">paired-end trimming</option>
+            <option value="mp" selected="false">mate-pair trimming</option>
+            <option value="ap" selected="false">amplicon trimming</option>
+        </param>
+        <param type="float" name="maximum_allowed_error_rate" argument="-r" label="Maximum allowed error rate"
+               optional="False" value="0.1" min="0" max="0.5"/>
+        <param type="float" name="maximum_allowed_indel_error_rate" argument="-d"
+               label="Maximum allowed indel error rate"
+               help="The valid range of indel error rate is [0,  maximum allowed error rate]" optional="False"
+               value="0.03" min="0"/>
+        <param type="integer" name="quality_trimming_3_end" argument="-q" label="3&#x2019; end quality trimming"
                optional="False" value="0"/>
         <param type="integer" name="mean_quality" argument="-Q" label="Reads filtering by average quality"
-               help="Specifies the lowest mean quality value allowed before trimming. The default value is 0."
                optional="False" value="0"/>
-        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
-               label="Display analysis code in report?"/>
+        <param type="integer" name="minimum_read_length" argument="-l"
+               label="Minimum read length allowed after trimming" optional="False" value="18"/>
+        <section name="advanced_options" title="Advanced options" expanded="False">
+            <param type="data" name="junction_adapter" argument="-j"
+                   label="Junction adapter sequence/file for Nextera Mate Pair reads" optional="True" format="fasta,fa"
+                   multiple="False"/>
+            <param type="text" name="tab_adapter" argument="-M"
+                   label="TAB delimited file indicates valid forward/reverse adapter pairing" optional="True"/>
+            <param type="boolean" name="barcode" argument="-b"
+                   label="Whether to demultiplex reads according to adapters/primers" optional="False" checked="False"
+                   truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="text" name="cut" argument="-c"
+                   label="To hard clip off the 5&#x2019; leading bases of the forward primer and reverse primer respectively as the barcodes in amplicon mode"
+                   optional="False" value="0, 0"/>
+            <param type="boolean" name="filter_degenerative_reads" argument="-n"
+                   label="Whether to filter out highly degenerative reads" optional="False" checked="False"
+                   truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="boolean" name="filter_undetermined_mate_pair_reads" argument="-u"
+                   label="Whether to filter out undetermined mate-pair reads" optional="False" checked="False"
+                   truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="select" name="format" argument="-f" label="Format of FASTQ quality value" optional="False"
+                   multiple="False">
+                <option value="sanger" selected="false">sanger</option>
+                <option value="solexa" selected="false">solexa</option>
+                <option value="auto" selected="true">auto</option>
+            </param>
+            <param type="boolean" name="compress" argument="-z" label="Whether to compress output in GZIP format"
+                   optional="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="boolean" name="qiime" argument="-qiime" label="Whether to prepare files required by QIIME"
+                   optional="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="boolean" name="quiet" argument="--quiet" label="Whether in quiet mode" optional="False"
+                   truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+            <param type="boolean" name="intelligent" argument="-i" label="Whether to intelligently redistribute reads"
+                   optional="False" checked="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/>
+        </section>
     </inputs>
     <outputs>
-        <data name="report" format="html" label="Skewer trimmed" hidden="false"/>
-        <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
+        <data name="report" format="html" label="${tool.name} report on ${on_string}" hidden="false"/>
+        <data name="sink_message" format="txt" label="${tool.name} running log" from_work_dir="warnings_and_errors.txt"
+              hidden="false"/>
         <data name="trimmed_r1" format="fastq" label="${tool.name} trimmed first reads on ${on_string}" hidden="false"/>
         <data name="trimmed_r2" format="fastq" label="${tool.name} trimmed second reads on ${on_string}"
               hidden="false"/>
+        <data name="trimmed_s" format="fastq" label="${tool.name} trimmed single end reads on ${on_string}"
+              hidden="false"/>
     </outputs>
+    <help>
+        <![CDATA[Read the `tool manual <http://download2.nust.na/pub4/sourceforge/s/project/sk/skewer/skewer_manual.pdf>`_ if you are not sure what parameter values to use.]]></help>
     <citations>
         <citation type="bibtex"><![CDATA[
             @article{allaire2016rmarkdown,
--- a/skewer_render.R	Fri Mar 02 09:59:25 2018 -0500
+++ b/skewer_render.R	Sun Mar 18 11:07:16 2018 -0400
@@ -15,24 +15,13 @@
 
 
 #------------get arguments into R--------------------
-# getopt_specification_matrix(extract_short_flags('skewer.xml')) %>%
-#   write.table(file = 'spec.txt', sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE)
-
-
-spec_matrix = as.matrix(
-  data.frame(stringsAsFactors=FALSE,
-              long_flags = c("X_e", "X_o", "X_d", "X_s", "X_t", "X_X", "X_Y",
-                             "X_x", "X_y", "X_A", "X_B", "X_f", "X_r"),
-             short_flags = c("e", "o", "d", "s", "t", "X", "Y", "x", "y", "A",
-                             "B", "f", "r"),
-     argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
-         data_type_flags = c("character", "character", "character", "character",
-                             "character", "character", "character",
-                             "character", "character", "character", "character",
-                             "character", "character")
-  )
-)
-opt = getopt(spec_matrix)
+# load helper function
+source(paste0(Sys.getenv('TOOL_DIR'), '/helper.R'))
+# import getopt specification matrix from a csv file
+spec_csv = paste0(Sys.getenv('TOOL_DIR'), '/getopt_specification.csv')
+opt = getopt(getopt_specification_matrix(spec_csv))
+opt$X_t = Sys.getenv('TOOL_DIR')
+print(opt)
 #----------------------------------------------------
 
 
@@ -46,14 +35,13 @@
 # TOOL_DIR: path to the tool installation directory
 OUTPUT_DIR = opt$X_d
 TOOL_DIR =   opt$X_t
+OUTPUT_REPORT = opt$X_o
 RMD_NAME = 'skewer.Rmd'
-OUTPUT_REPORT = opt$X_o
 
 # create the output associated directory to store all outputs
 dir.create(OUTPUT_DIR, recursive = TRUE)
-opt
+
 #-----------------render Rmd--------------
-paste0(TOOL_DIR, '/', RMD_NAME)
 render(paste0(TOOL_DIR, '/', RMD_NAME), output_file = OUTPUT_REPORT)
 #------------------------------------------