Mercurial > repos > mingchen0919 > aurora_skewer
changeset 2:d9601e1eb360 draft
v1.1.0
author | mingchen0919 |
---|---|
date | Sun, 18 Mar 2018 11:07:16 -0400 |
parents | e969699f49b6 |
children | 1731a1cca226 |
files | getopt_specification.csv helper.R skewer.Rmd skewer.sh skewer.xml skewer_render.R |
diffstat | 6 files changed, 271 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getopt_specification.csv Sun Mar 18 11:07:16 2018 -0400 @@ -0,0 +1,28 @@ +short flag,argument mask,data type,variable name,galaxy input type,description +o,1,character,report,data, +d,1,character,report.files_path,, +s,1,character,sink_message,data, +A,1,character,first_reads,data, +B,1,character,second_reads,data, +x,1,character,adapter_first_reads,data, +y,1,character,adapter_second_reads,data, +m,1,character,trimming_mode,select, +r,1,character,maximum_allowed_error_rate,float, +D,1,character,maximum_allowed_indel_error_rate,float, +q,1,character,quality_trimming_3_end,integer, +Q,1,character,mean_quality,integer, +l,1,character,minimum_read_length,integer, +j,1,character,advanced_options.junction_adapter,data, +M,1,character,advanced_options.tab_adapter,data, +b,1,character,advanced_options.barcode,boolean, +c,1,character,advanced_options.cut,text, +n,1,character,advanced_options.filter_degenerative_reads,boolean, +u,1,character,advanced_options.filter_undetermined_mate_pair_reads,boolean, +f,1,character,advanced_options.format,select, +z,1,character,advanced_options.compress,boolean, +E,1,character,advanced_options.qiime,boolean, +F,1,character,advanced_options.quiet,boolean, +i,1,character,advanced_options.intelligent,boolean, +1,1,character,trimmed_r1, +2,1,character,trimmed_r2, +3,1,character,trimmed_s, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/helper.R Sun Mar 18 11:07:16 2018 -0400 @@ -0,0 +1,28 @@ +#' \code{getopt_specification_matrix} returns a getopt specification matrix. +#' +#' @param specification_file a cvs file within the \code{galaxy_tool_directory} which stores getopt specification matrix data. +#' The first column are short flags, the second column are argument masks, the third column +#' is data types. The fourth column are variable names used in the tool XML. These three columns are required. +#' @param gtg_name the name of a running GTG. +getopt_specification_matrix = function(specification_file, gtg_name = 'gtg', tool_dir = Sys.getenv('TOOL_DIRECTORY')) { + df = read.csv(paste0(tool_dir, specification_file), + header = TRUE, stringsAsFactors = FALSE) + # check if there are duplicated short flags + short_flags = df[, 1] + if (length(unique(short_flags)) < length(short_flags)) { + cat('----Duplicated short flags found ----\n') + cat('short flags: ', df[, 1][duplicated(df[, 1])], '\n') + stop('Duplicated short flags are not allowed.') + } + + # use short flags to generate long flags + long_flags = paste0('X_', df[, 1]) + + # specification matrix + df2 = data.frame(long_flags = long_flags, + short_flags = df[, 1], + argument_mask = df[, 2], + data_type = df[, 3]) + + as.matrix(df2) +} \ No newline at end of file
--- a/skewer.Rmd Fri Mar 02 09:59:25 2018 -0500 +++ b/skewer.Rmd Sun Mar 18 11:07:16 2018 -0400 @@ -1,11 +1,15 @@ --- -title: 'Skewer Trimming' -output: html_document +title: 'Skewer report' +output: + html_document: + number_sections: true + highlight: tango + code_folding: hide --- ```{r setup, include=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set( - echo = as.logical(opt$X_e), + echo = TRUE, error = TRUE ) ``` @@ -13,44 +17,90 @@ # Run Skewer -```{bash} +```{bash echo=FALSE} +#--- build skewer job script --- +## change directory to output dir cd ${X_d} cat >temp.sh <<EOL skewer \\ - -q ${X_A} \\ - -Q ${X_B} \\ - -x ${X_x} \\ - -y ${X_y} \\ - ${X_X} \\ - ${X_Y} \\ - -o ${X_d}/trim > /dev/null 2>&1 + ${X_A} \\ + ${X_B} \\ + -x ${X_x} \\ + -y ${X_y} \\ + -m ${X_m} \\ + -r ${X_r} \\ + -d ${X_D} \\ + -q ${X_q} \\ + -Q ${X_Q} \\ + -l ${X_l} \\ + -j ${X_j} \\ + -M ${X_M} \\ + -b ${X_b} \\ + -c ${X_b} \\ + -n ${X_n} \\ + -u ${X_u} \\ + -f ${X_f} \\ + -z ${X_z} \\ + -qiime ${X_E} \\ + -quiet ${X_F} \\ + -i ${X_i} \\ + -o trim > /dev/null 2>&1 + EOL -grep -v None temp.sh > skewer-job.sh - -# run skewer -sh skewer-job.sh - -cp trim-trimmed-pair1.fastq ${X_f} -cp trim-trimmed-pair2.fastq ${X_r} -mv trim-trimmed.log trim-trimmed.txt +# remove empty input lines +grep -v '\-M \\' temp.sh |\ + grep -v 'None' |\ + grep -v 'NO_ARGUMENT_NO' |\ + sed 's/NO_ARGUMENT_YES//g' > skewer-job.sh ``` - -```{r} +```{r echo=FALSE} # display skewer job script skewer_sh = paste0(opt$X_d, '/skewer-job.sh') tags$code(tags$pre(readChar(skewer_sh, file.info(skewer_sh)$size ))) ``` +```{bash echo=FALSE} +## change directory to output dir +cd ${X_d} + +sh skewer-job.sh + +if [ -e trim-trimmed-pair1.fastq ]; then + cp trim-trimmed-pair1.fastq ${X_1} +fi + +if [ -e trim-trimmed-pair2.fastq ]; then + cp trim-trimmed-pair2.fastq ${X_2} +fi + +if [ -e trim-trimmed.fastq ]; then + cp trim-trimmed.fastq ${X_3} +fi + +if [ -e trim-trimmed.log ]; then + cp trim-trimmed.log trim-trimmed.txt +fi +``` + + + # Results -```{r} -tags$ul( - tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')), - tags$li(tags$a(href = 'trim-trimmed-pair1.fastq', 'trim-trimmed-pair1.fastq')), - tags$li(tags$a(href = 'trim-trimmed-pair2.fastq', 'trim-trimmed-pair2.fastq')) -) +```{r echo=FALSE} +if (file.exists(paste0(opt$X_d, '/trim-trimmed.fastq'))) { + tags$ul( + tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')), + tags$li(tags$a(href = 'trim-trimmed.fastq', 'trim-trimmed.fastq')) + ) +} else { + tags$ul( + tags$li(tags$a(href = 'trim-trimmed.txt', 'trim-trimmed.log')), + tags$li(tags$a(href = 'trim-trimmed-pair1.fastq', 'trim-trimmed-pair1.fastq')), + tags$li(tags$a(href = 'trim-trimmed-pair2.fastq', 'trim-trimmed-pair2.fastq')) + ) +} + ``` -
--- a/skewer.sh Fri Mar 02 09:59:25 2018 -0500 +++ b/skewer.sh Sun Mar 18 11:07:16 2018 -0400 @@ -1,17 +1,31 @@ -Rscript '${__tool_directory__}/skewer_render.R' +export TOOL_DIR='${__tool_directory__}' && + +Rscript '${__tool_directory__}/'skewer_render.R - -e $echo - -o $report - -d $report.files_path - -s $sink_message - -t '${__tool_directory__}' - - -X $first_reads - -Y $second_reads - -x $adapter_x - -y $adapter_y - -A $end_quality - -B $mean_quality - - -f $trimmed_r1 - -r $trimmed_r2 + -o '$report' + -d '$report.files_path' + -s '$sink_message' + -A '$first_reads' + -B '$second_reads' + -x '$adapter_first_reads' + -y '$adapter_second_reads' + -m '$trimming_mode' + -r '$maximum_allowed_error_rate' + -D '$maximum_allowed_indel_error_rate' + -q '$quality_trimming_3_end' + -Q '$mean_quality' + -l '$minimum_read_length' + -j '$advanced_options.junction_adapter' + -M '$advanced_options.tab_adapter' + -b '$advanced_options.barcode' + -c '$advanced_options.cut' + -n '$advanced_options.filter_degenerative_reads' + -u '$advanced_options.filter_undetermined_mate_pair_reads' + -f '$advanced_options.format' + -z '$advanced_options.compress' + -E '$advanced_options.qiime' + -F '$advanced_options.quiet' + -i '$advanced_options.intelligent' + -1 '$trimmed_r1' + -2 '$trimmed_r2' + -3 '$trimmed_s'
--- a/skewer.xml Fri Mar 02 09:59:25 2018 -0500 +++ b/skewer.xml Sun Mar 18 11:07:16 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="aurora_skewer" name="Aurora Skewer" version="1.0.0"> +<tool id="tool_1" name="Aurora Skewer" version="1.1.0"> <description>A fast and accurate adapter trimmer for next-generation sequencing paired-end reads
 </description> <requirements> @@ -11,47 +11,115 @@ <regex match="XXX" source="stderr" level="warning" description="Check the warnings_and_errors.txt file for more details."/> </stdio> - <command><![CDATA[Rscript '${__tool_directory__}/skewer_render.R' + <command><![CDATA[export TOOL_DIR='${__tool_directory__}' && + +Rscript '${__tool_directory__}/'skewer_render.R - -e $echo - -o $report - -d $report.files_path - -s $sink_message - -t '${__tool_directory__}' - - -X $first_reads - -Y $second_reads - -x $adapter_x - -y $adapter_y - -A $end_quality - -B $mean_quality - - -f $trimmed_r1 - -r $trimmed_r2 + -o '$report' + -d '$report.files_path' + -s '$sink_message' + -A '$first_reads' + -B '$second_reads' + -x '$adapter_first_reads' + -y '$adapter_second_reads' + -m '$trimming_mode' + -r '$maximum_allowed_error_rate' + -D '$maximum_allowed_indel_error_rate' + -q '$quality_trimming_3_end' + -Q '$mean_quality' + -l '$minimum_read_length' + -j '$advanced_options.junction_adapter' + -M '$advanced_options.tab_adapter' + -b '$advanced_options.barcode' + -c '$advanced_options.cut' + -n '$advanced_options.filter_degenerative_reads' + -u '$advanced_options.filter_undetermined_mate_pair_reads' + -f '$advanced_options.format' + -z '$advanced_options.compress' + -E '$advanced_options.qiime' + -F '$advanced_options.quiet' + -i '$advanced_options.intelligent' + -1 '$trimmed_r1' + -2 '$trimmed_r2' + -3 '$trimmed_s' ]]></command> <inputs> - <param type="data" name="first_reads" label="First reads file" optional="False" format="fastq,fastqsanger"/> - <param type="data" name="second_reads" label="Second reads" optional="False" format="fastq,fastqsanger"/> - <param type="data" name="adapter_x" argument="-x" label="Adapter sequence file for the first reads" - optional="True" format="fasta,fa"/> - <param type="data" name="adapter_y" argument="-y" label="Adapter sequence file for the second reads" - optional="True" format="fasta,fa"/> - <param type="integer" name="end_quality" argument="-q" label="3’ end quality trimming" - help="Trim 3’ end until specified or higher quality reached. The default value is 0." + <param type="data" name="first_reads" label="First reads" optional="False" format="fastq,fastqsanger" + multiple="True"/> + <param type="data" name="second_reads" label="Second reads" + help="If it is single end reads, ignore this input field and use the "first reads" field only." + optional="True" format="fastq,fastqsanger" multiple="True"/> + <param type="data" name="adapter_first_reads" argument="-x" label="Adapter sequence/file for the first reads." + optional="True" format="fasta,fa" multiple="False"/> + <param type="data" name="adapter_second_reads" argument="-y" label="Adapter sequence/file for the second reads" + optional="True" format="fasta,fa" multiple="False"/> + <param type="select" name="trimming_mode" argument="-m" label="trimming mode" optional="False" multiple="False"> + <option value="None" selected="true">--select a model--</option> + <option value="head" selected="false">5’ end trimming (single end reads)</option> + <option value="tail" selected="false">3’ end trimming (single end reads)</option> + <option value="any" selected="false">anywhere adapter detection and trimming (single end reads)</option> + <option value="pe" selected="false">paired-end trimming</option> + <option value="mp" selected="false">mate-pair trimming</option> + <option value="ap" selected="false">amplicon trimming</option> + </param> + <param type="float" name="maximum_allowed_error_rate" argument="-r" label="Maximum allowed error rate" + optional="False" value="0.1" min="0" max="0.5"/> + <param type="float" name="maximum_allowed_indel_error_rate" argument="-d" + label="Maximum allowed indel error rate" + help="The valid range of indel error rate is [0, maximum allowed error rate]" optional="False" + value="0.03" min="0"/> + <param type="integer" name="quality_trimming_3_end" argument="-q" label="3’ end quality trimming" optional="False" value="0"/> <param type="integer" name="mean_quality" argument="-Q" label="Reads filtering by average quality" - help="Specifies the lowest mean quality value allowed before trimming. The default value is 0." optional="False" value="0"/> - <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" - label="Display analysis code in report?"/> + <param type="integer" name="minimum_read_length" argument="-l" + label="Minimum read length allowed after trimming" optional="False" value="18"/> + <section name="advanced_options" title="Advanced options" expanded="False"> + <param type="data" name="junction_adapter" argument="-j" + label="Junction adapter sequence/file for Nextera Mate Pair reads" optional="True" format="fasta,fa" + multiple="False"/> + <param type="text" name="tab_adapter" argument="-M" + label="TAB delimited file indicates valid forward/reverse adapter pairing" optional="True"/> + <param type="boolean" name="barcode" argument="-b" + label="Whether to demultiplex reads according to adapters/primers" optional="False" checked="False" + truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="text" name="cut" argument="-c" + label="To hard clip off the 5’ leading bases of the forward primer and reverse primer respectively as the barcodes in amplicon mode" + optional="False" value="0, 0"/> + <param type="boolean" name="filter_degenerative_reads" argument="-n" + label="Whether to filter out highly degenerative reads" optional="False" checked="False" + truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="boolean" name="filter_undetermined_mate_pair_reads" argument="-u" + label="Whether to filter out undetermined mate-pair reads" optional="False" checked="False" + truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="select" name="format" argument="-f" label="Format of FASTQ quality value" optional="False" + multiple="False"> + <option value="sanger" selected="false">sanger</option> + <option value="solexa" selected="false">solexa</option> + <option value="auto" selected="true">auto</option> + </param> + <param type="boolean" name="compress" argument="-z" label="Whether to compress output in GZIP format" + optional="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="boolean" name="qiime" argument="-qiime" label="Whether to prepare files required by QIIME" + optional="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="boolean" name="quiet" argument="--quiet" label="Whether in quiet mode" optional="False" + truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + <param type="boolean" name="intelligent" argument="-i" label="Whether to intelligently redistribute reads" + optional="False" checked="False" truevalue="NO_ARGUMENT_YES" falsevalue="NO_ARGUMENT_NO"/> + </section> </inputs> <outputs> - <data name="report" format="html" label="Skewer trimmed" hidden="false"/> - <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/> + <data name="report" format="html" label="${tool.name} report on ${on_string}" hidden="false"/> + <data name="sink_message" format="txt" label="${tool.name} running log" from_work_dir="warnings_and_errors.txt" + hidden="false"/> <data name="trimmed_r1" format="fastq" label="${tool.name} trimmed first reads on ${on_string}" hidden="false"/> <data name="trimmed_r2" format="fastq" label="${tool.name} trimmed second reads on ${on_string}" hidden="false"/> + <data name="trimmed_s" format="fastq" label="${tool.name} trimmed single end reads on ${on_string}" + hidden="false"/> </outputs> + <help> + <![CDATA[Read the `tool manual <http://download2.nust.na/pub4/sourceforge/s/project/sk/skewer/skewer_manual.pdf>`_ if you are not sure what parameter values to use.]]></help> <citations> <citation type="bibtex"><![CDATA[ @article{allaire2016rmarkdown,
--- a/skewer_render.R Fri Mar 02 09:59:25 2018 -0500 +++ b/skewer_render.R Sun Mar 18 11:07:16 2018 -0400 @@ -15,24 +15,13 @@ #------------get arguments into R-------------------- -# getopt_specification_matrix(extract_short_flags('skewer.xml')) %>% -# write.table(file = 'spec.txt', sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) - - -spec_matrix = as.matrix( - data.frame(stringsAsFactors=FALSE, - long_flags = c("X_e", "X_o", "X_d", "X_s", "X_t", "X_X", "X_Y", - "X_x", "X_y", "X_A", "X_B", "X_f", "X_r"), - short_flags = c("e", "o", "d", "s", "t", "X", "Y", "x", "y", "A", - "B", "f", "r"), - argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), - data_type_flags = c("character", "character", "character", "character", - "character", "character", "character", - "character", "character", "character", "character", - "character", "character") - ) -) -opt = getopt(spec_matrix) +# load helper function +source(paste0(Sys.getenv('TOOL_DIR'), '/helper.R')) +# import getopt specification matrix from a csv file +spec_csv = paste0(Sys.getenv('TOOL_DIR'), '/getopt_specification.csv') +opt = getopt(getopt_specification_matrix(spec_csv)) +opt$X_t = Sys.getenv('TOOL_DIR') +print(opt) #---------------------------------------------------- @@ -46,14 +35,13 @@ # TOOL_DIR: path to the tool installation directory OUTPUT_DIR = opt$X_d TOOL_DIR = opt$X_t +OUTPUT_REPORT = opt$X_o RMD_NAME = 'skewer.Rmd' -OUTPUT_REPORT = opt$X_o # create the output associated directory to store all outputs dir.create(OUTPUT_DIR, recursive = TRUE) -opt + #-----------------render Rmd-------------- -paste0(TOOL_DIR, '/', RMD_NAME) render(paste0(TOOL_DIR, '/', RMD_NAME), output_file = OUTPUT_REPORT) #------------------------------------------