changeset 0:0bfeaf18a871 draft

planemo upload
author mingchen0919
date Wed, 07 Mar 2018 08:45:02 -0500
parents
children 9be3a8754fb3
files htseq_count.Rmd htseq_count.sh htseq_count.xml htseq_count_index.Rmd htseq_count_render.R htseq_count_site.yml
diffstat 6 files changed, 311 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count.Rmd	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,65 @@
+---
+title: 'htseq-count analysis'
+output: html_document
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = as.logical(opt$X_e), 
+  error = TRUE
+)
+```
+
+
+## Run htseq-count
+
+```{bash}
+cd ${X_d}
+
+cat >htseq-count.sh <<EOF
+  htseq-count \\
+    $(echo ${X_A} | sed 's/,/ /g') \\
+    ${X_G} \\
+    -f ${X_f} \\
+    -r ${X_r} \\
+    -s ${X_S} \\
+    -a ${X_a} \\
+    -t ${X_T} \\
+    -i ${X_i} \\
+    -m ${X_m} > counts.txt
+    
+  grep -v '__no_feature\|__ambiguous\|__too_low_aQual\|__not_aligned\|__alignment_not_unique' counts.txt > ${X_c}
+EOF
+```
+
+
+```{r}
+# display htseq-count job script
+htseq_count_sh = paste0(opt$X_d, '/htseq-count.sh')
+tags$code(tags$pre(readChar(htseq_count_sh, file.info(htseq_count_sh)$size )))
+```
+
+```{bash}
+cd ${X_d}
+sh htseq-count.sh
+```
+
+## Counts
+
+```{r}
+count_data = read.table(paste0(opt$X_d, '/counts.txt'), row.names = 1)
+sample_names = trimws(strsplit(opt$X_B, ',')[[1]])
+colnames(count_data) = rep(sample_names, length = ncol(count_data))
+DT::datatable(count_data, caption = "htseq counts for each sample")
+```
+
+```{r}
+# save count data as an R object
+save(count_data, file = paste0(opt$X_d, '/counts.RData'))
+```
+
+```{bash}
+cp ${X_d}/counts.RData ${X_O}
+```
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count.sh	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,20 @@
+Rscript '${__tool_directory__}/htseq_count_render.R'
+
+			-e $echo
+			-o $report
+			-d $report.files_path
+			-s $sink_message
+			-t '${__tool_directory__}'
+			
+			-A '$alignment_files'
+			-B '$sample_names'
+			-G $gff
+			-f $format
+			-r $order
+			-S $stranded
+			-a $minaqual
+			-T $feature_type
+			-i $idattr
+			-m $mode
+			-c $count
+			-O $count_rdata
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count.xml	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,112 @@
+<tool id="aurora_htseq_size" name="Aurora htseq site" version="1.0.0">
+    <description>Counting reads in features.</description>
+    <requirements>
+        <requirement type="package" version="1.15.0.6-0">pandoc</requirement>
+        <requirement type="package" version="1.20.0">r-getopt</requirement>
+        <requirement type="package" version="1.6">r-rmarkdown</requirement>
+        <requirement type="package" version="0.9.1">htseq</requirement>
+        <requirement type="package" version="0.2">r-dt</requirement>
+    </requirements>
+    <stdio>
+        <regex match="XXX" source="stderr" level="warning"
+               description="Check the warnings_and_errors.txt file for more details."/>
+    </stdio>
+    <command><![CDATA[Rscript '${__tool_directory__}/htseq_count_site_render.R'
+
+			-e $echo
+			-o $report
+			-d $report.files_path
+			-s $sink_message
+			-t '${__tool_directory__}'
+			
+			-A '$alignment_files'
+			-B '$sample_names'
+			-G $gff
+			-f $format
+			-r $order
+			-S $stranded
+			-a $minaqual
+			-T $feature_type
+			-i $idattr
+			-m $mode
+			-c $count
+			-O $count_rdata]]></command>
+    <inputs>
+        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
+               label="Display analysis code in report?"/>
+        <param type="data" name="alignment_files" label="Alignment file (SAM/BAM)" optional="False" format="sam,bam"
+               multiple="True"/>
+        <param type="text" name="sample_names" label="Sample names"
+               help="sample names for each input SAM/BAM file. Use comma to separate multiple names, for example, sample_1, sample_2, sample_3. The number of sample names should be the same as the number of input SAM/BAM files."
+               optional="False" size="5x25"/>
+        <param type="select" name="format" argument="-f" label="Input data format" optional="False" multiple="False">
+            <option value="sam" selected="false">SAM</option>
+            <option value="bam" selected="true">BAM</option>
+        </param>
+        <param type="data" name="gff" label="GFF file" optional="False" format="gff, gff3"/>
+        <param type="select" name="order" argument="-r" label="Order"
+               help="Use this option to indicate how the input data has been sorted. The default is name.&#13;&#10;&#13;&#10;"
+               optional="False" multiple="False">
+            <option value="name" selected="false">Name</option>
+            <option value="pos" selected="false">Position</option>
+        </param>
+        <param type="select" name="stranded" argument="-s" label="Stranded" optional="False" multiple="False">
+            <option value="yes" selected="true">Yes</option>
+            <option value="no" selected="false">No</option>
+            <option value="reverse" selected="false">Reverse</option>
+        </param>
+        <param type="integer" name="minaqual" argument="-a" label="Alignment quality "
+               help="Skip all reads with alignment quality lower than the given minimum value (default: 10)"
+               optional="False" value="10" min="0"/>
+        <param type="text" name="feature_type" argument="-t" label="Feature type"
+               help="Feature type (3rd column in GFF file) to be used, all features of other type are ignored (default, suitable for RNA-Seq analysis using an Ensembl GTF file: exon)"
+               optional="False" value="exon"/>
+        <param type="text" name="idattr" argument="-i" label="ID attribute"
+               help="GFF attribute to be used as feature ID. Several GFF lines with the same feature ID will be considered as parts of the same feature. The feature ID is used to identity the counts in the output table. The default, suitable for RNA-Seq analysis using an Ensembl GTF file, is gene_id."
+               optional="False" value="gene_id"/>
+        <param type="select" name="mode" argument="-m" label="Mode"
+               help="Mode to handle reads overlapping more than one feature. Possible values for &lt;mode&gt; are union, intersection-strict and intersection-nonempty (default: union)"
+               optional="False" multiple="False">
+            <option value="union" selected="true">Union</option>
+            <option value="intersection-strict" selected="false">Intersection (strict)</option>
+            <option value="intersection-nonempty" selected="false">Intersection (nonempty)</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="report" format="html" label="Aurora htseq-count site on ${on_string}" hidden="false"/>
+        <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
+        <data name="count" format="txt" label="Aurora htseq-count site on ${on_string}" hidden="false"/>
+        <data name="count_rdata" format="rdata" hidden="false"/>
+    </outputs>
+    <citations>
+        <citation type="bibtex"><![CDATA[
+            @article{allaire2016rmarkdown,
+            title={rmarkdown: Dynamic Documents for R, 2016},
+            author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff
+            and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
+            journal={R package version 0.9},
+            volume={6},
+            year={2016}
+            }
+        ]]></citation>
+        <citation type="bibtex"><![CDATA[
+            @book{xie2015dynamic,
+            title={Dynamic Documents with R and knitr},
+            author={Xie, Yihui},
+            volume={29},
+            year={2015},
+            publisher={CRC Press}
+            }
+        ]]></citation>
+        <citation type="bibtex"><![CDATA[@article{anders2015htseq,
+  title={HTSeq—a Python framework to work with high-throughput sequencing data},
+  author={Anders, Simon and Pyl, Paul Theodor and Huber, Wolfgang},
+  journal={Bioinformatics},
+  volume={31},
+  number={2},
+  pages={166--169},
+  year={2015},
+  publisher={Oxford University Press}
+}]]></citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count_index.Rmd	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,16 @@
+---
+title: "htseq-count Report"
+output: html_document
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(echo = TRUE, error = TRUE)
+```
+
+## References
+
+Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob (2016). rmarkdown: Dynamic Documents for R, 2016. In R package version 0.9, 6.
+
+Xie, Yihui (2015). Dynamic Documents with R and knitr, CRC Press, Vol.29.
+
+Anders, Simon and Pyl, Paul Theodor and Huber, Wolfgang (2015). HTSeq—a Python framework to work with high-throughput sequencing data. In Bioinformatics, 31 (2), pp. 166--169.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count_render.R	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,83 @@
+##============ Sink warnings and errors to a file ==============
+## use the sink() function to wrap all code within it.
+##==============================================================
+zz = file('warnings_and_errors.txt')
+sink(zz)
+sink(zz, type = 'message')
+
+#------------import libraries--------------------
+options(stringsAsFactors = FALSE)
+
+library(getopt)
+library(rmarkdown)
+#------------------------------------------------
+
+
+#------------get arguments into R--------------------
+# library(dplyr)
+# getopt_specification_matrix(extract_short_flags('')) %>%
+#   write.table(file = 'spec.txt', sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE)
+
+
+spec_matrix = as.matrix(
+  data.frame(stringsAsFactors=FALSE,
+             long_flags = c("X_e", "X_o", "X_d", "X_s", "X_t", "X_A", "X_B", "X_G",
+                            "X_f", "X_r", "X_S", "X_a", "X_T", "X_i", "X_m", "X_c", "X_O"),
+             short_flags = c("e", "o", "d", "s", "t", "A", "B", "G", "f", "r", "S",
+                             "a", "T", "i", "m", "c", "O"),
+             argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+                                     1L, 1L, 1L, 1L),
+             data_type_flags = c("character", "character", "character", "character",
+                                 "character", "character", "character", "character",
+                                 "character", "character", "character", "character",
+                                 "character", "character", "character", "character",
+                                 "character")
+  )
+)
+opt = getopt(spec_matrix)
+#----------------------------------------------------
+
+
+#-----------using passed arguments in R 
+#           to define system environment variables---
+do.call(Sys.setenv, opt[-1])
+#----------------------------------------------------
+
+#---------- often used variables ----------------
+# OUTPUT_REPORT: path to galaxy output report
+# OUTPUT_DIR: path to the output associated directory, which stores all outputs
+# TOOL_DIR: path to the tool installation directory
+OUTPUT_DIR = opt$X_d
+TOOL_DIR =   opt$X_t
+OUTPUT_REPORT = opt$X_o
+
+
+# create the output associated directory to store all outputs
+dir.create(OUTPUT_DIR, recursive = TRUE)
+
+#-----------------render site--------------
+# copy site generating materials into OUTPUT_DIR
+dir.create(paste0(OUTPUT_DIR, '/site_generator'), recursive = TRUE)
+system(paste0('cp -r ', TOOL_DIR, '/htseq_count.Rmd ', OUTPUT_DIR, '/site_generator/htseq_count.Rmdd'))
+system(paste0('cp -r ', TOOL_DIR, '/htseq_count_site.yml ', OUTPUT_DIR, '/site_generator/_site.yml'))
+system(paste0('cp -r ', TOOL_DIR, '/htseq_count_index.Rmd ', OUTPUT_DIR, '/site_generator/index.Rmd'))
+# render site to OUTPUT_DIR/_site, this is configured in the "_site.yml" file
+render_site(input = paste0(OUTPUT_DIR, '/site_generator'))
+# remove site generating materials from output associated directory
+unlink(paste0(OUTPUT_DIR, '/site_generator'), recursive = TRUE)
+# move _site/* into output associated directory
+move_cmd = paste0('mv ', OUTPUT_DIR, '/_site/* ', OUTPUT_DIR)
+system(move_cmd)
+#------------------------------------------
+
+#-----link index.html to output-----
+cp_index = paste0('cp ', OUTPUT_DIR, '/index.html ', OUTPUT_REPORT)
+system(cp_index)
+#-----------------------------------
+
+#==============the end==============
+
+
+##--------end of code rendering .Rmd templates----------------
+sink()
+##=========== End of sinking output=============================
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_count_site.yml	Wed Mar 07 08:45:02 2018 -0500
@@ -0,0 +1,15 @@
+name: "Analysis Report"
+output_dir: "../_site"
+navbar:
+    title: ""
+    type: inverse
+    left:
+        - text: "Home"
+          icon: fa-home
+          href: index.html
+        - text: "TITLE 1"
+          href: htseq_count.html
+output:
+  html_document:
+    theme: cosmo
+    highlight: textmate
\ No newline at end of file