Mercurial > repos > mingchen0919 > aurora_htseq

--- a/htseq_count.Rmd	Tue Mar 06 17:38:58 2018 -0500
+++ b/htseq_count.Rmd	Wed Mar 07 00:12:55 2018 -0500
@@ -11,15 +11,47 @@
 ```


-# Code for computational analysis
+## Run htseq-count
+
+```{bash}
+cd ${X_d}

-```{r 'step 1'}
+cat >htseq-count.sh <<EOF
+  htseq-count \\
+    $(echo ${X_A} | sed 's/,/ /g') \\
+    ${X_G} \\
+    -f ${X_f} \\
+    -r ${X_r} \\
+    -s ${X_S} \\
+    -a ${X_a} \\
+    -t ${X_T} \\
+    -i ${X_i} \\
+    -m ${X_m} > counts.txt
+
+  grep -v '__no_feature\|__ambiguous\|__too_low_aQual\|__not_aligned\|__alignment_not_unique' counts.txt > ${X_c}
+EOF
+```
+
+
+```{r}
+# display htseq-count job script
+htseq_count_sh = paste0(opt$X_d, '/htseq-count.sh')
+tags$code(tags$pre(readChar(htseq_count_sh, file.info(htseq_count_sh)$size )))
 opt
 ```

+```{bash}
+cd ${X_d}
+sh htseq-count.sh
+```

-```{r 'ste[ 2'}
+## Counts

+```{r}
+count_data = read.table(paste0(opt$X_d, '/counts.txt'), row.names = 1)
+sample_names = trimws(strsplit(opt$X_B, ',')[[1]])
+colnames(count_data) = rep(sample_names, length = ncol(count_data))
+DT::datatable(count_data, caption = "htseq counts for each sample")
 ```
--- a/htseq_count.sh	Tue Mar 06 17:38:58 2018 -0500
+++ b/htseq_count.sh	Wed Mar 07 00:12:55 2018 -0500
@@ -1,4 +1,4 @@
-Rscript '${__tool_directory__}/aurora_htseq_count.R'
+Rscript '${__tool_directory__}/htseq_count_render.R'

 			-e $echo
 			-o $report
@@ -6,11 +6,14 @@
 			-s $sink_message
 			-t '${__tool_directory__}'

+			-A '$alignment_files'
+			-B '$sample_names'
+			-G $gff
 			-f $format
 			-r $order
 			-S $stranded
 			-a $minaqual
 			-T $feature_type
 			-i $idattr
-			-I $additional_attr
-			-m $mode
\ No newline at end of file
+			-m $mode
+			-c $count
\ No newline at end of file
--- a/htseq_count.xml	Tue Mar 06 17:38:58 2018 -0500
+++ b/htseq_count.xml	Wed Mar 07 00:12:55 2018 -0500
@@ -10,7 +10,7 @@
         <regex match="XXX" source="stderr" level="warning"
                description="Check the warnings_and_errors.txt file for more details."/>
     </stdio>
-    <command><![CDATA[Rscript '${__tool_directory__}/aurora_htseq_count.R'
+    <command><![CDATA[Rscript '${__tool_directory__}/htseq_count_render.R'

 			-e $echo
 			-o $report
@@ -18,23 +18,30 @@
 			-s $sink_message
 			-t '${__tool_directory__}'

+			-A '$alignment_files'
+      -B '$sample_names'
+			-G $gff
 			-f $format
 			-r $order
 			-S $stranded
 			-a $minaqual
 			-T $feature_type
 			-i $idattr
-			-I $additional_attr
-			-m $mode]]></command>
+			-m $mode
+			-c $count]]></command>
     <inputs>
         <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
                label="Display analysis code in report?"/>
-        <param type="data" name="alignment_file" label="Alignment file (SAM/BAM)" optional="False" format="sam,bam"/>
-        <param type="data" name="gff" label="GFF file" optional="False" format="gff, gff3"/>
+        <param type="data" name="alignment_files" label="Alignment file (SAM/BAM)" optional="False" format="sam,bam"
+               multiple="True"/>
+        <param type="text" name="sample_names" label="Sample names"
+               help="sample names for each input SAM/BAM file. Use comma to separate multiple names, for example, sample_1, sample_2, sample_3. The number of sample names should be the same as the number of input SAM/BAM files."
+               optional="False" size="5x25"/>
         <param type="select" name="format" argument="-f" label="Input data format" optional="False" multiple="False">
             <option value="sam" selected="false">SAM</option>
             <option value="bam" selected="false">BAM</option>
         </param>
+        <param type="data" name="gff" label="GFF file" optional="False" format="gff, gff3"/>
         <param type="select" name="order" argument="-r" label="Order"
                help="Use this option to indicate how the input data has been sorted. The default is name.&#13;&#10;&#13;&#10;"
                optional="False" multiple="False">
@@ -55,9 +62,6 @@
         <param type="text" name="idattr" argument="-i" label="ID attribute"
                help="GFF attribute to be used as feature ID. Several GFF lines with the same feature ID will be considered as parts of the same feature. The feature ID is used to identity the counts in the output table. The default, suitable for RNA-Seq analysis using an Ensembl GTF file, is gene_id."
                optional="False" value="gene_id"/>
-        <param type="text" name="additional_attr" argument="--additional-attr" label="Additional attributes"
-               help="Additional feature attributes, which will be printed as an additional column after the primary attribute column but before the counts column(s). The default is none, a suitable value to get gene names using an Ensembl GTF file is gene_name."
-               optional="True"/>
         <param type="select" name="mode" argument="-m" label="Mode"
                help="Mode to handle reads overlapping more than one feature. Possible values for &lt;mode&gt; are union, intersection-strict and intersection-nonempty (default: union)"
                optional="False" multiple="False">
@@ -67,7 +71,7 @@
         </param>
     </inputs>
     <outputs>
-        <data name="Aurora htseq" format="html" label="Aurora htseq on ${on_string}" hidden="false"/>
+        <data name="report" format="html" label="Aurora htseq on ${on_string}" hidden="false"/>
         <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
         <data name="count" format="txt" label="Aurora htseq-count on ${on_string}" hidden="false"/>
     </outputs>
--- a/htseq_count_render.R	Tue Mar 06 17:38:58 2018 -0500
+++ b/htseq_count_render.R	Wed Mar 07 00:12:55 2018 -0500
@@ -10,6 +10,7 @@

 library(getopt)
 library(rmarkdown)
+library(htmltools)
 #------------------------------------------------


@@ -21,15 +22,16 @@

 spec_matrix = as.matrix(
   data.frame(stringsAsFactors=FALSE,
-              long_flags = c("X_e", "X_o", "X_d", "X_s", "X_t", "X_f", "X_r",
-                             "X_S", "X_a", "X_T", "X_i", "X_I", "X_m"),
-             short_flags = c("e", "o", "d", "s", "t", "f", "r", "S", "a", "T",
-                             "i", "I", "m"),
-     argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
+              long_flags = c("X_e", "X_o", "X_d", "X_s", "X_t", "X_A", "X_B", "X_G",
+                             "X_f", "X_r", "X_S", "X_a", "X_T", "X_i", "X_m", "X_c"),
+             short_flags = c("e", "o", "d", "s", "t", "A", "B", "G", "f", "r", "S",
+                             "a", "T", "i", "m", "c"),
+     argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+                             1L, 1L, 1L),
          data_type_flags = c("character", "character", "character", "character",
-                             "character", "character", "character",
+                             "character", "character", "character", "character",
                              "character", "character", "character", "character",
-                             "character", "character")
+                             "character", "character", "character", "character")
   )
 )
 opt = getopt(spec_matrix)
--- a/spec.txt	Tue Mar 06 17:38:58 2018 -0500
+++ b/spec.txt	Wed Mar 07 00:12:55 2018 -0500
@@ -4,11 +4,12 @@
 X_d,d,1,character
 X_s,s,1,character
 X_t,t,1,character
+X_A,A,1,character
+X_G,G,1,character
 X_f,f,1,character
 X_r,r,1,character
 X_S,S,1,character
 X_a,a,1,character
 X_T,T,1,character
 X_i,i,1,character
-X_I,I,1,character
 X_m,m,1,character