Mercurial > repos > lparsons > htseq_count

diff htseq-count.xml @ 5:0a835934d792
Version 0.3
author: lparsons
date: Tue, 05 Mar 2013 12:26:28 -0500
parents: 359d40333595
children: 08a11d1eaec6
--- a/htseq-count.xml	Tue Sep 04 13:34:05 2012 -0400
+++ b/htseq-count.xml	Tue Mar 05 12:26:28 2013 -0500
@@ -1,9 +1,11 @@
-<tool id="htseq_count" name="htseq-count" version="0.1">
+<tool id="htseq_count" name="htseq-count" version="0.3">
     <description> - Count aligned reads in a BAM file that overlap features in a GFF file</description>
     <version_command>htseq-count -h | grep version | sed 's/^\(.*\)*\(version .*\)\./\2/'</version_command>
     <requirements>
+        <requirement type="package" version="1.6.2">numpy</requirement>
         <requirement type="package" version="0.5.3p9">htseq</requirement>
         <requirement type="package" version="0.1.18">samtools</requirement>
+        <requirement type="package" version="1.56.0">picard</requirement> 
     </requirements>
     <command>
     ##set up input files
@@ -16,32 +18,46 @@
             #set $reference_fasta_filename = str( $samout_conditional.reference_source.ref_file.fields.path )
         #end if
     #end if
-
-    #if $samfile.extension == "bam":
-        samtools view $samfile | 
+    #if str($singlepaired) == "paired":
+        ln -s $samfile local_input.sam &amp;&amp;
+        java -Xmx2G -jar "\$JAVA_JAR_PATH/SortSam.jar" VALIDATION_STRINGENCY=LENIENT SORT_ORDER=queryname O=prepared_input.sam I=local_input.sam TMP_DIR="${__new_file_path__}" 
+        || echo "Error running Picard MergeSamFiles" &gt;&amp;2 &amp;&amp;
+    #else:
+        #if $samfile.extension == "bam":
+            samtools view $samfile | 
+        #else
+            ln -s $samfile prepared_input.sam &amp;&amp;
+        #end if
     #end if
     htseq-count 
     --mode=$mode 
     --stranded=$stranded 
     --minaqual=$minaqual 
-    --type=$type 
+    --type=$featuretype 
     --idattr=$idattr 
     #if $samout_conditional.samout:
         --samout=$__new_file_path__/${samoutfile.id}_tmp
     #end if
-    #if $samfile.extension == "bam":
-        - 
-    #else
-        $samfile 
-    #end if
+    #if str($singlepaired) == "paired":
+        prepared_input.sam
+    #else:
+        #if $samfile.extension == "bam":
+            - 
+        #else:
+            prepared_input.sam
+        #end if
+    #end if    
     $gfffile 
-    &gt; $counts
+    | awk '{if ($1 ~ "no_feature|ambiguous|too_low_aQual|not_aligned|alignment_not_unique") print $0 | "cat 1>&amp;2"; else print $0}' &gt; $counts 2&gt;$othercounts
     #if $samout_conditional.samout:
         &amp;&amp; samtools view -Su -t ${reference_fasta_filename}.fai $__new_file_path__/${samoutfile.id}_tmp | samtools sort -o - sorted > $samoutfile
     #end if</command>
     <inputs>
-        <param format="sam, bam" name="samfile" type="data" label="Aligned SAM File">
-            <help>Paired-End data must be sorted by QUERY NAME, use Picard Read Mate Fixer and Query name sort order before using this tool on paired data</help>
+        <param format="sam, bam" name="samfile" type="data" label="Aligned SAM/BAM File"/>
+        <param name="singlepaired" type="select" label="Is this library mate-paired?">
+            <help>Paired libraries will be sorted by read name prior to counting.</help>
+            <option value="single" selected="true">single-end</option>
+            <option value="paired">paired-end</option>
         </param>
         <param format="gff" name="gfffile" type="data" label="GFF File"/>
         <param name="mode" type="select" label="Mode">
@@ -59,11 +75,11 @@
         <param name="minaqual" type="integer" value="0" label="Minimum alignment quality">
             <help>Skip all reads with alignment quality lower than the given minimum value</help>
         </param>
-        <param name="type" type="text" value="exon" label="Feature type">
+        <param name="featuretype" type="text" value="exon" label="Feature type">
             <help>Feature type (3rd column in GFF file) to be used. All features of other types are ignored. The default, suitable for RNA-Seq and Ensembl GTF files, is exon.</help>
         </param>
         <param name="idattr" type="text" value="gene_id" label="ID Attribute">
-            <help>GFF attribute to be used as feature ID. Several GFF lines with the same feature ID will be considered as parts of the same feature. The feature ID is used to identity the counts in the output table. The default, suitable for RNA-SEq and Ensembl GTF files, is gene_id.</help>
+            <help>GFF attribute to be used as feature ID. Several GFF lines with the same feature ID will be considered as parts of the same feature. The feature ID is used to identity the counts in the output table. All features of the specified type MUST have a value for this attribute. The default, suitable for RNA-SEq and Ensembl GTF files, is gene_id.</help>
         </param>
         <conditional name="samout_conditional">
             <param name="samout" type="boolean" value="False" truevalue="True" falsevalue="False" label="Additional BAM Output">
@@ -78,7 +94,7 @@
                     <when value="cached">
                         <param name="ref_file" type="select" label="Using reference genome">
                             <options from_data_table="sam_fa_indexes">
-                                <filter type="data_meta" key="dbkey" ref="samfile" column="3"/>
+                                <filter type="data_meta" key="dbkey" ref="samfile" column="1"/>
                             </options>
                             <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                         </param>
@@ -92,14 +108,21 @@
     </inputs>
 
     <outputs>
-        <data format="tabular" name="counts" label="${tool.name} on ${on_string}"/>
-        <data format="bam" name="samoutfile" label="${tool.name} on ${on_string} (BAM)">
+        <data format="tabular" name="counts" metadata_source="samfile" label="${tool.name} on ${on_string}"/>
+        <data format="tabular" name="othercounts" metadata_source="samfile" label="${tool.name} on ${on_string} (no feature)"/>
+        <data format="bam" name="samoutfile" metadata_source="samfile" label="${tool.name} on ${on_string} (BAM)">
             <filter>samout_conditional['samout']</filter>
         </data>
     </outputs>
 
     <stdio>
         <exit_code range="1:" level="fatal" description="Unknown error occurred" />
+        <regex match="htseq-count: command not found" source="stderr" level="fatal" description="The HTSeq python package is not properly installed, contact Galaxy administrators" />
+        <regex match="samtools: command not found" source="stderr" level="fatal" description="The samtools package is not properly installed, contact Galaxy administrators" />
+        <regex match="Error: Feature (.+) does not contain a '(.+)' attribute" source="both" level="fatal" description="Error parsing the GFF file, at least one feature of the specified 'Feature type' does not have a value for the specified 'ID Attribute'" />
+        <regex match="Error occured in line (\d+) of file" source="stderr" level="fatal" description="Unknown error parsing the GFF file" />
+        <regex match="Error" source="stderr" level="fatal" description="Unknown error occured" />
+        <regex match="Warning: Read (.+) claims to have an aligned mate which could not be found. \(Is the SAM file properly sorted\?\)" source="stderr" level="warning" description="PAIRED DATA MISSING OR NOT PROPERLY SORTED. Try reruning and selecting the paired-end option. See stderr output of this dataset for more information." />
     </stdio>
 
     <tests>
@@ -108,12 +131,22 @@
             <param name="gfffile" value="htseq-test.gff" />
             <param name="samout" value="False" />
             <output name="counts" file="htseq-test_counts.tsv" />
+            <output name="othercounts" file="htseq-test_othercounts.tsv" />
         </test>
         <test>
             <param name="samfile" value="htseq-test.bam" />
             <param name="gfffile" value="htseq-test.gff" />
             <param name="samout" value="False" />
             <output name="counts" file="htseq-test_counts.tsv" />
+            <output name="othercounts" file="htseq-test_othercounts.tsv" />
+        </test>
+        <test>
+            <param name="samfile" value="htseq-test-paired.bam" />
+            <param name="singlepaired" value="paired" />
+            <param name="gfffile" value="htseq-test.gff" />
+            <param name="samout" value="False" />
+            <output name="counts" file="htseq-test-paired_counts.tsv" />
+            <output name="othercounts" file="htseq-test-paired_othercounts.tsv" />
         </test>
         <!-- Seems to be an issue setting the $reference_fasta_filename variable during test
         <test>
@@ -123,6 +156,7 @@
             <param name="reference_source_selector" value="history" />
             <param name="ref_file" value="htseq-test_reference.fasta" />
             <output name="counts" file="htseq-test_counts.tsv" />
+            <output name="othercounts" file="htseq-test_othercounts.tsv" />
             <output name="samoutfile" file="htseq-test_samout.bam" />
         </test>
         -->
author	lparsons
date	Tue, 05 Mar 2013 12:26:28 -0500
parents	359d40333595
children	08a11d1eaec6