Mercurial > repos > yhoogstrate > featurecounts_valid_gff

--- a/featurecounts_valid_gff.xml	Fri Feb 07 05:00:14 2014 -0500
+++ b/featurecounts_valid_gff.xml	Fri Feb 07 11:28:17 2014 -0500
@@ -9,59 +9,106 @@
 			The following script is written in the "Cheetah" language:
 			http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
 		-->
-		featureCounts
-			-a $input_annotation
-			-o $output
-
-			#if $alignments[0].file.extension == "bam"
-				-b
+
+		<!-- Check 01: do the alignments have a dbkey and is the option set to using it?  -->
+		#if $reference_gene_sets_source.source_select == "attribute" and len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) != 1
+			echo "Invalid number of dbkeys are found: ${ len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) }, while only one should be used. Make sure that the alignments are done on the same reference genome and that 'tool-data/gene_sets.loc' is configured properly!" >&amp;2
+		#else
+			<!-- Check 02: are all alignments from the same type (bam || sam)  -->
+			#if len({ alignment.extension:True for alignment in $alignments }.keys()) != 1
+				echo "Either all files must be SAM or all files must be BAM, no mixture is allowed." >&amp;2
+			#else
+				featureCounts
+					-a
+					#if $reference_gene_sets_source.source_select == "indexed"
+						"$reference_gene_sets_source.reference_gene_sets"
+					#else if $reference_gene_sets_source.source_select == "history"
+						"$reference_gene_sets_source.reference_gene_sets"
+					#else
+						<!--
+							This is a workaround to obtain the "genome.fa" file that
+							corresponds to the dbkey of the alignments.
+							Because this file is "calculated" during run-time, it can
+							be used in a workflow.
+						-->
+						"${ filter( lambda x: str( x[0] ) == str( { alignment.metadata.dbkey:True for alignment in $alignments }.keys()[0] ), $__app__.tool_data_tables[ 'gene_sets' ].get_fields() )[0][2] }"
+					#end if
+
+					-o "$output"
+
+					#if { alignment.extension:True for alignment in $alignments }.keys()[0] == "bam"
+						-b
+					#end if
+
+					#if $extended_parameters.parameters == "extended"
+						-t $extended_parameters.gff_feature_type
+						-g $extended_parameters.gff_feature_attribute
+						$extended_parameters.summarization_level
+						$extended_parameters.contribute_to_multiple_features
+						$extended_parameters.protocol
+						-Q $extended_parameters.mapping_quality
+						-T $extended_parameters.threads
+						$extended_parameters.fragment_counting
+						$extended_parameters.check_distance
+						-d $extended_parameters.minimum_fragment_length
+						-D $extended_parameters.maximum_fragment_length
+						$extended_parameters.only_both_ends
+						$extended_parameters.exclude_chimerics
+						$extended_parameters.namesort
+					#end if
+
+					#for $alignment in $alignments
+						 ${alignment}
+					#end for
+
+					2>&amp;1
+
+				<!-- #if $format == "complex" or $format.value == "complex" -->
+				<!-- ; mv tmp.txt $output -->
+				#if $format == "tabdel_default" or $format.value == "tabdel_default"
+					; cp $output tmp.txt
+					; egrep -v "^#" tmp.txt > tmp2.txt
+					; cut -f 1,7 tmp2.txt > tmp_left.txt
+					; cut -f 6 tmp2.txt > tmp_right.txt
+					; paste tmp_left.txt tmp_right.txt > $output
+					<!-- ; rm tmp.txt tmp2.txt tmp_left.txt tmp_right.txt -->
+				#elif $format == "tabdel_short" or $format.value == "tabdel_short"
+					; cp $output tmp.txt
+					; egrep -v "^#" tmp.txt | cut -f 1,7 > $output
+					<!-- ; rm tmp.txt -->
+				#end if
 			#end if
-
-			#if $extended_parameters.parameters == "extended"
-				-t $extended_parameters.gff_feature_type
-				-g $extended_parameters.gff_feature_attribute
-				$extended_parameters.summarization_level
-				$extended_parameters.contribute_to_multiple_features
-				$extended_parameters.protocol
-				-Q $extended_parameters.mapping_quality
-				-T $extended_parameters.threads
-				$extended_parameters.fragment_counting
-				$extended_parameters.check_distance
-				-d $extended_parameters.minimum_fragment_length
-				-D $extended_parameters.maximum_fragment_length
-				$extended_parameters.only_both_ends
-				$extended_parameters.exclude_chimerics
-				$extended_parameters.namesort
-			#end if
-
-			#for $alignment in $alignments
-				${alignment.file}
-			#end for
-
-			2>&amp;1
-
-		<!-- #if $format == "complex" or $format.value == "complex" -->
-		<!-- ; mv tmp.txt $output -->
-		#if $format == "tabdel_default" or $format.value == "tabdel_default"
-			; cp $output tmp.txt
-			; egrep -v "^#" tmp.txt > tmp2.txt
-			; cut -f 1,7 tmp2.txt > tmp_left.txt
-			; cut -f 6 tmp2.txt > tmp_right.txt
-			; paste tmp_left.txt tmp_right.txt > $output
-			<!-- ; rm tmp.txt tmp2.txt tmp_left.txt tmp_right.txt -->
-		#elif $format == "tabdel_short" or $format.value == "tabdel_short"
-			; cp $output tmp.txt
-			; egrep -v "^#" tmp.txt | cut -f 1,7 > $output
-			<!-- ; rm tmp.txt -->
 		#end if
 	</command>

 	<inputs>
-		<param name="input_annotation" type="data" format="gff" label="Annotation file" help="The annotation file. The program assumes that the provided annotation file is in GTF format." />
+		<param name="alignments" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files in the series must be in THE SAME format." multiple="true" />

-		<repeat name="alignments" title="Alignment file" default="1" min="1">
-			<param name="file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files in the series must be in THE SAME format." />
-		</repeat>
+		<!-- Find out how to access the the GTF/GFF file(s) -->
+		<conditional name="reference_gene_sets_source">
+			<param name="source_select" type="select" label="Fasta Source">
+				<option value="indexed">Use a built-in index</option>
+				<option value="history">Use reference from the history</option>
+				<option value="attribute">Use a built-in index based on the 'metadata.dbkey' attribute of the input; select this if you design a workflow</option>
+			</param>
+			<when value="indexed">
+				<param name="reference_gene_sets" type="select" label="Reference Genome used during alignment (fasta)" >
+					<options from_file="gene_sets.loc">
+						<column name="name"  index="0"/>
+						<column name="dbkey" index="1"/>
+						<column name="value" index="2"/>
+						<filter type="data_meta" ref="alignments" multiple="false" key="dbkey" column="1" />
+						<validator type="no_options" message="No indexes are available for the selected input dataset" />
+					</options>
+				</param>
+			</when>
+			<when value="history">
+				<param name="reference_gene_sets" format="gff" type="data" label="Gene annotation file" help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotaiton file corresponds to the same reference genome as used for the alignment." />
+			</when>
+			<when value="attribute">
+				<!-- Do nothing, determine GTF/GFF file at runtime -->
+			</when>
+		</conditional>

 		<param name="format" type="select" label="Output format">
 			<option value="complex">featureCounts 1.4.0+ default (extensive; complex)</option>
@@ -114,13 +161,14 @@
 	</inputs>

 	<outputs>
-		<data format="tabular" name="output" label="${tool.name} on ${alignments[0].file.hid}: ${alignments[0].file.name}" />
+		<data format="tabular" name="output" label="${tool.name} on ${', '.join([ str(a.hid)+': '+a.name for a in $alignments ])}" />
 	</outputs>

 	<help>
 featureCounts-valid-gff::

-**featureCounts Overview**
+**Overview**
+
 FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files.
 It has a variety of advanced parameters but its major strength is its outstanding performance: analysis of a 10GB BAM file takes about 7 minutes on a single average CPU (Homo Sapiens genome)!
 Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, Advance Access, accepted on Nov 7, 2013
@@ -152,6 +200,8 @@
 Make sure you have proper GFF/GTF files (corresponding to your reference genome used for the aligment) uploaded to your history.
 The source of this file should not be important since this fork can handle both ENSEMBL and UCSC variants of the GTF/GFF format.

+**Examples**
+
 **License**

 * featureCounts / subread: GNU General Public License version 3.0 (GPLv3)
@@ -164,5 +214,6 @@

 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
 http://toolshed.nbic.nl/
+
 </help>
-</tool>
+</tool>
\ No newline at end of file