Mercurial > repos > dpryan79 > featurecounts_test

--- a/.shed.yml	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-categories:
-- RNA
-- Transcriptomics
-- Sequence Analysis
-description: featureCounts counts the number of reads aligned to defined masked regions in a reference genome
-long_description: |
-  Counts reads aligned to annotated genes in a reference genome from SAM or BAM files.
-name: featurecounts
-owner: iuc
-homepage_url: http://bioinf.wehi.edu.au/featureCounts
-remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/featurecounts
-type: unrestricted
--- a/README.rst	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-FeatureCounts wrapper for Galaxy
-================================
-
-* http://bioinf.wehi.edu.au/featureCounts/
-* http://subread.sourceforge.net/
-
-FeatureCounts as part of the SUBREAD package is "a highly efficient and
-accurate read summarization program".
-
-Installation
-------------
-
-This wrapper requires Galaxy 16.04 to be fully functional because
-of the following commits:
-
-* https://github.com/galaxyproject/galaxy/pull/961
-* https://github.com/galaxyproject/galaxy/pull/1714
-
-License
--------
-
-**featureCounts**:
-
-GPL (>=3)
-
--- a/featurecounts.xml	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,469 +0,0 @@
-<tool id="featurecounts" name="featureCounts" version="1.4.6.p5" profile="16.04">
-    <description>Measure gene expression in RNA-Seq experiments from SAM or BAM files.</description>
-    <requirements>
-        <requirement type="package" version="1.4.6p5">subread</requirement>
-    </requirements>
-
-    <version_command>featureCounts -v 2&gt;&amp;1 | grep .</version_command>
-    <command><![CDATA[
-        ## Check whether all alignments are from the same type (bam || sam)
-        featureCounts
-            #if $gtf_source.ref_source=="history":
-                -a "$gtf_source.reference_gene_sets"
-            #else:
-                -a "$gtf_source.reference_gene_sets_builtin.fields.path"
-            #end if
-
-            -o "output"
-            -T \${GALAXY_SLOTS:-2}
-
-            -t "$extended_parameters.gff_feature_type"
-            -g "$extended_parameters.gff_feature_attribute"
-                $extended_parameters.summarization_level
-                $extended_parameters.contribute_to_multiple_features
-            -s  $extended_parameters.strand_specificity
-                $extended_parameters.multimapping_enabled.multimapping_counts
-
-                #if str($extended_parameters.multimapping_enabled.multimapping_counts) == " -M"
-                    $extended_parameters.multimapping_enabled.fraction
-                #end if
-
-            -Q  $extended_parameters.mapping_quality
-                $extended_parameters.largest_overlap
-            --minOverlap  $extended_parameters.min_overlap
-                $extended_parameters.read_reduction
-                $extended_parameters.primary
-                $extended_parameters.ignore_dup
-
-                #if str($extended_parameters.read_extension_5p) != "0"
-                    --readExtension5 $extended_parameters.read_extension_5p
-                #end if
-
-                #if str($extended_parameters.read_extension_3p) != "0"
-                    --readExtension3 $extended_parameters.read_extension_3p
-                #end if
-
-                $pe_parameters.fragment_counting_enabled.fragment_counting
-                #if str($pe_parameters.fragment_counting_enabled.fragment_counting) == " -p"
-                    $pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance
-                    #if str($pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance) == " -P"
-                        -d $pe_parameters.fragment_counting_enabled.check_distance_enabled.minimum_fragment_length
-                        -D $pe_parameters.fragment_counting_enabled.check_distance_enabled.maximum_fragment_length
-                    #end if
-                #end if
-
-                $pe_parameters.only_both_ends
-            -S  $pe_parameters.orientation
-                $pe_parameters.exclude_chimerics
-
-        "${alignment}"
-
-        ## Removal of comment and column-header line
-        && grep -v "^#" "output" | tail -n+2 > body.txt
-
-        ## Set the right columns for the tabular formats
-        #if $format.value == "tabdel_medium"
-            && cut -f 1,7 body.txt > expression_matrix.txt
-
-            ## Paste doesn't allow a non ordered list of columns: -f 1,7,8,6 will only return columns 1,7 and 8
-            ## Thus the gene length column (last column) has to be added separately
-            && cut -f 6 body.txt > gene_lengths.txt
-            && paste expression_matrix.txt gene_lengths.txt > expression_matrix.txt.bak
-            && mv -f expression_matrix.txt.bak "${output_medium}"
-        #elif $format.value == "tabdel_short"
-            && cut -f 1,7 body.txt > "${output_short}"
-        #else
-            && cp body.txt "${output_full}"
-        #end if
-
-
-        #if str($include_feature_length_file) == "true"
-            && cut -f 1,6 body.txt > "${output_feature_lengths}"
-        #end if
-
-        && tail -n+2 "output.summary" > "${output_summary}"
-
-    ]]></command>
-    <inputs>
-        <param name="alignment"
-               type="data"
-               multiple="false"
-               format="bam,sam"
-               label="Alignment file"
-               help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files must be in the same format" />
-
-        <conditional name="gtf_source">
-            <param name="ref_source" type="select" label="Gene annotation file">
-                <option value="cached">locally cached</option>
-                <option value="history">in your history</option>
-            </param>
-            <when value="cached">
-                <param name="reference_gene_sets_builtin" type="select" label="Using locally cached annotation" help="If the annotation file you require is not listed here, please contact the Galaxy administrator">
-                    <options from_data_table="gene_sets">
-                        <filter type="sort_by" column="1" />
-                        <validator type="no_options" message="No annotations are available." />
-                    </options>
-                </param>
-            </when>
-            <when value="history">
-                <param name="reference_gene_sets"
-                       format="gff,gtf,gff3"
-                       type="data"
-                       label="Gene annotation file"
-                       help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotation file corresponds to the same reference genome as used for the alignment" />
-            </when>
-        </conditional>
-
-        <param name="format"
-               type="select"
-               label="Output format"
-               help="The output format will be tabular, select the preferred columns here">
-            <option value="tabdel_short" selected="true">Gene-ID "\t" read-count (DESeq2 IUC wrapper compatible)</option>
-            <option value="tabdel_medium">Gene-ID "\t" read-count "\t" gene-length</option>
-            <option value="tabdel_full">featureCounts 1.4.0+ default (includes regions provided by the GTF file)</option>
-        </param>
-
-        <param name="include_feature_length_file"
-               type="boolean"
-               truevalue="true"
-               falsevalue="false"
-               checked="false"
-               label="Create gene-length file"
-               help="Creates a tabular file that contains the effective (nucleotides used for counting reads) length of the feature; might be useful for estimating FPKM/RPKM" />
-
-
-        <section name="pe_parameters" title="Options for paired-end reads">
-            <conditional name="fragment_counting_enabled">
-
-                <param name="fragment_counting"
-                       type="select"
-                       argument="-p"
-                       checked="true"
-                       label="Count fragments instead of reads"
-                       help="If specified, fragments (or templates) will be counted instead of reads.">
-                    <option value="" selected="true">Disabled; all reads/mates will be counted individually</option>
-                    <option value=" -p">Enabled; fragments (or templates) will be counted instead of reads</option>
-                </param>
-
-                <when value=" -p">
-                    <conditional name="check_distance_enabled">
-                        <param name="check_distance"
-                            type="boolean"
-                            truevalue=" -P"
-                            falsevalue=""
-                            argument="-P"
-                            label="Check paired-end distance"
-                            help="If specified, paired-end distance will be checked when assigning fragments to meta-features or features. This option is only applicable when -p (Count fragments instead of reads) is specified. The distance thresholds should be specified using -d and -D (minimum and maximum fragment/template length) options." />
-                        <when value=" -P">
-                            <param name="minimum_fragment_length"
-                                   type="integer"
-                                   value="50"
-                                   argument="-d"
-                                   label="Minimum fragment/template length." />
-                            <param name="maximum_fragment_length"
-                                   type="integer"
-                                   value="600"
-                                   argument="-D"
-                                   label="Maximum fragment/template length." />
-                        </when>
-                        <when value="" />
-                    </conditional>
-                </when>
-                <when value="" />
-            </conditional>
-
-            <param name="only_both_ends"
-                   type="boolean"
-                   truevalue=" -B"
-                   falsevalue=""
-                   argument="-B"
-                   label="Only allow fragments with both reads aligned"
-                   help="If specified, only fragments that have both ends successfully aligned will be considered for summarization. This option is only applicable for paired-end reads." />
-
-            <param name="orientation"
-                   type="select"
-                   label="Orientation of the two read from the same pair"
-                   argument="-S"
-                   help="Default is 'fr'">
-                <option value="fr" selected="true">Forward, Reverse (fr)</option>
-                <option value="ff">Forward, Forward (ff)</option>
-                <option value="rf">Reverse, Forward (rf)</option>
-            </param>
-
-            <param name="exclude_chimerics"
-                type="boolean"
-                truevalue=" -C"
-                falsevalue=""
-                argument="-C"
-                checked="true"
-                label="Exclude chimeric fragments"
-                help="If specified, the chimeric fragments (those fragments that have their two ends aligned to different chromosomes) will NOT be included for summarization. This option is only applicable for paired-end read data." />
-        </section>
-
-        <section name="extended_parameters" title="Advanced options">
-            <param name="gff_feature_type"
-                type="text"
-                value="exon"
-                argument="-t"
-                label="GFF feature type filter"
-                help="Specify the feature type. Only rows which have the matched matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default." />
-
-            <param name="gff_feature_attribute"
-                type="text"
-                value="gene_id"
-                argument="-g"
-                label="GFF gene identifier"
-                help="Specify the attribute type used to group features (eg. exons) into meta-features (eg. genes), when GTF annotation is provided. `gene_id' by default. This attribute type is usually the gene identifier. This argument is useful for the meta-feature level summarization." />
-
-            <param name="summarization_level"
-                type="boolean"
-                truevalue=" -f"
-                falsevalue=""
-                argument="-f"
-                label="On feature level"
-                help="If specified, read summarization will be performed at the feature level. By default (-f is not specified), the read summarization is performed at the meta-feature level." />
-
-            <param name ="contribute_to_multiple_features"
-                type="boolean"
-                truevalue=" -O"
-                falsevalue=""
-                argument="-O"
-                label="Allow read to contribute to multiple features"
-                help="If specified, reads (or fragments if -p is specified) will be allowed to be assigned to more than one matched meta- feature (or matched feature if -f is specified)" />
-
-            <param name="strand_specificity"
-                   type="select"
-                   label="Strand specificity of the protocol"
-                   argument="-s"
-                   help="Indicate if strand-specific read counting should be performed.">
-                <option value="0" selected="true">Unstranded</option>
-                <option value="1">Stranded (forwards)</option>
-                <option value="2">Stranded (reverse)</option>
-            </param>
-
-            <conditional name="multimapping_enabled">
-                <param name="multimapping_counts"
-                       type="select"
-                       argument="-M"
-                       label="Count multi-mapping reads/fragments"
-                       help="If specified, multi-mapping reads/fragments will be counted (ie. a multi-mapping read will be counted up to N times if it has N reported mapping locations). The program uses the `NH' tag to find multi-mapping reads.">
-                    <option value="" selected="true">Disabled; multi-mapping reads are excluded (default)</option>
-                    <option value=" -M">Enabled; multi-mapping reads are included</option>
-                </param>
-                <when value=" -M">
-                    <param name="fraction"
-                           type="boolean"
-                           truevalue="--fraction"
-                           falsevalue=""
-                           argument="--fraction"
-                           label="Assign fractions to multimapping reads"
-                           help="If specified, a fractional count 1/n will be generated for each multi-mapping read, where n is the number of alignments (indica- ted by 'NH' tag) reported for the read. This option must be used together with the '-M' option." />
-                </when>
-                <when value="" />
-            </conditional>
-
-            <param name="mapping_quality"
-                   type="integer"
-                   value="12"
-                   argument="-Q"
-                   label="Minimum mapping quality per read"
-                   help="The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 12 by default." />
-
-            <param name="largest_overlap"
-                   type="boolean"
-                   truevalue=" --largestOverlap"
-                   falsevalue=""
-                   argument="--largestOverlap"
-                   label="Largest overlap"
-                   help="If specified, reads (or fragments) will be assigned to the target that has the largest number of overlapping bases" />
-
-            <param name="min_overlap"
-                   type="integer"
-                   value="1"
-                   argument="--minOverlap"
-                   label="Minimum overlap"
-                   help="Specify the minimum required number of overlapping bases between a read (or a fragment) and a feature. 1 by default. If a negative value is provided, the read will be extended from both ends." />
-
-            <param name="read_extension_5p"
-                   type="integer"
-                   value="0"
-                   argument="--readExtension5"
-                   label="Read 5' extension"
-                   help="Reads are extended upstream by ... bases from their 5' end" />
-
-            <param name="read_extension_3p"
-                   type="integer"
-                   value="0"
-                   argument="--readExtension3"
-                   label="Read 3' extension"
-                   help="Reads are extended upstream by ... bases from their 3' end" />
-
-            <param name="read_reduction"
-                   type="select"
-                   label="Reduce read to single position"
-                   argument="--read2pos"
-                   help="The read is reduced to its 5' most base or 3'most base. Read summarization is then performed based on thesingle base which the read is reduced to.">
-                <option value="" selected="true">Leave the read as it is</option>
-                <option value="--read2pos 5">Reduce it to the 5' end</option>
-                <option value="--read2pos 3">Reduce it to the 3' end</option>
-            </param>
-
-            <param name="primary"
-                   type="boolean"
-                   truevalue=" --primary"
-                   falsevalue=""
-                   argument="--primary"
-                   label="Only count primary alignments"
-                   help="If specified, only primary alignments will be counted. Primaryand secondary alignments are identified using bit 0x100 in theFlag field of SAM/BAM files. All primary alignments in a datasetwill be counted no matter they are from multi-mapping reads ornot ('-M' is ignored)." />
-
-            <param name="ignore_dup"
-                   type="boolean"
-                   truevalue=" --ignoreDup"
-                   falsevalue=""
-                   argument="--ignoreDup"
-                   label="Ignore reads marked as duplicate"
-                   help="If specified, reads that were marked asduplicates will be ignored. Bit Ox400 in FLAG field of SAM/BAMfile is used for identifying duplicate reads. In paired enddata, the entire read pair will be ignored if at least one endis found to be a duplicate read." />
-
-            <param name="count_split_alignments_only"
-                   type="boolean"
-                   truevalue=" --countSplitAlignmentsOnly"
-                   falsevalue=""
-                   argument="--countSplitAlignmentsOnly"
-                   label="Ignore reads marked as duplicate"
-                   help="If specified, only split alignments (CIGARstrings containing letter `N') will be counted. All the otheralignments will be ignored. An example of split alignments isthe exon-spanning reads in RNA-seq data." />
-        </section>
-    </inputs>
-    <outputs>
-        <data format="tabular"
-              name="output_medium"
-              label="${tool.name} on ${on_string}">
-            <filter>format == "tabdel_medium"</filter>
-            <actions>
-                <action name="column_names" type="metadata" default="Geneid,${alignment.name},Length" />
-            </actions>
-        </data>
-
-        <data format="tabular"
-              name="output_short"
-              label="${tool.name} on ${on_string}">
-            <filter>format == "tabdel_short"</filter>
-            <actions>
-                <action name="column_names" type="metadata" default="Geneid,${alignment.name}" />
-            </actions>
-        </data>
-
-        <data format="tabular"
-              name="output_full"
-              label="${tool.name} on ${on_string}: count table">
-            <filter>format == "tabdel_full"</filter>
-            <actions>
-                <action name="column_names" type="metadata" default="Geneid,Chr,Start,End,Strand,Length,${alignment.name}" />
-            </actions>
-        </data>
-
-        <data format="tabular"
-              name="output_summary"
-              hidden="true"
-              label="${tool.name} on ${on_string}: summary">
-            <actions>
-                <action name="column_names" type="metadata" default="Status,${alignment.name}" />
-            </actions>
-        </data>
-
-        <data format="tabular"
-              name="output_feature_lengths"
-              label="${tool.name} on ${on_string}: feature lengths">
-              <filter>include_feature_length_file</filter>
-            <actions>
-                <action name="column_names" type="metadata" default="Feature,Length" />
-            </actions>
-        </data>
-    </outputs>
-    <tests>
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_short" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_1_short.tab"/>
-            <output name="output_summary" file="output_1_summary.tab"/>
-        </test>
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_medium" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_1_medium.tab"/>
-            <output name="output_summary" file="output_1_summary.tab"/>
-        </test>
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_full" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_1_full.tab"/>
-            <output name="output_summary" file="output_1_summary.tab"/>
-            <output name="output_feature_lengths" file="output_feature_lengths.tab"/>
-        </test>
-
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_short" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_2_short.tab"/>
-            <output name="output_summary" file="output_2_summary.tab"/>
-        </test>
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_medium" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_2_medium.tab"/>
-            <output name="output_summary" file="output_2_summary.tab"/>
-        </test>
-        <test>
-            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
-            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_full" />
-            <param name="include_feature_length_file" value="true"/>
-            <param name="ref_source" value="history" />
-            <output name="output" file="output_2_full.tab"/>
-            <output name="output_summary" file="output_2_summary.tab"/>
-            <output name="output_feature_lengths" file="output_feature_lengths.tab"/>
-        </test>
-    </tests>
-
-    <help><![CDATA[
-featureCounts
-#############
-
-Overview
---------
-FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files.
-
-Input formats
--------------
-Alignments should be provided in either:
-
- - SAM format, http://samtools.sourceforge.net/samtools.shtml#5
- - BAM format
-
-Gene regions should be provided in the GFF/GTF format:
-
- - http://genome.ucsc.edu/FAQ/FAQformat.html#format3
- - http://www.ensembl.org/info/website/upload/gff.html
-
-Output format
--------------
-FeatureCounts produces a table containing counted reads, per gene, per row. Optionally the last column can be set to be the effective gene-length. These tables are compatible with the DESeq2 Galaxy wrapper by IUC. Column names are added as metadata object.
-    ]]></help>
-    <citations>
-        <citation type="doi">10.1093/bioinformatics/btt656</citation>
-    </citations>
-</tool>
--- a/test-data/output_2_medium.tab	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-left	92	170000
-right	66	170000
--- a/test-data/output_2_short.tab	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-left	92
-right	66
--- a/test-data/output_2_summary.tab	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-Assigned	158
-Unassigned_Ambiguity	0
-Unassigned_MultiMapping	0
-Unassigned_NoFeatures	6078
-Unassigned_Unmapped	0
-Unassigned_MappingQuality	0
-Unassigned_FragmentLength	0
-Unassigned_Chimera	0
-Unassigned_Secondary	0
-Unassigned_Nonjunction	0
-Unassigned_Duplicate	0
--- a/test-data/output_feature_lengths.tab	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-left	170000
-right	170000
--- a/tool-data/gene_sets.loc	Wed Nov 09 16:29:18 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-# This is a sample file distributed with featureCounts that enables it and other# tools to use gene/exon annotations in the GFF/GTF format.
-#
-# The gene_sets.loc file syntax is:
-#<unique_build_id>	<dbkey>	<display_name>	<path>
-#
-# Please ensure that the above fields are tab separated.
-#
-# In case you have TWO or MORE providers PER dbkey, the one mentioned
-# first in the file, should have the "default" priority.
-#
-#Example:
-#
-#Homo_sapiens.GRCh37.74	hg19	GRCh37 (hg19) annotation from Ensembl, release 74	/depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf
-#Homo_sapiens.NCBI36.54	hg18	hg18 annotation from Ensembl, release 54	/depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gene_sets.loc.sample	Wed Nov 09 16:36:46 2016 -0500
@@ -0,0 +1,14 @@
+# This is a sample file distributed with featureCounts that enables it and other# tools to use gene/exon annotations in the GFF/GTF format.
+#
+# The gene_sets.loc file syntax is:
+#<unique_build_id>	<dbkey>	<display_name>	<path>
+#
+# Please ensure that the above fields are tab separated.
+#
+# In case you have TWO or MORE providers PER dbkey, the one mentioned
+# first in the file, should have the "default" priority.
+#
+#Example:
+#
+#Homo_sapiens.GRCh37.74	hg19	GRCh37 (hg19) annotation from Ensembl, release 74	/depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf
+#Homo_sapiens.NCBI36.54	hg18	hg18 annotation from Ensembl, release 54	/depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf