Mercurial > repos > dpryan79 > featurecounts_test
changeset 1:60b43da9a265 draft
Uploaded
| author | dpryan79 | 
|---|---|
| date | Wed, 09 Nov 2016 16:36:46 -0500 | 
| parents | 3edb501ec957 | 
| children | 7c3041c778d0 | 
| files | .shed.yml README.rst featurecounts.xml test-data/output_2_medium.tab test-data/output_2_short.tab test-data/output_2_summary.tab test-data/output_feature_lengths.tab tool-data/gene_sets.loc tool-data/gene_sets.loc.sample | 
| diffstat | 9 files changed, 14 insertions(+), 537 deletions(-) [+] | 
line wrap: on
 line diff
--- a/.shed.yml Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -categories: -- RNA -- Transcriptomics -- Sequence Analysis -description: featureCounts counts the number of reads aligned to defined masked regions in a reference genome -long_description: | - Counts reads aligned to annotated genes in a reference genome from SAM or BAM files. -name: featurecounts -owner: iuc -homepage_url: http://bioinf.wehi.edu.au/featureCounts -remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/featurecounts -type: unrestricted
--- a/README.rst Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -FeatureCounts wrapper for Galaxy -================================ - -* http://bioinf.wehi.edu.au/featureCounts/ -* http://subread.sourceforge.net/ - -FeatureCounts as part of the SUBREAD package is "a highly efficient and -accurate read summarization program". - -Installation ------------- - -This wrapper requires Galaxy 16.04 to be fully functional because -of the following commits: - -* https://github.com/galaxyproject/galaxy/pull/961 -* https://github.com/galaxyproject/galaxy/pull/1714 - -License -------- - -**featureCounts**: - -GPL (>=3) -
--- a/featurecounts.xml Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,469 +0,0 @@ -<tool id="featurecounts" name="featureCounts" version="1.4.6.p5" profile="16.04"> - <description>Measure gene expression in RNA-Seq experiments from SAM or BAM files.</description> - <requirements> - <requirement type="package" version="1.4.6p5">subread</requirement> - </requirements> - - <version_command>featureCounts -v 2>&1 | grep .</version_command> - <command><![CDATA[ - ## Check whether all alignments are from the same type (bam || sam) - featureCounts - #if $gtf_source.ref_source=="history": - -a "$gtf_source.reference_gene_sets" - #else: - -a "$gtf_source.reference_gene_sets_builtin.fields.path" - #end if - - -o "output" - -T \${GALAXY_SLOTS:-2} - - -t "$extended_parameters.gff_feature_type" - -g "$extended_parameters.gff_feature_attribute" - $extended_parameters.summarization_level - $extended_parameters.contribute_to_multiple_features - -s $extended_parameters.strand_specificity - $extended_parameters.multimapping_enabled.multimapping_counts - - #if str($extended_parameters.multimapping_enabled.multimapping_counts) == " -M" - $extended_parameters.multimapping_enabled.fraction - #end if - - -Q $extended_parameters.mapping_quality - $extended_parameters.largest_overlap - --minOverlap $extended_parameters.min_overlap - $extended_parameters.read_reduction - $extended_parameters.primary - $extended_parameters.ignore_dup - - #if str($extended_parameters.read_extension_5p) != "0" - --readExtension5 $extended_parameters.read_extension_5p - #end if - - #if str($extended_parameters.read_extension_3p) != "0" - --readExtension3 $extended_parameters.read_extension_3p - #end if - - $pe_parameters.fragment_counting_enabled.fragment_counting - #if str($pe_parameters.fragment_counting_enabled.fragment_counting) == " -p" - $pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance - #if str($pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance) == " -P" - -d $pe_parameters.fragment_counting_enabled.check_distance_enabled.minimum_fragment_length - -D $pe_parameters.fragment_counting_enabled.check_distance_enabled.maximum_fragment_length - #end if - #end if - - $pe_parameters.only_both_ends - -S $pe_parameters.orientation - $pe_parameters.exclude_chimerics - - "${alignment}" - - ## Removal of comment and column-header line - && grep -v "^#" "output" | tail -n+2 > body.txt - - ## Set the right columns for the tabular formats - #if $format.value == "tabdel_medium" - && cut -f 1,7 body.txt > expression_matrix.txt - - ## Paste doesn't allow a non ordered list of columns: -f 1,7,8,6 will only return columns 1,7 and 8 - ## Thus the gene length column (last column) has to be added separately - && cut -f 6 body.txt > gene_lengths.txt - && paste expression_matrix.txt gene_lengths.txt > expression_matrix.txt.bak - && mv -f expression_matrix.txt.bak "${output_medium}" - #elif $format.value == "tabdel_short" - && cut -f 1,7 body.txt > "${output_short}" - #else - && cp body.txt "${output_full}" - #end if - - - #if str($include_feature_length_file) == "true" - && cut -f 1,6 body.txt > "${output_feature_lengths}" - #end if - - && tail -n+2 "output.summary" > "${output_summary}" - - ]]></command> - <inputs> - <param name="alignment" - type="data" - multiple="false" - format="bam,sam" - label="Alignment file" - help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files must be in the same format" /> - - <conditional name="gtf_source"> - <param name="ref_source" type="select" label="Gene annotation file"> - <option value="cached">locally cached</option> - <option value="history">in your history</option> - </param> - <when value="cached"> - <param name="reference_gene_sets_builtin" type="select" label="Using locally cached annotation" help="If the annotation file you require is not listed here, please contact the Galaxy administrator"> - <options from_data_table="gene_sets"> - <filter type="sort_by" column="1" /> - <validator type="no_options" message="No annotations are available." /> - </options> - </param> - </when> - <when value="history"> - <param name="reference_gene_sets" - format="gff,gtf,gff3" - type="data" - label="Gene annotation file" - help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotation file corresponds to the same reference genome as used for the alignment" /> - </when> - </conditional> - - <param name="format" - type="select" - label="Output format" - help="The output format will be tabular, select the preferred columns here"> - <option value="tabdel_short" selected="true">Gene-ID "\t" read-count (DESeq2 IUC wrapper compatible)</option> - <option value="tabdel_medium">Gene-ID "\t" read-count "\t" gene-length</option> - <option value="tabdel_full">featureCounts 1.4.0+ default (includes regions provided by the GTF file)</option> - </param> - - <param name="include_feature_length_file" - type="boolean" - truevalue="true" - falsevalue="false" - checked="false" - label="Create gene-length file" - help="Creates a tabular file that contains the effective (nucleotides used for counting reads) length of the feature; might be useful for estimating FPKM/RPKM" /> - - - <section name="pe_parameters" title="Options for paired-end reads"> - <conditional name="fragment_counting_enabled"> - - <param name="fragment_counting" - type="select" - argument="-p" - checked="true" - label="Count fragments instead of reads" - help="If specified, fragments (or templates) will be counted instead of reads."> - <option value="" selected="true">Disabled; all reads/mates will be counted individually</option> - <option value=" -p">Enabled; fragments (or templates) will be counted instead of reads</option> - </param> - - <when value=" -p"> - <conditional name="check_distance_enabled"> - <param name="check_distance" - type="boolean" - truevalue=" -P" - falsevalue="" - argument="-P" - label="Check paired-end distance" - help="If specified, paired-end distance will be checked when assigning fragments to meta-features or features. This option is only applicable when -p (Count fragments instead of reads) is specified. The distance thresholds should be specified using -d and -D (minimum and maximum fragment/template length) options." /> - <when value=" -P"> - <param name="minimum_fragment_length" - type="integer" - value="50" - argument="-d" - label="Minimum fragment/template length." /> - <param name="maximum_fragment_length" - type="integer" - value="600" - argument="-D" - label="Maximum fragment/template length." /> - </when> - <when value="" /> - </conditional> - </when> - <when value="" /> - </conditional> - - <param name="only_both_ends" - type="boolean" - truevalue=" -B" - falsevalue="" - argument="-B" - label="Only allow fragments with both reads aligned" - help="If specified, only fragments that have both ends successfully aligned will be considered for summarization. This option is only applicable for paired-end reads." /> - - <param name="orientation" - type="select" - label="Orientation of the two read from the same pair" - argument="-S" - help="Default is 'fr'"> - <option value="fr" selected="true">Forward, Reverse (fr)</option> - <option value="ff">Forward, Forward (ff)</option> - <option value="rf">Reverse, Forward (rf)</option> - </param> - - <param name="exclude_chimerics" - type="boolean" - truevalue=" -C" - falsevalue="" - argument="-C" - checked="true" - label="Exclude chimeric fragments" - help="If specified, the chimeric fragments (those fragments that have their two ends aligned to different chromosomes) will NOT be included for summarization. This option is only applicable for paired-end read data." /> - </section> - - <section name="extended_parameters" title="Advanced options"> - <param name="gff_feature_type" - type="text" - value="exon" - argument="-t" - label="GFF feature type filter" - help="Specify the feature type. Only rows which have the matched matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default." /> - - <param name="gff_feature_attribute" - type="text" - value="gene_id" - argument="-g" - label="GFF gene identifier" - help="Specify the attribute type used to group features (eg. exons) into meta-features (eg. genes), when GTF annotation is provided. `gene_id' by default. This attribute type is usually the gene identifier. This argument is useful for the meta-feature level summarization." /> - - <param name="summarization_level" - type="boolean" - truevalue=" -f" - falsevalue="" - argument="-f" - label="On feature level" - help="If specified, read summarization will be performed at the feature level. By default (-f is not specified), the read summarization is performed at the meta-feature level." /> - - <param name ="contribute_to_multiple_features" - type="boolean" - truevalue=" -O" - falsevalue="" - argument="-O" - label="Allow read to contribute to multiple features" - help="If specified, reads (or fragments if -p is specified) will be allowed to be assigned to more than one matched meta- feature (or matched feature if -f is specified)" /> - - <param name="strand_specificity" - type="select" - label="Strand specificity of the protocol" - argument="-s" - help="Indicate if strand-specific read counting should be performed."> - <option value="0" selected="true">Unstranded</option> - <option value="1">Stranded (forwards)</option> - <option value="2">Stranded (reverse)</option> - </param> - - <conditional name="multimapping_enabled"> - <param name="multimapping_counts" - type="select" - argument="-M" - label="Count multi-mapping reads/fragments" - help="If specified, multi-mapping reads/fragments will be counted (ie. a multi-mapping read will be counted up to N times if it has N reported mapping locations). The program uses the `NH' tag to find multi-mapping reads."> - <option value="" selected="true">Disabled; multi-mapping reads are excluded (default)</option> - <option value=" -M">Enabled; multi-mapping reads are included</option> - </param> - <when value=" -M"> - <param name="fraction" - type="boolean" - truevalue="--fraction" - falsevalue="" - argument="--fraction" - label="Assign fractions to multimapping reads" - help="If specified, a fractional count 1/n will be generated for each multi-mapping read, where n is the number of alignments (indica- ted by 'NH' tag) reported for the read. This option must be used together with the '-M' option." /> - </when> - <when value="" /> - </conditional> - - <param name="mapping_quality" - type="integer" - value="12" - argument="-Q" - label="Minimum mapping quality per read" - help="The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 12 by default." /> - - <param name="largest_overlap" - type="boolean" - truevalue=" --largestOverlap" - falsevalue="" - argument="--largestOverlap" - label="Largest overlap" - help="If specified, reads (or fragments) will be assigned to the target that has the largest number of overlapping bases" /> - - <param name="min_overlap" - type="integer" - value="1" - argument="--minOverlap" - label="Minimum overlap" - help="Specify the minimum required number of overlapping bases between a read (or a fragment) and a feature. 1 by default. If a negative value is provided, the read will be extended from both ends." /> - - <param name="read_extension_5p" - type="integer" - value="0" - argument="--readExtension5" - label="Read 5' extension" - help="Reads are extended upstream by ... bases from their 5' end" /> - - <param name="read_extension_3p" - type="integer" - value="0" - argument="--readExtension3" - label="Read 3' extension" - help="Reads are extended upstream by ... bases from their 3' end" /> - - <param name="read_reduction" - type="select" - label="Reduce read to single position" - argument="--read2pos" - help="The read is reduced to its 5' most base or 3'most base. Read summarization is then performed based on thesingle base which the read is reduced to."> - <option value="" selected="true">Leave the read as it is</option> - <option value="--read2pos 5">Reduce it to the 5' end</option> - <option value="--read2pos 3">Reduce it to the 3' end</option> - </param> - - <param name="primary" - type="boolean" - truevalue=" --primary" - falsevalue="" - argument="--primary" - label="Only count primary alignments" - help="If specified, only primary alignments will be counted. Primaryand secondary alignments are identified using bit 0x100 in theFlag field of SAM/BAM files. All primary alignments in a datasetwill be counted no matter they are from multi-mapping reads ornot ('-M' is ignored)." /> - - <param name="ignore_dup" - type="boolean" - truevalue=" --ignoreDup" - falsevalue="" - argument="--ignoreDup" - label="Ignore reads marked as duplicate" - help="If specified, reads that were marked asduplicates will be ignored. Bit Ox400 in FLAG field of SAM/BAMfile is used for identifying duplicate reads. In paired enddata, the entire read pair will be ignored if at least one endis found to be a duplicate read." /> - - <param name="count_split_alignments_only" - type="boolean" - truevalue=" --countSplitAlignmentsOnly" - falsevalue="" - argument="--countSplitAlignmentsOnly" - label="Ignore reads marked as duplicate" - help="If specified, only split alignments (CIGARstrings containing letter `N') will be counted. All the otheralignments will be ignored. An example of split alignments isthe exon-spanning reads in RNA-seq data." /> - </section> - </inputs> - <outputs> - <data format="tabular" - name="output_medium" - label="${tool.name} on ${on_string}"> - <filter>format == "tabdel_medium"</filter> - <actions> - <action name="column_names" type="metadata" default="Geneid,${alignment.name},Length" /> - </actions> - </data> - - <data format="tabular" - name="output_short" - label="${tool.name} on ${on_string}"> - <filter>format == "tabdel_short"</filter> - <actions> - <action name="column_names" type="metadata" default="Geneid,${alignment.name}" /> - </actions> - </data> - - <data format="tabular" - name="output_full" - label="${tool.name} on ${on_string}: count table"> - <filter>format == "tabdel_full"</filter> - <actions> - <action name="column_names" type="metadata" default="Geneid,Chr,Start,End,Strand,Length,${alignment.name}" /> - </actions> - </data> - - <data format="tabular" - name="output_summary" - hidden="true" - label="${tool.name} on ${on_string}: summary"> - <actions> - <action name="column_names" type="metadata" default="Status,${alignment.name}" /> - </actions> - </data> - - <data format="tabular" - name="output_feature_lengths" - label="${tool.name} on ${on_string}: feature lengths"> - <filter>include_feature_length_file</filter> - <actions> - <action name="column_names" type="metadata" default="Feature,Length" /> - </actions> - </data> - </outputs> - <tests> - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_short" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_1_short.tab"/> - <output name="output_summary" file="output_1_summary.tab"/> - </test> - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_medium" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_1_medium.tab"/> - <output name="output_summary" file="output_1_summary.tab"/> - </test> - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_full" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_1_full.tab"/> - <output name="output_summary" file="output_1_summary.tab"/> - <output name="output_feature_lengths" file="output_feature_lengths.tab"/> - </test> - - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_short" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_2_short.tab"/> - <output name="output_summary" file="output_2_summary.tab"/> - </test> - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_medium" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_2_medium.tab"/> - <output name="output_summary" file="output_2_summary.tab"/> - </test> - <test> - <param name="alignment" value="featureCounts_input1.bam" ftype="bam" /> - <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" /> - <param name="format" value="tabdel_full" /> - <param name="include_feature_length_file" value="true"/> - <param name="ref_source" value="history" /> - <output name="output" file="output_2_full.tab"/> - <output name="output_summary" file="output_2_summary.tab"/> - <output name="output_feature_lengths" file="output_feature_lengths.tab"/> - </test> - </tests> - - <help><![CDATA[ -featureCounts -############# - -Overview --------- -FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files. - -Input formats -------------- -Alignments should be provided in either: - - - SAM format, http://samtools.sourceforge.net/samtools.shtml#5 - - BAM format - -Gene regions should be provided in the GFF/GTF format: - - - http://genome.ucsc.edu/FAQ/FAQformat.html#format3 - - http://www.ensembl.org/info/website/upload/gff.html - -Output format -------------- -FeatureCounts produces a table containing counted reads, per gene, per row. Optionally the last column can be set to be the effective gene-length. These tables are compatible with the DESeq2 Galaxy wrapper by IUC. Column names are added as metadata object. - ]]></help> - <citations> - <citation type="doi">10.1093/bioinformatics/btt656</citation> - </citations> -</tool>
--- a/test-data/output_2_medium.tab Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -left 92 170000 -right 66 170000
--- a/test-data/output_2_short.tab Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -left 92 -right 66
--- a/test-data/output_2_summary.tab Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -Assigned 158 -Unassigned_Ambiguity 0 -Unassigned_MultiMapping 0 -Unassigned_NoFeatures 6078 -Unassigned_Unmapped 0 -Unassigned_MappingQuality 0 -Unassigned_FragmentLength 0 -Unassigned_Chimera 0 -Unassigned_Secondary 0 -Unassigned_Nonjunction 0 -Unassigned_Duplicate 0
--- a/test-data/output_feature_lengths.tab Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -left 170000 -right 170000
--- a/tool-data/gene_sets.loc Wed Nov 09 16:29:18 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -# This is a sample file distributed with featureCounts that enables it and other# tools to use gene/exon annotations in the GFF/GTF format. -# -# The gene_sets.loc file syntax is: -#<unique_build_id> <dbkey> <display_name> <path> -# -# Please ensure that the above fields are tab separated. -# -# In case you have TWO or MORE providers PER dbkey, the one mentioned -# first in the file, should have the "default" priority. -# -#Example: -# -#Homo_sapiens.GRCh37.74 hg19 GRCh37 (hg19) annotation from Ensembl, release 74 /depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf -#Homo_sapiens.NCBI36.54 hg18 hg18 annotation from Ensembl, release 54 /depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gene_sets.loc.sample Wed Nov 09 16:36:46 2016 -0500 @@ -0,0 +1,14 @@ +# This is a sample file distributed with featureCounts that enables it and other# tools to use gene/exon annotations in the GFF/GTF format. +# +# The gene_sets.loc file syntax is: +#<unique_build_id> <dbkey> <display_name> <path> +# +# Please ensure that the above fields are tab separated. +# +# In case you have TWO or MORE providers PER dbkey, the one mentioned +# first in the file, should have the "default" priority. +# +#Example: +# +#Homo_sapiens.GRCh37.74 hg19 GRCh37 (hg19) annotation from Ensembl, release 74 /depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf +#Homo_sapiens.NCBI36.54 hg18 hg18 annotation from Ensembl, release 54 /depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf
