comparison htseqsams2mx.xml @ 56:9b59cd40f20d draft

Uploaded
author iuc
date Tue, 28 Apr 2015 22:56:39 -0400
parents
children 57841366f112
comparison
equal deleted inserted replaced
55:bf016b884c68 56:9b59cd40f20d
1 <tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.5">
2 <description>using HTSeq code</description>
3 <stdio>
4 <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/>
5 </stdio>
6 <requirements>
7 <requirement type="package" version="0.7.6">pysam</requirement>
8 <requirement type="package" version="1.2.1">matplotlib</requirement>
9 <requirement type="package" version="0.5.4p3">htseq</requirement>
10 </requirements>
11 <command interpreter="python">
12 htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
13 --mapqMin $mapqMin
14 #for $s in $samfiles:
15 #if $s.ext != 'data':
16 --samf "'${s}','${s.name}','${s.ext}','${s.metadata.bam_index}'"
17 #end if
18 #end for
19 #if $filter_extras:
20 --filter_extras "$filter_extras"
21 #end if
22 </command>
23 <inputs>
24 <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" />
25 <param name="mapqMin" label="Filter reads with mapq below than this value"
26 help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted"
27 type="integer" value="5"/>
28 <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/>
29 <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
30 truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
31 <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
32 help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons">
33 <option value="union" selected="true">union</option>
34 <option value="intersection-strict">intersection-strict</option>
35 <option value="intersection-nonempty">intersection-nonempty</option>
36 </param>
37 <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs"
38 help="If in doubt, use gene name or if you need the id in your GTF, gene id">
39 <option value="gene_name" selected="true">gene name</option>
40 <option value="gene_id">gene id</option>
41 <option value="transcript_id">transcript id</option>
42 <option value="transcript_name">transcript name</option>
43 </param>
44 <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs"
45 help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over">
46 <option value="exon" selected="true">exon</option>
47 <option value="CDS">CDS</option>
48 <option value="UTR">UTR</option>
49 <option value="transcript">transcript</option>
50 </param>
51 <param name="filter_extras" type="select" label="Filter any read with one or more flags"
52 help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true">
53 <option value="">None</option>
54 <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option>
55 <option value="XS:A">Might be useful for tophat</option>
56 </param>
57
58 <param name="samfiles" type="data" label="bam/sam file from your history" format="sam,bam" size="100" multiple="true"/>
59 </inputs>
60 <outputs>
61 <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" />
62 </outputs>
63 <tests>
64 <test>
65 <param name="feature_type" value="exon" />
66 <param name="gfffile" value="rn4_chr20_100k.gtf" />
67 <param name="samfiles" value="rn4chr20test1.bam,rn4chr20test2.bam" ftype="bam"/>
68 <param name="id_attr" value="gene_name" />
69 <param name="model" value="union" />
70 <param name="stranded" value="no" />
71 <param name="title" value="htseqtest" />
72 <param name="mapqMin" value="0" />
73
74 <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/>
75 </test>
76 </tests>
77 <help>
78
79 **What this tool does**
80
81 Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools
82 It uses HTSeq to count your sam reads over a gene model supplied as a GTF file
83 The output is a tabular text (columnar - spreadsheet) file containing the
84 count matrix for downstream processing. Each row contains the counts from each sample for each
85 of the non-emtpy GTF input file contigs matching the GTF attribute choice above.
86 You probably want to use gene level GTF output attribute and count reads that overlap
87 GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc.
88
89 ----
90
91 **Author's plea on replicates**
92
93 If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis
94 under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived,
95 which is what published p values are often assumed to do, then you need biological
96 (or for cell culture material experimental) replicates.
97
98 Using technical or no replicates means the downstream p values are not interpretable the way most people would assume
99 they are - ie as the probability of obtaining a result as or more extreme as your experimental data
100 in millions of experiments conducted using the same methods under the null hypothesis.
101
102 There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from
103 technical or no replicates without making the lack of biological or experimental error in the p value calculations
104 clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference.
105 If you have no replicates, you must not use this tool as the p values are uninterpretable. So there.
106
107 See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a
108 statistician if this makes no sense please.
109
110 **Attribution**
111
112 This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
113 for the tricky work of counting. That code includes the following attribution:
114
115 ## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
116 ## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
117 ## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
118
119 It will be automatically installed if you use the toolshed as in general, you probably should.
120 HTSeq_ must be installed with this tool if you install manually.
121
122 Otherwise, all code and documentation comprising this tool including the requirement
123 for more than one sample bam
124 was written by Ross Lazarus and is
125 licensed to you under the LGPL_ like other rgenetics artefacts
126
127 Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code
128
129 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
130 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
131 </help>
132
133 </tool>