comparison htseqsams2mx.xml @ 43:390cb852aae7 draft

Uploaded
author fubar
date Thu, 21 Nov 2013 17:39:01 -0500
parents
children 19207379d4cf
comparison
equal deleted inserted replaced
42:5c783df4f31c 43:390cb852aae7
1 <tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.4">
2 <description>using HTSeq code</description>
3 <stdio>
4 <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/>
5 </stdio>
6 <requirements>
7 <requirement type="package" version="1.7">numpy</requirement>
8 <requirement type="package" version="0.7.6">pysam</requirement>
9 <requirement type="package" version="2.4">freetype</requirement>
10 <requirement type="package" version="1.2">matplotlib</requirement>
11 <requirement type="package" version="0.5.4p3">htseq</requirement>
12 </requirements>
13 <command interpreter="python">
14 htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
15 --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'"
16 #if $secondsamf.ext != 'data':
17 --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'"
18 #end if
19 #for $s in $samfiles:
20 #if $s.samf.ext != 'data':
21 --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'"
22 #end if
23 #end for
24 #if $filter_extras:
25 --filter_extras "$filter_extras"
26 #end if
27 </command>
28 <inputs>
29 <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" />
30 <param name="mapqMin" label="Filter reads with mapq below than this value"
31 help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted"
32 type="integer" value="5"/>
33 <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/>
34 <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
35 truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
36 <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
37 help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons">
38 <option value="union" selected="true">union</option>
39 <option value="intersection-strict">intersection-strict</option>
40 <option value="intersection-nonempty">intersection-nonempty</option>
41 </param>
42 <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs"
43 help="If in doubt, use gene name or if you need the id in your GTF, gene id">
44 <option value="gene_name" selected="true">gene name</option>
45 <option value="gene_id">gene id</option>
46 <option value="transcript_id">transcript id</option>
47 <option value="transcript_name">transcript name</option>
48 </param>
49 <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs"
50 help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over">
51 <option value="exon" selected="true">exon</option>
52 <option value="CDS">CDS</option>
53 <option value="UTR">UTR</option>
54 <option value="transcript">transcript</option>
55 </param>
56 <param name="filter_extras" type="select" label="Filter any read with one or more flags"
57 help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true">
58 <option value="">None</option>
59 <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option>
60 <option value="XS:A">Might be useful for tophat</option>
61 </param>
62
63 <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100"
64 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"
65 optional="false"/>
66 <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100"
67 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"
68 optional="false"/>
69 <repeat name="samfiles" min="16"
70 title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs">
71 <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100"
72 optional="true"/>
73 </repeat>
74 </inputs>
75 <outputs>
76 <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" />
77 </outputs>
78 <tests>
79 <test>
80 <param name="feature_type" value="exon" />
81 <param name="gfffile" value="rn4_chr20_100k.gtf" />
82 <param name="firstsamf" value="rn4chr20test1.bam" ftype="bam"/>
83 <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam"/>
84 <param name="id_attr" value="gene_name" />
85 <param name="model" value="union" />
86 <param name="stranded" value="no" />
87 <param name="title" value="htseqtest" />
88 <param name="mapqMin" value="0" />
89
90 <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/>
91 </test>
92 </tests>
93 <help>
94
95 **What this tool does**
96
97 Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools
98 It uses HTSeq to count your sam reads over a gene model supplied as a GTF file
99 The output is a tabular text (columnar - spreadsheet) file containing the
100 count matrix for downstream processing. Each row contains the counts from each sample for each
101 of the non-emtpy GTF input file contigs matching the GTF attribute choice above.
102 You probably want to use gene level GTF output attribute and count reads that overlap
103 GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc.
104
105 ----
106
107 **Author's plea on replicates**
108
109 If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis
110 under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived,
111 which is what published p values are often assumed to do, then you need biological
112 (or for cell culture material experimental) replicates.
113
114 Using technical or no replicates means the downstream p values are not interpretable the way most people would assume
115 they are - ie as the probability of obtaining a result as or more extreme as your experimental data
116 in millions of experiments conducted using the same methods under the null hypothesis.
117
118 There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from
119 technical or no replicates without making the lack of biological or experimental error in the p value calculations
120 clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference.
121 If you have no replicates, you must not use this tool as the p values are uninterpretable. So there.
122
123 See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a
124 statistician if this makes no sense please.
125
126 **Attribution**
127
128 This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
129 for the tricky work of counting. That code includes the following attribution:
130
131 ## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
132 ## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
133 ## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
134
135 It will be automatically installed if you use the toolshed as in general, you probably should.
136 HTSeq_ must be installed with this tool if you install manually.
137
138 Otherwise, all code and documentation comprising this tool including the requirement
139 for more than one sample bam
140 was written by Ross Lazarus and is
141 licensed to you under the LGPL_ like other rgenetics artefacts
142
143 Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code
144
145 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
146 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
147 </help>
148
149 </tool>