Mercurial > repos > fubar > htseq_bams_to_count_matrix
comparison htseqsams2mx.xml @ 43:390cb852aae7 draft
Uploaded
author | fubar |
---|---|
date | Thu, 21 Nov 2013 17:39:01 -0500 |
parents | |
children | 19207379d4cf |
comparison
equal
deleted
inserted
replaced
42:5c783df4f31c | 43:390cb852aae7 |
---|---|
1 <tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.4"> | |
2 <description>using HTSeq code</description> | |
3 <stdio> | |
4 <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> | |
5 </stdio> | |
6 <requirements> | |
7 <requirement type="package" version="1.7">numpy</requirement> | |
8 <requirement type="package" version="0.7.6">pysam</requirement> | |
9 <requirement type="package" version="2.4">freetype</requirement> | |
10 <requirement type="package" version="1.2">matplotlib</requirement> | |
11 <requirement type="package" version="0.5.4p3">htseq</requirement> | |
12 </requirements> | |
13 <command interpreter="python"> | |
14 htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" | |
15 --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'" | |
16 #if $secondsamf.ext != 'data': | |
17 --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'" | |
18 #end if | |
19 #for $s in $samfiles: | |
20 #if $s.samf.ext != 'data': | |
21 --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'" | |
22 #end if | |
23 #end for | |
24 #if $filter_extras: | |
25 --filter_extras "$filter_extras" | |
26 #end if | |
27 </command> | |
28 <inputs> | |
29 <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> | |
30 <param name="mapqMin" label="Filter reads with mapq below than this value" | |
31 help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" | |
32 type="integer" value="5"/> | |
33 <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/> | |
34 <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox" | |
35 truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" /> | |
36 <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" | |
37 help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> | |
38 <option value="union" selected="true">union</option> | |
39 <option value="intersection-strict">intersection-strict</option> | |
40 <option value="intersection-nonempty">intersection-nonempty</option> | |
41 </param> | |
42 <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" | |
43 help="If in doubt, use gene name or if you need the id in your GTF, gene id"> | |
44 <option value="gene_name" selected="true">gene name</option> | |
45 <option value="gene_id">gene id</option> | |
46 <option value="transcript_id">transcript id</option> | |
47 <option value="transcript_name">transcript name</option> | |
48 </param> | |
49 <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" | |
50 help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over"> | |
51 <option value="exon" selected="true">exon</option> | |
52 <option value="CDS">CDS</option> | |
53 <option value="UTR">UTR</option> | |
54 <option value="transcript">transcript</option> | |
55 </param> | |
56 <param name="filter_extras" type="select" label="Filter any read with one or more flags" | |
57 help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> | |
58 <option value="">None</option> | |
59 <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> | |
60 <option value="XS:A">Might be useful for tophat</option> | |
61 </param> | |
62 | |
63 <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100" | |
64 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" | |
65 optional="false"/> | |
66 <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" | |
67 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" | |
68 optional="false"/> | |
69 <repeat name="samfiles" min="16" | |
70 title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"> | |
71 <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" | |
72 optional="true"/> | |
73 </repeat> | |
74 </inputs> | |
75 <outputs> | |
76 <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> | |
77 </outputs> | |
78 <tests> | |
79 <test> | |
80 <param name="feature_type" value="exon" /> | |
81 <param name="gfffile" value="rn4_chr20_100k.gtf" /> | |
82 <param name="firstsamf" value="rn4chr20test1.bam" ftype="bam"/> | |
83 <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam"/> | |
84 <param name="id_attr" value="gene_name" /> | |
85 <param name="model" value="union" /> | |
86 <param name="stranded" value="no" /> | |
87 <param name="title" value="htseqtest" /> | |
88 <param name="mapqMin" value="0" /> | |
89 | |
90 <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/> | |
91 </test> | |
92 </tests> | |
93 <help> | |
94 | |
95 **What this tool does** | |
96 | |
97 Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools | |
98 It uses HTSeq to count your sam reads over a gene model supplied as a GTF file | |
99 The output is a tabular text (columnar - spreadsheet) file containing the | |
100 count matrix for downstream processing. Each row contains the counts from each sample for each | |
101 of the non-emtpy GTF input file contigs matching the GTF attribute choice above. | |
102 You probably want to use gene level GTF output attribute and count reads that overlap | |
103 GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. | |
104 | |
105 ---- | |
106 | |
107 **Author's plea on replicates** | |
108 | |
109 If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis | |
110 under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, | |
111 which is what published p values are often assumed to do, then you need biological | |
112 (or for cell culture material experimental) replicates. | |
113 | |
114 Using technical or no replicates means the downstream p values are not interpretable the way most people would assume | |
115 they are - ie as the probability of obtaining a result as or more extreme as your experimental data | |
116 in millions of experiments conducted using the same methods under the null hypothesis. | |
117 | |
118 There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from | |
119 technical or no replicates without making the lack of biological or experimental error in the p value calculations | |
120 clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. | |
121 If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. | |
122 | |
123 See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a | |
124 statistician if this makes no sense please. | |
125 | |
126 **Attribution** | |
127 | |
128 This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html | |
129 for the tricky work of counting. That code includes the following attribution: | |
130 | |
131 ## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology | |
132 ## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General | |
133 ## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 | |
134 | |
135 It will be automatically installed if you use the toolshed as in general, you probably should. | |
136 HTSeq_ must be installed with this tool if you install manually. | |
137 | |
138 Otherwise, all code and documentation comprising this tool including the requirement | |
139 for more than one sample bam | |
140 was written by Ross Lazarus and is | |
141 licensed to you under the LGPL_ like other rgenetics artefacts | |
142 | |
143 Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code | |
144 | |
145 .. _LGPL: http://www.gnu.org/copyleft/lesser.html | |
146 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html | |
147 </help> | |
148 | |
149 </tool> |