htseq_bams_to_count_matrix: htseqsams2mx.xml annotate

annotate htseqsams2mx.xml @ 45:19207379d4cf draft

Uploaded

author	fubar
date	Thu, 21 Nov 2013 17:48:39 -0500
parents	390cb852aae7
children	0df2b662113e

rev	line source
43 390cb852aae7 Uploaded fubar parents: diff changeset	1 <tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.4">
390cb852aae7 Uploaded fubar parents: diff changeset	2 <description>using HTSeq code</description>
390cb852aae7 Uploaded fubar parents: diff changeset	3 <stdio>
390cb852aae7 Uploaded fubar parents: diff changeset	4 <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/>
390cb852aae7 Uploaded fubar parents: diff changeset	5 </stdio>
390cb852aae7 Uploaded fubar parents: diff changeset	6 <requirements>
390cb852aae7 Uploaded fubar parents: diff changeset	7 <requirement type="package" version="0.7.6">pysam</requirement>
390cb852aae7 Uploaded fubar parents: diff changeset	8 <requirement type="package" version="1.2">matplotlib</requirement>
390cb852aae7 Uploaded fubar parents: diff changeset	9 <requirement type="package" version="0.5.4p3">htseq</requirement>
390cb852aae7 Uploaded fubar parents: diff changeset	10 </requirements>
390cb852aae7 Uploaded fubar parents: diff changeset	11 <command interpreter="python">
390cb852aae7 Uploaded fubar parents: diff changeset	12 htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
390cb852aae7 Uploaded fubar parents: diff changeset	13 --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'"
390cb852aae7 Uploaded fubar parents: diff changeset	14 #if $secondsamf.ext != 'data':
390cb852aae7 Uploaded fubar parents: diff changeset	15 --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'"
390cb852aae7 Uploaded fubar parents: diff changeset	16 #end if
390cb852aae7 Uploaded fubar parents: diff changeset	17 #for $s in $samfiles:
390cb852aae7 Uploaded fubar parents: diff changeset	18 #if $s.samf.ext != 'data':
390cb852aae7 Uploaded fubar parents: diff changeset	19 --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'"
390cb852aae7 Uploaded fubar parents: diff changeset	20 #end if
390cb852aae7 Uploaded fubar parents: diff changeset	21 #end for
390cb852aae7 Uploaded fubar parents: diff changeset	22 #if $filter_extras:
390cb852aae7 Uploaded fubar parents: diff changeset	23 --filter_extras "$filter_extras"
390cb852aae7 Uploaded fubar parents: diff changeset	24 #end if
390cb852aae7 Uploaded fubar parents: diff changeset	25 </command>
390cb852aae7 Uploaded fubar parents: diff changeset	26 <inputs>
390cb852aae7 Uploaded fubar parents: diff changeset	27 <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" />
390cb852aae7 Uploaded fubar parents: diff changeset	28 <param name="mapqMin" label="Filter reads with mapq below than this value"
390cb852aae7 Uploaded fubar parents: diff changeset	29 help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted"
390cb852aae7 Uploaded fubar parents: diff changeset	30 type="integer" value="5"/>
390cb852aae7 Uploaded fubar parents: diff changeset	31 <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/>
390cb852aae7 Uploaded fubar parents: diff changeset	32 <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
390cb852aae7 Uploaded fubar parents: diff changeset	33 truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
390cb852aae7 Uploaded fubar parents: diff changeset	34 <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
390cb852aae7 Uploaded fubar parents: diff changeset	35 help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons">
390cb852aae7 Uploaded fubar parents: diff changeset	36 <option value="union" selected="true">union</option>
390cb852aae7 Uploaded fubar parents: diff changeset	37 <option value="intersection-strict">intersection-strict</option>
390cb852aae7 Uploaded fubar parents: diff changeset	38 <option value="intersection-nonempty">intersection-nonempty</option>
390cb852aae7 Uploaded fubar parents: diff changeset	39 </param>
390cb852aae7 Uploaded fubar parents: diff changeset	40 <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs"
390cb852aae7 Uploaded fubar parents: diff changeset	41 help="If in doubt, use gene name or if you need the id in your GTF, gene id">
390cb852aae7 Uploaded fubar parents: diff changeset	42 <option value="gene_name" selected="true">gene name</option>
390cb852aae7 Uploaded fubar parents: diff changeset	43 <option value="gene_id">gene id</option>
390cb852aae7 Uploaded fubar parents: diff changeset	44 <option value="transcript_id">transcript id</option>
390cb852aae7 Uploaded fubar parents: diff changeset	45 <option value="transcript_name">transcript name</option>
390cb852aae7 Uploaded fubar parents: diff changeset	46 </param>
390cb852aae7 Uploaded fubar parents: diff changeset	47 <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs"
390cb852aae7 Uploaded fubar parents: diff changeset	48 help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over">
390cb852aae7 Uploaded fubar parents: diff changeset	49 <option value="exon" selected="true">exon</option>
390cb852aae7 Uploaded fubar parents: diff changeset	50 <option value="CDS">CDS</option>
390cb852aae7 Uploaded fubar parents: diff changeset	51 <option value="UTR">UTR</option>
390cb852aae7 Uploaded fubar parents: diff changeset	52 <option value="transcript">transcript</option>
390cb852aae7 Uploaded fubar parents: diff changeset	53 </param>
390cb852aae7 Uploaded fubar parents: diff changeset	54 <param name="filter_extras" type="select" label="Filter any read with one or more flags"
390cb852aae7 Uploaded fubar parents: diff changeset	55 help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true">
390cb852aae7 Uploaded fubar parents: diff changeset	56 <option value="">None</option>
390cb852aae7 Uploaded fubar parents: diff changeset	57 <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option>
390cb852aae7 Uploaded fubar parents: diff changeset	58 <option value="XS:A">Might be useful for tophat</option>
390cb852aae7 Uploaded fubar parents: diff changeset	59 </param>
390cb852aae7 Uploaded fubar parents: diff changeset	60
390cb852aae7 Uploaded fubar parents: diff changeset	61 <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100"
390cb852aae7 Uploaded fubar parents: diff changeset	62 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"
390cb852aae7 Uploaded fubar parents: diff changeset	63 optional="false"/>
390cb852aae7 Uploaded fubar parents: diff changeset	64 <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100"
390cb852aae7 Uploaded fubar parents: diff changeset	65 help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"
390cb852aae7 Uploaded fubar parents: diff changeset	66 optional="false"/>
390cb852aae7 Uploaded fubar parents: diff changeset	67 <repeat name="samfiles" min="16"
390cb852aae7 Uploaded fubar parents: diff changeset	68 title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs">
390cb852aae7 Uploaded fubar parents: diff changeset	69 <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100"
390cb852aae7 Uploaded fubar parents: diff changeset	70 optional="true"/>
390cb852aae7 Uploaded fubar parents: diff changeset	71 </repeat>
390cb852aae7 Uploaded fubar parents: diff changeset	72 </inputs>
390cb852aae7 Uploaded fubar parents: diff changeset	73 <outputs>
390cb852aae7 Uploaded fubar parents: diff changeset	74 <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" />
390cb852aae7 Uploaded fubar parents: diff changeset	75 </outputs>
390cb852aae7 Uploaded fubar parents: diff changeset	76 <tests>
390cb852aae7 Uploaded fubar parents: diff changeset	77 <test>
390cb852aae7 Uploaded fubar parents: diff changeset	78 <param name="feature_type" value="exon" />
390cb852aae7 Uploaded fubar parents: diff changeset	79 <param name="gfffile" value="rn4_chr20_100k.gtf" />
390cb852aae7 Uploaded fubar parents: diff changeset	80 <param name="firstsamf" value="rn4chr20test1.bam" ftype="bam"/>
390cb852aae7 Uploaded fubar parents: diff changeset	81 <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam"/>
390cb852aae7 Uploaded fubar parents: diff changeset	82 <param name="id_attr" value="gene_name" />
390cb852aae7 Uploaded fubar parents: diff changeset	83 <param name="model" value="union" />
390cb852aae7 Uploaded fubar parents: diff changeset	84 <param name="stranded" value="no" />
390cb852aae7 Uploaded fubar parents: diff changeset	85 <param name="title" value="htseqtest" />
390cb852aae7 Uploaded fubar parents: diff changeset	86 <param name="mapqMin" value="0" />
390cb852aae7 Uploaded fubar parents: diff changeset	87
390cb852aae7 Uploaded fubar parents: diff changeset	88 <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/>
390cb852aae7 Uploaded fubar parents: diff changeset	89 </test>
390cb852aae7 Uploaded fubar parents: diff changeset	90 </tests>
390cb852aae7 Uploaded fubar parents: diff changeset	91 <help>
390cb852aae7 Uploaded fubar parents: diff changeset	92
390cb852aae7 Uploaded fubar parents: diff changeset	93 What this tool does
390cb852aae7 Uploaded fubar parents: diff changeset	94
390cb852aae7 Uploaded fubar parents: diff changeset	95 Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools
390cb852aae7 Uploaded fubar parents: diff changeset	96 It uses HTSeq to count your sam reads over a gene model supplied as a GTF file
390cb852aae7 Uploaded fubar parents: diff changeset	97 The output is a tabular text (columnar - spreadsheet) file containing the
390cb852aae7 Uploaded fubar parents: diff changeset	98 count matrix for downstream processing. Each row contains the counts from each sample for each
390cb852aae7 Uploaded fubar parents: diff changeset	99 of the non-emtpy GTF input file contigs matching the GTF attribute choice above.
390cb852aae7 Uploaded fubar parents: diff changeset	100 You probably want to use gene level GTF output attribute and count reads that overlap
390cb852aae7 Uploaded fubar parents: diff changeset	101 GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc.
390cb852aae7 Uploaded fubar parents: diff changeset	102
390cb852aae7 Uploaded fubar parents: diff changeset	103 ----
390cb852aae7 Uploaded fubar parents: diff changeset	104
390cb852aae7 Uploaded fubar parents: diff changeset	105 Author's plea on replicates
390cb852aae7 Uploaded fubar parents: diff changeset	106
390cb852aae7 Uploaded fubar parents: diff changeset	107 If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis
390cb852aae7 Uploaded fubar parents: diff changeset	108 under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived,
390cb852aae7 Uploaded fubar parents: diff changeset	109 which is what published p values are often assumed to do, then you need biological
390cb852aae7 Uploaded fubar parents: diff changeset	110 (or for cell culture material experimental) replicates.
390cb852aae7 Uploaded fubar parents: diff changeset	111
390cb852aae7 Uploaded fubar parents: diff changeset	112 Using technical or no replicates means the downstream p values are not interpretable the way most people would assume
390cb852aae7 Uploaded fubar parents: diff changeset	113 they are - ie as the probability of obtaining a result as or more extreme as your experimental data
390cb852aae7 Uploaded fubar parents: diff changeset	114 in millions of experiments conducted using the same methods under the null hypothesis.
390cb852aae7 Uploaded fubar parents: diff changeset	115
390cb852aae7 Uploaded fubar parents: diff changeset	116 There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from
390cb852aae7 Uploaded fubar parents: diff changeset	117 technical or no replicates without making the lack of biological or experimental error in the p value calculations
390cb852aae7 Uploaded fubar parents: diff changeset	118 clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference.
390cb852aae7 Uploaded fubar parents: diff changeset	119 If you have no replicates, you must not use this tool as the p values are uninterpretable. So there.
390cb852aae7 Uploaded fubar parents: diff changeset	120
390cb852aae7 Uploaded fubar parents: diff changeset	121 See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a
390cb852aae7 Uploaded fubar parents: diff changeset	122 statistician if this makes no sense please.
390cb852aae7 Uploaded fubar parents: diff changeset	123
390cb852aae7 Uploaded fubar parents: diff changeset	124 Attribution
390cb852aae7 Uploaded fubar parents: diff changeset	125
390cb852aae7 Uploaded fubar parents: diff changeset	126 This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
390cb852aae7 Uploaded fubar parents: diff changeset	127 for the tricky work of counting. That code includes the following attribution:
390cb852aae7 Uploaded fubar parents: diff changeset	128
390cb852aae7 Uploaded fubar parents: diff changeset	129 ## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
390cb852aae7 Uploaded fubar parents: diff changeset	130 ## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
390cb852aae7 Uploaded fubar parents: diff changeset	131 ## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
390cb852aae7 Uploaded fubar parents: diff changeset	132
390cb852aae7 Uploaded fubar parents: diff changeset	133 It will be automatically installed if you use the toolshed as in general, you probably should.
390cb852aae7 Uploaded fubar parents: diff changeset	134 HTSeq_ must be installed with this tool if you install manually.
390cb852aae7 Uploaded fubar parents: diff changeset	135
390cb852aae7 Uploaded fubar parents: diff changeset	136 Otherwise, all code and documentation comprising this tool including the requirement
390cb852aae7 Uploaded fubar parents: diff changeset	137 for more than one sample bam
390cb852aae7 Uploaded fubar parents: diff changeset	138 was written by Ross Lazarus and is
390cb852aae7 Uploaded fubar parents: diff changeset	139 licensed to you under the LGPL_ like other rgenetics artefacts
390cb852aae7 Uploaded fubar parents: diff changeset	140
390cb852aae7 Uploaded fubar parents: diff changeset	141 Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code
390cb852aae7 Uploaded fubar parents: diff changeset	142
390cb852aae7 Uploaded fubar parents: diff changeset	143 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
390cb852aae7 Uploaded fubar parents: diff changeset	144 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
390cb852aae7 Uploaded fubar parents: diff changeset	145 </help>
390cb852aae7 Uploaded fubar parents: diff changeset	146
390cb852aae7 Uploaded fubar parents: diff changeset	147 </tool>

Mercurial > repos > fubar > htseq_bams_to_count_matrix

annotate htseqsams2mx.xml @ 45:19207379d4cf draft