Mercurial > repos > fubar > htseq_bams_to_count_matrix
diff htseqsams2mx.xml @ 56:9b59cd40f20d draft
Uploaded
author | iuc |
---|---|
date | Tue, 28 Apr 2015 22:56:39 -0400 |
parents | |
children | 57841366f112 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.xml Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,133 @@ +<tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.5"> + <description>using HTSeq code</description> + <stdio> + <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> + </stdio> + <requirements> + <requirement type="package" version="0.7.6">pysam</requirement> + <requirement type="package" version="1.2.1">matplotlib</requirement> + <requirement type="package" version="0.5.4p3">htseq</requirement> + </requirements> + <command interpreter="python"> + htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" + --mapqMin $mapqMin + #for $s in $samfiles: + #if $s.ext != 'data': + --samf "'${s}','${s.name}','${s.ext}','${s.metadata.bam_index}'" + #end if + #end for + #if $filter_extras: + --filter_extras "$filter_extras" + #end if + </command> + <inputs> + <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> + <param name="mapqMin" label="Filter reads with mapq below than this value" + help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" + type="integer" value="5"/> + <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/> + <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox" + truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" /> + <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" + help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> + <option value="union" selected="true">union</option> + <option value="intersection-strict">intersection-strict</option> + <option value="intersection-nonempty">intersection-nonempty</option> + </param> + <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" + help="If in doubt, use gene name or if you need the id in your GTF, gene id"> + <option value="gene_name" selected="true">gene name</option> + <option value="gene_id">gene id</option> + <option value="transcript_id">transcript id</option> + <option value="transcript_name">transcript name</option> + </param> + <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" + help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over"> + <option value="exon" selected="true">exon</option> + <option value="CDS">CDS</option> + <option value="UTR">UTR</option> + <option value="transcript">transcript</option> + </param> + <param name="filter_extras" type="select" label="Filter any read with one or more flags" + help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> + <option value="">None</option> + <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> + <option value="XS:A">Might be useful for tophat</option> + </param> + + <param name="samfiles" type="data" label="bam/sam file from your history" format="sam,bam" size="100" multiple="true"/> + </inputs> + <outputs> + <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> + </outputs> + <tests> + <test> + <param name="feature_type" value="exon" /> + <param name="gfffile" value="rn4_chr20_100k.gtf" /> + <param name="samfiles" value="rn4chr20test1.bam,rn4chr20test2.bam" ftype="bam"/> + <param name="id_attr" value="gene_name" /> + <param name="model" value="union" /> + <param name="stranded" value="no" /> + <param name="title" value="htseqtest" /> + <param name="mapqMin" value="0" /> + + <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/> + </test> + </tests> + <help> + +**What this tool does** + +Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools +It uses HTSeq to count your sam reads over a gene model supplied as a GTF file +The output is a tabular text (columnar - spreadsheet) file containing the +count matrix for downstream processing. Each row contains the counts from each sample for each +of the non-emtpy GTF input file contigs matching the GTF attribute choice above. +You probably want to use gene level GTF output attribute and count reads that overlap +GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. + +---- + +**Author's plea on replicates** + +If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis +under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, +which is what published p values are often assumed to do, then you need biological +(or for cell culture material experimental) replicates. + +Using technical or no replicates means the downstream p values are not interpretable the way most people would assume +they are - ie as the probability of obtaining a result as or more extreme as your experimental data +in millions of experiments conducted using the same methods under the null hypothesis. + +There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from +technical or no replicates without making the lack of biological or experimental error in the p value calculations +clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. +If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. + +See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a +statistician if this makes no sense please. + +**Attribution** + +This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html +for the tricky work of counting. That code includes the following attribution: + +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 + +It will be automatically installed if you use the toolshed as in general, you probably should. +HTSeq_ must be installed with this tool if you install manually. + +Otherwise, all code and documentation comprising this tool including the requirement +for more than one sample bam +was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html + </help> + +</tool>