# HG changeset patch # User fubar # Date 1375962106 14400 # Node ID d917cde0f94b913534b169550a569bbc2eb2d871 # Parent 3906fe90d881c8ad263d5cbe0ad16afbf801d13a Uploaded diff -r 3906fe90d881 -r d917cde0f94b htseq_bams_to_count_matrix/generatetest.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseq_bams_to_count_matrix/generatetest.sh Thu Aug 08 07:41:46 2013 -0400 @@ -0,0 +1,1 @@ +python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'" diff -r 3906fe90d881 -r d917cde0f94b htseq_bams_to_count_matrix/htseqsams2mx.xml --- a/htseq_bams_to_count_matrix/htseqsams2mx.xml Wed Aug 07 21:54:01 2013 -0400 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml Thu Aug 08 07:41:46 2013 -0400 @@ -1,4 +1,4 @@ - + using HTSeq code numpy - pysam freetype matplotliblite htseq diff -r 3906fe90d881 -r d917cde0f94b htseq_bams_to_count_matrix/htseqsams2mx.xml~ --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml~ Thu Aug 08 07:41:46 2013 -0400 @@ -0,0 +1,145 @@ + + using HTSeq code + + + + + numpy + freetype + matplotliblite + htseq + + + htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" + --samf "'$firstsamf','${firstsamf.name}'" + #if $secondsamf: + --samf "'$secondsamf','${secondsamf.name}'" + #end if + #if $thirdsamf: + --samf "'$thirdsamf','${thirdsamf.name}'" + #end if + #if $fourthsamf: + --samf "'$fourthsamf','${fourthsamf.name}'" + #end if + #for $s in $samfiles: + #if $s.samf: + --samf "'${s.samf}','${s.samf.name}'" + #end if + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What this tool does** + +Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools +It uses HTSeq to count your sam reads over a gene model supplied as a GTF file +The output is a tabular text (columnar - spreadsheet) file containing the +count matrix for downstream processing. Each row contains the counts from each sample for each +of the non-emtpy GTF input file contigs matching the GTF attribute choice above. +You probably want to use gene level GTF output attribute and count reads that overlap +GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. + +---- + +**Author's plea on replicates** + +If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis +under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, +which is what published p values are often assumed to do, then you need biological +(or for cell culture material experimental) replicates. + +Using technical or no replicates means the downstream p values are not interpretable the way most people would assume +they are - ie as the probability of obtaining a result as or more extreme as your experimental data +in millions of experiments conducted using the same methods under the null hypothesis. + +There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from +technical or no replicates without making the lack of biological or experimental error in the p value calculations +clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. +If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. + +See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a +statistician if this makes no sense please. + +**Attribution** + +This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html +for the tricky work of counting. That code includes the following attribution: + +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 + +It will be automatically installed if you use the toolshed as in general, you probably should. +HTSeq_ must be installed with this tool if you install manually. + +Otherwise, all code and documentation comprising this tool including the requirement +for more than one sample bam +was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +Sorry, don't use so can't be buggered with read groups - contributions welcome - send code + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html + + + diff -r 3906fe90d881 -r d917cde0f94b htseq_bams_to_count_matrix/tool_dependencies.xml --- a/htseq_bams_to_count_matrix/tool_dependencies.xml Wed Aug 07 21:54:01 2013 -0400 +++ b/htseq_bams_to_count_matrix/tool_dependencies.xml Thu Aug 08 07:41:46 2013 -0400 @@ -3,9 +3,6 @@ - - - @@ -22,9 +19,6 @@ - - - @@ -40,7 +34,7 @@ - Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), pysam and the Numpy Python package. + Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), and the Numpy Python package. Note this uses the matplotlib lite version dependent on the lite version of numpy - no atlas compilation