Mercurial > repos > fubar > htseq_bams_to_count_matrix
changeset 38:7a4061baef5c draft
moved to iuc repos and pysam bump so version bump
author | fubar |
---|---|
date | Tue, 19 Nov 2013 20:25:26 -0500 |
parents | f69b55c71ae5 |
children | cc4efb5bd07e |
files | htseq_bams_to_count_matrix/htseqsams2mx.py htseq_bams_to_count_matrix/htseqsams2mx.xml htseq_bams_to_count_matrix/tool_dependencies.xml |
diffstat | 3 files changed, 80 insertions(+), 124 deletions(-) [+] |
line wrap: on
line diff
--- a/htseq_bams_to_count_matrix/htseqsams2mx.py Thu Aug 08 20:03:48 2013 -0400 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.py Tue Nov 19 20:25:26 2013 -0500 @@ -60,7 +60,7 @@ self.msg = msg -def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts): +def htseqMX(gff_filename,sam_filenames,colnames,opts): """ Code taken from count.py in Simon Anders HTSeq distribution Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to @@ -90,13 +90,11 @@ features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) mapqMin = int(opts.mapqMin) counts = {} - nreads = 0 empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 - filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? gff = HTSeq.GFF_Reader( gff_filename ) try: for i,f in enumerate(gff): @@ -123,29 +121,19 @@ sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) for sami,sam_filename in enumerate(sam_filenames): colname = colnames[sami] - isbam = sam_exts[sami] == 'bam' - hasbai = sam_bais[sami] > '' - if hasbai: - tempname = os.path.splitext(os.path.basename(sam_filename))[0] - tempbam = '%s.bam' % tempname - tempbai = '%s.bai' % tempname - os.link(sam_filename,tempbam) - os.link(sam_bais[sami],tempbai) + isbam = colname.endswith('.bam') try: if isbam: - if hasbai: - read_seq = HTSeq.BAM_Reader ( tempbam ) - else: - read_seq = HTSeq.BAM_Reader( sam_filename ) + read_seq = HTSeq.BAM_Reader( sam_filename ) else: read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: if isbam: - print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname ) + sys.stderr.write( "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname) ) else: - print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ) + sys.stderr.write( "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname )) raise try: @@ -153,17 +141,11 @@ read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) for seqi,r in enumerate(read_seq): - nreads += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: - if len(opts.filter_extras) > 0: - for extra in opts.filter_extras: - if r.optional_field(extra): - filtered += 1 - continue if r.optional_field( "NH" ) > 1: nonunique += 1 continue @@ -240,10 +222,10 @@ else: rr = r[0] if r[0] is not None else r[1] empty += 1 - if not opts.quiet: - sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + - "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % - ( rr.read.name, iv.chrom ) ) + #if not quiet: + # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % + # ( rr.read.name, iv.chrom ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) @@ -252,13 +234,13 @@ raise if not opts.quiet: - sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) - return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + sys.stdout.write( "%d sam %s processed.\n" % ( seqi, "lines " if not pe_mode else "line pairs" ) ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique warnings.showwarning = my_showwarning assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename try: - counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) + counts,empty,ambiguous,lowqual,notaligned,nonunique = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) except: sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % @@ -266,7 +248,7 @@ os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) sys.exit( 1 ) - return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + return counts,empty,ambiguous,lowqual,notaligned,nonunique def usage(): @@ -296,7 +278,6 @@ op.add_option('-o', '--outfname', default=None) op.add_option('-f','--forceName', default="false") op.add_option('--samf', default=[], action="append") - op.add_option('--filter_extras', default=[], action="append") op.add_option('--mapqMin', default='0') op.add_option( "-t", "--type", type="string", dest="featuretype", default = "exon", help = "feature type (3rd column in GFF file) to be used, " + @@ -316,43 +297,24 @@ outfname = opts.outfname sam_filenames = [] colnames = [] - samf = opts.samf - samfsplit = [x.split(',') for x in samf] # one per samf set - samsets = [] - for samfs in samfsplit: - samset = [x.replace("'","") for x in samfs] - samset = [x.replace('"','') for x in samset] - samsets.append(samset) - samsets = [x for x in samsets if x[0].lower() != 'none'] - # just cannot stop getting these on cl! wtf in cheetah for a repeat group? - samfnames = [x[0] for x in samsets] - if len(set(samfnames)) != len(samfnames): - samnames = [] - delme = [] - for i,s in enumerate(samfnames): - if s in samnames: - delme.append(i) - print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\ - (s,','.join(samfnames), str(delme)) - else: - samnames.append(s) # first time - samsets = [x for i,x in enumerate(samsets) if not (i in delme)] - samfnames = [x[0] for x in samsets] - scolnames = [x[1]for x in samsets] - assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames)) - sam_exts = [x[2] for x in samsets] - assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts)) - sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work - for i,b in enumerate(samfnames): - assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b - sam_filenames.append(b) - sampName = scolnames[i] # better be unique - sampName = sampName.replace('#','') # for R - sampName = sampName.replace('(','') # for R - sampName = sampName.replace(')','') # for R - sampName = sampName.replace(' ','_') # for R - colnames.append(sampName) - counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts) + samdat = opts.samf + samf = [x.split(',')[0].replace("'",'').replace('"','') for x in samdat] # get rid of wrapper supplied quotes + assert len(set(samf)) == len(samf),'## ERROR sams2mx: Duplicate input sam file in %s' % ','.join(samf) + scolnames = [x.split(',')[1].replace("'",'').replace('"','') for x in samdat] + assert len(samf) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samf),len(scolname)) + for i,b in enumerate(samf): + if b != 'None': + assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b + sam_filenames.append(b) + sampName = scolnames[i] # better be unique + if sampName == "": + sampName = b # for test + sampName = sampName.replace('#','') # for R + sampName = sampName.replace('(','') # for R + sampName = sampName.replace(')','') # for R + sampName = sampName.replace(' ','_') # for R + colnames.append(sampName) + counts,empty,ambiguous,lowqual,notaligned,nonunique = htseqMX(gff_file, sam_filenames,colnames,opts) heads = '\t'.join(['Contig',] + colnames) res = [heads,] contigs = counts.keys() @@ -372,10 +334,9 @@ outf.write('\n') outf.close() walltime = int(time.time() - starttime) - accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads', - 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs') - accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs) - fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs)) - notes = ['%s = %d (%2.3f %%)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)] - print >> sys.stdout, '\n'.join(notes) + accumulatornames = ('walltimeseconds','totreadscounted','ncontigs','emptyreads','ambiguousreads','lowqualreads', + 'notalignedreads','nonuniquereads','emptycontigs') + accums = (walltime,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,emptycontigs) + notes = ['%s=%d' % (accumulatornames[i],x) for i,x in enumerate(accums)] + print >> sys.stdout, ','.join(notes) sys.exit(0)
--- a/htseq_bams_to_count_matrix/htseqsams2mx.xml Thu Aug 08 20:03:48 2013 -0400 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml Tue Nov 19 20:25:26 2013 -0500 @@ -1,30 +1,32 @@ -<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.41"> +<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.4"> <description>using HTSeq code</description> <stdio> - <exit_code range="666" level="warning" - description="Exit code 666 encountered" /> + <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> </stdio> <requirements> - <requirement type="package" version="1.7.1">numpy</requirement> - <requirement type="package" version="0.7.5">pysam</requirement> - <requirement type="package" version="2.4.11">freetype</requirement> - <requirement type="package" version="1.2.1">matplotliblite</requirement> + <requirement type="package" version="1.7">numpy</requirement> + <requirement type="package" version="0.7.6">pysam</requirement> + <requirement type="package" version="2.4">freetype</requirement> + <requirement type="package" version="1.2">matplotlib</requirement> <requirement type="package" version="0.5.4p3">htseq</requirement> </requirements> <command interpreter="python"> htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" - --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'" - #if $secondsamf.ext != 'data': - --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'" + --samf "'$firstsamf','${firstsamf.name}'" + #if $secondsamf: + --samf "'$secondsamf','${secondsamf.name}'" + #end if + #if $thirdsamf: + --samf "'$thirdsamf','${thirdsamf.name}'" + #end if + #if $fourthsamf: + --samf "'$fourthsamf','${fourthsamf.name}'" #end if #for $s in $samfiles: - #if $s.samf.ext != 'data': - --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'" + #if $s.samf: + --samf "'${s.samf}','${s.samf.name}'" #end if #end for - #if $filter_extras: - --filter_extras "$filter_extras" - #end if </command> <inputs> <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> @@ -54,23 +56,16 @@ <option value="UTR">UTR</option> <option value="transcript">transcript</option> </param> - <param name="filter_extras" type="select" label="Filter any read with one or more flags" - help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> - <option value="">None</option> - <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> - <option value="XS:A">Might be useful for tophat</option> - </param> - - <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100" - help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" - optional="false"/> - <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" - help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" - optional="false"/> - <repeat name="samfiles" min="16" - title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"> - <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" - optional="true"/> + <param name="firstsamf" type="data" label="First bam/sam file from your history to count reads overlapping gene model regions" + format="sam,bam" optional="false"/> + <param name="secondsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions" + format="sam,bam" optional="false"/> + <param name="thirdsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions" + format="sam,bam" optional="true"/> + <param name="fourthsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions" + format="sam,bam" optional="true"/> + <repeat name="samfiles" title="Use this to add all needed additional bam/sam files from your history to count reads overlapping gene model regions"> + <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" optional="true"/> </repeat> </inputs> <outputs> @@ -141,7 +136,7 @@ was written by Ross Lazarus and is licensed to you under the LGPL_ like other rgenetics artefacts -Sorry, I don't use readgroups so no reason to code read groups. Contributions welcome. Send code +Sorry, don't use so can't be buggered with read groups - contributions welcome - send code .. _LGPL: http://www.gnu.org/copyleft/lesser.html .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
--- a/htseq_bams_to_count_matrix/tool_dependencies.xml Thu Aug 08 20:03:48 2013 -0400 +++ b/htseq_bams_to_count_matrix/tool_dependencies.xml Tue Nov 19 20:25:26 2013 -0500 @@ -1,16 +1,16 @@ <?xml version="1.0"?> <tool_dependency> - <package name="numpy" version="1.7.1"> - <repository changeset_revision="af9633757cf0" name="package_numpy_1_7" owner="blankenberg" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <package name="numpy" version="1.7"> + <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> - <package name="pysam" version="0.7.5"> - <repository changeset_revision="a4e35f23093f" name="package_pysam_0_7_5" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <package name="pysam" version="0.7.6"> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> - <package name="freetype" version="2.4.11"> - <repository changeset_revision="4e54e357ac25" name="package_freetype_2_4" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + <package name="freetype" version="2.4"> + <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> </package> - <package name="matplotliblite" version="1.2.1"> - <repository changeset_revision="8df6bbf48c3a" name="package_matplotlib_lite" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + <package name="matplotlib" version="1.2"> + <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> </package> <package name="htseq" version="0.5.4p3"> <install version="1.0"> @@ -19,17 +19,17 @@ <action type="make_directory">$INSTALL_DIR/lib/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> <action type="make_directory">$INSTALL_DIR/lib64/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> <action type="set_environment_for_install"> - <repository changeset_revision="af9633757cf0" name="package_numpy_1_7" owner="blankenberg" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu"> - <package name="numpy" version="1.7.1" /> + <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu"> + <package name="numpy" version="1.7" /> </repository> - <repository changeset_revision="a4e35f23093f" name="package_pysam_0_7_5" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu"> - <package name="pysam" version="0.7.5" /> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu"> + <package name="pysam" version="0.7.6" /> </repository> - <repository changeset_revision="4e54e357ac25" name="package_freetype_2_4" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/"> - <package name="freetype" version="2.4.11" /> + <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/"> + <package name="freetype" version="2.4" /> </repository> - <repository changeset_revision="8df6bbf48c3a" name="package_matplotlib_lite" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/"> - <package name="matplotliblite" version="1.2.1" /> + <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/"> + <package name="matplotlib_1_2" version="1.2" /> </repository> </action> <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action>