# HG changeset patch # User devteam # Date 1392346925 18000 # Node ID c25cd3e674a5cbe2a8f894711efce2441e8941d9 # Parent 468673afa3483c175ae7b074d1a2f26240176456 Uploaded diff -r 468673afa348 -r c25cd3e674a5 picard_wrapper.py --- a/picard_wrapper.py Thu Feb 13 21:39:50 2014 -0500 +++ b/picard_wrapper.py Thu Feb 13 22:02:05 2014 -0500 @@ -15,7 +15,7 @@ - + @@ -33,20 +33,20 @@ def timenow(): """return current time as a string - """ +""" return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) class PicardBase(): """ - simple base class with some utilities for Picard - adapted and merged with Kelly Vincent's code april 2011 Ross - lots of changes... - """ +simple base class with some utilities for Picard +adapted and merged with Kelly Vincent's code april 2011 Ross +lots of changes... +""" def __init__(self, opts=None,arg0=None): """ common stuff needed at init for a picard tool - """ +""" assert opts <> None, 'PicardBase needs opts at init' self.opts = opts if self.opts.outdir == None: @@ -69,7 +69,7 @@ except: pass self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname) - self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname) + self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname) self.setLogging(logfname=self.log_filename) def baseName(self,name=None): @@ -77,7 +77,7 @@ def setLogging(self,logfname="picard_wrapper.log"): """setup a logger - """ +""" logging.basicConfig(level=logging.INFO, filename=logfname, filemode='a') @@ -85,7 +85,7 @@ def readLarge(self,fname=None): """ read a potentially huge file. - """ +""" try: # get stderr, allowing for case where it's very large tmp = open( fname, 'rb' ) @@ -102,14 +102,14 @@ pass tmp.close() except Exception, e: - stop_err( 'Read Large Exception : %s' % str( e ) ) + stop_err( 'Read Large Exception : %s' % str( e ) ) return s def runCL(self,cl=None,output_dir=None): """ construct and run a command line - we have galaxy's temp path as opt.temp_dir so don't really need isolation - sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts - """ +we have galaxy's temp path as opt.temp_dir so don't really need isolation +sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts +""" assert cl <> None, 'PicardBase runCL needs a command line as cl' if output_dir == None: output_dir = self.opts.outdir @@ -124,7 +124,7 @@ tlf.close() tef.close() stderrs = self.readLarge(temperr) - stdouts = self.readLarge(templog) + stdouts = self.readLarge(templog) if rval > 0: s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs) stdouts = '%s\n%s' % (stdouts,stderrs) @@ -133,23 +133,26 @@ logging.info(s) os.unlink(templog) # always os.unlink(temperr) # always - return s, stdouts, rval # sometimes s is an output + return s, stdouts, rval # sometimes s is an output def runPic(self, jar, cl): """ - cl should be everything after the jar file name in the command - """ +cl should be everything after the jar file name in the command +""" runme = ['java -Xmx%s' % self.opts.maxjheap] runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir) runme.append('-jar %s' % jar) runme += cl + + print runme,self.opts.outdir + s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir) return stdouts,rval def samToBam(self,infile=None,outdir=None): """ - use samtools view to convert sam to bam - """ +use samtools view to convert sam to bam +""" fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam') cl = ['samtools view -h -b -S -o ',tempbam,infile] tlog,stdouts,rval = self.runCL(cl,outdir) @@ -157,7 +160,7 @@ def sortSam(self, infile=None,outfile=None,outdir=None): """ - """ +""" print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir) cl = ['samtools sort',infile,outfile] tlog,stdouts,rval = self.runCL(cl,outdir) @@ -172,20 +175,20 @@ def prettyPicout(self,transpose,maxrows): """organize picard outpouts into a report html page - """ +""" res = [] try: r = open(self.metricsOut,'r').readlines() except: - r = [] + r = [] if len(r) > 0: res.append('Picard on line resources
\n') if transpose: - res.append('Picard output (transposed to make it easier to see)
\n') + res.append('Picard output (transposed to make it easier to see)
\n') else: - res.append('Picard output
\n') + res.append('Picard output
\n') res.append('\n') dat = [] heads = [] @@ -208,30 +211,30 @@ if len(dat) > 0: if transpose and not thist: tdat = map(None,*dat) # transpose an arbitrary list of lists - tdat = ['\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] + tdat = ['\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] else: tdat = ['\t'.join(x).strip() for x in dat] # back to strings :( tdat = ['\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)] res += tdat dat = [] - res.append('
%s%s 
%s%s 
%s
\n') + res.append('\n') return res def fixPicardOutputs(self,transpose,maxloglines): """ - picard produces long hard to read tab header files - make them available but present them transposed for readability - """ +picard produces long hard to read tab header files +make them available but present them transposed for readability +""" logging.shutdown() self.cleanup() # remove temp files stored in delme rstyle="""""" +tr.d0 td {background-color: oldlace; color: black;} +tr.d1 td {background-color: aliceblue; color: black;} +""" res = [rstyle,] - res.append(galhtmlprefix % self.progname) + res.append(galhtmlprefix % self.progname) res.append(galhtmlattr % (self.picname,timenow())) - flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] + flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs for p in pdflist: @@ -241,22 +244,22 @@ if mimghref in flist: imghref=mimghref # only one for thumbnail...it's a multi page pdf res.append('
\n') - res.append('\n' % (p,imghref)) - res.append('
\n') + res.append('\n' % (p,imghref)) + res.append('\n') if len(flist) > 0: res.append('The following output files were created (click the filename to view/download a copy):
') res.append('\n') for i,f in enumerate(flist): fn = os.path.split(f)[-1] res.append('\n' % (fn,fn)) - res.append('
%s

\n') + res.append('

\n') pres = self.prettyPicout(transpose,maxloglines) if len(pres) > 0: res += pres l = open(self.log_filename,'r').readlines() llen = len(l) - if llen > 0: - res.append('Picard Tool Run Log


\n') + if llen > 0: + res.append('Picard Tool Run Log
\n') rlog = ['
',]
             if llen > maxloglines:
                 n = min(50,int(maxloglines/2))
@@ -271,21 +274,21 @@
             res += rlog
         else:
             res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
-        res.append('
The freely available Picard software \n') - res.append( 'generated all outputs reported here running as a Galaxy tool') - res.append(galhtmlpostfix) + res.append('
The freely available Picard software \n') + res.append( 'generated all outputs reported here running as a Galaxy tool') + res.append(galhtmlpostfix) outf = open(self.opts.htmlout,'w') - outf.write(''.join(res)) + outf.write(''.join(res)) outf.write('\n') outf.close() def makePicInterval(self,inbed=None,outf=None): """ - picard wants bait and target files to have the same header length as the incoming bam/sam - a meaningful (ie accurate) representation will fail because of this - so this hack - it would be far better to be able to supply the original bed untouched - Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list - """ +picard wants bait and target files to have the same header length as the incoming bam/sam +a meaningful (ie accurate) representation will fail because of this - so this hack +it would be far better to be able to supply the original bed untouched +Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list +""" assert inbed <> None bed = open(inbed,'r').readlines() sbed = [x.split('\t') for x in bed] # lengths MUST be 5 @@ -320,19 +323,19 @@ def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None): """ - interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs! - Do the work of removing all the error sequences - pysam is cool - infile = pysam.Samfile( "-", "r" ) - outfile = pysam.Samfile( "-", "w", template = infile ) - for s in infile: outfile.write(s) +interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs! +Do the work of removing all the error sequences +pysam is cool +infile = pysam.Samfile( "-", "r" ) +outfile = pysam.Samfile( "-", "w", template = infile ) +for s in infile: outfile.write(s) - errors from ValidateSameFile.jar look like - WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing - ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary. - ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041 +errors from ValidateSameFile.jar look like +WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing +ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary. +ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041 - """ +""" assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path' removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2] @@ -364,7 +367,7 @@ def __main__(): doFix = False # tools returning htmlfile don't need this doTranspose = True # default - maxloglines = 100 # default + maxloglines = 100 # default #Parse Command Line op = optparse.OptionParser() # All tools @@ -374,12 +377,12 @@ op.add_option('-n', '--title', default="Pick a Picard Tool") op.add_option('-t', '--htmlout', default=None) op.add_option('-d', '--outdir', default=None) - op.add_option('-x', '--maxjheap', default='4g') + op.add_option('-x', '--maxjheap', default='3000m') op.add_option('-b', '--bisulphite', default='false') - op.add_option('-s', '--sortorder', default='query') + op.add_option('-s', '--sortorder', default='query') op.add_option('','--tmpdir', default='/tmp') - op.add_option('-j','--jar',default='') - op.add_option('','--picard-cmd',default=None) + op.add_option('-j','--jar',default='') + op.add_option('','--picard-cmd',default=None) # Many tools op.add_option( '', '--output-format', dest='output_format', help='Output format' ) op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' ) @@ -404,7 +407,7 @@ # FixMateInformation and validate # CollectGcBiasMetrics op.add_option('', '--windowsize', default='100') - op.add_option('', '--mingenomefrac', default='0.00001') + op.add_option('', '--mingenomefrac', default='0.00001') # AddOrReplaceReadGroups op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' ) op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' ) @@ -420,7 +423,7 @@ # ReplaceSamHeader op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' ) - op.add_option('','--assumesorted', default='true') + op.add_option('','--assumesorted', default='true') op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*") #estimatelibrarycomplexity op.add_option('','--minid', default="5") @@ -439,7 +442,6 @@ op.add_option('','--probability', default="1") op.add_option('','--seed', default="1") - ## opts, args = op.parse_args() opts.sortme = opts.assumesorted == 'false' assert opts.input <> None @@ -542,7 +544,7 @@ stdouts,rval = pic.runPic(opts.jar, cl) elif pic.picname == 'CollectAlignmentSummaryMetrics': - # Why do we do this fakefasta thing? + # Why do we do this fakefasta thing? # Because we need NO fai to be available or picard barfs unless it matches the input data. # why? Dunno Seems to work without complaining if the .bai file is AWOL.... fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) @@ -564,7 +566,7 @@ cl.append('TMP_DIR=%s' % opts.tmpdir) if not opts.assumesorted.lower() == 'true': # we need to sort input sortedfile = '%s.sorted' % os.path.basename(opts.input) - if opts.datatype == 'sam': # need to work with a bam + if opts.datatype == 'sam': # need to work with a bam tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir) pic.delme.append(tempbam) try: @@ -579,14 +581,14 @@ cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile))) pic.delme.append(os.path.join(opts.outdir,sortedfile)) else: - cl.append('INPUT=%s' % os.path.abspath(opts.input)) + cl.append('INPUT=%s' % os.path.abspath(opts.input)) stdouts,rval = pic.runPic(opts.jar, cl) elif pic.picname == 'CollectGcBiasMetrics': assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. - # why? Dunno + # why? Dunno fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) try: os.symlink(ref_file_name,fakefasta) @@ -621,11 +623,11 @@ elif pic.picname == 'CollectInsertSizeMetrics': """ - picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations" - --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file" - - """ +picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations" +--histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel" +-j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file" + +""" isPDF = 'InsertSizeHist.pdf' pdfpath = os.path.join(opts.outdir,isPDF) histpdf = 'InsertSizeHist.pdf' @@ -633,8 +635,8 @@ cl.append('O=%s' % pic.metricsOut) cl.append('HISTOGRAM_FILE=%s' % histpdf) #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56 - # cl.append('TAIL_LIMIT=%s' % opts.taillimit) - if opts.histwidth <> '0': + # cl.append('TAIL_LIMIT=%s' % opts.taillimit) + if opts.histwidth <> '0': cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth) if float( opts.minpct) > 0.0: cl.append('MINIMUM_PCT=%s' % opts.minpct) @@ -645,7 +647,7 @@ malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists] cl += malist stdouts,rval = pic.runPic(opts.jar, cl) - if os.path.exists(pdfpath): # automake thumbnail - will be added to html + if os.path.exists(pdfpath): # automake thumbnail - will be added to html cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath] pic.runCL(cl=cl2,output_dir=opts.outdir) else: @@ -663,7 +665,7 @@ # input cl.append('INPUT=%s' % opts.input) # outputs - cl.append('OUTPUT=%s' % opts.output) + cl.append('OUTPUT=%s' % opts.output) cl.append('METRICS_FILE=%s' % pic.metricsOut ) # remove or mark duplicates cl.append('REMOVE_DUPLICATES=%s' % opts.remdups) @@ -711,7 +713,7 @@ if opts.targetbed == opts.baitbed: # same file sometimes targetf = baitf else: - targetf = pic.makePicInterval(opts.targetbed,targetfname) + targetf = pic.makePicInterval(opts.targetbed,targetfname) cl.append('BAIT_INTERVALS=%s' % baitf) cl.append('TARGET_INTERVALS=%s' % targetf) cl.append('INPUT=%s' % os.path.abspath(opts.input)) @@ -725,7 +727,7 @@ sortedfile = os.path.join(opts.outdir,'rgValidate.sorted') stf = open(pic.log_filename,'w') tlog = None - if opts.datatype == 'sam': # need to work with a bam + if opts.datatype == 'sam': # need to work with a bam tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir) try: tlog = pic.sortSam(tempbam,sortedfile,opts.outdir) @@ -740,7 +742,7 @@ print '##tlog=',tlog stf.write(tlog) stf.write('\n') - sortedfile = '%s.bam' % sortedfile # samtools does that + sortedfile = '%s.bam' % sortedfile # samtools does that cl.append('O=%s' % pic.metricsOut) cl.append('TMP_DIR=%s' % opts.tmpdir) cl.append('I=%s' % sortedfile) @@ -752,7 +754,7 @@ if opts.bisulphite.lower() <> 'false': cl.append('IS_BISULFITE_SEQUENCED=true') if opts.ref <> None or opts.ref_file <> None: - cl.append('R=%s' % ref_file_name) + cl.append('R=%s' % ref_file_name) stdouts,rval = pic.runPic(opts.jar,cl) if opts.datatype == 'sam': pic.delme.append(tempbam) @@ -763,6 +765,34 @@ pic.delme.append(sortedfile) # not wanted stf.close() pic.cleanup() + +####liubo added CleanSam tool#### + elif pic.picname == 'CleanSam': + # input + cl.append('INPUT=%s' % opts.input) + # output + cl.append('OUTPUT=%s' % tempout) + stdouts,rval = pic.runPic(opts.jar, cl) + haveTempout = True + + elif pic.picname == 'SortSam': + cl.append('I=%s' % opts.input) + cl.append('O=%s' % tempout) + cl.append('SORT_ORDER=%s' % opts.sortorder) + stdouts,rval = pic.runPic(opts.jar,cl) + haveTempout = True + + elif pic.picname == 'BuildBamIndex': + cl.append('I=%s' % opts.input) + cl.append('O=%s' % tempout) + stdouts,rval = pic.runPic(opts.jar,cl) + haveTempout = True + + elif pic.picname == 'SamFormatConverter': + cl.append('INPUT=%s' % opts.input) + cl.append('OUTPUT=%s' % tempout) + pic.runPic(opts.jar, cl) + haveTempout = True elif pic.picname == "DownsampleSam": cl.append('I=%s' % opts.input) mystring = opts.output @@ -775,17 +805,13 @@ stdouts,rval = pic.runPic(opts.jar, cl) myoutput = mystringsam.replace(".sam", "") os.rename(mystringsam,myoutput) - elif pic.picname == 'SamFormatConverter': - cl.append('INPUT=%s' % opts.input) - cl.append('OUTPUT=%s' % tempout) - pic.runPic(opts.jar, cl) - haveTempout = True - + + else: print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname sys.exit(1) if haveTempout: - # Some Picard tools produced a potentially intermediate bam file. + # Some Picard tools produced a potentially intermediate bam file. # Either just move to final location or create sam if os.path.exists(tempout): shutil.move(tempout, os.path.abspath(opts.output)) @@ -795,4 +821,3 @@ print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts) # signal failure if __name__=="__main__": __main__() -