picard1106: picard_wrapper.py comparison

comparison picard_wrapper.py @ 42:c25cd3e674a5 draft

Uploaded

author	devteam
date	Thu, 13 Feb 2014 22:02:05 -0500
parents	1f4e2e13d9dc
children

comparison

equal deleted inserted replaced

-:468673afa348
+:c25cd3e674a5
 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="generator" content="Bluefish 2.2.3" />
+<meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
 <title></title>
 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
 </head>
 <body>
 <div class="document">
 sys.exit()
 def timenow():
 """return current time as a string
 """
 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
 class PicardBase():
 """
 simple base class with some utilities for Picard
 adapted and merged with Kelly Vincent's code april 2011 Ross
 lots of changes...
 """
 def __init__(self, opts=None,arg0=None):
 """ common stuff needed at init for a picard tool
 """
 assert opts <> None, 'PicardBase needs opts at init'
 self.opts = opts
 if self.opts.outdir == None:
 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
 try:
 os.makedirs(opts.tmpdir)
 except:
 pass
 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
-self.metricsOut =  os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
+self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
 self.setLogging(logfname=self.log_filename)
 def baseName(self,name=None):
 return os.path.splitext(os.path.basename(name))[0]
 def setLogging(self,logfname="picard_wrapper.log"):
 """setup a logger
 """
 logging.basicConfig(level=logging.INFO,
 filename=logfname,
 filemode='a')
 def readLarge(self,fname=None):
 """ read a potentially huge file.
 """
 try:
 # get stderr, allowing for case where it's very large
 tmp = open( fname, 'rb' )
 s = ''
 buffsize = 1048576
 break
 except OverflowError:
 pass
 tmp.close()
 except Exception, e:
 stop_err( 'Read Large Exception : %s' % str( e ) )
 return s
 def runCL(self,cl=None,output_dir=None):
 """ construct and run a command line
 we have galaxy's temp path as opt.temp_dir so don't really need isolation
 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
 """
 assert cl <> None, 'PicardBase runCL needs a command line as cl'
 if output_dir == None:
 output_dir = self.opts.outdir
 if type(cl) == type([]):
 cl = ' '.join(cl)
 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
 rval = process.wait()
 tlf.close()
 tef.close()
 stderrs = self.readLarge(temperr)
 stdouts = self.readLarge(templog)
 if rval > 0:
 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
 stdouts = '%s\n%s' % (stdouts,stderrs)
 else:
 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
 logging.info(s)
 os.unlink(templog) # always
 os.unlink(temperr) # always
-return s, stdouts, rval  # sometimes s is an output
+return s, stdouts, rval # sometimes s is an output
 def runPic(self, jar, cl):
 """
 cl should be everything after the jar file name in the command
 """
 runme = ['java -Xmx%s' % self.opts.maxjheap]
 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
 runme.append('-jar %s' % jar)
 runme += cl
+print runme,self.opts.outdir
 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
 return stdouts,rval
 def samToBam(self,infile=None,outdir=None):
 """
 use samtools view to convert sam to bam
 """
 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
 cl = ['samtools view -h -b -S -o ',tempbam,infile]
 tlog,stdouts,rval = self.runCL(cl,outdir)
 return tlog,tempbam,rval
 def sortSam(self, infile=None,outfile=None,outdir=None):
 """
 """
 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
 cl = ['samtools sort',infile,outfile]
 tlog,stdouts,rval = self.runCL(cl,outdir)
 return tlog
 except:
 pass
 def prettyPicout(self,transpose,maxrows):
 """organize picard outpouts into a report html page
 """
 res = []
 try:
 r = open(self.metricsOut,'r').readlines()
 except:
 r = []
 if len(r) > 0:
 res.append('<b>Picard on line resources</b><ul>\n')
 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
 if transpose:
 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
 else:
 res.append('<b>Picard output</b><hr/>\n')
 res.append('<table cellpadding="3" >\n')
 dat = []
 heads = []
 lastr = len(r) - 1
 # special case for estimate library complexity hist
 res += hres
 heads = []
 if len(dat) > 0:
 if transpose and not thist:
 tdat = map(None,*dat) # transpose an arbitrary list of lists
 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
 else:
 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
 res += tdat
 dat = []
 res.append('</table>\n')
 return res
 def fixPicardOutputs(self,transpose,maxloglines):
 """
 picard produces long hard to read tab header files
 make them available but present them transposed for readability
 """
 logging.shutdown()
 self.cleanup() # remove temp files stored in delme
 rstyle="""<style type="text/css">
 tr.d0 td {background-color: oldlace; color: black;}
 tr.d1 td {background-color: aliceblue; color: black;}
 </style>"""
 res = [rstyle,]
 res.append(galhtmlprefix % self.progname)
 res.append(galhtmlattr % (self.picname,timenow()))
 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
 for p in pdflist:
 pbase = os.path.splitext(p)[0] # removes .pdf
 imghref = '%s.jpg' % pbase
 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
 if mimghref in flist:
 imghref=mimghref # only one for thumbnail...it's a multi page pdf
 res.append('<table cellpadding="10"><tr><td>\n')
 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
 res.append('</tr></td></table>\n')
 if len(flist) > 0:
 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
 res.append('<table>\n')
 for i,f in enumerate(flist):
 fn = os.path.split(f)[-1]
 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
 res.append('</table><p/>\n')
 pres = self.prettyPicout(transpose,maxloglines)
 if len(pres) > 0:
 res += pres
 l = open(self.log_filename,'r').readlines()
 llen = len(l)
 if llen > 0:
 res.append('<b>Picard Tool Run Log</b><hr/>\n')
 rlog = ['<pre>',]
 if llen > maxloglines:
 n = min(50,int(maxloglines/2))
 rlog += l[:n]
 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
 if llen > maxloglines:
 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
 res += rlog
 else:
 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
 res.append(galhtmlpostfix)
 outf = open(self.opts.htmlout,'w')
 outf.write(''.join(res))
 outf.write('\n')
 outf.close()
 def makePicInterval(self,inbed=None,outf=None):
 """
 picard wants bait and target files to have the same header length as the incoming bam/sam
 a meaningful (ie accurate) representation will fail because of this - so this hack
 it would be far better to be able to supply the original bed untouched
 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
 """
 assert inbed <> None
 bed = open(inbed,'r').readlines()
 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
 lens = [len(x) for x in sbed]
 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
 o.close()
 return outf
 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
 """
 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
 Do the work of removing all the error sequences
 pysam is cool
 infile = pysam.Samfile( "-", "r" )
 outfile = pysam.Samfile( "-", "w", template = infile )
 for s in infile: outfile.write(s)
 errors from ValidateSameFile.jar look like
 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
 """
 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
 remDict = dict(zip(removeNames,range(len(removeNames))))
 infile = pysam.Samfile(insam,'rb')
 def __main__():
 doFix = False # tools returning htmlfile don't need this
 doTranspose = True # default
 maxloglines = 100 # default
 #Parse Command Line
 op = optparse.OptionParser()
 # All tools
 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
 op.add_option('-e', '--inputext', default=None)
 op.add_option('-o', '--output', default=None)
 op.add_option('-n', '--title', default="Pick a Picard Tool")
 op.add_option('-t', '--htmlout', default=None)
 op.add_option('-d', '--outdir', default=None)
-op.add_option('-x', '--maxjheap', default='4g')
+op.add_option('-x', '--maxjheap', default='3000m')
 op.add_option('-b', '--bisulphite', default='false')
 op.add_option('-s', '--sortorder', default='query')
 op.add_option('','--tmpdir', default='/tmp')
 op.add_option('-j','--jar',default='')
 op.add_option('','--picard-cmd',default=None)
 # Many tools
 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
 # CreateSequenceDictionary
 op.add_option('', '--maxinsert', default="20")
 op.add_option('', '--adaptors', default='')
 # FixMateInformation and validate
 # CollectGcBiasMetrics
 op.add_option('', '--windowsize', default='100')
 op.add_option('', '--mingenomefrac', default='0.00001')
 # AddOrReplaceReadGroups
 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
 # ReplaceSamHeader
 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
 op.add_option('','--assumesorted', default='true')
 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
 #estimatelibrarycomplexity
 op.add_option('','--minid', default="5")
 op.add_option('','--maxdiff', default="0.03")
 op.add_option('','--minmeanq', default="20")
 op.add_option('','--samout', default=None)
 #downsample
 op.add_option('','--probability', default="1")
 op.add_option('','--seed', default="1")
-##
 opts, args = op.parse_args()
 opts.sortme = opts.assumesorted == 'false'
 assert opts.input <> None
 # need to add
 # instance that does all the work
 if float(opts.optdupdist) > 0:
 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
 stdouts,rval = pic.runPic(opts.jar, cl)
 elif pic.picname == 'CollectAlignmentSummaryMetrics':
 # Why do we do this fakefasta thing?
 # Because we need NO fai to be available or picard barfs unless it matches the input data.
 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
 try:
 os.symlink(ref_file_name,fakefasta)
 cl.append('OUTPUT=%s' % pic.metricsOut)
 cl.append('R=%s' % fakefasta)
 cl.append('TMP_DIR=%s' % opts.tmpdir)
 if not opts.assumesorted.lower() == 'true': # we need to sort input
 sortedfile = '%s.sorted' % os.path.basename(opts.input)
 if opts.datatype == 'sam': # need to work with a bam
 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
 pic.delme.append(tempbam)
 try:
 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
 except:
 except : # bug - [bam_sort_core] not being ignored - TODO fixme
 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
 pic.delme.append(os.path.join(opts.outdir,sortedfile))
 else:
 cl.append('INPUT=%s' % os.path.abspath(opts.input))
 stdouts,rval = pic.runPic(opts.jar, cl)
 elif pic.picname == 'CollectGcBiasMetrics':
 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
 # why? Dunno
 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
 try:
 os.symlink(ref_file_name,fakefasta)
 except:
 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
 lf.write('\n')
 lf.close()
 elif pic.picname == 'CollectInsertSizeMetrics':
 """ <command interpreter="python">
 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
 </command>
 """
 isPDF = 'InsertSizeHist.pdf'
 pdfpath = os.path.join(opts.outdir,isPDF)
 histpdf = 'InsertSizeHist.pdf'
 cl.append('I=%s' % opts.input)
 cl.append('O=%s' % pic.metricsOut)
 cl.append('HISTOGRAM_FILE=%s' % histpdf)
 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
-#    cl.append('TAIL_LIMIT=%s' % opts.taillimit)
+# cl.append('TAIL_LIMIT=%s' % opts.taillimit)
-if  opts.histwidth <> '0':
+if opts.histwidth <> '0':
 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
 if float( opts.minpct) > 0.0:
 cl.append('MINIMUM_PCT=%s' % opts.minpct)
 if float(opts.deviations) > 0.0:
 cl.append('DEVIATIONS=%s' % opts.deviations)
 if opts.malevel:
 malists = opts.malevel.split(',')
 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
 cl += malist
 stdouts,rval = pic.runPic(opts.jar, cl)
 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
 pic.runCL(cl=cl2,output_dir=opts.outdir)
 else:
 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
 # assume sorted even if header says otherwise
 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
 # input
 cl.append('INPUT=%s' % opts.input)
 # outputs
 cl.append('OUTPUT=%s' % opts.output)
 cl.append('METRICS_FILE=%s' % pic.metricsOut )
 # remove or mark duplicates
 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
 # the regular expression to be used to parse reads in incoming SAM file
 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
 baitf = pic.makePicInterval(opts.baitbed,baitfname)
 if opts.targetbed == opts.baitbed: # same file sometimes
 targetf = baitf
 else:
 targetf = pic.makePicInterval(opts.targetbed,targetfname)
 cl.append('BAIT_INTERVALS=%s' % baitf)
 cl.append('TARGET_INTERVALS=%s' % targetf)
 cl.append('INPUT=%s' % os.path.abspath(opts.input))
 cl.append('OUTPUT=%s' % pic.metricsOut)
 cl.append('TMP_DIR=%s' % opts.tmpdir)
 import pysam
 doTranspose = False
 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
 stf = open(pic.log_filename,'w')
 tlog = None
 if opts.datatype == 'sam': # need to work with a bam
 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
 try:
 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
 except:
 print '## exception on sorting sam file %s' % opts.input
 print '## exception on sorting bam file %s' % opts.input
 if tlog:
 print '##tlog=',tlog
 stf.write(tlog)
 stf.write('\n')
 sortedfile = '%s.bam' % sortedfile # samtools does that
 cl.append('O=%s' % pic.metricsOut)
 cl.append('TMP_DIR=%s' % opts.tmpdir)
 cl.append('I=%s' % sortedfile)
 opts.maxerrors = '99999999'
 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
 cl.append(' '.join(igs))
 if opts.bisulphite.lower() <> 'false':
 cl.append('IS_BISULFITE_SEQUENCED=true')
 if opts.ref <> None or opts.ref_file <> None:
-cl.append('R=%s' %  ref_file_name)
+cl.append('R=%s' % ref_file_name)
 stdouts,rval = pic.runPic(opts.jar,cl)
 if opts.datatype == 'sam':
 pic.delme.append(tempbam)
 newsam = opts.output
 outformat = 'bam'
 pe = open(pic.metricsOut,'r').readlines()
 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
 pic.delme.append(sortedfile) # not wanted
 stf.close()
 pic.cleanup()
+####liubo added CleanSam tool####
+elif pic.picname == 'CleanSam':
+# input
+cl.append('INPUT=%s' % opts.input)
+# output
+cl.append('OUTPUT=%s' % tempout)
+stdouts,rval = pic.runPic(opts.jar, cl)
+haveTempout = True
+elif pic.picname == 'SortSam':
+cl.append('I=%s' % opts.input)
+cl.append('O=%s' % tempout)
+cl.append('SORT_ORDER=%s' % opts.sortorder)
+stdouts,rval = pic.runPic(opts.jar,cl)
+haveTempout = True
+elif pic.picname == 'BuildBamIndex':
+cl.append('I=%s' % opts.input)
+cl.append('O=%s' % tempout)
+stdouts,rval = pic.runPic(opts.jar,cl)
+haveTempout = True
+elif pic.picname == 'SamFormatConverter':
+cl.append('INPUT=%s' % opts.input)
+cl.append('OUTPUT=%s' % tempout)
+pic.runPic(opts.jar, cl)
+haveTempout = True
 elif pic.picname == "DownsampleSam":
 cl.append('I=%s' % opts.input)
 mystring = opts.output
 mystringsam = mystring + ".sam"
 cl.append('O=%s' % mystringsam)
 if float(opts.seed) > 0:
 cl.append('RANDOM_SEED=%s' % opts.seed)
 stdouts,rval = pic.runPic(opts.jar, cl)
 myoutput = mystringsam.replace(".sam", "")
 os.rename(mystringsam,myoutput)
-elif pic.picname == 'SamFormatConverter':
-cl.append('INPUT=%s' % opts.input)
-cl.append('OUTPUT=%s' % tempout)
-pic.runPic(opts.jar, cl)
-haveTempout = True
 else:
 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
 sys.exit(1)
 if haveTempout:
 # Some Picard tools produced a potentially intermediate bam file.
 # Either just move to final location or create sam
 if os.path.exists(tempout):
 shutil.move(tempout, os.path.abspath(opts.output))
 if opts.htmlout <> None or doFix: # return a pretty html page
 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
 if rval <> 0:
 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
 # signal failure
 if __name__=="__main__": __main__()

Mercurial > repos > devteam > picard1106

comparison picard_wrapper.py @ 42:c25cd3e674a5 draft