# HG changeset patch
# User devteam
# Date 1392346925 18000
# Node ID c25cd3e674a5cbe2a8f894711efce2441e8941d9
# Parent 468673afa3483c175ae7b074d1a2f26240176456
Uploaded
diff -r 468673afa348 -r c25cd3e674a5 picard_wrapper.py
--- a/picard_wrapper.py Thu Feb 13 21:39:50 2014 -0500
+++ b/picard_wrapper.py Thu Feb 13 22:02:05 2014 -0500
@@ -15,7 +15,7 @@
-
+
@@ -33,20 +33,20 @@
def timenow():
"""return current time as a string
- """
+"""
return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
class PicardBase():
"""
- simple base class with some utilities for Picard
- adapted and merged with Kelly Vincent's code april 2011 Ross
- lots of changes...
- """
+simple base class with some utilities for Picard
+adapted and merged with Kelly Vincent's code april 2011 Ross
+lots of changes...
+"""
def __init__(self, opts=None,arg0=None):
""" common stuff needed at init for a picard tool
- """
+"""
assert opts <> None, 'PicardBase needs opts at init'
self.opts = opts
if self.opts.outdir == None:
@@ -69,7 +69,7 @@
except:
pass
self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
- self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
+ self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
self.setLogging(logfname=self.log_filename)
def baseName(self,name=None):
@@ -77,7 +77,7 @@
def setLogging(self,logfname="picard_wrapper.log"):
"""setup a logger
- """
+"""
logging.basicConfig(level=logging.INFO,
filename=logfname,
filemode='a')
@@ -85,7 +85,7 @@
def readLarge(self,fname=None):
""" read a potentially huge file.
- """
+"""
try:
# get stderr, allowing for case where it's very large
tmp = open( fname, 'rb' )
@@ -102,14 +102,14 @@
pass
tmp.close()
except Exception, e:
- stop_err( 'Read Large Exception : %s' % str( e ) )
+ stop_err( 'Read Large Exception : %s' % str( e ) )
return s
def runCL(self,cl=None,output_dir=None):
""" construct and run a command line
- we have galaxy's temp path as opt.temp_dir so don't really need isolation
- sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
- """
+we have galaxy's temp path as opt.temp_dir so don't really need isolation
+sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
+"""
assert cl <> None, 'PicardBase runCL needs a command line as cl'
if output_dir == None:
output_dir = self.opts.outdir
@@ -124,7 +124,7 @@
tlf.close()
tef.close()
stderrs = self.readLarge(temperr)
- stdouts = self.readLarge(templog)
+ stdouts = self.readLarge(templog)
if rval > 0:
s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
stdouts = '%s\n%s' % (stdouts,stderrs)
@@ -133,23 +133,26 @@
logging.info(s)
os.unlink(templog) # always
os.unlink(temperr) # always
- return s, stdouts, rval # sometimes s is an output
+ return s, stdouts, rval # sometimes s is an output
def runPic(self, jar, cl):
"""
- cl should be everything after the jar file name in the command
- """
+cl should be everything after the jar file name in the command
+"""
runme = ['java -Xmx%s' % self.opts.maxjheap]
runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
runme.append('-jar %s' % jar)
runme += cl
+
+ print runme,self.opts.outdir
+
s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
return stdouts,rval
def samToBam(self,infile=None,outdir=None):
"""
- use samtools view to convert sam to bam
- """
+use samtools view to convert sam to bam
+"""
fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
cl = ['samtools view -h -b -S -o ',tempbam,infile]
tlog,stdouts,rval = self.runCL(cl,outdir)
@@ -157,7 +160,7 @@
def sortSam(self, infile=None,outfile=None,outdir=None):
"""
- """
+"""
print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
cl = ['samtools sort',infile,outfile]
tlog,stdouts,rval = self.runCL(cl,outdir)
@@ -172,20 +175,20 @@
def prettyPicout(self,transpose,maxrows):
"""organize picard outpouts into a report html page
- """
+"""
res = []
try:
r = open(self.metricsOut,'r').readlines()
except:
- r = []
+ r = []
if len(r) > 0:
res.append('Picard on line resources
\n')
if transpose:
- res.append('Picard output (transposed to make it easier to see)\n')
+ res.append('Picard output (transposed to make it easier to see)\n')
else:
- res.append('Picard output\n')
+ res.append('Picard output\n')
res.append('
\n')
dat = []
heads = []
@@ -208,30 +211,30 @@
if len(dat) > 0:
if transpose and not thist:
tdat = map(None,*dat) # transpose an arbitrary list of lists
- tdat = ['
%s
%s
\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
+ tdat = ['
%s
%s
\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
else:
tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
tdat = ['
%s
\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
res += tdat
dat = []
- res.append('
\n')
+ res.append('\n')
return res
def fixPicardOutputs(self,transpose,maxloglines):
"""
- picard produces long hard to read tab header files
- make them available but present them transposed for readability
- """
+picard produces long hard to read tab header files
+make them available but present them transposed for readability
+"""
logging.shutdown()
self.cleanup() # remove temp files stored in delme
rstyle=""""""
+tr.d0 td {background-color: oldlace; color: black;}
+tr.d1 td {background-color: aliceblue; color: black;}
+"""
res = [rstyle,]
- res.append(galhtmlprefix % self.progname)
+ res.append(galhtmlprefix % self.progname)
res.append(galhtmlattr % (self.picname,timenow()))
- flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
+ flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
for p in pdflist:
@@ -241,22 +244,22 @@
if mimghref in flist:
imghref=mimghref # only one for thumbnail...it's a multi page pdf
res.append('
\n')
+ res.append('\n' % (p,imghref))
+ res.append('\n')
if len(flist) > 0:
res.append('The following output files were created (click the filename to view/download a copy):')
res.append('
\n')
for i,f in enumerate(flist):
fn = os.path.split(f)[-1]
res.append('
\n')
+ res.append('\n')
pres = self.prettyPicout(transpose,maxloglines)
if len(pres) > 0:
res += pres
l = open(self.log_filename,'r').readlines()
llen = len(l)
- if llen > 0:
- res.append('Picard Tool Run Log\n')
+ if llen > 0:
+ res.append('Picard Tool Run Log\n')
rlog = ['
',]
if llen > maxloglines:
n = min(50,int(maxloglines/2))
@@ -271,21 +274,21 @@
res += rlog
else:
res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
- res.append('The freely available Picard software \n')
- res.append( 'generated all outputs reported here running as a Galaxy tool')
- res.append(galhtmlpostfix)
+ res.append('The freely available Picard software \n')
+ res.append( 'generated all outputs reported here running as a Galaxy tool')
+ res.append(galhtmlpostfix)
outf = open(self.opts.htmlout,'w')
- outf.write(''.join(res))
+ outf.write(''.join(res))
outf.write('\n')
outf.close()
def makePicInterval(self,inbed=None,outf=None):
"""
- picard wants bait and target files to have the same header length as the incoming bam/sam
- a meaningful (ie accurate) representation will fail because of this - so this hack
- it would be far better to be able to supply the original bed untouched
- Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
- """
+picard wants bait and target files to have the same header length as the incoming bam/sam
+a meaningful (ie accurate) representation will fail because of this - so this hack
+it would be far better to be able to supply the original bed untouched
+Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
+"""
assert inbed <> None
bed = open(inbed,'r').readlines()
sbed = [x.split('\t') for x in bed] # lengths MUST be 5
@@ -320,19 +323,19 @@
def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
"""
- interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
- Do the work of removing all the error sequences
- pysam is cool
- infile = pysam.Samfile( "-", "r" )
- outfile = pysam.Samfile( "-", "w", template = infile )
- for s in infile: outfile.write(s)
+interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
+Do the work of removing all the error sequences
+pysam is cool
+infile = pysam.Samfile( "-", "r" )
+outfile = pysam.Samfile( "-", "w", template = infile )
+for s in infile: outfile.write(s)
- errors from ValidateSameFile.jar look like
- WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
- ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
- ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
+errors from ValidateSameFile.jar look like
+WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
+ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
+ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
- """
+"""
assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
@@ -364,7 +367,7 @@
def __main__():
doFix = False # tools returning htmlfile don't need this
doTranspose = True # default
- maxloglines = 100 # default
+ maxloglines = 100 # default
#Parse Command Line
op = optparse.OptionParser()
# All tools
@@ -374,12 +377,12 @@
op.add_option('-n', '--title', default="Pick a Picard Tool")
op.add_option('-t', '--htmlout', default=None)
op.add_option('-d', '--outdir', default=None)
- op.add_option('-x', '--maxjheap', default='4g')
+ op.add_option('-x', '--maxjheap', default='3000m')
op.add_option('-b', '--bisulphite', default='false')
- op.add_option('-s', '--sortorder', default='query')
+ op.add_option('-s', '--sortorder', default='query')
op.add_option('','--tmpdir', default='/tmp')
- op.add_option('-j','--jar',default='')
- op.add_option('','--picard-cmd',default=None)
+ op.add_option('-j','--jar',default='')
+ op.add_option('','--picard-cmd',default=None)
# Many tools
op.add_option( '', '--output-format', dest='output_format', help='Output format' )
op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
@@ -404,7 +407,7 @@
# FixMateInformation and validate
# CollectGcBiasMetrics
op.add_option('', '--windowsize', default='100')
- op.add_option('', '--mingenomefrac', default='0.00001')
+ op.add_option('', '--mingenomefrac', default='0.00001')
# AddOrReplaceReadGroups
op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
@@ -420,7 +423,7 @@
# ReplaceSamHeader
op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
- op.add_option('','--assumesorted', default='true')
+ op.add_option('','--assumesorted', default='true')
op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
#estimatelibrarycomplexity
op.add_option('','--minid', default="5")
@@ -439,7 +442,6 @@
op.add_option('','--probability', default="1")
op.add_option('','--seed', default="1")
- ##
opts, args = op.parse_args()
opts.sortme = opts.assumesorted == 'false'
assert opts.input <> None
@@ -542,7 +544,7 @@
stdouts,rval = pic.runPic(opts.jar, cl)
elif pic.picname == 'CollectAlignmentSummaryMetrics':
- # Why do we do this fakefasta thing?
+ # Why do we do this fakefasta thing?
# Because we need NO fai to be available or picard barfs unless it matches the input data.
# why? Dunno Seems to work without complaining if the .bai file is AWOL....
fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
@@ -564,7 +566,7 @@
cl.append('TMP_DIR=%s' % opts.tmpdir)
if not opts.assumesorted.lower() == 'true': # we need to sort input
sortedfile = '%s.sorted' % os.path.basename(opts.input)
- if opts.datatype == 'sam': # need to work with a bam
+ if opts.datatype == 'sam': # need to work with a bam
tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
pic.delme.append(tempbam)
try:
@@ -579,14 +581,14 @@
cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
pic.delme.append(os.path.join(opts.outdir,sortedfile))
else:
- cl.append('INPUT=%s' % os.path.abspath(opts.input))
+ cl.append('INPUT=%s' % os.path.abspath(opts.input))
stdouts,rval = pic.runPic(opts.jar, cl)
elif pic.picname == 'CollectGcBiasMetrics':
assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
# sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
- # why? Dunno
+ # why? Dunno
fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
try:
os.symlink(ref_file_name,fakefasta)
@@ -621,11 +623,11 @@
elif pic.picname == 'CollectInsertSizeMetrics':
"""
- picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
- --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
- -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
-
- """
+picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
+--histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
+-j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
+
+"""
isPDF = 'InsertSizeHist.pdf'
pdfpath = os.path.join(opts.outdir,isPDF)
histpdf = 'InsertSizeHist.pdf'
@@ -633,8 +635,8 @@
cl.append('O=%s' % pic.metricsOut)
cl.append('HISTOGRAM_FILE=%s' % histpdf)
#if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
- # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
- if opts.histwidth <> '0':
+ # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
+ if opts.histwidth <> '0':
cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
if float( opts.minpct) > 0.0:
cl.append('MINIMUM_PCT=%s' % opts.minpct)
@@ -645,7 +647,7 @@
malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
cl += malist
stdouts,rval = pic.runPic(opts.jar, cl)
- if os.path.exists(pdfpath): # automake thumbnail - will be added to html
+ if os.path.exists(pdfpath): # automake thumbnail - will be added to html
cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
pic.runCL(cl=cl2,output_dir=opts.outdir)
else:
@@ -663,7 +665,7 @@
# input
cl.append('INPUT=%s' % opts.input)
# outputs
- cl.append('OUTPUT=%s' % opts.output)
+ cl.append('OUTPUT=%s' % opts.output)
cl.append('METRICS_FILE=%s' % pic.metricsOut )
# remove or mark duplicates
cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
@@ -711,7 +713,7 @@
if opts.targetbed == opts.baitbed: # same file sometimes
targetf = baitf
else:
- targetf = pic.makePicInterval(opts.targetbed,targetfname)
+ targetf = pic.makePicInterval(opts.targetbed,targetfname)
cl.append('BAIT_INTERVALS=%s' % baitf)
cl.append('TARGET_INTERVALS=%s' % targetf)
cl.append('INPUT=%s' % os.path.abspath(opts.input))
@@ -725,7 +727,7 @@
sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
stf = open(pic.log_filename,'w')
tlog = None
- if opts.datatype == 'sam': # need to work with a bam
+ if opts.datatype == 'sam': # need to work with a bam
tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
try:
tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
@@ -740,7 +742,7 @@
print '##tlog=',tlog
stf.write(tlog)
stf.write('\n')
- sortedfile = '%s.bam' % sortedfile # samtools does that
+ sortedfile = '%s.bam' % sortedfile # samtools does that
cl.append('O=%s' % pic.metricsOut)
cl.append('TMP_DIR=%s' % opts.tmpdir)
cl.append('I=%s' % sortedfile)
@@ -752,7 +754,7 @@
if opts.bisulphite.lower() <> 'false':
cl.append('IS_BISULFITE_SEQUENCED=true')
if opts.ref <> None or opts.ref_file <> None:
- cl.append('R=%s' % ref_file_name)
+ cl.append('R=%s' % ref_file_name)
stdouts,rval = pic.runPic(opts.jar,cl)
if opts.datatype == 'sam':
pic.delme.append(tempbam)
@@ -763,6 +765,34 @@
pic.delme.append(sortedfile) # not wanted
stf.close()
pic.cleanup()
+
+####liubo added CleanSam tool####
+ elif pic.picname == 'CleanSam':
+ # input
+ cl.append('INPUT=%s' % opts.input)
+ # output
+ cl.append('OUTPUT=%s' % tempout)
+ stdouts,rval = pic.runPic(opts.jar, cl)
+ haveTempout = True
+
+ elif pic.picname == 'SortSam':
+ cl.append('I=%s' % opts.input)
+ cl.append('O=%s' % tempout)
+ cl.append('SORT_ORDER=%s' % opts.sortorder)
+ stdouts,rval = pic.runPic(opts.jar,cl)
+ haveTempout = True
+
+ elif pic.picname == 'BuildBamIndex':
+ cl.append('I=%s' % opts.input)
+ cl.append('O=%s' % tempout)
+ stdouts,rval = pic.runPic(opts.jar,cl)
+ haveTempout = True
+
+ elif pic.picname == 'SamFormatConverter':
+ cl.append('INPUT=%s' % opts.input)
+ cl.append('OUTPUT=%s' % tempout)
+ pic.runPic(opts.jar, cl)
+ haveTempout = True
elif pic.picname == "DownsampleSam":
cl.append('I=%s' % opts.input)
mystring = opts.output
@@ -775,17 +805,13 @@
stdouts,rval = pic.runPic(opts.jar, cl)
myoutput = mystringsam.replace(".sam", "")
os.rename(mystringsam,myoutput)
- elif pic.picname == 'SamFormatConverter':
- cl.append('INPUT=%s' % opts.input)
- cl.append('OUTPUT=%s' % tempout)
- pic.runPic(opts.jar, cl)
- haveTempout = True
-
+
+
else:
print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
sys.exit(1)
if haveTempout:
- # Some Picard tools produced a potentially intermediate bam file.
+ # Some Picard tools produced a potentially intermediate bam file.
# Either just move to final location or create sam
if os.path.exists(tempout):
shutil.move(tempout, os.path.abspath(opts.output))
@@ -795,4 +821,3 @@
print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
# signal failure
if __name__=="__main__": __main__()
-