33
+ − 1 #!/usr/bin/env python
+ − 2 """
+ − 3 Originally written by Kelly Vincent
+ − 4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
+ − 5 Runs all available wrapped Picard tools.
+ − 6 usage: picard_wrapper.py [options]
+ − 7 code Ross wrote licensed under the LGPL
+ − 8 see http://www.gnu.org/copyleft/lesser.html
+ − 9 """
+ − 10
+ − 11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
+ − 12
+ − 13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
+ − 14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+ − 15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ − 16 <head>
+ − 17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
42
+ − 18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
33
+ − 19 <title></title>
+ − 20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+ − 21 </head>
+ − 22 <body>
+ − 23 <div class="document">
+ − 24 """
+ − 25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
+ − 26 galhtmlpostfix = """</div></body></html>\n"""
+ − 27
+ − 28
+ − 29 def stop_err( msg ):
+ − 30 sys.stderr.write( '%s\n' % msg )
+ − 31 sys.exit()
+ − 32
+ − 33
+ − 34 def timenow():
+ − 35 """return current time as a string
42
+ − 36 """
33
+ − 37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+ − 38
+ − 39
+ − 40 class PicardBase():
+ − 41 """
42
+ − 42 simple base class with some utilities for Picard
+ − 43 adapted and merged with Kelly Vincent's code april 2011 Ross
+ − 44 lots of changes...
+ − 45 """
33
+ − 46
+ − 47 def __init__(self, opts=None,arg0=None):
+ − 48 """ common stuff needed at init for a picard tool
42
+ − 49 """
33
+ − 50 assert opts <> None, 'PicardBase needs opts at init'
+ − 51 self.opts = opts
+ − 52 if self.opts.outdir == None:
+ − 53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
+ − 54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
+ − 55 self.picname = self.baseName(opts.jar)
+ − 56 if self.picname.startswith('picard'):
+ − 57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
+ − 58 self.progname = self.baseName(arg0)
+ − 59 self.version = '0.002'
+ − 60 self.delme = [] # list of files to destroy
+ − 61 self.title = opts.title
+ − 62 self.inputfile = opts.input
+ − 63 try:
+ − 64 os.makedirs(opts.outdir)
+ − 65 except:
+ − 66 pass
+ − 67 try:
+ − 68 os.makedirs(opts.tmpdir)
+ − 69 except:
+ − 70 pass
+ − 71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
42
+ − 72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
33
+ − 73 self.setLogging(logfname=self.log_filename)
+ − 74
+ − 75 def baseName(self,name=None):
+ − 76 return os.path.splitext(os.path.basename(name))[0]
+ − 77
+ − 78 def setLogging(self,logfname="picard_wrapper.log"):
+ − 79 """setup a logger
42
+ − 80 """
33
+ − 81 logging.basicConfig(level=logging.INFO,
+ − 82 filename=logfname,
+ − 83 filemode='a')
+ − 84
+ − 85
+ − 86 def readLarge(self,fname=None):
+ − 87 """ read a potentially huge file.
42
+ − 88 """
33
+ − 89 try:
+ − 90 # get stderr, allowing for case where it's very large
+ − 91 tmp = open( fname, 'rb' )
+ − 92 s = ''
+ − 93 buffsize = 1048576
+ − 94 try:
+ − 95 while True:
+ − 96 more = tmp.read( buffsize )
+ − 97 if len(more) > 0:
+ − 98 s += more
+ − 99 else:
+ − 100 break
+ − 101 except OverflowError:
+ − 102 pass
+ − 103 tmp.close()
+ − 104 except Exception, e:
42
+ − 105 stop_err( 'Read Large Exception : %s' % str( e ) )
33
+ − 106 return s
+ − 107
+ − 108 def runCL(self,cl=None,output_dir=None):
+ − 109 """ construct and run a command line
42
+ − 110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
+ − 111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
+ − 112 """
33
+ − 113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
+ − 114 if output_dir == None:
+ − 115 output_dir = self.opts.outdir
+ − 116 if type(cl) == type([]):
+ − 117 cl = ' '.join(cl)
+ − 118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
+ − 119 tlf = open(templog,'wb')
+ − 120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
+ − 121 tef = open(temperr,'wb')
+ − 122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
+ − 123 rval = process.wait()
+ − 124 tlf.close()
+ − 125 tef.close()
+ − 126 stderrs = self.readLarge(temperr)
42
+ − 127 stdouts = self.readLarge(templog)
33
+ − 128 if rval > 0:
+ − 129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
+ − 130 stdouts = '%s\n%s' % (stdouts,stderrs)
+ − 131 else:
+ − 132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
+ − 133 logging.info(s)
+ − 134 os.unlink(templog) # always
+ − 135 os.unlink(temperr) # always
42
+ − 136 return s, stdouts, rval # sometimes s is an output
33
+ − 137
+ − 138 def runPic(self, jar, cl):
+ − 139 """
42
+ − 140 cl should be everything after the jar file name in the command
+ − 141 """
33
+ − 142 runme = ['java -Xmx%s' % self.opts.maxjheap]
+ − 143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
+ − 144 runme.append('-jar %s' % jar)
+ − 145 runme += cl
42
+ − 146
+ − 147 print runme,self.opts.outdir
+ − 148
33
+ − 149 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
+ − 150 return stdouts,rval
+ − 151
+ − 152 def samToBam(self,infile=None,outdir=None):
+ − 153 """
42
+ − 154 use samtools view to convert sam to bam
+ − 155 """
33
+ − 156 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
+ − 157 cl = ['samtools view -h -b -S -o ',tempbam,infile]
+ − 158 tlog,stdouts,rval = self.runCL(cl,outdir)
+ − 159 return tlog,tempbam,rval
+ − 160
+ − 161 def sortSam(self, infile=None,outfile=None,outdir=None):
+ − 162 """
42
+ − 163 """
33
+ − 164 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
+ − 165 cl = ['samtools sort',infile,outfile]
+ − 166 tlog,stdouts,rval = self.runCL(cl,outdir)
+ − 167 return tlog
+ − 168
+ − 169 def cleanup(self):
+ − 170 for fname in self.delme:
+ − 171 try:
+ − 172 os.unlink(fname)
+ − 173 except:
+ − 174 pass
+ − 175
+ − 176 def prettyPicout(self,transpose,maxrows):
+ − 177 """organize picard outpouts into a report html page
42
+ − 178 """
33
+ − 179 res = []
+ − 180 try:
+ − 181 r = open(self.metricsOut,'r').readlines()
+ − 182 except:
42
+ − 183 r = []
33
+ − 184 if len(r) > 0:
+ − 185 res.append('<b>Picard on line resources</b><ul>\n')
+ − 186 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
+ − 187 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
+ − 188 if transpose:
42
+ − 189 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
33
+ − 190 else:
42
+ − 191 res.append('<b>Picard output</b><hr/>\n')
33
+ − 192 res.append('<table cellpadding="3" >\n')
+ − 193 dat = []
+ − 194 heads = []
+ − 195 lastr = len(r) - 1
+ − 196 # special case for estimate library complexity hist
+ − 197 thist = False
+ − 198 for i,row in enumerate(r):
+ − 199 if row.strip() > '':
+ − 200 srow = row.split('\t')
+ − 201 if row.startswith('#'):
+ − 202 heads.append(row.strip()) # want strings
+ − 203 else:
+ − 204 dat.append(srow) # want lists
+ − 205 if row.startswith('## HISTOGRAM'):
+ − 206 thist = True
+ − 207 if len(heads) > 0:
+ − 208 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
+ − 209 res += hres
+ − 210 heads = []
+ − 211 if len(dat) > 0:
+ − 212 if transpose and not thist:
+ − 213 tdat = map(None,*dat) # transpose an arbitrary list of lists
42
+ − 214 tdat = ['<tr class="d%d"><td>%s</td><td>%s </td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
33
+ − 215 else:
+ − 216 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
+ − 217 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
+ − 218 res += tdat
+ − 219 dat = []
42
+ − 220 res.append('</table>\n')
33
+ − 221 return res
+ − 222
+ − 223 def fixPicardOutputs(self,transpose,maxloglines):
+ − 224 """
42
+ − 225 picard produces long hard to read tab header files
+ − 226 make them available but present them transposed for readability
+ − 227 """
33
+ − 228 logging.shutdown()
+ − 229 self.cleanup() # remove temp files stored in delme
+ − 230 rstyle="""<style type="text/css">
42
+ − 231 tr.d0 td {background-color: oldlace; color: black;}
+ − 232 tr.d1 td {background-color: aliceblue; color: black;}
+ − 233 </style>"""
33
+ − 234 res = [rstyle,]
42
+ − 235 res.append(galhtmlprefix % self.progname)
33
+ − 236 res.append(galhtmlattr % (self.picname,timenow()))
42
+ − 237 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
33
+ − 238 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
+ − 239 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
+ − 240 for p in pdflist:
+ − 241 pbase = os.path.splitext(p)[0] # removes .pdf
+ − 242 imghref = '%s.jpg' % pbase
+ − 243 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
+ − 244 if mimghref in flist:
+ − 245 imghref=mimghref # only one for thumbnail...it's a multi page pdf
+ − 246 res.append('<table cellpadding="10"><tr><td>\n')
42
+ − 247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
+ − 248 res.append('</tr></td></table>\n')
33
+ − 249 if len(flist) > 0:
+ − 250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
+ − 251 res.append('<table>\n')
+ − 252 for i,f in enumerate(flist):
+ − 253 fn = os.path.split(f)[-1]
+ − 254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
42
+ − 255 res.append('</table><p/>\n')
33
+ − 256 pres = self.prettyPicout(transpose,maxloglines)
+ − 257 if len(pres) > 0:
+ − 258 res += pres
+ − 259 l = open(self.log_filename,'r').readlines()
+ − 260 llen = len(l)
42
+ − 261 if llen > 0:
+ − 262 res.append('<b>Picard Tool Run Log</b><hr/>\n')
33
+ − 263 rlog = ['<pre>',]
+ − 264 if llen > maxloglines:
+ − 265 n = min(50,int(maxloglines/2))
+ − 266 rlog += l[:n]
+ − 267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
+ − 268 rlog += l[-n:]
+ − 269 else:
+ − 270 rlog += l
+ − 271 rlog.append('</pre>')
+ − 272 if llen > maxloglines:
+ − 273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
+ − 274 res += rlog
+ − 275 else:
+ − 276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
42
+ − 277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
+ − 278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
+ − 279 res.append(galhtmlpostfix)
33
+ − 280 outf = open(self.opts.htmlout,'w')
42
+ − 281 outf.write(''.join(res))
33
+ − 282 outf.write('\n')
+ − 283 outf.close()
+ − 284
+ − 285 def makePicInterval(self,inbed=None,outf=None):
+ − 286 """
42
+ − 287 picard wants bait and target files to have the same header length as the incoming bam/sam
+ − 288 a meaningful (ie accurate) representation will fail because of this - so this hack
+ − 289 it would be far better to be able to supply the original bed untouched
+ − 290 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
+ − 291 """
33
+ − 292 assert inbed <> None
+ − 293 bed = open(inbed,'r').readlines()
+ − 294 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
+ − 295 lens = [len(x) for x in sbed]
+ − 296 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
+ − 297 maxl = max(lens)
+ − 298 minl = min(lens)
+ − 299 e = []
+ − 300 if maxl <> minl:
+ − 301 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
+ − 302 if maxl <> 5:
+ − 303 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
+ − 304 if len(strands) > 0:
+ − 305 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
+ − 306 if len(e) > 0: # write to stderr and quit
+ − 307 print >> sys.stderr, '\n'.join(e)
+ − 308 sys.exit(1)
+ − 309 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
+ − 310 if self.opts.datatype == 'sam':
+ − 311 cl = ['samtools view -H -S',self.opts.input,'>',thead]
+ − 312 else:
+ − 313 cl = ['samtools view -H',self.opts.input,'>',thead]
+ − 314 self.runCL(cl=cl,output_dir=self.opts.outdir)
+ − 315 head = open(thead,'r').readlines()
+ − 316 s = '## got %d rows of header\n' % (len(head))
+ − 317 logging.info(s)
+ − 318 o = open(outf,'w')
+ − 319 o.write(''.join(head))
+ − 320 o.write(''.join(bed))
+ − 321 o.close()
+ − 322 return outf
+ − 323
+ − 324 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
+ − 325 """
42
+ − 326 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
+ − 327 Do the work of removing all the error sequences
+ − 328 pysam is cool
+ − 329 infile = pysam.Samfile( "-", "r" )
+ − 330 outfile = pysam.Samfile( "-", "w", template = infile )
+ − 331 for s in infile: outfile.write(s)
33
+ − 332
42
+ − 333 errors from ValidateSameFile.jar look like
+ − 334 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
+ − 335 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
+ − 336 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
33
+ − 337
42
+ − 338 """
33
+ − 339 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
+ − 340 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
+ − 341 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
+ − 342 remDict = dict(zip(removeNames,range(len(removeNames))))
+ − 343 infile = pysam.Samfile(insam,'rb')
+ − 344 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
+ − 345 if len(removeNames) > 0:
+ − 346 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
+ − 347 i = 0
+ − 348 j = 0
+ − 349 for row in infile:
+ − 350 dropme = remDict.get(row.qname,None) # keep if None
+ − 351 if not dropme:
+ − 352 outfile.write(row)
+ − 353 j += 1
+ − 354 else: # discard
+ − 355 i += 1
+ − 356 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
+ − 357 outfile.close()
+ − 358 infile.close()
+ − 359 else: # we really want a nullop or a simple pointer copy
+ − 360 infile.close()
+ − 361 if newsam:
+ − 362 shutil.copy(insam,newsam)
+ − 363 logging.info(info)
+ − 364
+ − 365
+ − 366
+ − 367 def __main__():
+ − 368 doFix = False # tools returning htmlfile don't need this
+ − 369 doTranspose = True # default
42
+ − 370 maxloglines = 100 # default
33
+ − 371 #Parse Command Line
+ − 372 op = optparse.OptionParser()
+ − 373 # All tools
+ − 374 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
+ − 375 op.add_option('-e', '--inputext', default=None)
+ − 376 op.add_option('-o', '--output', default=None)
+ − 377 op.add_option('-n', '--title', default="Pick a Picard Tool")
+ − 378 op.add_option('-t', '--htmlout', default=None)
+ − 379 op.add_option('-d', '--outdir', default=None)
42
+ − 380 op.add_option('-x', '--maxjheap', default='3000m')
33
+ − 381 op.add_option('-b', '--bisulphite', default='false')
42
+ − 382 op.add_option('-s', '--sortorder', default='query')
33
+ − 383 op.add_option('','--tmpdir', default='/tmp')
42
+ − 384 op.add_option('-j','--jar',default='')
+ − 385 op.add_option('','--picard-cmd',default=None)
33
+ − 386 # Many tools
+ − 387 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
+ − 388 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
+ − 389 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
+ − 390 # CreateSequenceDictionary
+ − 391 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
+ − 392 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
+ − 393 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
+ − 394 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
+ − 395 # MarkDuplicates
+ − 396 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
+ − 397 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
+ − 398 # CollectInsertSizeMetrics
+ − 399 op.add_option('', '--taillimit', default="0")
+ − 400 op.add_option('', '--histwidth', default="0")
+ − 401 op.add_option('', '--minpct', default="0.01")
+ − 402 op.add_option('', '--malevel', default='')
+ − 403 op.add_option('', '--deviations', default="0.0")
+ − 404 # CollectAlignmentSummaryMetrics
+ − 405 op.add_option('', '--maxinsert', default="20")
+ − 406 op.add_option('', '--adaptors', default='')
+ − 407 # FixMateInformation and validate
+ − 408 # CollectGcBiasMetrics
+ − 409 op.add_option('', '--windowsize', default='100')
42
+ − 410 op.add_option('', '--mingenomefrac', default='0.00001')
33
+ − 411 # AddOrReplaceReadGroups
+ − 412 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
+ − 413 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
+ − 414 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
+ − 415 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
+ − 416 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
+ − 417 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
+ − 418 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
+ − 419 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
+ − 420 # ReorderSam
+ − 421 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
+ − 422 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
+ − 423 # ReplaceSamHeader
+ − 424 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
+ − 425
42
+ − 426 op.add_option('','--assumesorted', default='true')
33
+ − 427 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
+ − 428 #estimatelibrarycomplexity
+ − 429 op.add_option('','--minid', default="5")
+ − 430 op.add_option('','--maxdiff', default="0.03")
+ − 431 op.add_option('','--minmeanq', default="20")
+ − 432 #hsmetrics
+ − 433 op.add_option('','--baitbed', default=None)
+ − 434 op.add_option('','--targetbed', default=None)
+ − 435 #validate
+ − 436 op.add_option('','--ignoreflags', action='append', type="string")
+ − 437 op.add_option('','--maxerrors', default=None)
+ − 438 op.add_option('','--datatype', default=None)
+ − 439 op.add_option('','--bamout', default=None)
+ − 440 op.add_option('','--samout', default=None)
+ − 441 #downsample
+ − 442 op.add_option('','--probability', default="1")
+ − 443 op.add_option('','--seed', default="1")
+ − 444
+ − 445 opts, args = op.parse_args()
+ − 446 opts.sortme = opts.assumesorted == 'false'
+ − 447 assert opts.input <> None
+ − 448 # need to add
+ − 449 # instance that does all the work
+ − 450 pic = PicardBase(opts,sys.argv[0])
+ − 451
+ − 452 tmp_dir = opts.outdir
+ − 453 haveTempout = False # we use this where sam output is an option
+ − 454 rval = 0
+ − 455 stdouts = 'Not run yet'
+ − 456 # set ref and dict files to use (create if necessary)
+ − 457 ref_file_name = opts.ref
+ − 458 if opts.ref_file <> None:
+ − 459 csd = 'CreateSequenceDictionary'
+ − 460 realjarpath = os.path.split(opts.jar)[0]
+ − 461 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
+ − 462 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
+ − 463 ref_file_name = '%s.fasta' % tmp_ref_name
+ − 464 # build dict
+ − 465 dict_file_name = '%s.dict' % tmp_ref_name
+ − 466 os.symlink( opts.ref_file, ref_file_name )
+ − 467 cl = ['REFERENCE=%s' % ref_file_name]
+ − 468 cl.append('OUTPUT=%s' % dict_file_name)
+ − 469 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
+ − 470 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
+ − 471 if opts.species_name:
+ − 472 cl.append('SPECIES=%s' % opts.species_name)
+ − 473 if opts.build_name:
+ − 474 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
+ − 475 pic.delme.append(dict_file_name)
+ − 476 pic.delme.append(ref_file_name)
+ − 477 pic.delme.append(tmp_ref_name)
+ − 478 stdouts,rval = pic.runPic(jarpath, cl)
+ − 479 # run relevant command(s)
+ − 480
+ − 481 # define temporary output
+ − 482 # if output is sam, it must have that extension, otherwise bam will be produced
+ − 483 # specify sam or bam file with extension
+ − 484 if opts.output_format == 'sam':
+ − 485 suff = '.sam'
+ − 486 else:
+ − 487 suff = ''
+ − 488 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
+ − 489
+ − 490 cl = ['VALIDATION_STRINGENCY=LENIENT',]
+ − 491
+ − 492 if pic.picname == 'AddOrReplaceReadGroups':
+ − 493 # sort order to match Galaxy's default
+ − 494 cl.append('SORT_ORDER=coordinate')
+ − 495 # input
+ − 496 cl.append('INPUT=%s' % opts.input)
+ − 497 # outputs
+ − 498 cl.append('OUTPUT=%s' % tempout)
+ − 499 # required read groups
+ − 500 cl.append('RGLB="%s"' % opts.rg_library)
+ − 501 cl.append('RGPL="%s"' % opts.rg_platform)
+ − 502 cl.append('RGPU="%s"' % opts.rg_plat_unit)
+ − 503 cl.append('RGSM="%s"' % opts.rg_sample)
+ − 504 if opts.rg_id:
+ − 505 cl.append('RGID="%s"' % opts.rg_id)
+ − 506 # optional read groups
+ − 507 if opts.rg_seq_center:
+ − 508 cl.append('RGCN="%s"' % opts.rg_seq_center)
+ − 509 if opts.rg_desc:
+ − 510 cl.append('RGDS="%s"' % opts.rg_desc)
+ − 511 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 512 haveTempout = True
+ − 513
+ − 514 elif pic.picname == 'BamIndexStats':
+ − 515 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
+ − 516 tmp_bam_name = '%s.bam' % tmp_name
+ − 517 tmp_bai_name = '%s.bai' % tmp_bam_name
+ − 518 os.symlink( opts.input, tmp_bam_name )
+ − 519 os.symlink( opts.bai_file, tmp_bai_name )
+ − 520 cl.append('INPUT=%s' % ( tmp_bam_name ))
+ − 521 pic.delme.append(tmp_bam_name)
+ − 522 pic.delme.append(tmp_bai_name)
+ − 523 pic.delme.append(tmp_name)
+ − 524 stdouts,rval = pic.runPic( opts.jar, cl )
+ − 525 f = open(pic.metricsOut,'a')
+ − 526 f.write(stdouts) # got this on stdout from runCl
+ − 527 f.write('\n')
+ − 528 f.close()
+ − 529 doTranspose = False # but not transposed
+ − 530
+ − 531 elif pic.picname == 'EstimateLibraryComplexity':
+ − 532 cl.append('I=%s' % opts.input)
+ − 533 cl.append('O=%s' % pic.metricsOut)
+ − 534 if float(opts.minid) > 0:
+ − 535 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
+ − 536 if float(opts.maxdiff) > 0.0:
+ − 537 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
+ − 538 if float(opts.minmeanq) > 0:
+ − 539 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
+ − 540 if opts.readregex > '':
+ − 541 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
+ − 542 if float(opts.optdupdist) > 0:
+ − 543 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
+ − 544 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 545
+ − 546 elif pic.picname == 'CollectAlignmentSummaryMetrics':
42
+ − 547 # Why do we do this fakefasta thing?
33
+ − 548 # Because we need NO fai to be available or picard barfs unless it matches the input data.
+ − 549 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
+ − 550 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
+ − 551 try:
+ − 552 os.symlink(ref_file_name,fakefasta)
+ − 553 except:
+ − 554 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
+ − 555 info = s
+ − 556 shutil.copy(ref_file_name,fakefasta)
+ − 557 pic.delme.append(fakefasta)
+ − 558 cl.append('ASSUME_SORTED=true')
+ − 559 adaptlist = opts.adaptors.split(',')
+ − 560 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
+ − 561 cl += adaptorseqs
+ − 562 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
+ − 563 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
+ − 564 cl.append('OUTPUT=%s' % pic.metricsOut)
+ − 565 cl.append('R=%s' % fakefasta)
+ − 566 cl.append('TMP_DIR=%s' % opts.tmpdir)
+ − 567 if not opts.assumesorted.lower() == 'true': # we need to sort input
+ − 568 sortedfile = '%s.sorted' % os.path.basename(opts.input)
42
+ − 569 if opts.datatype == 'sam': # need to work with a bam
33
+ − 570 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
+ − 571 pic.delme.append(tempbam)
+ − 572 try:
+ − 573 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
+ − 574 except:
+ − 575 print '## exception on sorting sam file %s' % opts.input
+ − 576 else: # is already bam
+ − 577 try:
+ − 578 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
+ − 579 except : # bug - [bam_sort_core] not being ignored - TODO fixme
+ − 580 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
+ − 581 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
+ − 582 pic.delme.append(os.path.join(opts.outdir,sortedfile))
+ − 583 else:
42
+ − 584 cl.append('INPUT=%s' % os.path.abspath(opts.input))
33
+ − 585 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 586
+ − 587
+ − 588 elif pic.picname == 'CollectGcBiasMetrics':
+ − 589 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
+ − 590 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
42
+ − 591 # why? Dunno
33
+ − 592 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
+ − 593 try:
+ − 594 os.symlink(ref_file_name,fakefasta)
+ − 595 except:
+ − 596 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
+ − 597 info = s
+ − 598 shutil.copy(ref_file_name,fakefasta)
+ − 599 pic.delme.append(fakefasta)
+ − 600 x = 'rgPicardGCBiasMetrics'
+ − 601 pdfname = '%s.pdf' % x
+ − 602 jpgname = '%s.jpg' % x
+ − 603 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
+ − 604 temppdf = os.path.join(opts.outdir,pdfname)
+ − 605 cl.append('R=%s' % fakefasta)
+ − 606 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
+ − 607 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
+ − 608 cl.append('INPUT=%s' % opts.input)
+ − 609 cl.append('OUTPUT=%s' % tempout)
+ − 610 cl.append('TMP_DIR=%s' % opts.tmpdir)
+ − 611 cl.append('CHART_OUTPUT=%s' % temppdf)
+ − 612 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
+ − 613 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 614 if os.path.isfile(temppdf):
+ − 615 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
+ − 616 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
+ − 617 else:
+ − 618 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
+ − 619 lf = open(pic.log_filename,'a')
+ − 620 lf.write(s)
+ − 621 lf.write('\n')
+ − 622 lf.close()
+ − 623
+ − 624 elif pic.picname == 'CollectInsertSizeMetrics':
+ − 625 """ <command interpreter="python">
42
+ − 626 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
+ − 627 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
+ − 628 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
+ − 629 </command>
+ − 630 """
33
+ − 631 isPDF = 'InsertSizeHist.pdf'
+ − 632 pdfpath = os.path.join(opts.outdir,isPDF)
+ − 633 histpdf = 'InsertSizeHist.pdf'
+ − 634 cl.append('I=%s' % opts.input)
+ − 635 cl.append('O=%s' % pic.metricsOut)
+ − 636 cl.append('HISTOGRAM_FILE=%s' % histpdf)
+ − 637 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
42
+ − 638 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
+ − 639 if opts.histwidth <> '0':
33
+ − 640 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
+ − 641 if float( opts.minpct) > 0.0:
+ − 642 cl.append('MINIMUM_PCT=%s' % opts.minpct)
+ − 643 if float(opts.deviations) > 0.0:
+ − 644 cl.append('DEVIATIONS=%s' % opts.deviations)
+ − 645 if opts.malevel:
+ − 646 malists = opts.malevel.split(',')
+ − 647 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
+ − 648 cl += malist
+ − 649 stdouts,rval = pic.runPic(opts.jar, cl)
42
+ − 650 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
33
+ − 651 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
+ − 652 pic.runCL(cl=cl2,output_dir=opts.outdir)
+ − 653 else:
+ − 654 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
+ − 655 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
+ − 656 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
+ − 657 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
+ − 658 logging.info(s)
+ − 659 if len(stdouts) > 0:
+ − 660 logging.info(stdouts)
+ − 661
+ − 662 elif pic.picname == 'MarkDuplicates':
+ − 663 # assume sorted even if header says otherwise
+ − 664 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
+ − 665 # input
+ − 666 cl.append('INPUT=%s' % opts.input)
+ − 667 # outputs
42
+ − 668 cl.append('OUTPUT=%s' % opts.output)
33
+ − 669 cl.append('METRICS_FILE=%s' % pic.metricsOut )
+ − 670 # remove or mark duplicates
+ − 671 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
+ − 672 # the regular expression to be used to parse reads in incoming SAM file
+ − 673 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
+ − 674 # maximum offset between two duplicate clusters
+ − 675 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
+ − 676 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 677
+ − 678 elif pic.picname == 'FixMateInformation':
+ − 679 cl.append('I=%s' % opts.input)
+ − 680 cl.append('O=%s' % tempout)
+ − 681 cl.append('SORT_ORDER=%s' % opts.sortorder)
+ − 682 stdouts,rval = pic.runPic(opts.jar,cl)
+ − 683 haveTempout = True
+ − 684
+ − 685 elif pic.picname == 'ReorderSam':
+ − 686 # input
+ − 687 cl.append('INPUT=%s' % opts.input)
+ − 688 # output
+ − 689 cl.append('OUTPUT=%s' % tempout)
+ − 690 # reference
+ − 691 cl.append('REFERENCE=%s' % ref_file_name)
+ − 692 # incomplete dict concordance
+ − 693 if opts.allow_inc_dict_concord == 'true':
+ − 694 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
+ − 695 # contig length discordance
+ − 696 if opts.allow_contig_len_discord == 'true':
+ − 697 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
+ − 698 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 699 haveTempout = True
+ − 700
+ − 701 elif pic.picname == 'ReplaceSamHeader':
+ − 702 cl.append('INPUT=%s' % opts.input)
+ − 703 cl.append('OUTPUT=%s' % tempout)
+ − 704 cl.append('HEADER=%s' % opts.header_file)
+ − 705 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 706 haveTempout = True
+ − 707
+ − 708 elif pic.picname == 'CalculateHsMetrics':
+ − 709 maxloglines = 100
+ − 710 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
+ − 711 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
+ − 712 baitf = pic.makePicInterval(opts.baitbed,baitfname)
+ − 713 if opts.targetbed == opts.baitbed: # same file sometimes
+ − 714 targetf = baitf
+ − 715 else:
42
+ − 716 targetf = pic.makePicInterval(opts.targetbed,targetfname)
33
+ − 717 cl.append('BAIT_INTERVALS=%s' % baitf)
+ − 718 cl.append('TARGET_INTERVALS=%s' % targetf)
+ − 719 cl.append('INPUT=%s' % os.path.abspath(opts.input))
+ − 720 cl.append('OUTPUT=%s' % pic.metricsOut)
+ − 721 cl.append('TMP_DIR=%s' % opts.tmpdir)
+ − 722 stdouts,rval = pic.runPic(opts.jar,cl)
+ − 723
+ − 724 elif pic.picname == 'ValidateSamFile':
+ − 725 import pysam
+ − 726 doTranspose = False
+ − 727 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
+ − 728 stf = open(pic.log_filename,'w')
+ − 729 tlog = None
42
+ − 730 if opts.datatype == 'sam': # need to work with a bam
33
+ − 731 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
+ − 732 try:
+ − 733 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
+ − 734 except:
+ − 735 print '## exception on sorting sam file %s' % opts.input
+ − 736 else: # is already bam
+ − 737 try:
+ − 738 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
+ − 739 except: # bug - [bam_sort_core] not being ignored - TODO fixme
+ − 740 print '## exception on sorting bam file %s' % opts.input
+ − 741 if tlog:
+ − 742 print '##tlog=',tlog
+ − 743 stf.write(tlog)
+ − 744 stf.write('\n')
42
+ − 745 sortedfile = '%s.bam' % sortedfile # samtools does that
33
+ − 746 cl.append('O=%s' % pic.metricsOut)
+ − 747 cl.append('TMP_DIR=%s' % opts.tmpdir)
+ − 748 cl.append('I=%s' % sortedfile)
+ − 749 opts.maxerrors = '99999999'
+ − 750 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
+ − 751 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
+ − 752 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
+ − 753 cl.append(' '.join(igs))
+ − 754 if opts.bisulphite.lower() <> 'false':
+ − 755 cl.append('IS_BISULFITE_SEQUENCED=true')
+ − 756 if opts.ref <> None or opts.ref_file <> None:
42
+ − 757 cl.append('R=%s' % ref_file_name)
33
+ − 758 stdouts,rval = pic.runPic(opts.jar,cl)
+ − 759 if opts.datatype == 'sam':
+ − 760 pic.delme.append(tempbam)
+ − 761 newsam = opts.output
+ − 762 outformat = 'bam'
+ − 763 pe = open(pic.metricsOut,'r').readlines()
+ − 764 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
+ − 765 pic.delme.append(sortedfile) # not wanted
+ − 766 stf.close()
+ − 767 pic.cleanup()
42
+ − 768
+ − 769 ####liubo added CleanSam tool####
+ − 770 elif pic.picname == 'CleanSam':
+ − 771 # input
+ − 772 cl.append('INPUT=%s' % opts.input)
+ − 773 # output
+ − 774 cl.append('OUTPUT=%s' % tempout)
+ − 775 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 776 haveTempout = True
+ − 777
+ − 778 elif pic.picname == 'SortSam':
+ − 779 cl.append('I=%s' % opts.input)
+ − 780 cl.append('O=%s' % tempout)
+ − 781 cl.append('SORT_ORDER=%s' % opts.sortorder)
+ − 782 stdouts,rval = pic.runPic(opts.jar,cl)
+ − 783 haveTempout = True
+ − 784
+ − 785 elif pic.picname == 'BuildBamIndex':
+ − 786 cl.append('I=%s' % opts.input)
+ − 787 cl.append('O=%s' % tempout)
+ − 788 stdouts,rval = pic.runPic(opts.jar,cl)
+ − 789 haveTempout = True
+ − 790
+ − 791 elif pic.picname == 'SamFormatConverter':
+ − 792 cl.append('INPUT=%s' % opts.input)
+ − 793 cl.append('OUTPUT=%s' % tempout)
+ − 794 pic.runPic(opts.jar, cl)
+ − 795 haveTempout = True
33
+ − 796 elif pic.picname == "DownsampleSam":
+ − 797 cl.append('I=%s' % opts.input)
+ − 798 mystring = opts.output
+ − 799 mystringsam = mystring + ".sam"
+ − 800 cl.append('O=%s' % mystringsam)
+ − 801 if float(opts.probability) > 0:
+ − 802 cl.append('PROBABILITY=%s' % opts.probability)
+ − 803 if float(opts.seed) > 0:
+ − 804 cl.append('RANDOM_SEED=%s' % opts.seed)
+ − 805 stdouts,rval = pic.runPic(opts.jar, cl)
+ − 806 myoutput = mystringsam.replace(".sam", "")
+ − 807 os.rename(mystringsam,myoutput)
42
+ − 808
+ − 809
33
+ − 810 else:
+ − 811 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
+ − 812 sys.exit(1)
+ − 813 if haveTempout:
42
+ − 814 # Some Picard tools produced a potentially intermediate bam file.
33
+ − 815 # Either just move to final location or create sam
+ − 816 if os.path.exists(tempout):
+ − 817 shutil.move(tempout, os.path.abspath(opts.output))
+ − 818 if opts.htmlout <> None or doFix: # return a pretty html page
+ − 819 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
+ − 820 if rval <> 0:
+ − 821 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
+ − 822 # signal failure
+ − 823 if __name__=="__main__": __main__()