comparison picard_wrapper.py @ 133:c127b3314d53 draft

Uploaded
author devteam
date Wed, 26 Feb 2014 02:11:58 -0500
parents
children
comparison
equal deleted inserted replaced
132:b2dd3911c3f5 133:c127b3314d53
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Bluefish 2.2.4" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146
147 print runme,self.opts.outdir
148
149 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
150 return stdouts,rval
151
152 def samToBam(self,infile=None,outdir=None):
153 """
154 use samtools view to convert sam to bam
155 """
156 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
157 cl = ['samtools view -h -b -S -o ',tempbam,infile]
158 tlog,stdouts,rval = self.runCL(cl,outdir)
159 return tlog,tempbam,rval
160
161 def sortSam(self, infile=None,outfile=None,outdir=None):
162 """
163 """
164 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
165 cl = ['samtools sort',infile,outfile]
166 tlog,stdouts,rval = self.runCL(cl,outdir)
167 return tlog
168
169 def cleanup(self):
170 for fname in self.delme:
171 try:
172 os.unlink(fname)
173 except:
174 pass
175
176 def prettyPicout(self,transpose,maxrows):
177 """organize picard outpouts into a report html page
178 """
179 res = []
180 try:
181 r = open(self.metricsOut,'r').readlines()
182 except:
183 r = []
184 if len(r) > 0:
185 res.append('<b>Picard on line resources</b><ul>\n')
186 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
187 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
188 if transpose:
189 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
190 else:
191 res.append('<b>Picard output</b><hr/>\n')
192 res.append('<table cellpadding="3" >\n')
193 dat = []
194 heads = []
195 lastr = len(r) - 1
196 # special case for estimate library complexity hist
197 thist = False
198 for i,row in enumerate(r):
199 if row.strip() > '':
200 srow = row.split('\t')
201 if row.startswith('#'):
202 heads.append(row.strip()) # want strings
203 else:
204 dat.append(srow) # want lists
205 if row.startswith('## HISTOGRAM'):
206 thist = True
207 if len(heads) > 0:
208 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
209 res += hres
210 heads = []
211 if len(dat) > 0:
212 if transpose and not thist:
213 tdat = map(None,*dat) # transpose an arbitrary list of lists
214 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
215 else:
216 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
217 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
218 res += tdat
219 dat = []
220 res.append('</table>\n')
221 return res
222
223 def fixPicardOutputs(self,transpose,maxloglines):
224 """
225 picard produces long hard to read tab header files
226 make them available but present them transposed for readability
227 """
228 logging.shutdown()
229 self.cleanup() # remove temp files stored in delme
230 rstyle="""<style type="text/css">
231 tr.d0 td {background-color: oldlace; color: black;}
232 tr.d1 td {background-color: aliceblue; color: black;}
233 </style>"""
234 res = [rstyle,]
235 res.append(galhtmlprefix % self.progname)
236 res.append(galhtmlattr % (self.picname,timenow()))
237 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
238 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
239 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
240 for p in pdflist:
241 pbase = os.path.splitext(p)[0] # removes .pdf
242 imghref = '%s.jpg' % pbase
243 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
244 if mimghref in flist:
245 imghref=mimghref # only one for thumbnail...it's a multi page pdf
246 res.append('<table cellpadding="10"><tr><td>\n')
247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
248 res.append('</tr></td></table>\n')
249 if len(flist) > 0:
250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
251 res.append('<table>\n')
252 for i,f in enumerate(flist):
253 fn = os.path.split(f)[-1]
254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
255 res.append('</table><p/>\n')
256 pres = self.prettyPicout(transpose,maxloglines)
257 if len(pres) > 0:
258 res += pres
259 l = open(self.log_filename,'r').readlines()
260 llen = len(l)
261 if llen > 0:
262 res.append('<b>Picard Tool Run Log</b><hr/>\n')
263 rlog = ['<pre>',]
264 if llen > maxloglines:
265 n = min(50,int(maxloglines/2))
266 rlog += l[:n]
267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
268 rlog += l[-n:]
269 else:
270 rlog += l
271 rlog.append('</pre>')
272 if llen > maxloglines:
273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
274 res += rlog
275 else:
276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
279 res.append(galhtmlpostfix)
280 outf = open(self.opts.htmlout,'w')
281 outf.write(''.join(res))
282 outf.write('\n')
283 outf.close()
284
285 def makePicInterval(self,inbed=None,outf=None):
286 """
287 picard wants bait and target files to have the same header length as the incoming bam/sam
288 a meaningful (ie accurate) representation will fail because of this - so this hack
289 it would be far better to be able to supply the original bed untouched
290 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
291 """
292 assert inbed <> None
293 bed = open(inbed,'r').readlines()
294 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
295 lens = [len(x) for x in sbed]
296 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
297 maxl = max(lens)
298 minl = min(lens)
299 e = []
300 if maxl <> minl:
301 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
302 if maxl <> 5:
303 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
304 if len(strands) > 0:
305 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
306 if len(e) > 0: # write to stderr and quit
307 print >> sys.stderr, '\n'.join(e)
308 sys.exit(1)
309 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
310 if self.opts.datatype == 'sam':
311 cl = ['samtools view -H -S',self.opts.input,'>',thead]
312 else:
313 cl = ['samtools view -H',self.opts.input,'>',thead]
314 self.runCL(cl=cl,output_dir=self.opts.outdir)
315 head = open(thead,'r').readlines()
316 s = '## got %d rows of header\n' % (len(head))
317 logging.info(s)
318 o = open(outf,'w')
319 o.write(''.join(head))
320 o.write(''.join(bed))
321 o.close()
322 return outf
323
324 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
325 """
326 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
327 Do the work of removing all the error sequences
328 pysam is cool
329 infile = pysam.Samfile( "-", "r" )
330 outfile = pysam.Samfile( "-", "w", template = infile )
331 for s in infile: outfile.write(s)
332
333 errors from ValidateSameFile.jar look like
334 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
335 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
336 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
337
338 """
339 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
340 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
341 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
342 remDict = dict(zip(removeNames,range(len(removeNames))))
343 infile = pysam.Samfile(insam,'rb')
344 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
345 if len(removeNames) > 0:
346 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
347 i = 0
348 j = 0
349 for row in infile:
350 dropme = remDict.get(row.qname,None) # keep if None
351 if not dropme:
352 outfile.write(row)
353 j += 1
354 else: # discard
355 i += 1
356 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
357 outfile.close()
358 infile.close()
359 else: # we really want a nullop or a simple pointer copy
360 infile.close()
361 if newsam:
362 shutil.copy(insam,newsam)
363 logging.info(info)
364
365
366
367 def __main__():
368 doFix = False # tools returning htmlfile don't need this
369 doTranspose = True # default
370 maxloglines = 100 # default
371 #Parse Command Line
372 op = optparse.OptionParser()
373 # All tools
374 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
375 op.add_option('-e', '--inputext', default=None)
376 op.add_option('-o', '--output', default=None)
377 op.add_option('-n', '--title', default="Pick a Picard Tool")
378 op.add_option('-t', '--htmlout', default=None)
379 op.add_option('-d', '--outdir', default=None)
380 op.add_option('-x', '--maxjheap', default='3000m')
381 op.add_option('-b', '--bisulphite', default='false')
382 op.add_option('-s', '--sortorder', default='query')
383 op.add_option('','--tmpdir', default='/tmp')
384 op.add_option('-j','--jar',default='')
385 op.add_option('','--picard-cmd',default=None)
386 # Many tools
387 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
388 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
389 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
390 # CreateSequenceDictionary
391 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
392 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
393 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
394 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
395 # MarkDuplicates
396 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
397 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
398 # CollectInsertSizeMetrics
399 op.add_option('', '--taillimit', default="0")
400 op.add_option('', '--histwidth', default="0")
401 op.add_option('', '--minpct', default="0.01")
402 op.add_option('', '--malevel', default='')
403 op.add_option('', '--deviations', default="0.0")
404 # CollectAlignmentSummaryMetrics
405 op.add_option('', '--maxinsert', default="20")
406 op.add_option('', '--adaptors', default='')
407 # FixMateInformation and validate
408 # CollectGcBiasMetrics
409 op.add_option('', '--windowsize', default='100')
410 op.add_option('', '--mingenomefrac', default='0.00001')
411 # AddOrReplaceReadGroups
412 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
413 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
414 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
415 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
416 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
417 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
418 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
419 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
420 # ReorderSam
421 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
422 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
423 # ReplaceSamHeader
424 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
425
426 op.add_option('','--assumesorted', default='true')
427 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
428 #estimatelibrarycomplexity
429 op.add_option('','--minid', default="5")
430 op.add_option('','--maxdiff', default="0.03")
431 op.add_option('','--minmeanq', default="20")
432 #hsmetrics
433 op.add_option('','--baitbed', default=None)
434 op.add_option('','--targetbed', default=None)
435 #validate
436 op.add_option('','--ignoreflags', action='append', type="string")
437 op.add_option('','--maxerrors', default=None)
438 op.add_option('','--datatype', default=None)
439 op.add_option('','--bamout', default=None)
440 op.add_option('','--samout', default=None)
441 #downsample
442 op.add_option('','--probability', default="1")
443 op.add_option('','--seed', default="1")
444 #meanqualitybycycle
445 op.add_option('','--pfreadsonly', default='false')
446 op.add_option('','--alignedreadsonly', default='false')
447 #collectrnaseqmetrics
448 op.add_option('','--ignoreseq', action='append', type="string")
449 op.add_option('','--minlength', default=None)
450 op.add_option('','--refflat', default=None)
451 op.add_option('','--strandspecificity', default=None)
452 op.add_option('','--ribosomalintervals', default=None)
453 op.add_option('','--rrnafragmentpercentage', default=None)
454
455 opts, args = op.parse_args()
456 opts.sortme = opts.assumesorted == 'false'
457 assert opts.input <> None
458 # need to add
459 # instance that does all the work
460 pic = PicardBase(opts,sys.argv[0])
461
462 tmp_dir = opts.outdir
463 haveTempout = False # we use this where sam output is an option
464 rval = 0
465 stdouts = 'Not run yet'
466 # set ref and dict files to use (create if necessary)
467 ref_file_name = opts.ref
468 if opts.ref_file <> None:
469 csd = 'CreateSequenceDictionary'
470 realjarpath = os.path.split(opts.jar)[0]
471 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
472 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
473 ref_file_name = '%s.fasta' % tmp_ref_name
474 # build dict
475 dict_file_name = '%s.dict' % tmp_ref_name
476 os.symlink( opts.ref_file, ref_file_name )
477 cl = ['REFERENCE=%s' % ref_file_name]
478 cl.append('OUTPUT=%s' % dict_file_name)
479 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
480 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
481 if opts.species_name:
482 cl.append('SPECIES=%s' % opts.species_name)
483 if opts.build_name:
484 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
485 pic.delme.append(dict_file_name)
486 pic.delme.append(ref_file_name)
487 pic.delme.append(tmp_ref_name)
488 stdouts,rval = pic.runPic(jarpath, cl)
489 # run relevant command(s)
490
491 # define temporary output
492 # if output is sam, it must have that extension, otherwise bam will be produced
493 # specify sam or bam file with extension
494 if opts.output_format == 'sam':
495 suff = '.sam'
496 else:
497 suff = ''
498 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
499
500 cl = ['VALIDATION_STRINGENCY=LENIENT',]
501
502 if pic.picname == 'AddOrReplaceReadGroups':
503 # sort order to match Galaxy's default
504 cl.append('SORT_ORDER=coordinate')
505 # input
506 cl.append('INPUT=%s' % opts.input)
507 # outputs
508 cl.append('OUTPUT=%s' % tempout)
509 # required read groups
510 cl.append('RGLB="%s"' % opts.rg_library)
511 cl.append('RGPL="%s"' % opts.rg_platform)
512 cl.append('RGPU="%s"' % opts.rg_plat_unit)
513 cl.append('RGSM="%s"' % opts.rg_sample)
514 if opts.rg_id:
515 cl.append('RGID="%s"' % opts.rg_id)
516 # optional read groups
517 if opts.rg_seq_center:
518 cl.append('RGCN="%s"' % opts.rg_seq_center)
519 if opts.rg_desc:
520 cl.append('RGDS="%s"' % opts.rg_desc)
521 stdouts,rval = pic.runPic(opts.jar, cl)
522 haveTempout = True
523
524 elif pic.picname == 'BamIndexStats':
525 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
526 tmp_bam_name = '%s.bam' % tmp_name
527 tmp_bai_name = '%s.bai' % tmp_bam_name
528 os.symlink( opts.input, tmp_bam_name )
529 os.symlink( opts.bai_file, tmp_bai_name )
530 cl.append('INPUT=%s' % ( tmp_bam_name ))
531 pic.delme.append(tmp_bam_name)
532 pic.delme.append(tmp_bai_name)
533 pic.delme.append(tmp_name)
534 stdouts,rval = pic.runPic( opts.jar, cl )
535 f = open(pic.metricsOut,'a')
536 f.write(stdouts) # got this on stdout from runCl
537 f.write('\n')
538 f.close()
539 doTranspose = False # but not transposed
540
541 elif pic.picname == 'EstimateLibraryComplexity':
542 cl.append('I=%s' % opts.input)
543 cl.append('O=%s' % pic.metricsOut)
544 if float(opts.minid) > 0:
545 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
546 if float(opts.maxdiff) > 0.0:
547 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
548 if float(opts.minmeanq) > 0:
549 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
550 if opts.readregex > '':
551 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
552 if float(opts.optdupdist) > 0:
553 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
554 stdouts,rval = pic.runPic(opts.jar, cl)
555
556 elif pic.picname == 'CollectAlignmentSummaryMetrics':
557 # Why do we do this fakefasta thing?
558 # Because we need NO fai to be available or picard barfs unless it matches the input data.
559 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
560 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
561 try:
562 os.symlink(ref_file_name,fakefasta)
563 except:
564 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
565 info = s
566 shutil.copy(ref_file_name,fakefasta)
567 pic.delme.append(fakefasta)
568 cl.append('ASSUME_SORTED=true')
569 adaptlist = opts.adaptors.split(',')
570 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
571 cl += adaptorseqs
572 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
573 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
574 cl.append('OUTPUT=%s' % pic.metricsOut)
575 cl.append('R=%s' % fakefasta)
576 cl.append('TMP_DIR=%s' % opts.tmpdir)
577 if not opts.assumesorted.lower() == 'true': # we need to sort input
578 sortedfile = '%s.sorted' % os.path.basename(opts.input)
579 if opts.datatype == 'sam': # need to work with a bam
580 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
581 pic.delme.append(tempbam)
582 try:
583 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
584 except:
585 print '## exception on sorting sam file %s' % opts.input
586 else: # is already bam
587 try:
588 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
589 except : # bug - [bam_sort_core] not being ignored - TODO fixme
590 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
591 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
592 pic.delme.append(os.path.join(opts.outdir,sortedfile))
593 else:
594 cl.append('INPUT=%s' % os.path.abspath(opts.input))
595 stdouts,rval = pic.runPic(opts.jar, cl)
596
597
598 elif pic.picname == 'CollectGcBiasMetrics':
599 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
600 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
601 # why? Dunno
602 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
603 try:
604 os.symlink(ref_file_name,fakefasta)
605 except:
606 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
607 info = s
608 shutil.copy(ref_file_name,fakefasta)
609 pic.delme.append(fakefasta)
610 x = 'rgPicardGCBiasMetrics'
611 pdfname = '%s.pdf' % x
612 jpgname = '%s.jpg' % x
613 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
614 temppdf = os.path.join(opts.outdir,pdfname)
615 cl.append('R=%s' % fakefasta)
616 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
617 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
618 cl.append('INPUT=%s' % opts.input)
619 cl.append('OUTPUT=%s' % tempout)
620 cl.append('TMP_DIR=%s' % opts.tmpdir)
621 cl.append('CHART_OUTPUT=%s' % temppdf)
622 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
623 stdouts,rval = pic.runPic(opts.jar, cl)
624 if os.path.isfile(temppdf):
625 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
626 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
627 else:
628 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
629 lf = open(pic.log_filename,'a')
630 lf.write(s)
631 lf.write('\n')
632 lf.close()
633
634 elif pic.picname == 'CollectInsertSizeMetrics':
635 """ <command interpreter="python">
636 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
637 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
638 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
639 </command>
640 """
641 isPDF = 'InsertSizeHist.pdf'
642 pdfpath = os.path.join(opts.outdir,isPDF)
643 histpdf = 'InsertSizeHist.pdf'
644 cl.append('I=%s' % opts.input)
645 cl.append('O=%s' % pic.metricsOut)
646 cl.append('HISTOGRAM_FILE=%s' % histpdf)
647 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
648 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
649 if opts.histwidth <> '0':
650 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
651 if float( opts.minpct) > 0.0:
652 cl.append('MINIMUM_PCT=%s' % opts.minpct)
653 if float(opts.deviations) > 0.0:
654 cl.append('DEVIATIONS=%s' % opts.deviations)
655 if opts.malevel:
656 malists = opts.malevel.split(',')
657 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
658 cl += malist
659 stdouts,rval = pic.runPic(opts.jar, cl)
660 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
661 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
662 pic.runCL(cl=cl2,output_dir=opts.outdir)
663 else:
664 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
665 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
666 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
667 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
668 logging.info(s)
669 if len(stdouts) > 0:
670 logging.info(stdouts)
671
672 elif pic.picname == 'MarkDuplicates':
673 # assume sorted even if header says otherwise
674 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
675 # input
676 cl.append('INPUT=%s' % opts.input)
677 # outputs
678 cl.append('OUTPUT=%s' % opts.output)
679 cl.append('METRICS_FILE=%s' % pic.metricsOut )
680 # remove or mark duplicates
681 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
682 # the regular expression to be used to parse reads in incoming SAM file
683 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
684 # maximum offset between two duplicate clusters
685 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
686 stdouts,rval = pic.runPic(opts.jar, cl)
687
688 elif pic.picname == 'FixMateInformation':
689 cl.append('I=%s' % opts.input)
690 cl.append('O=%s' % tempout)
691 cl.append('SORT_ORDER=%s' % opts.sortorder)
692 stdouts,rval = pic.runPic(opts.jar,cl)
693 haveTempout = True
694
695 elif pic.picname == 'ReorderSam':
696 # input
697 cl.append('INPUT=%s' % opts.input)
698 # output
699 cl.append('OUTPUT=%s' % tempout)
700 # reference
701 cl.append('REFERENCE=%s' % ref_file_name)
702 # incomplete dict concordance
703 if opts.allow_inc_dict_concord == 'true':
704 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
705 # contig length discordance
706 if opts.allow_contig_len_discord == 'true':
707 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
708 stdouts,rval = pic.runPic(opts.jar, cl)
709 haveTempout = True
710
711 elif pic.picname == 'ReplaceSamHeader':
712 cl.append('INPUT=%s' % opts.input)
713 cl.append('OUTPUT=%s' % tempout)
714 cl.append('HEADER=%s' % opts.header_file)
715 stdouts,rval = pic.runPic(opts.jar, cl)
716 haveTempout = True
717
718 elif pic.picname == 'CalculateHsMetrics':
719 maxloglines = 100
720 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
721 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
722 baitf = pic.makePicInterval(opts.baitbed,baitfname)
723 if opts.targetbed == opts.baitbed: # same file sometimes
724 targetf = baitf
725 else:
726 targetf = pic.makePicInterval(opts.targetbed,targetfname)
727 cl.append('BAIT_INTERVALS=%s' % baitf)
728 cl.append('TARGET_INTERVALS=%s' % targetf)
729 cl.append('INPUT=%s' % os.path.abspath(opts.input))
730 cl.append('OUTPUT=%s' % pic.metricsOut)
731 cl.append('TMP_DIR=%s' % opts.tmpdir)
732 stdouts,rval = pic.runPic(opts.jar,cl)
733
734 elif pic.picname == 'ValidateSamFile':
735 import pysam
736 doTranspose = False
737 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
738 stf = open(pic.log_filename,'w')
739 tlog = None
740 if opts.datatype == 'sam': # need to work with a bam
741 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
742 try:
743 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
744 except:
745 print '## exception on sorting sam file %s' % opts.input
746 else: # is already bam
747 try:
748 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
749 except: # bug - [bam_sort_core] not being ignored - TODO fixme
750 print '## exception on sorting bam file %s' % opts.input
751 if tlog:
752 print '##tlog=',tlog
753 stf.write(tlog)
754 stf.write('\n')
755 sortedfile = '%s.bam' % sortedfile # samtools does that
756 cl.append('O=%s' % pic.metricsOut)
757 cl.append('TMP_DIR=%s' % opts.tmpdir)
758 cl.append('I=%s' % sortedfile)
759 opts.maxerrors = '99999999'
760 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
761 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
762 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
763 cl.append(' '.join(igs))
764 if opts.bisulphite.lower() <> 'false':
765 cl.append('IS_BISULFITE_SEQUENCED=true')
766 if opts.ref <> None or opts.ref_file <> None:
767 cl.append('R=%s' % ref_file_name)
768 stdouts,rval = pic.runPic(opts.jar,cl)
769 if opts.datatype == 'sam':
770 pic.delme.append(tempbam)
771 newsam = opts.output
772 outformat = 'bam'
773 pe = open(pic.metricsOut,'r').readlines()
774 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
775 pic.delme.append(sortedfile) # not wanted
776 stf.close()
777 pic.cleanup()
778
779 ####liubo added CleanSam tool####
780 elif pic.picname == 'CleanSam':
781 # input
782 cl.append('INPUT=%s' % opts.input)
783 # output
784 cl.append('OUTPUT=%s' % tempout)
785 stdouts,rval = pic.runPic(opts.jar, cl)
786 haveTempout = True
787
788 elif pic.picname == 'SortSam':
789 cl.append('I=%s' % opts.input)
790 cl.append('O=%s' % tempout)
791 cl.append('SORT_ORDER=%s' % opts.sortorder)
792 stdouts,rval = pic.runPic(opts.jar,cl)
793 haveTempout = True
794
795 elif pic.picname == 'BuildBamIndex':
796 cl.append('I=%s' % opts.input)
797 cl.append('O=%s' % tempout)
798 stdouts,rval = pic.runPic(opts.jar,cl)
799 haveTempout = True
800
801 elif pic.picname == 'SamFormatConverter':
802 cl.append('INPUT=%s' % opts.input)
803 cl.append('OUTPUT=%s' % tempout)
804 pic.runPic(opts.jar, cl)
805 haveTempout = True
806 elif pic.picname == "DownsampleSam":
807 cl.append('I=%s' % opts.input)
808 mystring = opts.output
809 mystringsam = mystring + ".sam"
810 cl.append('O=%s' % mystringsam)
811 if float(opts.probability) > 0:
812 cl.append('PROBABILITY=%s' % opts.probability)
813 if float(opts.seed) > 0:
814 cl.append('RANDOM_SEED=%s' % opts.seed)
815 stdouts,rval = pic.runPic(opts.jar, cl)
816 myoutput = mystringsam.replace(".sam", "")
817 os.rename(mystringsam,myoutput)
818
819 elif pic.picname == 'MeanQualityByCycle':
820 isPDF = 'MeanQualityByCycle.pdf'
821 pdfpath = os.path.join(opts.outdir,isPDF)
822 histpdf = isPDF
823 cl.append('I=%s' % opts.input)
824 cl.append('O=%s' % pic.metricsOut)
825 cl.append('CHART_OUTPUT=%s' % histpdf)
826 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
827 cl.append('ALIGNED_READS_ONLY=%s' % (opts.alignedreadsonly))
828 cl.append('PF_READS_ONLY=%s' % (opts.pfreadsonly))
829 stdouts,rval = pic.runPic(opts.jar, cl)
830 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
831 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
832 pic.runCL(cl=cl2,output_dir=opts.outdir)
833 else:
834 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
835 logging.info(s)
836 if len(stdouts) > 0:
837 logging.info(stdouts)
838
839 elif pic.picname == 'CollectRnaSeqMetrics':
840 isPDF = pic.picname + '.pdf'
841 pdfpath = os.path.join(opts.outdir,isPDF)
842 histpdf = isPDF
843 cl.append('I=%s' % opts.input)
844 cl.append('O=%s' % pic.metricsOut)
845 cl.append('CHART_OUTPUT=%s' % histpdf)
846 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
847 cl.append('MINIMUM_LENGTH=%s' % (opts.minlength))
848 cl.append('REF_FLAT=%s' % (opts.refflat))
849 cl.append('STRAND_SPECIFICITY=%s' % (opts.strandspecificity))
850 cl.append('RIBOSOMAL_INTERVALS=%s' % (opts.ribosomalintervals))
851 cl.append('RRNA_FRAGMENT_PERCENTAGE=%s' % (opts.rrnafragmentpercentage))
852 if opts.ignoreseq[0] <> 'None': # picard error values to ignore
853 igs = ['IGNORE_SEQUENCE=%s' % x for x in opts.ignoreseq if x <> 'None']
854 cl.append(' '.join(igs))
855 if opts.malevel:
856 malists = opts.malevel.split(',')
857 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
858 cl += malist
859 stdouts,rval = pic.runPic(opts.jar, cl)
860 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
861 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
862 pic.runCL(cl=cl2,output_dir=opts.outdir)
863 else:
864 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
865 logging.info(s)
866 if len(stdouts) > 0:
867 logging.info(stdouts)
868
869 else:
870 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
871 sys.exit(1)
872 if haveTempout:
873 # Some Picard tools produced a potentially intermediate bam file.
874 # Either just move to final location or create sam
875 if os.path.exists(tempout):
876 shutil.move(tempout, os.path.abspath(opts.output))
877 if opts.htmlout <> None or doFix: # return a pretty html page
878 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
879 if rval <> 0:
880 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
881 # signal failure
882 if __name__=="__main__": __main__()