comparison picard_wrapper.py @ 22:b0585923decc draft

Uploaded
author devteam
date Tue, 11 Feb 2014 18:43:11 -0500
parents
children
comparison
equal deleted inserted replaced
21:c2f6ec2fee7e 22:b0585923decc
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Bluefish 2.2.3" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146
147 print runme,self.opts.outdir
148
149 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
150 return stdouts,rval
151
152 def samToBam(self,infile=None,outdir=None):
153 """
154 use samtools view to convert sam to bam
155 """
156 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
157 cl = ['samtools view -h -b -S -o ',tempbam,infile]
158 tlog,stdouts,rval = self.runCL(cl,outdir)
159 return tlog,tempbam,rval
160
161 def sortSam(self, infile=None,outfile=None,outdir=None):
162 """
163 """
164 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
165 cl = ['samtools sort',infile,outfile]
166 tlog,stdouts,rval = self.runCL(cl,outdir)
167 return tlog
168
169 def cleanup(self):
170 for fname in self.delme:
171 try:
172 os.unlink(fname)
173 except:
174 pass
175
176 def prettyPicout(self,transpose,maxrows):
177 """organize picard outpouts into a report html page
178 """
179 res = []
180 try:
181 r = open(self.metricsOut,'r').readlines()
182 except:
183 r = []
184 if len(r) > 0:
185 res.append('<b>Picard on line resources</b><ul>\n')
186 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
187 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
188 if transpose:
189 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
190 else:
191 res.append('<b>Picard output</b><hr/>\n')
192 res.append('<table cellpadding="3" >\n')
193 dat = []
194 heads = []
195 lastr = len(r) - 1
196 # special case for estimate library complexity hist
197 thist = False
198 for i,row in enumerate(r):
199 if row.strip() > '':
200 srow = row.split('\t')
201 if row.startswith('#'):
202 heads.append(row.strip()) # want strings
203 else:
204 dat.append(srow) # want lists
205 if row.startswith('## HISTOGRAM'):
206 thist = True
207 if len(heads) > 0:
208 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
209 res += hres
210 heads = []
211 if len(dat) > 0:
212 if transpose and not thist:
213 tdat = map(None,*dat) # transpose an arbitrary list of lists
214 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
215 else:
216 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
217 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
218 res += tdat
219 dat = []
220 res.append('</table>\n')
221 return res
222
223 def fixPicardOutputs(self,transpose,maxloglines):
224 """
225 picard produces long hard to read tab header files
226 make them available but present them transposed for readability
227 """
228 logging.shutdown()
229 self.cleanup() # remove temp files stored in delme
230 rstyle="""<style type="text/css">
231 tr.d0 td {background-color: oldlace; color: black;}
232 tr.d1 td {background-color: aliceblue; color: black;}
233 </style>"""
234 res = [rstyle,]
235 res.append(galhtmlprefix % self.progname)
236 res.append(galhtmlattr % (self.picname,timenow()))
237 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
238 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
239 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
240 for p in pdflist:
241 pbase = os.path.splitext(p)[0] # removes .pdf
242 imghref = '%s.jpg' % pbase
243 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
244 if mimghref in flist:
245 imghref=mimghref # only one for thumbnail...it's a multi page pdf
246 res.append('<table cellpadding="10"><tr><td>\n')
247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
248 res.append('</tr></td></table>\n')
249 if len(flist) > 0:
250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
251 res.append('<table>\n')
252 for i,f in enumerate(flist):
253 fn = os.path.split(f)[-1]
254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
255 res.append('</table><p/>\n')
256 pres = self.prettyPicout(transpose,maxloglines)
257 if len(pres) > 0:
258 res += pres
259 l = open(self.log_filename,'r').readlines()
260 llen = len(l)
261 if llen > 0:
262 res.append('<b>Picard Tool Run Log</b><hr/>\n')
263 rlog = ['<pre>',]
264 if llen > maxloglines:
265 n = min(50,int(maxloglines/2))
266 rlog += l[:n]
267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
268 rlog += l[-n:]
269 else:
270 rlog += l
271 rlog.append('</pre>')
272 if llen > maxloglines:
273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
274 res += rlog
275 else:
276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
279 res.append(galhtmlpostfix)
280 outf = open(self.opts.htmlout,'w')
281 outf.write(''.join(res))
282 outf.write('\n')
283 outf.close()
284
285 def makePicInterval(self,inbed=None,outf=None):
286 """
287 picard wants bait and target files to have the same header length as the incoming bam/sam
288 a meaningful (ie accurate) representation will fail because of this - so this hack
289 it would be far better to be able to supply the original bed untouched
290 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
291 """
292 assert inbed <> None
293 bed = open(inbed,'r').readlines()
294 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
295 lens = [len(x) for x in sbed]
296 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
297 maxl = max(lens)
298 minl = min(lens)
299 e = []
300 if maxl <> minl:
301 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
302 if maxl <> 5:
303 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
304 if len(strands) > 0:
305 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
306 if len(e) > 0: # write to stderr and quit
307 print >> sys.stderr, '\n'.join(e)
308 sys.exit(1)
309 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
310 if self.opts.datatype == 'sam':
311 cl = ['samtools view -H -S',self.opts.input,'>',thead]
312 else:
313 cl = ['samtools view -H',self.opts.input,'>',thead]
314 self.runCL(cl=cl,output_dir=self.opts.outdir)
315 head = open(thead,'r').readlines()
316 s = '## got %d rows of header\n' % (len(head))
317 logging.info(s)
318 o = open(outf,'w')
319 o.write(''.join(head))
320 o.write(''.join(bed))
321 o.close()
322 return outf
323
324 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
325 """
326 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
327 Do the work of removing all the error sequences
328 pysam is cool
329 infile = pysam.Samfile( "-", "r" )
330 outfile = pysam.Samfile( "-", "w", template = infile )
331 for s in infile: outfile.write(s)
332
333 errors from ValidateSameFile.jar look like
334 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
335 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
336 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
337
338 """
339 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
340 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
341 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
342 remDict = dict(zip(removeNames,range(len(removeNames))))
343 infile = pysam.Samfile(insam,'rb')
344 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
345 if len(removeNames) > 0:
346 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
347 i = 0
348 j = 0
349 for row in infile:
350 dropme = remDict.get(row.qname,None) # keep if None
351 if not dropme:
352 outfile.write(row)
353 j += 1
354 else: # discard
355 i += 1
356 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
357 outfile.close()
358 infile.close()
359 else: # we really want a nullop or a simple pointer copy
360 infile.close()
361 if newsam:
362 shutil.copy(insam,newsam)
363 logging.info(info)
364
365
366
367 def __main__():
368 doFix = False # tools returning htmlfile don't need this
369 doTranspose = True # default
370 maxloglines = 100 # default
371 #Parse Command Line
372 op = optparse.OptionParser()
373 # All tools
374 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
375 op.add_option('-e', '--inputext', default=None)
376 op.add_option('-o', '--output', default=None)
377 op.add_option('-n', '--title', default="Pick a Picard Tool")
378 op.add_option('-t', '--htmlout', default=None)
379 op.add_option('-d', '--outdir', default=None)
380 op.add_option('-x', '--maxjheap', default='3000m')
381 op.add_option('-b', '--bisulphite', default='false')
382 op.add_option('-s', '--sortorder', default='query')
383 op.add_option('','--tmpdir', default='/tmp')
384 op.add_option('-j','--jar',default='')
385 op.add_option('','--picard-cmd',default=None)
386 # Many tools
387 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
388 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
389 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
390 # CreateSequenceDictionary
391 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
392 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
393 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
394 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
395 # MarkDuplicates
396 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
397 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
398 # CollectInsertSizeMetrics
399 op.add_option('', '--taillimit', default="0")
400 op.add_option('', '--histwidth', default="0")
401 op.add_option('', '--minpct', default="0.01")
402 op.add_option('', '--malevel', default='')
403 op.add_option('', '--deviations', default="0.0")
404 # CollectAlignmentSummaryMetrics
405 op.add_option('', '--maxinsert', default="20")
406 op.add_option('', '--adaptors', default='')
407 # FixMateInformation and validate
408 # CollectGcBiasMetrics
409 op.add_option('', '--windowsize', default='100')
410 op.add_option('', '--mingenomefrac', default='0.00001')
411 # AddOrReplaceReadGroups
412 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
413 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
414 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
415 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
416 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
417 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
418 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
419 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
420 # ReorderSam
421 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
422 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
423 # ReplaceSamHeader
424 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
425
426 op.add_option('','--assumesorted', default='true')
427 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
428 #estimatelibrarycomplexity
429 op.add_option('','--minid', default="5")
430 op.add_option('','--maxdiff', default="0.03")
431 op.add_option('','--minmeanq', default="20")
432 #hsmetrics
433 op.add_option('','--baitbed', default=None)
434 op.add_option('','--targetbed', default=None)
435 #validate
436 op.add_option('','--ignoreflags', action='append', type="string")
437 op.add_option('','--maxerrors', default=None)
438 op.add_option('','--datatype', default=None)
439 op.add_option('','--bamout', default=None)
440 op.add_option('','--samout', default=None)
441
442 opts, args = op.parse_args()
443 opts.sortme = opts.assumesorted == 'false'
444 assert opts.input <> None
445 # need to add
446 # instance that does all the work
447 pic = PicardBase(opts,sys.argv[0])
448
449 tmp_dir = opts.outdir
450 haveTempout = False # we use this where sam output is an option
451 rval = 0
452 stdouts = 'Not run yet'
453 # set ref and dict files to use (create if necessary)
454 ref_file_name = opts.ref
455 if opts.ref_file <> None:
456 csd = 'CreateSequenceDictionary'
457 realjarpath = os.path.split(opts.jar)[0]
458 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
459 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
460 ref_file_name = '%s.fasta' % tmp_ref_name
461 # build dict
462 dict_file_name = '%s.dict' % tmp_ref_name
463 os.symlink( opts.ref_file, ref_file_name )
464 cl = ['REFERENCE=%s' % ref_file_name]
465 cl.append('OUTPUT=%s' % dict_file_name)
466 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
467 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
468 if opts.species_name:
469 cl.append('SPECIES=%s' % opts.species_name)
470 if opts.build_name:
471 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
472 pic.delme.append(dict_file_name)
473 pic.delme.append(ref_file_name)
474 pic.delme.append(tmp_ref_name)
475 stdouts,rval = pic.runPic(jarpath, cl)
476 # run relevant command(s)
477
478 # define temporary output
479 # if output is sam, it must have that extension, otherwise bam will be produced
480 # specify sam or bam file with extension
481 if opts.output_format == 'sam':
482 suff = '.sam'
483 else:
484 suff = ''
485 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
486
487 cl = ['VALIDATION_STRINGENCY=LENIENT',]
488
489 if pic.picname == 'AddOrReplaceReadGroups':
490 # sort order to match Galaxy's default
491 cl.append('SORT_ORDER=coordinate')
492 # input
493 cl.append('INPUT=%s' % opts.input)
494 # outputs
495 cl.append('OUTPUT=%s' % tempout)
496 # required read groups
497 cl.append('RGLB="%s"' % opts.rg_library)
498 cl.append('RGPL="%s"' % opts.rg_platform)
499 cl.append('RGPU="%s"' % opts.rg_plat_unit)
500 cl.append('RGSM="%s"' % opts.rg_sample)
501 if opts.rg_id:
502 cl.append('RGID="%s"' % opts.rg_id)
503 # optional read groups
504 if opts.rg_seq_center:
505 cl.append('RGCN="%s"' % opts.rg_seq_center)
506 if opts.rg_desc:
507 cl.append('RGDS="%s"' % opts.rg_desc)
508 stdouts,rval = pic.runPic(opts.jar, cl)
509 haveTempout = True
510
511 elif pic.picname == 'BamIndexStats':
512 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
513 tmp_bam_name = '%s.bam' % tmp_name
514 tmp_bai_name = '%s.bai' % tmp_bam_name
515 os.symlink( opts.input, tmp_bam_name )
516 os.symlink( opts.bai_file, tmp_bai_name )
517 cl.append('INPUT=%s' % ( tmp_bam_name ))
518 pic.delme.append(tmp_bam_name)
519 pic.delme.append(tmp_bai_name)
520 pic.delme.append(tmp_name)
521 stdouts,rval = pic.runPic( opts.jar, cl )
522 f = open(pic.metricsOut,'a')
523 f.write(stdouts) # got this on stdout from runCl
524 f.write('\n')
525 f.close()
526 doTranspose = False # but not transposed
527
528 elif pic.picname == 'EstimateLibraryComplexity':
529 cl.append('I=%s' % opts.input)
530 cl.append('O=%s' % pic.metricsOut)
531 if float(opts.minid) > 0:
532 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
533 if float(opts.maxdiff) > 0.0:
534 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
535 if float(opts.minmeanq) > 0:
536 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
537 if opts.readregex > '':
538 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
539 if float(opts.optdupdist) > 0:
540 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
541 stdouts,rval = pic.runPic(opts.jar, cl)
542
543 elif pic.picname == 'CollectAlignmentSummaryMetrics':
544 # Why do we do this fakefasta thing?
545 # Because we need NO fai to be available or picard barfs unless it matches the input data.
546 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
547 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
548 try:
549 os.symlink(ref_file_name,fakefasta)
550 except:
551 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
552 info = s
553 shutil.copy(ref_file_name,fakefasta)
554 pic.delme.append(fakefasta)
555 cl.append('ASSUME_SORTED=true')
556 adaptlist = opts.adaptors.split(',')
557 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
558 cl += adaptorseqs
559 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
560 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
561 cl.append('OUTPUT=%s' % pic.metricsOut)
562 cl.append('R=%s' % fakefasta)
563 cl.append('TMP_DIR=%s' % opts.tmpdir)
564 if not opts.assumesorted.lower() == 'true': # we need to sort input
565 sortedfile = '%s.sorted' % os.path.basename(opts.input)
566 if opts.datatype == 'sam': # need to work with a bam
567 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
568 pic.delme.append(tempbam)
569 try:
570 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
571 except:
572 print '## exception on sorting sam file %s' % opts.input
573 else: # is already bam
574 try:
575 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
576 except : # bug - [bam_sort_core] not being ignored - TODO fixme
577 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
578 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
579 pic.delme.append(os.path.join(opts.outdir,sortedfile))
580 else:
581 cl.append('INPUT=%s' % os.path.abspath(opts.input))
582 stdouts,rval = pic.runPic(opts.jar, cl)
583
584
585 elif pic.picname == 'CollectGcBiasMetrics':
586 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
587 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
588 # why? Dunno
589 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
590 try:
591 os.symlink(ref_file_name,fakefasta)
592 except:
593 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
594 info = s
595 shutil.copy(ref_file_name,fakefasta)
596 pic.delme.append(fakefasta)
597 x = 'rgPicardGCBiasMetrics'
598 pdfname = '%s.pdf' % x
599 jpgname = '%s.jpg' % x
600 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
601 temppdf = os.path.join(opts.outdir,pdfname)
602 cl.append('R=%s' % fakefasta)
603 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
604 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
605 cl.append('INPUT=%s' % opts.input)
606 cl.append('OUTPUT=%s' % tempout)
607 cl.append('TMP_DIR=%s' % opts.tmpdir)
608 cl.append('CHART_OUTPUT=%s' % temppdf)
609 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
610 stdouts,rval = pic.runPic(opts.jar, cl)
611 if os.path.isfile(temppdf):
612 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
613 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
614 else:
615 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
616 lf = open(pic.log_filename,'a')
617 lf.write(s)
618 lf.write('\n')
619 lf.close()
620
621 elif pic.picname == 'CollectInsertSizeMetrics':
622 """ <command interpreter="python">
623 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
624 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
625 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
626 </command>
627 """
628 isPDF = 'InsertSizeHist.pdf'
629 pdfpath = os.path.join(opts.outdir,isPDF)
630 histpdf = 'InsertSizeHist.pdf'
631 cl.append('I=%s' % opts.input)
632 cl.append('O=%s' % pic.metricsOut)
633 cl.append('HISTOGRAM_FILE=%s' % histpdf)
634 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
635 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
636 if opts.histwidth <> '0':
637 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
638 if float( opts.minpct) > 0.0:
639 cl.append('MINIMUM_PCT=%s' % opts.minpct)
640 if float(opts.deviations) > 0.0:
641 cl.append('DEVIATIONS=%s' % opts.deviations)
642 if opts.malevel:
643 malists = opts.malevel.split(',')
644 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
645 cl += malist
646 stdouts,rval = pic.runPic(opts.jar, cl)
647 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
648 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
649 pic.runCL(cl=cl2,output_dir=opts.outdir)
650 else:
651 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
652 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
653 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
654 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
655 logging.info(s)
656 if len(stdouts) > 0:
657 logging.info(stdouts)
658
659 elif pic.picname == 'MarkDuplicates':
660 # assume sorted even if header says otherwise
661 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
662 # input
663 cl.append('INPUT=%s' % opts.input)
664 # outputs
665 cl.append('OUTPUT=%s' % opts.output)
666 cl.append('METRICS_FILE=%s' % pic.metricsOut )
667 # remove or mark duplicates
668 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
669 # the regular expression to be used to parse reads in incoming SAM file
670 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
671 # maximum offset between two duplicate clusters
672 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
673 stdouts,rval = pic.runPic(opts.jar, cl)
674
675 elif pic.picname == 'FixMateInformation':
676 cl.append('I=%s' % opts.input)
677 cl.append('O=%s' % tempout)
678 cl.append('SORT_ORDER=%s' % opts.sortorder)
679 stdouts,rval = pic.runPic(opts.jar,cl)
680 haveTempout = True
681
682 elif pic.picname == 'ReorderSam':
683 # input
684 cl.append('INPUT=%s' % opts.input)
685 # output
686 cl.append('OUTPUT=%s' % tempout)
687 # reference
688 cl.append('REFERENCE=%s' % ref_file_name)
689 # incomplete dict concordance
690 if opts.allow_inc_dict_concord == 'true':
691 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
692 # contig length discordance
693 if opts.allow_contig_len_discord == 'true':
694 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
695 stdouts,rval = pic.runPic(opts.jar, cl)
696 haveTempout = True
697
698 elif pic.picname == 'ReplaceSamHeader':
699 cl.append('INPUT=%s' % opts.input)
700 cl.append('OUTPUT=%s' % tempout)
701 cl.append('HEADER=%s' % opts.header_file)
702 stdouts,rval = pic.runPic(opts.jar, cl)
703 haveTempout = True
704
705 elif pic.picname == 'CalculateHsMetrics':
706 maxloglines = 100
707 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
708 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
709 baitf = pic.makePicInterval(opts.baitbed,baitfname)
710 if opts.targetbed == opts.baitbed: # same file sometimes
711 targetf = baitf
712 else:
713 targetf = pic.makePicInterval(opts.targetbed,targetfname)
714 cl.append('BAIT_INTERVALS=%s' % baitf)
715 cl.append('TARGET_INTERVALS=%s' % targetf)
716 cl.append('INPUT=%s' % os.path.abspath(opts.input))
717 cl.append('OUTPUT=%s' % pic.metricsOut)
718 cl.append('TMP_DIR=%s' % opts.tmpdir)
719 stdouts,rval = pic.runPic(opts.jar,cl)
720
721 elif pic.picname == 'ValidateSamFile':
722 import pysam
723 doTranspose = False
724 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
725 stf = open(pic.log_filename,'w')
726 tlog = None
727 if opts.datatype == 'sam': # need to work with a bam
728 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
729 try:
730 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
731 except:
732 print '## exception on sorting sam file %s' % opts.input
733 else: # is already bam
734 try:
735 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
736 except: # bug - [bam_sort_core] not being ignored - TODO fixme
737 print '## exception on sorting bam file %s' % opts.input
738 if tlog:
739 print '##tlog=',tlog
740 stf.write(tlog)
741 stf.write('\n')
742 sortedfile = '%s.bam' % sortedfile # samtools does that
743 cl.append('O=%s' % pic.metricsOut)
744 cl.append('TMP_DIR=%s' % opts.tmpdir)
745 cl.append('I=%s' % sortedfile)
746 opts.maxerrors = '99999999'
747 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
748 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
749 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
750 cl.append(' '.join(igs))
751 if opts.bisulphite.lower() <> 'false':
752 cl.append('IS_BISULFITE_SEQUENCED=true')
753 if opts.ref <> None or opts.ref_file <> None:
754 cl.append('R=%s' % ref_file_name)
755 stdouts,rval = pic.runPic(opts.jar,cl)
756 if opts.datatype == 'sam':
757 pic.delme.append(tempbam)
758 newsam = opts.output
759 outformat = 'bam'
760 pe = open(pic.metricsOut,'r').readlines()
761 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
762 pic.delme.append(sortedfile) # not wanted
763 stf.close()
764 pic.cleanup()
765
766 ####liubo added CleanSam tool####
767 elif pic.picname == 'CleanSam':
768 # input
769 cl.append('INPUT=%s' % opts.input)
770 # output
771 cl.append('OUTPUT=%s' % tempout)
772 stdouts,rval = pic.runPic(opts.jar, cl)
773 haveTempout = True
774
775 elif pic.picname == 'SortSam':
776 cl.append('I=%s' % opts.input)
777 cl.append('O=%s' % tempout)
778 cl.append('SORT_ORDER=%s' % opts.sortorder)
779 stdouts,rval = pic.runPic(opts.jar,cl)
780 haveTempout = True
781
782 elif pic.picname == 'BuildBamIndex':
783 cl.append('I=%s' % opts.input)
784 cl.append('O=%s' % tempout)
785 stdouts,rval = pic.runPic(opts.jar,cl)
786 haveTempout = True
787
788 elif pic.picname == 'SamFormatConverter':
789 cl.append('INPUT=%s' % opts.input)
790 cl.append('OUTPUT=%s' % tempout)
791 pic.runPic(opts.jar, cl)
792 haveTempout = True
793
794
795 else:
796 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
797 sys.exit(1)
798 if haveTempout:
799 # Some Picard tools produced a potentially intermediate bam file.
800 # Either just move to final location or create sam
801 if os.path.exists(tempout):
802 shutil.move(tempout, os.path.abspath(opts.output))
803 if opts.htmlout <> None or doFix: # return a pretty html page
804 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
805 if rval <> 0:
806 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
807 # signal failure
808 if __name__=="__main__": __main__()