comparison picard_wrapper.py @ 135:1790970e47fe draft

Uploaded
author devteam
date Wed, 26 Feb 2014 17:29:45 -0500
parents
children b2ca2d181fb4
comparison
equal deleted inserted replaced
134:522c7184b0f2 135:1790970e47fe
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
147 return stdouts,rval
148
149 def samToBam(self,infile=None,outdir=None):
150 """
151 use samtools view to convert sam to bam
152 """
153 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
154 cl = ['samtools view -h -b -S -o ',tempbam,infile]
155 tlog,stdouts,rval = self.runCL(cl,outdir)
156 return tlog,tempbam,rval
157
158 def sortSam(self, infile=None,outfile=None,outdir=None):
159 """
160 """
161 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
162 cl = ['samtools sort',infile,outfile]
163 tlog,stdouts,rval = self.runCL(cl,outdir)
164 return tlog
165
166 def cleanup(self):
167 for fname in self.delme:
168 try:
169 os.unlink(fname)
170 except:
171 pass
172
173 def prettyPicout(self,transpose,maxrows):
174 """organize picard outpouts into a report html page
175 """
176 res = []
177 try:
178 r = open(self.metricsOut,'r').readlines()
179 except:
180 r = []
181 if len(r) > 0:
182 res.append('<b>Picard on line resources</b><ul>\n')
183 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
184 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
185 if transpose:
186 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
187 else:
188 res.append('<b>Picard output</b><hr/>\n')
189 res.append('<table cellpadding="3" >\n')
190 dat = []
191 heads = []
192 lastr = len(r) - 1
193 # special case for estimate library complexity hist
194 thist = False
195 for i,row in enumerate(r):
196 if row.strip() > '':
197 srow = row.split('\t')
198 if row.startswith('#'):
199 heads.append(row.strip()) # want strings
200 else:
201 dat.append(srow) # want lists
202 if row.startswith('## HISTOGRAM'):
203 thist = True
204 if len(heads) > 0:
205 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
206 res += hres
207 heads = []
208 if len(dat) > 0:
209 if transpose and not thist:
210 tdat = map(None,*dat) # transpose an arbitrary list of lists
211 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
212 else:
213 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
214 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
215 res += tdat
216 dat = []
217 res.append('</table>\n')
218 return res
219
220 def fixPicardOutputs(self,transpose,maxloglines):
221 """
222 picard produces long hard to read tab header files
223 make them available but present them transposed for readability
224 """
225 logging.shutdown()
226 self.cleanup() # remove temp files stored in delme
227 rstyle="""<style type="text/css">
228 tr.d0 td {background-color: oldlace; color: black;}
229 tr.d1 td {background-color: aliceblue; color: black;}
230 </style>"""
231 res = [rstyle,]
232 res.append(galhtmlprefix % self.progname)
233 res.append(galhtmlattr % (self.picname,timenow()))
234 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
235 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
236 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
237 for p in pdflist:
238 pbase = os.path.splitext(p)[0] # removes .pdf
239 imghref = '%s.jpg' % pbase
240 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
241 if mimghref in flist:
242 imghref=mimghref # only one for thumbnail...it's a multi page pdf
243 res.append('<table cellpadding="10"><tr><td>\n')
244 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
245 res.append('</tr></td></table>\n')
246 if len(flist) > 0:
247 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
248 res.append('<table>\n')
249 for i,f in enumerate(flist):
250 fn = os.path.split(f)[-1]
251 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
252 res.append('</table><p/>\n')
253 pres = self.prettyPicout(transpose,maxloglines)
254 if len(pres) > 0:
255 res += pres
256 l = open(self.log_filename,'r').readlines()
257 llen = len(l)
258 if llen > 0:
259 res.append('<b>Picard Tool Run Log</b><hr/>\n')
260 rlog = ['<pre>',]
261 if llen > maxloglines:
262 n = min(50,int(maxloglines/2))
263 rlog += l[:n]
264 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
265 rlog += l[-n:]
266 else:
267 rlog += l
268 rlog.append('</pre>')
269 if llen > maxloglines:
270 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
271 res += rlog
272 else:
273 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
274 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
275 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
276 res.append(galhtmlpostfix)
277 outf = open(self.opts.htmlout,'w')
278 outf.write(''.join(res))
279 outf.write('\n')
280 outf.close()
281
282 def makePicInterval(self,inbed=None,outf=None):
283 """
284 picard wants bait and target files to have the same header length as the incoming bam/sam
285 a meaningful (ie accurate) representation will fail because of this - so this hack
286 it would be far better to be able to supply the original bed untouched
287 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
288 """
289 assert inbed <> None
290 bed = open(inbed,'r').readlines()
291 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
292 lens = [len(x) for x in sbed]
293 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
294 maxl = max(lens)
295 minl = min(lens)
296 e = []
297 if maxl <> minl:
298 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
299 if maxl <> 5:
300 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
301 if len(strands) > 0:
302 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
303 if len(e) > 0: # write to stderr and quit
304 print >> sys.stderr, '\n'.join(e)
305 sys.exit(1)
306 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
307 if self.opts.datatype == 'sam':
308 cl = ['samtools view -H -S',self.opts.input,'>',thead]
309 else:
310 cl = ['samtools view -H',self.opts.input,'>',thead]
311 self.runCL(cl=cl,output_dir=self.opts.outdir)
312 head = open(thead,'r').readlines()
313 s = '## got %d rows of header\n' % (len(head))
314 logging.info(s)
315 o = open(outf,'w')
316 o.write(''.join(head))
317 o.write(''.join(bed))
318 o.close()
319 return outf
320
321 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
322 """
323 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
324 Do the work of removing all the error sequences
325 pysam is cool
326 infile = pysam.Samfile( "-", "r" )
327 outfile = pysam.Samfile( "-", "w", template = infile )
328 for s in infile: outfile.write(s)
329
330 errors from ValidateSameFile.jar look like
331 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
332 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
333 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
334
335 """
336 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
337 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
338 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
339 remDict = dict(zip(removeNames,range(len(removeNames))))
340 infile = pysam.Samfile(insam,'rb')
341 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
342 if len(removeNames) > 0:
343 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
344 i = 0
345 j = 0
346 for row in infile:
347 dropme = remDict.get(row.qname,None) # keep if None
348 if not dropme:
349 outfile.write(row)
350 j += 1
351 else: # discard
352 i += 1
353 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
354 outfile.close()
355 infile.close()
356 else: # we really want a nullop or a simple pointer copy
357 infile.close()
358 if newsam:
359 shutil.copy(insam,newsam)
360 logging.info(info)
361
362
363
364 def __main__():
365 doFix = False # tools returning htmlfile don't need this
366 doTranspose = True # default
367 maxloglines = 100 # default
368 #Parse Command Line
369 op = optparse.OptionParser()
370 # All tools
371 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
372 op.add_option('-e', '--inputext', default=None)
373 op.add_option('-o', '--output', default=None)
374 op.add_option('-n', '--title', default="Pick a Picard Tool")
375 op.add_option('-t', '--htmlout', default=None)
376 op.add_option('-d', '--outdir', default=None)
377 op.add_option('-x', '--maxjheap', default='3000m')
378 op.add_option('-b', '--bisulphite', default='false')
379 op.add_option('-s', '--sortorder', default='query')
380 op.add_option('','--tmpdir', default='/tmp')
381 op.add_option('-j','--jar',default='')
382 op.add_option('','--picard-cmd',default=None)
383 # Many tools
384 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
385 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
386 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
387 # CreateSequenceDictionary
388 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
389 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
390 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
391 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
392 # MarkDuplicates
393 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
394 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
395 # CollectInsertSizeMetrics
396 op.add_option('', '--taillimit', default="0")
397 op.add_option('', '--histwidth', default="0")
398 op.add_option('', '--minpct', default="0.01")
399 op.add_option('', '--malevel', default='')
400 op.add_option('', '--deviations', default="0.0")
401 # CollectAlignmentSummaryMetrics
402 op.add_option('', '--maxinsert', default="20")
403 op.add_option('', '--adaptors', default='')
404 # FixMateInformation and validate
405 # CollectGcBiasMetrics
406 op.add_option('', '--windowsize', default='100')
407 op.add_option('', '--mingenomefrac', default='0.00001')
408 # AddOrReplaceReadGroups
409 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
410 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
411 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
412 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
413 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
414 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
415 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
416 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
417 # ReorderSam
418 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
419 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
420 # ReplaceSamHeader
421 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
422
423 op.add_option('','--assumesorted', default='true')
424 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
425 #estimatelibrarycomplexity
426 op.add_option('','--minid', default="5")
427 op.add_option('','--maxdiff', default="0.03")
428 op.add_option('','--minmeanq', default="20")
429 #hsmetrics
430 op.add_option('','--baitbed', default=None)
431 op.add_option('','--targetbed', default=None)
432 #validate
433 op.add_option('','--ignoreflags', action='append', type="string")
434 op.add_option('','--maxerrors', default=None)
435 op.add_option('','--datatype', default=None)
436 op.add_option('','--bamout', default=None)
437 op.add_option('','--samout', default=None)
438 #downsample
439 op.add_option('','--probability', default="1")
440 op.add_option('','--seed', default="1")
441 #meanqualitybycycle
442 op.add_option('','--pfreadsonly', default='false')
443 op.add_option('','--alignedreadsonly', default='false')
444 #collectrnaseqmetrics
445 op.add_option('','--ignoreseq', action='append', type="string")
446 op.add_option('','--minlength', default=None)
447 op.add_option('','--refflat', default=None)
448 op.add_option('','--strandspecificity', default=None)
449 op.add_option('','--ribosomalintervals', default='null')
450 op.add_option('','--rrnafragmentpercentage', default=None)
451
452 opts, args = op.parse_args()
453 opts.sortme = opts.assumesorted == 'false'
454 assert opts.input <> None
455 # need to add
456 # instance that does all the work
457 pic = PicardBase(opts,sys.argv[0])
458
459 tmp_dir = opts.outdir
460 haveTempout = False # we use this where sam output is an option
461 rval = 0
462 stdouts = 'Not run yet'
463 # set ref and dict files to use (create if necessary)
464 ref_file_name = opts.ref
465 if opts.ref_file <> None:
466 csd = 'CreateSequenceDictionary'
467 realjarpath = os.path.split(opts.jar)[0]
468 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
469 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
470 ref_file_name = '%s.fasta' % tmp_ref_name
471 # build dict
472 dict_file_name = '%s.dict' % tmp_ref_name
473 os.symlink( opts.ref_file, ref_file_name )
474 cl = ['REFERENCE=%s' % ref_file_name]
475 cl.append('OUTPUT=%s' % dict_file_name)
476 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
477 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
478 if opts.species_name:
479 cl.append('SPECIES=%s' % opts.species_name)
480 if opts.build_name:
481 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
482 pic.delme.append(dict_file_name)
483 pic.delme.append(ref_file_name)
484 pic.delme.append(tmp_ref_name)
485 stdouts,rval = pic.runPic(jarpath, cl)
486 # run relevant command(s)
487
488 # define temporary output
489 # if output is sam, it must have that extension, otherwise bam will be produced
490 # specify sam or bam file with extension
491 if opts.output_format == 'sam':
492 suff = '.sam'
493 else:
494 suff = ''
495 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
496
497 cl = ['VALIDATION_STRINGENCY=LENIENT',]
498
499 if pic.picname == 'AddOrReplaceReadGroups':
500 # sort order to match Galaxy's default
501 cl.append('SORT_ORDER=coordinate')
502 # input
503 cl.append('INPUT=%s' % opts.input)
504 # outputs
505 cl.append('OUTPUT=%s' % tempout)
506 # required read groups
507 cl.append('RGLB="%s"' % opts.rg_library)
508 cl.append('RGPL="%s"' % opts.rg_platform)
509 cl.append('RGPU="%s"' % opts.rg_plat_unit)
510 cl.append('RGSM="%s"' % opts.rg_sample)
511 if opts.rg_id:
512 cl.append('RGID="%s"' % opts.rg_id)
513 # optional read groups
514 if opts.rg_seq_center:
515 cl.append('RGCN="%s"' % opts.rg_seq_center)
516 if opts.rg_desc:
517 cl.append('RGDS="%s"' % opts.rg_desc)
518 stdouts,rval = pic.runPic(opts.jar, cl)
519 haveTempout = True
520
521 elif pic.picname == 'BamIndexStats':
522 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
523 tmp_bam_name = '%s.bam' % tmp_name
524 tmp_bai_name = '%s.bai' % tmp_bam_name
525 os.symlink( opts.input, tmp_bam_name )
526 os.symlink( opts.bai_file, tmp_bai_name )
527 cl.append('INPUT=%s' % ( tmp_bam_name ))
528 pic.delme.append(tmp_bam_name)
529 pic.delme.append(tmp_bai_name)
530 pic.delme.append(tmp_name)
531 stdouts,rval = pic.runPic( opts.jar, cl )
532 f = open(pic.metricsOut,'a')
533 f.write(stdouts) # got this on stdout from runCl
534 f.write('\n')
535 f.close()
536 doTranspose = False # but not transposed
537
538 elif pic.picname == 'EstimateLibraryComplexity':
539 cl.append('I=%s' % opts.input)
540 cl.append('O=%s' % pic.metricsOut)
541 if float(opts.minid) > 0:
542 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
543 if float(opts.maxdiff) > 0.0:
544 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
545 if float(opts.minmeanq) > 0:
546 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
547 if opts.readregex > '':
548 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
549 if float(opts.optdupdist) > 0:
550 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
551 stdouts,rval = pic.runPic(opts.jar, cl)
552
553 elif pic.picname == 'CollectAlignmentSummaryMetrics':
554 # Why do we do this fakefasta thing?
555 # Because we need NO fai to be available or picard barfs unless it matches the input data.
556 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
557 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
558 try:
559 os.symlink(ref_file_name,fakefasta)
560 except:
561 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
562 info = s
563 shutil.copy(ref_file_name,fakefasta)
564 pic.delme.append(fakefasta)
565 cl.append('ASSUME_SORTED=true')
566 adaptlist = opts.adaptors.split(',')
567 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
568 cl += adaptorseqs
569 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
570 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
571 cl.append('OUTPUT=%s' % pic.metricsOut)
572 cl.append('R=%s' % fakefasta)
573 cl.append('TMP_DIR=%s' % opts.tmpdir)
574 if not opts.assumesorted.lower() == 'true': # we need to sort input
575 sortedfile = '%s.sorted' % os.path.basename(opts.input)
576 if opts.datatype == 'sam': # need to work with a bam
577 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
578 pic.delme.append(tempbam)
579 try:
580 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
581 except:
582 print '## exception on sorting sam file %s' % opts.input
583 else: # is already bam
584 try:
585 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
586 except : # bug - [bam_sort_core] not being ignored - TODO fixme
587 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
588 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
589 pic.delme.append(os.path.join(opts.outdir,sortedfile))
590 else:
591 cl.append('INPUT=%s' % os.path.abspath(opts.input))
592 stdouts,rval = pic.runPic(opts.jar, cl)
593
594
595 elif pic.picname == 'CollectGcBiasMetrics':
596 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
597 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
598 # why? Dunno
599 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
600 try:
601 os.symlink(ref_file_name,fakefasta)
602 except:
603 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
604 info = s
605 shutil.copy(ref_file_name,fakefasta)
606 pic.delme.append(fakefasta)
607 x = 'rgPicardGCBiasMetrics'
608 pdfname = '%s.pdf' % x
609 jpgname = '%s.jpg' % x
610 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
611 temppdf = os.path.join(opts.outdir,pdfname)
612 cl.append('R=%s' % fakefasta)
613 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
614 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
615 cl.append('INPUT=%s' % opts.input)
616 cl.append('OUTPUT=%s' % tempout)
617 cl.append('TMP_DIR=%s' % opts.tmpdir)
618 cl.append('CHART_OUTPUT=%s' % temppdf)
619 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
620 stdouts,rval = pic.runPic(opts.jar, cl)
621 if os.path.isfile(temppdf):
622 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
623 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
624 else:
625 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
626 lf = open(pic.log_filename,'a')
627 lf.write(s)
628 lf.write('\n')
629 lf.close()
630
631 elif pic.picname == 'CollectInsertSizeMetrics':
632 """ <command interpreter="python">
633 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
634 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
635 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
636 </command>
637 """
638 isPDF = 'InsertSizeHist.pdf'
639 pdfpath = os.path.join(opts.outdir,isPDF)
640 histpdf = 'InsertSizeHist.pdf'
641 cl.append('I=%s' % opts.input)
642 cl.append('O=%s' % pic.metricsOut)
643 cl.append('HISTOGRAM_FILE=%s' % histpdf)
644 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
645 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
646 if opts.histwidth <> '0':
647 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
648 if float( opts.minpct) > 0.0:
649 cl.append('MINIMUM_PCT=%s' % opts.minpct)
650 if float(opts.deviations) > 0.0:
651 cl.append('DEVIATIONS=%s' % opts.deviations)
652 if opts.malevel:
653 malists = opts.malevel.split(',')
654 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
655 cl += malist
656 stdouts,rval = pic.runPic(opts.jar, cl)
657 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
658 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
659 pic.runCL(cl=cl2,output_dir=opts.outdir)
660 else:
661 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
662 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
663 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
664 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
665 logging.info(s)
666 if len(stdouts) > 0:
667 logging.info(stdouts)
668
669 elif pic.picname == 'MarkDuplicates':
670 # assume sorted even if header says otherwise
671 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
672 # input
673 cl.append('INPUT=%s' % opts.input)
674 # outputs
675 cl.append('OUTPUT=%s' % opts.output)
676 cl.append('METRICS_FILE=%s' % pic.metricsOut )
677 # remove or mark duplicates
678 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
679 # the regular expression to be used to parse reads in incoming SAM file
680 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
681 # maximum offset between two duplicate clusters
682 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
683 stdouts,rval = pic.runPic(opts.jar, cl)
684
685 elif pic.picname == 'FixMateInformation':
686 cl.append('I=%s' % opts.input)
687 cl.append('O=%s' % tempout)
688 cl.append('SORT_ORDER=%s' % opts.sortorder)
689 stdouts,rval = pic.runPic(opts.jar,cl)
690 haveTempout = True
691
692 elif pic.picname == 'ReorderSam':
693 # input
694 cl.append('INPUT=%s' % opts.input)
695 # output
696 cl.append('OUTPUT=%s' % tempout)
697 # reference
698 cl.append('REFERENCE=%s' % ref_file_name)
699 # incomplete dict concordance
700 if opts.allow_inc_dict_concord == 'true':
701 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
702 # contig length discordance
703 if opts.allow_contig_len_discord == 'true':
704 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
705 stdouts,rval = pic.runPic(opts.jar, cl)
706 haveTempout = True
707
708 elif pic.picname == 'ReplaceSamHeader':
709 cl.append('INPUT=%s' % opts.input)
710 cl.append('OUTPUT=%s' % tempout)
711 cl.append('HEADER=%s' % opts.header_file)
712 stdouts,rval = pic.runPic(opts.jar, cl)
713 haveTempout = True
714
715 elif pic.picname == 'CalculateHsMetrics':
716 maxloglines = 100
717 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
718 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
719 baitf = pic.makePicInterval(opts.baitbed,baitfname)
720 if opts.targetbed == opts.baitbed: # same file sometimes
721 targetf = baitf
722 else:
723 targetf = pic.makePicInterval(opts.targetbed,targetfname)
724 cl.append('BAIT_INTERVALS=%s' % baitf)
725 cl.append('TARGET_INTERVALS=%s' % targetf)
726 cl.append('INPUT=%s' % os.path.abspath(opts.input))
727 cl.append('OUTPUT=%s' % pic.metricsOut)
728 cl.append('TMP_DIR=%s' % opts.tmpdir)
729 stdouts,rval = pic.runPic(opts.jar,cl)
730
731 elif pic.picname == 'ValidateSamFile':
732 import pysam
733 doTranspose = False
734 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
735 stf = open(pic.log_filename,'w')
736 tlog = None
737 if opts.datatype == 'sam': # need to work with a bam
738 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
739 try:
740 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
741 except:
742 print '## exception on sorting sam file %s' % opts.input
743 else: # is already bam
744 try:
745 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
746 except: # bug - [bam_sort_core] not being ignored - TODO fixme
747 print '## exception on sorting bam file %s' % opts.input
748 if tlog:
749 print '##tlog=',tlog
750 stf.write(tlog)
751 stf.write('\n')
752 sortedfile = '%s.bam' % sortedfile # samtools does that
753 cl.append('O=%s' % pic.metricsOut)
754 cl.append('TMP_DIR=%s' % opts.tmpdir)
755 cl.append('I=%s' % sortedfile)
756 opts.maxerrors = '99999999'
757 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
758 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
759 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
760 cl.append(' '.join(igs))
761 if opts.bisulphite.lower() <> 'false':
762 cl.append('IS_BISULFITE_SEQUENCED=true')
763 if opts.ref <> None or opts.ref_file <> None:
764 cl.append('R=%s' % ref_file_name)
765 stdouts,rval = pic.runPic(opts.jar,cl)
766 if opts.datatype == 'sam':
767 pic.delme.append(tempbam)
768 newsam = opts.output
769 outformat = 'bam'
770 pe = open(pic.metricsOut,'r').readlines()
771 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
772 pic.delme.append(sortedfile) # not wanted
773 stf.close()
774 pic.cleanup()
775
776 ####liubo added CleanSam tool####
777 elif pic.picname == 'CleanSam':
778 # input
779 cl.append('INPUT=%s' % opts.input)
780 # output
781 cl.append('OUTPUT=%s' % tempout)
782 stdouts,rval = pic.runPic(opts.jar, cl)
783 haveTempout = True
784
785 elif pic.picname == 'SortSam':
786 cl.append('I=%s' % opts.input)
787 cl.append('O=%s' % tempout)
788 cl.append('SORT_ORDER=%s' % opts.sortorder)
789 stdouts,rval = pic.runPic(opts.jar,cl)
790 haveTempout = True
791
792 elif pic.picname == 'BuildBamIndex':
793 cl.append('I=%s' % opts.input)
794 cl.append('O=%s' % tempout)
795 stdouts,rval = pic.runPic(opts.jar,cl)
796 haveTempout = True
797
798 elif pic.picname == 'SamFormatConverter':
799 cl.append('INPUT=%s' % opts.input)
800 cl.append('OUTPUT=%s' % tempout)
801 pic.runPic(opts.jar, cl)
802 haveTempout = True
803 elif pic.picname == "DownsampleSam":
804 cl.append('I=%s' % opts.input)
805 mystring = opts.output
806 mystringsam = mystring + ".sam"
807 cl.append('O=%s' % mystringsam)
808 if float(opts.probability) > 0:
809 cl.append('PROBABILITY=%s' % opts.probability)
810 if float(opts.seed) > 0:
811 cl.append('RANDOM_SEED=%s' % opts.seed)
812 stdouts,rval = pic.runPic(opts.jar, cl)
813 myoutput = mystringsam.replace(".sam", "")
814 os.rename(mystringsam,myoutput)
815
816 elif pic.picname == 'MeanQualityByCycle':
817 isPDF = 'MeanQualityByCycle.pdf'
818 pdfpath = os.path.join(opts.outdir,isPDF)
819 histpdf = isPDF
820 cl.append('I=%s' % opts.input)
821 cl.append('O=%s' % pic.metricsOut)
822 cl.append('CHART_OUTPUT=%s' % histpdf)
823 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
824 cl.append('ALIGNED_READS_ONLY=%s' % (opts.alignedreadsonly))
825 cl.append('PF_READS_ONLY=%s' % (opts.pfreadsonly))
826 stdouts,rval = pic.runPic(opts.jar, cl)
827 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
828 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
829 pic.runCL(cl=cl2,output_dir=opts.outdir)
830 else:
831 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
832 logging.info(s)
833 if len(stdouts) > 0:
834 logging.info(stdouts)
835
836 elif pic.picname == 'CollectRnaSeqMetrics':
837 isPDF = pic.picname + '.pdf'
838 pdfpath = os.path.join(opts.outdir,isPDF)
839 histpdf = isPDF
840 cl.append('I=%s' % opts.input)
841 cl.append('O=%s' % pic.metricsOut)
842 cl.append('CHART_OUTPUT=%s' % histpdf)
843 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
844 cl.append('MINIMUM_LENGTH=%s' % (opts.minlength))
845 cl.append('REF_FLAT=%s' % (opts.refflat))
846 cl.append('STRAND_SPECIFICITY=%s' % (opts.strandspecificity))
847 cl.append('RIBOSOMAL_INTERVALS=%s' % (opts.ribosomalintervals))
848 cl.append('RRNA_FRAGMENT_PERCENTAGE=%s' % (opts.rrnafragmentpercentage))
849 if opts.ignoreseq: # picard error values to ignore
850 igs = ['IGNORE_SEQUENCE=%s' % x for x in opts.ignoreseq if x <> 'None']
851 cl.append(' '.join(igs))
852 if opts.malevel:
853 malists = opts.malevel.split(',')
854 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
855 cl += malist
856 stdouts,rval = pic.runPic(opts.jar, cl)
857 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
858 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
859 pic.runCL(cl=cl2,output_dir=opts.outdir)
860 else:
861 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
862 logging.info(s)
863 if len(stdouts) > 0:
864 logging.info(stdouts)
865
866 else:
867 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
868 sys.exit(1)
869 if haveTempout:
870 # Some Picard tools produced a potentially intermediate bam file.
871 # Either just move to final location or create sam
872 if os.path.exists(tempout):
873 shutil.move(tempout, os.path.abspath(opts.output))
874 if opts.htmlout <> None or doFix: # return a pretty html page
875 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
876 if rval <> 0:
877 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
878 # signal failure
879 if __name__=="__main__": __main__()