comparison picard_wrapper.py @ 33:1f4e2e13d9dc draft

Uploaded
author devteam
date Thu, 13 Feb 2014 16:01:51 -0500
parents
children c25cd3e674a5
comparison
equal deleted inserted replaced
32:0a8761ed9d53 33:1f4e2e13d9dc
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Bluefish 2.2.3" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
147 return stdouts,rval
148
149 def samToBam(self,infile=None,outdir=None):
150 """
151 use samtools view to convert sam to bam
152 """
153 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
154 cl = ['samtools view -h -b -S -o ',tempbam,infile]
155 tlog,stdouts,rval = self.runCL(cl,outdir)
156 return tlog,tempbam,rval
157
158 def sortSam(self, infile=None,outfile=None,outdir=None):
159 """
160 """
161 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
162 cl = ['samtools sort',infile,outfile]
163 tlog,stdouts,rval = self.runCL(cl,outdir)
164 return tlog
165
166 def cleanup(self):
167 for fname in self.delme:
168 try:
169 os.unlink(fname)
170 except:
171 pass
172
173 def prettyPicout(self,transpose,maxrows):
174 """organize picard outpouts into a report html page
175 """
176 res = []
177 try:
178 r = open(self.metricsOut,'r').readlines()
179 except:
180 r = []
181 if len(r) > 0:
182 res.append('<b>Picard on line resources</b><ul>\n')
183 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
184 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
185 if transpose:
186 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
187 else:
188 res.append('<b>Picard output</b><hr/>\n')
189 res.append('<table cellpadding="3" >\n')
190 dat = []
191 heads = []
192 lastr = len(r) - 1
193 # special case for estimate library complexity hist
194 thist = False
195 for i,row in enumerate(r):
196 if row.strip() > '':
197 srow = row.split('\t')
198 if row.startswith('#'):
199 heads.append(row.strip()) # want strings
200 else:
201 dat.append(srow) # want lists
202 if row.startswith('## HISTOGRAM'):
203 thist = True
204 if len(heads) > 0:
205 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
206 res += hres
207 heads = []
208 if len(dat) > 0:
209 if transpose and not thist:
210 tdat = map(None,*dat) # transpose an arbitrary list of lists
211 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
212 else:
213 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
214 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
215 res += tdat
216 dat = []
217 res.append('</table>\n')
218 return res
219
220 def fixPicardOutputs(self,transpose,maxloglines):
221 """
222 picard produces long hard to read tab header files
223 make them available but present them transposed for readability
224 """
225 logging.shutdown()
226 self.cleanup() # remove temp files stored in delme
227 rstyle="""<style type="text/css">
228 tr.d0 td {background-color: oldlace; color: black;}
229 tr.d1 td {background-color: aliceblue; color: black;}
230 </style>"""
231 res = [rstyle,]
232 res.append(galhtmlprefix % self.progname)
233 res.append(galhtmlattr % (self.picname,timenow()))
234 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
235 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
236 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
237 for p in pdflist:
238 pbase = os.path.splitext(p)[0] # removes .pdf
239 imghref = '%s.jpg' % pbase
240 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
241 if mimghref in flist:
242 imghref=mimghref # only one for thumbnail...it's a multi page pdf
243 res.append('<table cellpadding="10"><tr><td>\n')
244 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
245 res.append('</tr></td></table>\n')
246 if len(flist) > 0:
247 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
248 res.append('<table>\n')
249 for i,f in enumerate(flist):
250 fn = os.path.split(f)[-1]
251 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
252 res.append('</table><p/>\n')
253 pres = self.prettyPicout(transpose,maxloglines)
254 if len(pres) > 0:
255 res += pres
256 l = open(self.log_filename,'r').readlines()
257 llen = len(l)
258 if llen > 0:
259 res.append('<b>Picard Tool Run Log</b><hr/>\n')
260 rlog = ['<pre>',]
261 if llen > maxloglines:
262 n = min(50,int(maxloglines/2))
263 rlog += l[:n]
264 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
265 rlog += l[-n:]
266 else:
267 rlog += l
268 rlog.append('</pre>')
269 if llen > maxloglines:
270 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
271 res += rlog
272 else:
273 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
274 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
275 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
276 res.append(galhtmlpostfix)
277 outf = open(self.opts.htmlout,'w')
278 outf.write(''.join(res))
279 outf.write('\n')
280 outf.close()
281
282 def makePicInterval(self,inbed=None,outf=None):
283 """
284 picard wants bait and target files to have the same header length as the incoming bam/sam
285 a meaningful (ie accurate) representation will fail because of this - so this hack
286 it would be far better to be able to supply the original bed untouched
287 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
288 """
289 assert inbed <> None
290 bed = open(inbed,'r').readlines()
291 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
292 lens = [len(x) for x in sbed]
293 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
294 maxl = max(lens)
295 minl = min(lens)
296 e = []
297 if maxl <> minl:
298 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
299 if maxl <> 5:
300 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
301 if len(strands) > 0:
302 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
303 if len(e) > 0: # write to stderr and quit
304 print >> sys.stderr, '\n'.join(e)
305 sys.exit(1)
306 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
307 if self.opts.datatype == 'sam':
308 cl = ['samtools view -H -S',self.opts.input,'>',thead]
309 else:
310 cl = ['samtools view -H',self.opts.input,'>',thead]
311 self.runCL(cl=cl,output_dir=self.opts.outdir)
312 head = open(thead,'r').readlines()
313 s = '## got %d rows of header\n' % (len(head))
314 logging.info(s)
315 o = open(outf,'w')
316 o.write(''.join(head))
317 o.write(''.join(bed))
318 o.close()
319 return outf
320
321 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
322 """
323 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
324 Do the work of removing all the error sequences
325 pysam is cool
326 infile = pysam.Samfile( "-", "r" )
327 outfile = pysam.Samfile( "-", "w", template = infile )
328 for s in infile: outfile.write(s)
329
330 errors from ValidateSameFile.jar look like
331 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
332 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
333 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
334
335 """
336 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
337 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
338 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
339 remDict = dict(zip(removeNames,range(len(removeNames))))
340 infile = pysam.Samfile(insam,'rb')
341 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
342 if len(removeNames) > 0:
343 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
344 i = 0
345 j = 0
346 for row in infile:
347 dropme = remDict.get(row.qname,None) # keep if None
348 if not dropme:
349 outfile.write(row)
350 j += 1
351 else: # discard
352 i += 1
353 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
354 outfile.close()
355 infile.close()
356 else: # we really want a nullop or a simple pointer copy
357 infile.close()
358 if newsam:
359 shutil.copy(insam,newsam)
360 logging.info(info)
361
362
363
364 def __main__():
365 doFix = False # tools returning htmlfile don't need this
366 doTranspose = True # default
367 maxloglines = 100 # default
368 #Parse Command Line
369 op = optparse.OptionParser()
370 # All tools
371 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
372 op.add_option('-e', '--inputext', default=None)
373 op.add_option('-o', '--output', default=None)
374 op.add_option('-n', '--title', default="Pick a Picard Tool")
375 op.add_option('-t', '--htmlout', default=None)
376 op.add_option('-d', '--outdir', default=None)
377 op.add_option('-x', '--maxjheap', default='4g')
378 op.add_option('-b', '--bisulphite', default='false')
379 op.add_option('-s', '--sortorder', default='query')
380 op.add_option('','--tmpdir', default='/tmp')
381 op.add_option('-j','--jar',default='')
382 op.add_option('','--picard-cmd',default=None)
383 # Many tools
384 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
385 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
386 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
387 # CreateSequenceDictionary
388 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
389 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
390 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
391 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
392 # MarkDuplicates
393 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
394 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
395 # CollectInsertSizeMetrics
396 op.add_option('', '--taillimit', default="0")
397 op.add_option('', '--histwidth', default="0")
398 op.add_option('', '--minpct', default="0.01")
399 op.add_option('', '--malevel', default='')
400 op.add_option('', '--deviations', default="0.0")
401 # CollectAlignmentSummaryMetrics
402 op.add_option('', '--maxinsert', default="20")
403 op.add_option('', '--adaptors', default='')
404 # FixMateInformation and validate
405 # CollectGcBiasMetrics
406 op.add_option('', '--windowsize', default='100')
407 op.add_option('', '--mingenomefrac', default='0.00001')
408 # AddOrReplaceReadGroups
409 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
410 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
411 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
412 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
413 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
414 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
415 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
416 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
417 # ReorderSam
418 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
419 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
420 # ReplaceSamHeader
421 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
422
423 op.add_option('','--assumesorted', default='true')
424 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
425 #estimatelibrarycomplexity
426 op.add_option('','--minid', default="5")
427 op.add_option('','--maxdiff', default="0.03")
428 op.add_option('','--minmeanq', default="20")
429 #hsmetrics
430 op.add_option('','--baitbed', default=None)
431 op.add_option('','--targetbed', default=None)
432 #validate
433 op.add_option('','--ignoreflags', action='append', type="string")
434 op.add_option('','--maxerrors', default=None)
435 op.add_option('','--datatype', default=None)
436 op.add_option('','--bamout', default=None)
437 op.add_option('','--samout', default=None)
438 #downsample
439 op.add_option('','--probability', default="1")
440 op.add_option('','--seed', default="1")
441
442 ##
443 opts, args = op.parse_args()
444 opts.sortme = opts.assumesorted == 'false'
445 assert opts.input <> None
446 # need to add
447 # instance that does all the work
448 pic = PicardBase(opts,sys.argv[0])
449
450 tmp_dir = opts.outdir
451 haveTempout = False # we use this where sam output is an option
452 rval = 0
453 stdouts = 'Not run yet'
454 # set ref and dict files to use (create if necessary)
455 ref_file_name = opts.ref
456 if opts.ref_file <> None:
457 csd = 'CreateSequenceDictionary'
458 realjarpath = os.path.split(opts.jar)[0]
459 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
460 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
461 ref_file_name = '%s.fasta' % tmp_ref_name
462 # build dict
463 dict_file_name = '%s.dict' % tmp_ref_name
464 os.symlink( opts.ref_file, ref_file_name )
465 cl = ['REFERENCE=%s' % ref_file_name]
466 cl.append('OUTPUT=%s' % dict_file_name)
467 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
468 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
469 if opts.species_name:
470 cl.append('SPECIES=%s' % opts.species_name)
471 if opts.build_name:
472 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
473 pic.delme.append(dict_file_name)
474 pic.delme.append(ref_file_name)
475 pic.delme.append(tmp_ref_name)
476 stdouts,rval = pic.runPic(jarpath, cl)
477 # run relevant command(s)
478
479 # define temporary output
480 # if output is sam, it must have that extension, otherwise bam will be produced
481 # specify sam or bam file with extension
482 if opts.output_format == 'sam':
483 suff = '.sam'
484 else:
485 suff = ''
486 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
487
488 cl = ['VALIDATION_STRINGENCY=LENIENT',]
489
490 if pic.picname == 'AddOrReplaceReadGroups':
491 # sort order to match Galaxy's default
492 cl.append('SORT_ORDER=coordinate')
493 # input
494 cl.append('INPUT=%s' % opts.input)
495 # outputs
496 cl.append('OUTPUT=%s' % tempout)
497 # required read groups
498 cl.append('RGLB="%s"' % opts.rg_library)
499 cl.append('RGPL="%s"' % opts.rg_platform)
500 cl.append('RGPU="%s"' % opts.rg_plat_unit)
501 cl.append('RGSM="%s"' % opts.rg_sample)
502 if opts.rg_id:
503 cl.append('RGID="%s"' % opts.rg_id)
504 # optional read groups
505 if opts.rg_seq_center:
506 cl.append('RGCN="%s"' % opts.rg_seq_center)
507 if opts.rg_desc:
508 cl.append('RGDS="%s"' % opts.rg_desc)
509 stdouts,rval = pic.runPic(opts.jar, cl)
510 haveTempout = True
511
512 elif pic.picname == 'BamIndexStats':
513 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
514 tmp_bam_name = '%s.bam' % tmp_name
515 tmp_bai_name = '%s.bai' % tmp_bam_name
516 os.symlink( opts.input, tmp_bam_name )
517 os.symlink( opts.bai_file, tmp_bai_name )
518 cl.append('INPUT=%s' % ( tmp_bam_name ))
519 pic.delme.append(tmp_bam_name)
520 pic.delme.append(tmp_bai_name)
521 pic.delme.append(tmp_name)
522 stdouts,rval = pic.runPic( opts.jar, cl )
523 f = open(pic.metricsOut,'a')
524 f.write(stdouts) # got this on stdout from runCl
525 f.write('\n')
526 f.close()
527 doTranspose = False # but not transposed
528
529 elif pic.picname == 'EstimateLibraryComplexity':
530 cl.append('I=%s' % opts.input)
531 cl.append('O=%s' % pic.metricsOut)
532 if float(opts.minid) > 0:
533 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
534 if float(opts.maxdiff) > 0.0:
535 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
536 if float(opts.minmeanq) > 0:
537 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
538 if opts.readregex > '':
539 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
540 if float(opts.optdupdist) > 0:
541 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
542 stdouts,rval = pic.runPic(opts.jar, cl)
543
544 elif pic.picname == 'CollectAlignmentSummaryMetrics':
545 # Why do we do this fakefasta thing?
546 # Because we need NO fai to be available or picard barfs unless it matches the input data.
547 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
548 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
549 try:
550 os.symlink(ref_file_name,fakefasta)
551 except:
552 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
553 info = s
554 shutil.copy(ref_file_name,fakefasta)
555 pic.delme.append(fakefasta)
556 cl.append('ASSUME_SORTED=true')
557 adaptlist = opts.adaptors.split(',')
558 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
559 cl += adaptorseqs
560 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
561 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
562 cl.append('OUTPUT=%s' % pic.metricsOut)
563 cl.append('R=%s' % fakefasta)
564 cl.append('TMP_DIR=%s' % opts.tmpdir)
565 if not opts.assumesorted.lower() == 'true': # we need to sort input
566 sortedfile = '%s.sorted' % os.path.basename(opts.input)
567 if opts.datatype == 'sam': # need to work with a bam
568 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
569 pic.delme.append(tempbam)
570 try:
571 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
572 except:
573 print '## exception on sorting sam file %s' % opts.input
574 else: # is already bam
575 try:
576 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
577 except : # bug - [bam_sort_core] not being ignored - TODO fixme
578 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
579 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
580 pic.delme.append(os.path.join(opts.outdir,sortedfile))
581 else:
582 cl.append('INPUT=%s' % os.path.abspath(opts.input))
583 stdouts,rval = pic.runPic(opts.jar, cl)
584
585
586 elif pic.picname == 'CollectGcBiasMetrics':
587 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
588 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
589 # why? Dunno
590 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
591 try:
592 os.symlink(ref_file_name,fakefasta)
593 except:
594 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
595 info = s
596 shutil.copy(ref_file_name,fakefasta)
597 pic.delme.append(fakefasta)
598 x = 'rgPicardGCBiasMetrics'
599 pdfname = '%s.pdf' % x
600 jpgname = '%s.jpg' % x
601 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
602 temppdf = os.path.join(opts.outdir,pdfname)
603 cl.append('R=%s' % fakefasta)
604 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
605 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
606 cl.append('INPUT=%s' % opts.input)
607 cl.append('OUTPUT=%s' % tempout)
608 cl.append('TMP_DIR=%s' % opts.tmpdir)
609 cl.append('CHART_OUTPUT=%s' % temppdf)
610 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
611 stdouts,rval = pic.runPic(opts.jar, cl)
612 if os.path.isfile(temppdf):
613 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
614 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
615 else:
616 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
617 lf = open(pic.log_filename,'a')
618 lf.write(s)
619 lf.write('\n')
620 lf.close()
621
622 elif pic.picname == 'CollectInsertSizeMetrics':
623 """ <command interpreter="python">
624 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
625 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
626 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
627 </command>
628 """
629 isPDF = 'InsertSizeHist.pdf'
630 pdfpath = os.path.join(opts.outdir,isPDF)
631 histpdf = 'InsertSizeHist.pdf'
632 cl.append('I=%s' % opts.input)
633 cl.append('O=%s' % pic.metricsOut)
634 cl.append('HISTOGRAM_FILE=%s' % histpdf)
635 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
636 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
637 if opts.histwidth <> '0':
638 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
639 if float( opts.minpct) > 0.0:
640 cl.append('MINIMUM_PCT=%s' % opts.minpct)
641 if float(opts.deviations) > 0.0:
642 cl.append('DEVIATIONS=%s' % opts.deviations)
643 if opts.malevel:
644 malists = opts.malevel.split(',')
645 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
646 cl += malist
647 stdouts,rval = pic.runPic(opts.jar, cl)
648 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
649 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
650 pic.runCL(cl=cl2,output_dir=opts.outdir)
651 else:
652 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
653 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
654 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
655 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
656 logging.info(s)
657 if len(stdouts) > 0:
658 logging.info(stdouts)
659
660 elif pic.picname == 'MarkDuplicates':
661 # assume sorted even if header says otherwise
662 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
663 # input
664 cl.append('INPUT=%s' % opts.input)
665 # outputs
666 cl.append('OUTPUT=%s' % opts.output)
667 cl.append('METRICS_FILE=%s' % pic.metricsOut )
668 # remove or mark duplicates
669 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
670 # the regular expression to be used to parse reads in incoming SAM file
671 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
672 # maximum offset between two duplicate clusters
673 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
674 stdouts,rval = pic.runPic(opts.jar, cl)
675
676 elif pic.picname == 'FixMateInformation':
677 cl.append('I=%s' % opts.input)
678 cl.append('O=%s' % tempout)
679 cl.append('SORT_ORDER=%s' % opts.sortorder)
680 stdouts,rval = pic.runPic(opts.jar,cl)
681 haveTempout = True
682
683 elif pic.picname == 'ReorderSam':
684 # input
685 cl.append('INPUT=%s' % opts.input)
686 # output
687 cl.append('OUTPUT=%s' % tempout)
688 # reference
689 cl.append('REFERENCE=%s' % ref_file_name)
690 # incomplete dict concordance
691 if opts.allow_inc_dict_concord == 'true':
692 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
693 # contig length discordance
694 if opts.allow_contig_len_discord == 'true':
695 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
696 stdouts,rval = pic.runPic(opts.jar, cl)
697 haveTempout = True
698
699 elif pic.picname == 'ReplaceSamHeader':
700 cl.append('INPUT=%s' % opts.input)
701 cl.append('OUTPUT=%s' % tempout)
702 cl.append('HEADER=%s' % opts.header_file)
703 stdouts,rval = pic.runPic(opts.jar, cl)
704 haveTempout = True
705
706 elif pic.picname == 'CalculateHsMetrics':
707 maxloglines = 100
708 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
709 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
710 baitf = pic.makePicInterval(opts.baitbed,baitfname)
711 if opts.targetbed == opts.baitbed: # same file sometimes
712 targetf = baitf
713 else:
714 targetf = pic.makePicInterval(opts.targetbed,targetfname)
715 cl.append('BAIT_INTERVALS=%s' % baitf)
716 cl.append('TARGET_INTERVALS=%s' % targetf)
717 cl.append('INPUT=%s' % os.path.abspath(opts.input))
718 cl.append('OUTPUT=%s' % pic.metricsOut)
719 cl.append('TMP_DIR=%s' % opts.tmpdir)
720 stdouts,rval = pic.runPic(opts.jar,cl)
721
722 elif pic.picname == 'ValidateSamFile':
723 import pysam
724 doTranspose = False
725 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
726 stf = open(pic.log_filename,'w')
727 tlog = None
728 if opts.datatype == 'sam': # need to work with a bam
729 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
730 try:
731 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
732 except:
733 print '## exception on sorting sam file %s' % opts.input
734 else: # is already bam
735 try:
736 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
737 except: # bug - [bam_sort_core] not being ignored - TODO fixme
738 print '## exception on sorting bam file %s' % opts.input
739 if tlog:
740 print '##tlog=',tlog
741 stf.write(tlog)
742 stf.write('\n')
743 sortedfile = '%s.bam' % sortedfile # samtools does that
744 cl.append('O=%s' % pic.metricsOut)
745 cl.append('TMP_DIR=%s' % opts.tmpdir)
746 cl.append('I=%s' % sortedfile)
747 opts.maxerrors = '99999999'
748 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
749 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
750 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
751 cl.append(' '.join(igs))
752 if opts.bisulphite.lower() <> 'false':
753 cl.append('IS_BISULFITE_SEQUENCED=true')
754 if opts.ref <> None or opts.ref_file <> None:
755 cl.append('R=%s' % ref_file_name)
756 stdouts,rval = pic.runPic(opts.jar,cl)
757 if opts.datatype == 'sam':
758 pic.delme.append(tempbam)
759 newsam = opts.output
760 outformat = 'bam'
761 pe = open(pic.metricsOut,'r').readlines()
762 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
763 pic.delme.append(sortedfile) # not wanted
764 stf.close()
765 pic.cleanup()
766 elif pic.picname == "DownsampleSam":
767 cl.append('I=%s' % opts.input)
768 mystring = opts.output
769 mystringsam = mystring + ".sam"
770 cl.append('O=%s' % mystringsam)
771 if float(opts.probability) > 0:
772 cl.append('PROBABILITY=%s' % opts.probability)
773 if float(opts.seed) > 0:
774 cl.append('RANDOM_SEED=%s' % opts.seed)
775 stdouts,rval = pic.runPic(opts.jar, cl)
776 myoutput = mystringsam.replace(".sam", "")
777 os.rename(mystringsam,myoutput)
778 elif pic.picname == 'SamFormatConverter':
779 cl.append('INPUT=%s' % opts.input)
780 cl.append('OUTPUT=%s' % tempout)
781 pic.runPic(opts.jar, cl)
782 haveTempout = True
783
784 else:
785 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
786 sys.exit(1)
787 if haveTempout:
788 # Some Picard tools produced a potentially intermediate bam file.
789 # Either just move to final location or create sam
790 if os.path.exists(tempout):
791 shutil.move(tempout, os.path.abspath(opts.output))
792 if opts.htmlout <> None or doFix: # return a pretty html page
793 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
794 if rval <> 0:
795 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
796 # signal failure
797 if __name__=="__main__": __main__()
798