comparison picard_wrapper.py @ 42:c25cd3e674a5 draft

Uploaded
author devteam
date Thu, 13 Feb 2014 22:02:05 -0500
parents 1f4e2e13d9dc
children
comparison
equal deleted inserted replaced
41:468673afa348 42:c25cd3e674a5
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> 13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head> 16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Bluefish 2.2.3" /> 18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
19 <title></title> 19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> 20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head> 21 </head>
22 <body> 22 <body>
23 <div class="document"> 23 <div class="document">
31 sys.exit() 31 sys.exit()
32 32
33 33
34 def timenow(): 34 def timenow():
35 """return current time as a string 35 """return current time as a string
36 """ 36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) 37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38 38
39 39
40 class PicardBase(): 40 class PicardBase():
41 """ 41 """
42 simple base class with some utilities for Picard 42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross 43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes... 44 lots of changes...
45 """ 45 """
46 46
47 def __init__(self, opts=None,arg0=None): 47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool 48 """ common stuff needed at init for a picard tool
49 """ 49 """
50 assert opts <> None, 'PicardBase needs opts at init' 50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts 51 self.opts = opts
52 if self.opts.outdir == None: 52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir 53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in' 54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
67 try: 67 try:
68 os.makedirs(opts.tmpdir) 68 os.makedirs(opts.tmpdir)
69 except: 69 except:
70 pass 70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname) 71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname) 72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename) 73 self.setLogging(logfname=self.log_filename)
74 74
75 def baseName(self,name=None): 75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0] 76 return os.path.splitext(os.path.basename(name))[0]
77 77
78 def setLogging(self,logfname="picard_wrapper.log"): 78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger 79 """setup a logger
80 """ 80 """
81 logging.basicConfig(level=logging.INFO, 81 logging.basicConfig(level=logging.INFO,
82 filename=logfname, 82 filename=logfname,
83 filemode='a') 83 filemode='a')
84 84
85 85
86 def readLarge(self,fname=None): 86 def readLarge(self,fname=None):
87 """ read a potentially huge file. 87 """ read a potentially huge file.
88 """ 88 """
89 try: 89 try:
90 # get stderr, allowing for case where it's very large 90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' ) 91 tmp = open( fname, 'rb' )
92 s = '' 92 s = ''
93 buffsize = 1048576 93 buffsize = 1048576
100 break 100 break
101 except OverflowError: 101 except OverflowError:
102 pass 102 pass
103 tmp.close() 103 tmp.close()
104 except Exception, e: 104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) ) 105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s 106 return s
107 107
108 def runCL(self,cl=None,output_dir=None): 108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line 109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation 110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts 111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """ 112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl' 113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None: 114 if output_dir == None:
115 output_dir = self.opts.outdir 115 output_dir = self.opts.outdir
116 if type(cl) == type([]): 116 if type(cl) == type([]):
117 cl = ' '.join(cl) 117 cl = ' '.join(cl)
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir) 122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait() 123 rval = process.wait()
124 tlf.close() 124 tlf.close()
125 tef.close() 125 tef.close()
126 stderrs = self.readLarge(temperr) 126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog) 127 stdouts = self.readLarge(templog)
128 if rval > 0: 128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs) 129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs) 130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else: 131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval) 132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s) 133 logging.info(s)
134 os.unlink(templog) # always 134 os.unlink(templog) # always
135 os.unlink(temperr) # always 135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output 136 return s, stdouts, rval # sometimes s is an output
137 137
138 def runPic(self, jar, cl): 138 def runPic(self, jar, cl):
139 """ 139 """
140 cl should be everything after the jar file name in the command 140 cl should be everything after the jar file name in the command
141 """ 141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap] 142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir) 143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar) 144 runme.append('-jar %s' % jar)
145 runme += cl 145 runme += cl
146
147 print runme,self.opts.outdir
148
146 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir) 149 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
147 return stdouts,rval 150 return stdouts,rval
148 151
149 def samToBam(self,infile=None,outdir=None): 152 def samToBam(self,infile=None,outdir=None):
150 """ 153 """
151 use samtools view to convert sam to bam 154 use samtools view to convert sam to bam
152 """ 155 """
153 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam') 156 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
154 cl = ['samtools view -h -b -S -o ',tempbam,infile] 157 cl = ['samtools view -h -b -S -o ',tempbam,infile]
155 tlog,stdouts,rval = self.runCL(cl,outdir) 158 tlog,stdouts,rval = self.runCL(cl,outdir)
156 return tlog,tempbam,rval 159 return tlog,tempbam,rval
157 160
158 def sortSam(self, infile=None,outfile=None,outdir=None): 161 def sortSam(self, infile=None,outfile=None,outdir=None):
159 """ 162 """
160 """ 163 """
161 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir) 164 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
162 cl = ['samtools sort',infile,outfile] 165 cl = ['samtools sort',infile,outfile]
163 tlog,stdouts,rval = self.runCL(cl,outdir) 166 tlog,stdouts,rval = self.runCL(cl,outdir)
164 return tlog 167 return tlog
165 168
170 except: 173 except:
171 pass 174 pass
172 175
173 def prettyPicout(self,transpose,maxrows): 176 def prettyPicout(self,transpose,maxrows):
174 """organize picard outpouts into a report html page 177 """organize picard outpouts into a report html page
175 """ 178 """
176 res = [] 179 res = []
177 try: 180 try:
178 r = open(self.metricsOut,'r').readlines() 181 r = open(self.metricsOut,'r').readlines()
179 except: 182 except:
180 r = [] 183 r = []
181 if len(r) > 0: 184 if len(r) > 0:
182 res.append('<b>Picard on line resources</b><ul>\n') 185 res.append('<b>Picard on line resources</b><ul>\n')
183 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') 186 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
184 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') 187 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
185 if transpose: 188 if transpose:
186 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n') 189 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
187 else: 190 else:
188 res.append('<b>Picard output</b><hr/>\n') 191 res.append('<b>Picard output</b><hr/>\n')
189 res.append('<table cellpadding="3" >\n') 192 res.append('<table cellpadding="3" >\n')
190 dat = [] 193 dat = []
191 heads = [] 194 heads = []
192 lastr = len(r) - 1 195 lastr = len(r) - 1
193 # special case for estimate library complexity hist 196 # special case for estimate library complexity hist
206 res += hres 209 res += hres
207 heads = [] 210 heads = []
208 if len(dat) > 0: 211 if len(dat) > 0:
209 if transpose and not thist: 212 if transpose and not thist:
210 tdat = map(None,*dat) # transpose an arbitrary list of lists 213 tdat = map(None,*dat) # transpose an arbitrary list of lists
211 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] 214 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
212 else: 215 else:
213 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :( 216 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
214 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)] 217 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
215 res += tdat 218 res += tdat
216 dat = [] 219 dat = []
217 res.append('</table>\n') 220 res.append('</table>\n')
218 return res 221 return res
219 222
220 def fixPicardOutputs(self,transpose,maxloglines): 223 def fixPicardOutputs(self,transpose,maxloglines):
221 """ 224 """
222 picard produces long hard to read tab header files 225 picard produces long hard to read tab header files
223 make them available but present them transposed for readability 226 make them available but present them transposed for readability
224 """ 227 """
225 logging.shutdown() 228 logging.shutdown()
226 self.cleanup() # remove temp files stored in delme 229 self.cleanup() # remove temp files stored in delme
227 rstyle="""<style type="text/css"> 230 rstyle="""<style type="text/css">
228 tr.d0 td {background-color: oldlace; color: black;} 231 tr.d0 td {background-color: oldlace; color: black;}
229 tr.d1 td {background-color: aliceblue; color: black;} 232 tr.d1 td {background-color: aliceblue; color: black;}
230 </style>""" 233 </style>"""
231 res = [rstyle,] 234 res = [rstyle,]
232 res.append(galhtmlprefix % self.progname) 235 res.append(galhtmlprefix % self.progname)
233 res.append(galhtmlattr % (self.picname,timenow())) 236 res.append(galhtmlattr % (self.picname,timenow()))
234 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] 237 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
235 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] 238 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
236 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs 239 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
237 for p in pdflist: 240 for p in pdflist:
238 pbase = os.path.splitext(p)[0] # removes .pdf 241 pbase = os.path.splitext(p)[0] # removes .pdf
239 imghref = '%s.jpg' % pbase 242 imghref = '%s.jpg' % pbase
240 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking! 243 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
241 if mimghref in flist: 244 if mimghref in flist:
242 imghref=mimghref # only one for thumbnail...it's a multi page pdf 245 imghref=mimghref # only one for thumbnail...it's a multi page pdf
243 res.append('<table cellpadding="10"><tr><td>\n') 246 res.append('<table cellpadding="10"><tr><td>\n')
244 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref)) 247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
245 res.append('</tr></td></table>\n') 248 res.append('</tr></td></table>\n')
246 if len(flist) > 0: 249 if len(flist) > 0:
247 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>') 250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
248 res.append('<table>\n') 251 res.append('<table>\n')
249 for i,f in enumerate(flist): 252 for i,f in enumerate(flist):
250 fn = os.path.split(f)[-1] 253 fn = os.path.split(f)[-1]
251 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) 254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
252 res.append('</table><p/>\n') 255 res.append('</table><p/>\n')
253 pres = self.prettyPicout(transpose,maxloglines) 256 pres = self.prettyPicout(transpose,maxloglines)
254 if len(pres) > 0: 257 if len(pres) > 0:
255 res += pres 258 res += pres
256 l = open(self.log_filename,'r').readlines() 259 l = open(self.log_filename,'r').readlines()
257 llen = len(l) 260 llen = len(l)
258 if llen > 0: 261 if llen > 0:
259 res.append('<b>Picard Tool Run Log</b><hr/>\n') 262 res.append('<b>Picard Tool Run Log</b><hr/>\n')
260 rlog = ['<pre>',] 263 rlog = ['<pre>',]
261 if llen > maxloglines: 264 if llen > maxloglines:
262 n = min(50,int(maxloglines/2)) 265 n = min(50,int(maxloglines/2))
263 rlog += l[:n] 266 rlog += l[:n]
264 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines)) 267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
269 if llen > maxloglines: 272 if llen > maxloglines:
270 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename)) 273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
271 res += rlog 274 res += rlog
272 else: 275 else:
273 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename) 276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
274 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') 277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
275 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool') 278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
276 res.append(galhtmlpostfix) 279 res.append(galhtmlpostfix)
277 outf = open(self.opts.htmlout,'w') 280 outf = open(self.opts.htmlout,'w')
278 outf.write(''.join(res)) 281 outf.write(''.join(res))
279 outf.write('\n') 282 outf.write('\n')
280 outf.close() 283 outf.close()
281 284
282 def makePicInterval(self,inbed=None,outf=None): 285 def makePicInterval(self,inbed=None,outf=None):
283 """ 286 """
284 picard wants bait and target files to have the same header length as the incoming bam/sam 287 picard wants bait and target files to have the same header length as the incoming bam/sam
285 a meaningful (ie accurate) representation will fail because of this - so this hack 288 a meaningful (ie accurate) representation will fail because of this - so this hack
286 it would be far better to be able to supply the original bed untouched 289 it would be far better to be able to supply the original bed untouched
287 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list 290 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
288 """ 291 """
289 assert inbed <> None 292 assert inbed <> None
290 bed = open(inbed,'r').readlines() 293 bed = open(inbed,'r').readlines()
291 sbed = [x.split('\t') for x in bed] # lengths MUST be 5 294 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
292 lens = [len(x) for x in sbed] 295 lens = [len(x) for x in sbed]
293 strands = [x[3] for x in sbed if not x[3] in ['+','-']] 296 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
318 o.close() 321 o.close()
319 return outf 322 return outf
320 323
321 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None): 324 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
322 """ 325 """
323 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs! 326 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
324 Do the work of removing all the error sequences 327 Do the work of removing all the error sequences
325 pysam is cool 328 pysam is cool
326 infile = pysam.Samfile( "-", "r" ) 329 infile = pysam.Samfile( "-", "r" )
327 outfile = pysam.Samfile( "-", "w", template = infile ) 330 outfile = pysam.Samfile( "-", "w", template = infile )
328 for s in infile: outfile.write(s) 331 for s in infile: outfile.write(s)
329 332
330 errors from ValidateSameFile.jar look like 333 errors from ValidateSameFile.jar look like
331 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing 334 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
332 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary. 335 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
333 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041 336 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
334 337
335 """ 338 """
336 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam 339 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
337 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path' 340 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
338 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2] 341 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
339 remDict = dict(zip(removeNames,range(len(removeNames)))) 342 remDict = dict(zip(removeNames,range(len(removeNames))))
340 infile = pysam.Samfile(insam,'rb') 343 infile = pysam.Samfile(insam,'rb')
362 365
363 366
364 def __main__(): 367 def __main__():
365 doFix = False # tools returning htmlfile don't need this 368 doFix = False # tools returning htmlfile don't need this
366 doTranspose = True # default 369 doTranspose = True # default
367 maxloglines = 100 # default 370 maxloglines = 100 # default
368 #Parse Command Line 371 #Parse Command Line
369 op = optparse.OptionParser() 372 op = optparse.OptionParser()
370 # All tools 373 # All tools
371 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' ) 374 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
372 op.add_option('-e', '--inputext', default=None) 375 op.add_option('-e', '--inputext', default=None)
373 op.add_option('-o', '--output', default=None) 376 op.add_option('-o', '--output', default=None)
374 op.add_option('-n', '--title', default="Pick a Picard Tool") 377 op.add_option('-n', '--title', default="Pick a Picard Tool")
375 op.add_option('-t', '--htmlout', default=None) 378 op.add_option('-t', '--htmlout', default=None)
376 op.add_option('-d', '--outdir', default=None) 379 op.add_option('-d', '--outdir', default=None)
377 op.add_option('-x', '--maxjheap', default='4g') 380 op.add_option('-x', '--maxjheap', default='3000m')
378 op.add_option('-b', '--bisulphite', default='false') 381 op.add_option('-b', '--bisulphite', default='false')
379 op.add_option('-s', '--sortorder', default='query') 382 op.add_option('-s', '--sortorder', default='query')
380 op.add_option('','--tmpdir', default='/tmp') 383 op.add_option('','--tmpdir', default='/tmp')
381 op.add_option('-j','--jar',default='') 384 op.add_option('-j','--jar',default='')
382 op.add_option('','--picard-cmd',default=None) 385 op.add_option('','--picard-cmd',default=None)
383 # Many tools 386 # Many tools
384 op.add_option( '', '--output-format', dest='output_format', help='Output format' ) 387 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
385 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' ) 388 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
386 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None ) 389 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
387 # CreateSequenceDictionary 390 # CreateSequenceDictionary
402 op.add_option('', '--maxinsert', default="20") 405 op.add_option('', '--maxinsert', default="20")
403 op.add_option('', '--adaptors', default='') 406 op.add_option('', '--adaptors', default='')
404 # FixMateInformation and validate 407 # FixMateInformation and validate
405 # CollectGcBiasMetrics 408 # CollectGcBiasMetrics
406 op.add_option('', '--windowsize', default='100') 409 op.add_option('', '--windowsize', default='100')
407 op.add_option('', '--mingenomefrac', default='0.00001') 410 op.add_option('', '--mingenomefrac', default='0.00001')
408 # AddOrReplaceReadGroups 411 # AddOrReplaceReadGroups
409 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' ) 412 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
410 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' ) 413 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
411 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' ) 414 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
412 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' ) 415 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
418 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' ) 421 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
419 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' ) 422 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
420 # ReplaceSamHeader 423 # ReplaceSamHeader
421 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' ) 424 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
422 425
423 op.add_option('','--assumesorted', default='true') 426 op.add_option('','--assumesorted', default='true')
424 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*") 427 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
425 #estimatelibrarycomplexity 428 #estimatelibrarycomplexity
426 op.add_option('','--minid', default="5") 429 op.add_option('','--minid', default="5")
427 op.add_option('','--maxdiff', default="0.03") 430 op.add_option('','--maxdiff', default="0.03")
428 op.add_option('','--minmeanq', default="20") 431 op.add_option('','--minmeanq', default="20")
437 op.add_option('','--samout', default=None) 440 op.add_option('','--samout', default=None)
438 #downsample 441 #downsample
439 op.add_option('','--probability', default="1") 442 op.add_option('','--probability', default="1")
440 op.add_option('','--seed', default="1") 443 op.add_option('','--seed', default="1")
441 444
442 ##
443 opts, args = op.parse_args() 445 opts, args = op.parse_args()
444 opts.sortme = opts.assumesorted == 'false' 446 opts.sortme = opts.assumesorted == 'false'
445 assert opts.input <> None 447 assert opts.input <> None
446 # need to add 448 # need to add
447 # instance that does all the work 449 # instance that does all the work
540 if float(opts.optdupdist) > 0: 542 if float(opts.optdupdist) > 0:
541 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist) 543 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
542 stdouts,rval = pic.runPic(opts.jar, cl) 544 stdouts,rval = pic.runPic(opts.jar, cl)
543 545
544 elif pic.picname == 'CollectAlignmentSummaryMetrics': 546 elif pic.picname == 'CollectAlignmentSummaryMetrics':
545 # Why do we do this fakefasta thing? 547 # Why do we do this fakefasta thing?
546 # Because we need NO fai to be available or picard barfs unless it matches the input data. 548 # Because we need NO fai to be available or picard barfs unless it matches the input data.
547 # why? Dunno Seems to work without complaining if the .bai file is AWOL.... 549 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
548 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) 550 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
549 try: 551 try:
550 os.symlink(ref_file_name,fakefasta) 552 os.symlink(ref_file_name,fakefasta)
562 cl.append('OUTPUT=%s' % pic.metricsOut) 564 cl.append('OUTPUT=%s' % pic.metricsOut)
563 cl.append('R=%s' % fakefasta) 565 cl.append('R=%s' % fakefasta)
564 cl.append('TMP_DIR=%s' % opts.tmpdir) 566 cl.append('TMP_DIR=%s' % opts.tmpdir)
565 if not opts.assumesorted.lower() == 'true': # we need to sort input 567 if not opts.assumesorted.lower() == 'true': # we need to sort input
566 sortedfile = '%s.sorted' % os.path.basename(opts.input) 568 sortedfile = '%s.sorted' % os.path.basename(opts.input)
567 if opts.datatype == 'sam': # need to work with a bam 569 if opts.datatype == 'sam': # need to work with a bam
568 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir) 570 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
569 pic.delme.append(tempbam) 571 pic.delme.append(tempbam)
570 try: 572 try:
571 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir) 573 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
572 except: 574 except:
577 except : # bug - [bam_sort_core] not being ignored - TODO fixme 579 except : # bug - [bam_sort_core] not being ignored - TODO fixme
578 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input) 580 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
579 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile))) 581 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
580 pic.delme.append(os.path.join(opts.outdir,sortedfile)) 582 pic.delme.append(os.path.join(opts.outdir,sortedfile))
581 else: 583 else:
582 cl.append('INPUT=%s' % os.path.abspath(opts.input)) 584 cl.append('INPUT=%s' % os.path.abspath(opts.input))
583 stdouts,rval = pic.runPic(opts.jar, cl) 585 stdouts,rval = pic.runPic(opts.jar, cl)
584 586
585 587
586 elif pic.picname == 'CollectGcBiasMetrics': 588 elif pic.picname == 'CollectGcBiasMetrics':
587 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name 589 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
588 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. 590 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
589 # why? Dunno 591 # why? Dunno
590 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) 592 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
591 try: 593 try:
592 os.symlink(ref_file_name,fakefasta) 594 os.symlink(ref_file_name,fakefasta)
593 except: 595 except:
594 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy' 596 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
619 lf.write('\n') 621 lf.write('\n')
620 lf.close() 622 lf.close()
621 623
622 elif pic.picname == 'CollectInsertSizeMetrics': 624 elif pic.picname == 'CollectInsertSizeMetrics':
623 """ <command interpreter="python"> 625 """ <command interpreter="python">
624 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations" 626 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
625 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel" 627 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
626 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file" 628 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
627 </command> 629 </command>
628 """ 630 """
629 isPDF = 'InsertSizeHist.pdf' 631 isPDF = 'InsertSizeHist.pdf'
630 pdfpath = os.path.join(opts.outdir,isPDF) 632 pdfpath = os.path.join(opts.outdir,isPDF)
631 histpdf = 'InsertSizeHist.pdf' 633 histpdf = 'InsertSizeHist.pdf'
632 cl.append('I=%s' % opts.input) 634 cl.append('I=%s' % opts.input)
633 cl.append('O=%s' % pic.metricsOut) 635 cl.append('O=%s' % pic.metricsOut)
634 cl.append('HISTOGRAM_FILE=%s' % histpdf) 636 cl.append('HISTOGRAM_FILE=%s' % histpdf)
635 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56 637 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
636 # cl.append('TAIL_LIMIT=%s' % opts.taillimit) 638 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
637 if opts.histwidth <> '0': 639 if opts.histwidth <> '0':
638 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth) 640 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
639 if float( opts.minpct) > 0.0: 641 if float( opts.minpct) > 0.0:
640 cl.append('MINIMUM_PCT=%s' % opts.minpct) 642 cl.append('MINIMUM_PCT=%s' % opts.minpct)
641 if float(opts.deviations) > 0.0: 643 if float(opts.deviations) > 0.0:
642 cl.append('DEVIATIONS=%s' % opts.deviations) 644 cl.append('DEVIATIONS=%s' % opts.deviations)
643 if opts.malevel: 645 if opts.malevel:
644 malists = opts.malevel.split(',') 646 malists = opts.malevel.split(',')
645 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists] 647 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
646 cl += malist 648 cl += malist
647 stdouts,rval = pic.runPic(opts.jar, cl) 649 stdouts,rval = pic.runPic(opts.jar, cl)
648 if os.path.exists(pdfpath): # automake thumbnail - will be added to html 650 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
649 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath] 651 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
650 pic.runCL(cl=cl2,output_dir=opts.outdir) 652 pic.runCL(cl=cl2,output_dir=opts.outdir)
651 else: 653 else:
652 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath 654 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
653 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n' 655 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
661 # assume sorted even if header says otherwise 663 # assume sorted even if header says otherwise
662 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted)) 664 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
663 # input 665 # input
664 cl.append('INPUT=%s' % opts.input) 666 cl.append('INPUT=%s' % opts.input)
665 # outputs 667 # outputs
666 cl.append('OUTPUT=%s' % opts.output) 668 cl.append('OUTPUT=%s' % opts.output)
667 cl.append('METRICS_FILE=%s' % pic.metricsOut ) 669 cl.append('METRICS_FILE=%s' % pic.metricsOut )
668 # remove or mark duplicates 670 # remove or mark duplicates
669 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups) 671 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
670 # the regular expression to be used to parse reads in incoming SAM file 672 # the regular expression to be used to parse reads in incoming SAM file
671 cl.append('READ_NAME_REGEX="%s"' % opts.readregex) 673 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
709 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target') 711 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
710 baitf = pic.makePicInterval(opts.baitbed,baitfname) 712 baitf = pic.makePicInterval(opts.baitbed,baitfname)
711 if opts.targetbed == opts.baitbed: # same file sometimes 713 if opts.targetbed == opts.baitbed: # same file sometimes
712 targetf = baitf 714 targetf = baitf
713 else: 715 else:
714 targetf = pic.makePicInterval(opts.targetbed,targetfname) 716 targetf = pic.makePicInterval(opts.targetbed,targetfname)
715 cl.append('BAIT_INTERVALS=%s' % baitf) 717 cl.append('BAIT_INTERVALS=%s' % baitf)
716 cl.append('TARGET_INTERVALS=%s' % targetf) 718 cl.append('TARGET_INTERVALS=%s' % targetf)
717 cl.append('INPUT=%s' % os.path.abspath(opts.input)) 719 cl.append('INPUT=%s' % os.path.abspath(opts.input))
718 cl.append('OUTPUT=%s' % pic.metricsOut) 720 cl.append('OUTPUT=%s' % pic.metricsOut)
719 cl.append('TMP_DIR=%s' % opts.tmpdir) 721 cl.append('TMP_DIR=%s' % opts.tmpdir)
723 import pysam 725 import pysam
724 doTranspose = False 726 doTranspose = False
725 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted') 727 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
726 stf = open(pic.log_filename,'w') 728 stf = open(pic.log_filename,'w')
727 tlog = None 729 tlog = None
728 if opts.datatype == 'sam': # need to work with a bam 730 if opts.datatype == 'sam': # need to work with a bam
729 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir) 731 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
730 try: 732 try:
731 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir) 733 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
732 except: 734 except:
733 print '## exception on sorting sam file %s' % opts.input 735 print '## exception on sorting sam file %s' % opts.input
738 print '## exception on sorting bam file %s' % opts.input 740 print '## exception on sorting bam file %s' % opts.input
739 if tlog: 741 if tlog:
740 print '##tlog=',tlog 742 print '##tlog=',tlog
741 stf.write(tlog) 743 stf.write(tlog)
742 stf.write('\n') 744 stf.write('\n')
743 sortedfile = '%s.bam' % sortedfile # samtools does that 745 sortedfile = '%s.bam' % sortedfile # samtools does that
744 cl.append('O=%s' % pic.metricsOut) 746 cl.append('O=%s' % pic.metricsOut)
745 cl.append('TMP_DIR=%s' % opts.tmpdir) 747 cl.append('TMP_DIR=%s' % opts.tmpdir)
746 cl.append('I=%s' % sortedfile) 748 cl.append('I=%s' % sortedfile)
747 opts.maxerrors = '99999999' 749 opts.maxerrors = '99999999'
748 cl.append('MAX_OUTPUT=%s' % opts.maxerrors) 750 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
750 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None'] 752 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
751 cl.append(' '.join(igs)) 753 cl.append(' '.join(igs))
752 if opts.bisulphite.lower() <> 'false': 754 if opts.bisulphite.lower() <> 'false':
753 cl.append('IS_BISULFITE_SEQUENCED=true') 755 cl.append('IS_BISULFITE_SEQUENCED=true')
754 if opts.ref <> None or opts.ref_file <> None: 756 if opts.ref <> None or opts.ref_file <> None:
755 cl.append('R=%s' % ref_file_name) 757 cl.append('R=%s' % ref_file_name)
756 stdouts,rval = pic.runPic(opts.jar,cl) 758 stdouts,rval = pic.runPic(opts.jar,cl)
757 if opts.datatype == 'sam': 759 if opts.datatype == 'sam':
758 pic.delme.append(tempbam) 760 pic.delme.append(tempbam)
759 newsam = opts.output 761 newsam = opts.output
760 outformat = 'bam' 762 outformat = 'bam'
761 pe = open(pic.metricsOut,'r').readlines() 763 pe = open(pic.metricsOut,'r').readlines()
762 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat) 764 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
763 pic.delme.append(sortedfile) # not wanted 765 pic.delme.append(sortedfile) # not wanted
764 stf.close() 766 stf.close()
765 pic.cleanup() 767 pic.cleanup()
768
769 ####liubo added CleanSam tool####
770 elif pic.picname == 'CleanSam':
771 # input
772 cl.append('INPUT=%s' % opts.input)
773 # output
774 cl.append('OUTPUT=%s' % tempout)
775 stdouts,rval = pic.runPic(opts.jar, cl)
776 haveTempout = True
777
778 elif pic.picname == 'SortSam':
779 cl.append('I=%s' % opts.input)
780 cl.append('O=%s' % tempout)
781 cl.append('SORT_ORDER=%s' % opts.sortorder)
782 stdouts,rval = pic.runPic(opts.jar,cl)
783 haveTempout = True
784
785 elif pic.picname == 'BuildBamIndex':
786 cl.append('I=%s' % opts.input)
787 cl.append('O=%s' % tempout)
788 stdouts,rval = pic.runPic(opts.jar,cl)
789 haveTempout = True
790
791 elif pic.picname == 'SamFormatConverter':
792 cl.append('INPUT=%s' % opts.input)
793 cl.append('OUTPUT=%s' % tempout)
794 pic.runPic(opts.jar, cl)
795 haveTempout = True
766 elif pic.picname == "DownsampleSam": 796 elif pic.picname == "DownsampleSam":
767 cl.append('I=%s' % opts.input) 797 cl.append('I=%s' % opts.input)
768 mystring = opts.output 798 mystring = opts.output
769 mystringsam = mystring + ".sam" 799 mystringsam = mystring + ".sam"
770 cl.append('O=%s' % mystringsam) 800 cl.append('O=%s' % mystringsam)
773 if float(opts.seed) > 0: 803 if float(opts.seed) > 0:
774 cl.append('RANDOM_SEED=%s' % opts.seed) 804 cl.append('RANDOM_SEED=%s' % opts.seed)
775 stdouts,rval = pic.runPic(opts.jar, cl) 805 stdouts,rval = pic.runPic(opts.jar, cl)
776 myoutput = mystringsam.replace(".sam", "") 806 myoutput = mystringsam.replace(".sam", "")
777 os.rename(mystringsam,myoutput) 807 os.rename(mystringsam,myoutput)
778 elif pic.picname == 'SamFormatConverter': 808
779 cl.append('INPUT=%s' % opts.input) 809
780 cl.append('OUTPUT=%s' % tempout)
781 pic.runPic(opts.jar, cl)
782 haveTempout = True
783
784 else: 810 else:
785 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname 811 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
786 sys.exit(1) 812 sys.exit(1)
787 if haveTempout: 813 if haveTempout:
788 # Some Picard tools produced a potentially intermediate bam file. 814 # Some Picard tools produced a potentially intermediate bam file.
789 # Either just move to final location or create sam 815 # Either just move to final location or create sam
790 if os.path.exists(tempout): 816 if os.path.exists(tempout):
791 shutil.move(tempout, os.path.abspath(opts.output)) 817 shutil.move(tempout, os.path.abspath(opts.output))
792 if opts.htmlout <> None or doFix: # return a pretty html page 818 if opts.htmlout <> None or doFix: # return a pretty html page
793 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines) 819 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
794 if rval <> 0: 820 if rval <> 0:
795 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts) 821 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
796 # signal failure 822 # signal failure
797 if __name__=="__main__": __main__() 823 if __name__=="__main__": __main__()
798