Mercurial > repos > devteam > picard1106

--- a/picard_wrapper.py	Thu Feb 13 21:39:50 2014 -0500
+++ b/picard_wrapper.py	Thu Feb 13 22:02:05 2014 -0500
@@ -15,7 +15,7 @@
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="generator" content="Bluefish 2.2.3" />
+<meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
 <title></title>
 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
 </head>
@@ -33,20 +33,20 @@

 def timenow():
     """return current time as a string
-    """
+"""
     return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))


 class PicardBase():
     """
-    simple base class with some utilities for Picard
-    adapted and merged with Kelly Vincent's code april 2011 Ross
-    lots of changes...
-    """
+simple base class with some utilities for Picard
+adapted and merged with Kelly Vincent's code april 2011 Ross
+lots of changes...
+"""

     def __init__(self, opts=None,arg0=None):
         """ common stuff needed at init for a picard tool
-        """
+"""
         assert opts <> None, 'PicardBase needs opts at init'
         self.opts = opts
         if self.opts.outdir == None:
@@ -69,7 +69,7 @@
         except:
             pass
         self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
-        self.metricsOut =  os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
+        self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
         self.setLogging(logfname=self.log_filename)

     def baseName(self,name=None):
@@ -77,7 +77,7 @@

     def setLogging(self,logfname="picard_wrapper.log"):
         """setup a logger
-        """
+"""
         logging.basicConfig(level=logging.INFO,
                     filename=logfname,
                     filemode='a')
@@ -85,7 +85,7 @@

     def readLarge(self,fname=None):
         """ read a potentially huge file.
-        """
+"""
         try:
             # get stderr, allowing for case where it's very large
             tmp = open( fname, 'rb' )
@@ -102,14 +102,14 @@
                 pass
             tmp.close()
         except Exception, e:
-            stop_err( 'Read Large Exception : %s' % str( e ) )
+            stop_err( 'Read Large Exception : %s' % str( e ) )
         return s

     def runCL(self,cl=None,output_dir=None):
         """ construct and run a command line
-        we have galaxy's temp path as opt.temp_dir so don't really need isolation
-        sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
-        """
+we have galaxy's temp path as opt.temp_dir so don't really need isolation
+sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
+"""
         assert cl <> None, 'PicardBase runCL needs a command line as cl'
         if output_dir == None:
             output_dir = self.opts.outdir
@@ -124,7 +124,7 @@
         tlf.close()
         tef.close()
         stderrs = self.readLarge(temperr)
-        stdouts = self.readLarge(templog)
+        stdouts = self.readLarge(templog)
         if rval > 0:
             s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
             stdouts = '%s\n%s' % (stdouts,stderrs)
@@ -133,23 +133,26 @@
         logging.info(s)
         os.unlink(templog) # always
         os.unlink(temperr) # always
-        return s, stdouts, rval  # sometimes s is an output
+        return s, stdouts, rval # sometimes s is an output

     def runPic(self, jar, cl):
         """
-        cl should be everything after the jar file name in the command
-        """
+cl should be everything after the jar file name in the command
+"""
         runme = ['java -Xmx%s' % self.opts.maxjheap]
         runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
         runme.append('-jar %s' % jar)
         runme += cl
+
+        print runme,self.opts.outdir
+
         s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
         return stdouts,rval

     def samToBam(self,infile=None,outdir=None):
         """
-        use samtools view to convert sam to bam
-        """
+use samtools view to convert sam to bam
+"""
         fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
         cl = ['samtools view -h -b -S -o ',tempbam,infile]
         tlog,stdouts,rval = self.runCL(cl,outdir)
@@ -157,7 +160,7 @@

     def sortSam(self, infile=None,outfile=None,outdir=None):
         """
-        """
+"""
         print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
         cl = ['samtools sort',infile,outfile]
         tlog,stdouts,rval = self.runCL(cl,outdir)
@@ -172,20 +175,20 @@

     def prettyPicout(self,transpose,maxrows):
         """organize picard outpouts into a report html page
-        """
+"""
         res = []
         try:
             r = open(self.metricsOut,'r').readlines()
         except:
-            r = []
+            r = []
         if len(r) > 0:
             res.append('<b>Picard on line resources</b><ul>\n')
             res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
             res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
             if transpose:
-                res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
+                res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
             else:
-                res.append('<b>Picard output</b><hr/>\n')
+                res.append('<b>Picard output</b><hr/>\n')
             res.append('<table cellpadding="3" >\n')
             dat = []
             heads = []
@@ -208,30 +211,30 @@
             if len(dat) > 0:
                 if transpose and not thist:
                     tdat = map(None,*dat) # transpose an arbitrary list of lists
-                    tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
+                    tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
                 else:
                     tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
                     tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
                 res += tdat
                 dat = []
-            res.append('</table>\n')
+            res.append('</table>\n')
         return res

     def fixPicardOutputs(self,transpose,maxloglines):
         """
-        picard produces long hard to read tab header files
-        make them available but present them transposed for readability
-        """
+picard produces long hard to read tab header files
+make them available but present them transposed for readability
+"""
         logging.shutdown()
         self.cleanup() # remove temp files stored in delme
         rstyle="""<style type="text/css">
-        tr.d0 td {background-color: oldlace; color: black;}
-        tr.d1 td {background-color: aliceblue; color: black;}
-        </style>"""
+tr.d0 td {background-color: oldlace; color: black;}
+tr.d1 td {background-color: aliceblue; color: black;}
+</style>"""
         res = [rstyle,]
-        res.append(galhtmlprefix % self.progname)
+        res.append(galhtmlprefix % self.progname)
         res.append(galhtmlattr % (self.picname,timenow()))
-        flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
+        flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
         pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
         if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
             for p in pdflist:
@@ -241,22 +244,22 @@
                 if mimghref in flist:
                     imghref=mimghref # only one for thumbnail...it's a multi page pdf
                 res.append('<table cellpadding="10"><tr><td>\n')
-                res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
-                res.append('</tr></td></table>\n')
+                res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
+                res.append('</tr></td></table>\n')
         if len(flist) > 0:
             res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
             res.append('<table>\n')
             for i,f in enumerate(flist):
                 fn = os.path.split(f)[-1]
                 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
-            res.append('</table><p/>\n')
+            res.append('</table><p/>\n')
         pres = self.prettyPicout(transpose,maxloglines)
         if len(pres) > 0:
             res += pres
         l = open(self.log_filename,'r').readlines()
         llen = len(l)
-        if llen > 0:
-            res.append('<b>Picard Tool Run Log</b><hr/>\n')
+        if llen > 0:
+            res.append('<b>Picard Tool Run Log</b><hr/>\n')
             rlog = ['<pre>',]
             if llen > maxloglines:
                 n = min(50,int(maxloglines/2))
@@ -271,21 +274,21 @@
             res += rlog
         else:
             res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
-        res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
-        res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
-        res.append(galhtmlpostfix)
+        res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
+        res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
+        res.append(galhtmlpostfix)
         outf = open(self.opts.htmlout,'w')
-        outf.write(''.join(res))
+        outf.write(''.join(res))
         outf.write('\n')
         outf.close()

     def makePicInterval(self,inbed=None,outf=None):
         """
-        picard wants bait and target files to have the same header length as the incoming bam/sam
-        a meaningful (ie accurate) representation will fail because of this - so this hack
-        it would be far better to be able to supply the original bed untouched
-        Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
-        """
+picard wants bait and target files to have the same header length as the incoming bam/sam
+a meaningful (ie accurate) representation will fail because of this - so this hack
+it would be far better to be able to supply the original bed untouched
+Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
+"""
         assert inbed <> None
         bed = open(inbed,'r').readlines()
         sbed = [x.split('\t') for x in bed] # lengths MUST be 5
@@ -320,19 +323,19 @@

     def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
         """
-        interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
-        Do the work of removing all the error sequences
-        pysam is cool
-        infile = pysam.Samfile( "-", "r" )
-        outfile = pysam.Samfile( "-", "w", template = infile )
-        for s in infile: outfile.write(s)
+interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
+Do the work of removing all the error sequences
+pysam is cool
+infile = pysam.Samfile( "-", "r" )
+outfile = pysam.Samfile( "-", "w", template = infile )
+for s in infile: outfile.write(s)

-        errors from ValidateSameFile.jar look like
-        WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
-        ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
-        ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
+errors from ValidateSameFile.jar look like
+WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
+ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
+ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041

-        """
+"""
         assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
         assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
         removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
@@ -364,7 +367,7 @@
 def __main__():
     doFix = False # tools returning htmlfile don't need this
     doTranspose = True # default
-    maxloglines = 100 # default
+    maxloglines = 100 # default
     #Parse Command Line
     op = optparse.OptionParser()
     # All tools
@@ -374,12 +377,12 @@
     op.add_option('-n', '--title', default="Pick a Picard Tool")
     op.add_option('-t', '--htmlout', default=None)
     op.add_option('-d', '--outdir', default=None)
-    op.add_option('-x', '--maxjheap', default='4g')
+    op.add_option('-x', '--maxjheap', default='3000m')
     op.add_option('-b', '--bisulphite', default='false')
-    op.add_option('-s', '--sortorder', default='query')
+    op.add_option('-s', '--sortorder', default='query')
     op.add_option('','--tmpdir', default='/tmp')
-    op.add_option('-j','--jar',default='')
-    op.add_option('','--picard-cmd',default=None)
+    op.add_option('-j','--jar',default='')
+    op.add_option('','--picard-cmd',default=None)
     # Many tools
     op.add_option( '', '--output-format', dest='output_format', help='Output format' )
     op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
@@ -404,7 +407,7 @@
     # FixMateInformation and validate
     # CollectGcBiasMetrics
     op.add_option('', '--windowsize', default='100')
-    op.add_option('', '--mingenomefrac', default='0.00001')
+    op.add_option('', '--mingenomefrac', default='0.00001')
     # AddOrReplaceReadGroups
     op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
     op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
@@ -420,7 +423,7 @@
     # ReplaceSamHeader
     op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )

-    op.add_option('','--assumesorted', default='true')
+    op.add_option('','--assumesorted', default='true')
     op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
     #estimatelibrarycomplexity
     op.add_option('','--minid', default="5")
@@ -439,7 +442,6 @@
     op.add_option('','--probability', default="1")
     op.add_option('','--seed', default="1")

-    ##
     opts, args = op.parse_args()
     opts.sortme = opts.assumesorted == 'false'
     assert opts.input <> None
@@ -542,7 +544,7 @@
         stdouts,rval = pic.runPic(opts.jar, cl)

     elif pic.picname == 'CollectAlignmentSummaryMetrics':
-        # Why do we do this fakefasta thing?
+        # Why do we do this fakefasta thing?
         # Because we need NO fai to be available or picard barfs unless it matches the input data.
         # why? Dunno Seems to work without complaining if the .bai file is AWOL....
         fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
@@ -564,7 +566,7 @@
         cl.append('TMP_DIR=%s' % opts.tmpdir)
         if not opts.assumesorted.lower() == 'true': # we need to sort input
             sortedfile = '%s.sorted' % os.path.basename(opts.input)
-            if opts.datatype == 'sam': # need to work with a bam
+            if opts.datatype == 'sam': # need to work with a bam
                 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
                 pic.delme.append(tempbam)
                 try:
@@ -579,14 +581,14 @@
             cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
             pic.delme.append(os.path.join(opts.outdir,sortedfile))
         else:
-            cl.append('INPUT=%s' % os.path.abspath(opts.input))
+            cl.append('INPUT=%s' % os.path.abspath(opts.input))
         stdouts,rval = pic.runPic(opts.jar, cl)


     elif pic.picname == 'CollectGcBiasMetrics':
         assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
         # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
-        # why? Dunno
+        # why? Dunno
         fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
         try:
             os.symlink(ref_file_name,fakefasta)
@@ -621,11 +623,11 @@

     elif pic.picname == 'CollectInsertSizeMetrics':
         """ <command interpreter="python">
-   picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
-   --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
-   -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
-  </command>
-        """
+picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
+--histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
+-j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
+</command>
+"""
         isPDF = 'InsertSizeHist.pdf'
         pdfpath = os.path.join(opts.outdir,isPDF)
         histpdf = 'InsertSizeHist.pdf'
@@ -633,8 +635,8 @@
         cl.append('O=%s' % pic.metricsOut)
         cl.append('HISTOGRAM_FILE=%s' % histpdf)
         #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
-        #    cl.append('TAIL_LIMIT=%s' % opts.taillimit)
-        if  opts.histwidth <> '0':
+        # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
+        if opts.histwidth <> '0':
             cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
         if float( opts.minpct) > 0.0:
             cl.append('MINIMUM_PCT=%s' % opts.minpct)
@@ -645,7 +647,7 @@
             malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
             cl += malist
         stdouts,rval = pic.runPic(opts.jar, cl)
-        if os.path.exists(pdfpath): # automake thumbnail - will be added to html
+        if os.path.exists(pdfpath): # automake thumbnail - will be added to html
             cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
             pic.runCL(cl=cl2,output_dir=opts.outdir)
         else:
@@ -663,7 +665,7 @@
         # input
         cl.append('INPUT=%s' % opts.input)
         # outputs
-        cl.append('OUTPUT=%s' % opts.output)
+        cl.append('OUTPUT=%s' % opts.output)
         cl.append('METRICS_FILE=%s' % pic.metricsOut )
         # remove or mark duplicates
         cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
@@ -711,7 +713,7 @@
         if opts.targetbed == opts.baitbed: # same file sometimes
             targetf = baitf
         else:
-            targetf = pic.makePicInterval(opts.targetbed,targetfname)
+            targetf = pic.makePicInterval(opts.targetbed,targetfname)
         cl.append('BAIT_INTERVALS=%s' % baitf)
         cl.append('TARGET_INTERVALS=%s' % targetf)
         cl.append('INPUT=%s' % os.path.abspath(opts.input))
@@ -725,7 +727,7 @@
         sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
         stf = open(pic.log_filename,'w')
         tlog = None
-        if opts.datatype == 'sam': # need to work with a bam
+        if opts.datatype == 'sam': # need to work with a bam
             tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
             try:
                 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
@@ -740,7 +742,7 @@
             print '##tlog=',tlog
             stf.write(tlog)
             stf.write('\n')
-        sortedfile = '%s.bam' % sortedfile # samtools does that
+        sortedfile = '%s.bam' % sortedfile # samtools does that
         cl.append('O=%s' % pic.metricsOut)
         cl.append('TMP_DIR=%s' % opts.tmpdir)
         cl.append('I=%s' % sortedfile)
@@ -752,7 +754,7 @@
         if opts.bisulphite.lower() <> 'false':
             cl.append('IS_BISULFITE_SEQUENCED=true')
         if opts.ref <> None or opts.ref_file <> None:
-            cl.append('R=%s' %  ref_file_name)
+            cl.append('R=%s' % ref_file_name)
         stdouts,rval = pic.runPic(opts.jar,cl)
         if opts.datatype == 'sam':
             pic.delme.append(tempbam)
@@ -763,6 +765,34 @@
         pic.delme.append(sortedfile) # not wanted
         stf.close()
         pic.cleanup()
+
+####liubo added CleanSam tool####
+    elif pic.picname == 'CleanSam':
+        # input
+        cl.append('INPUT=%s' % opts.input)
+        # output
+        cl.append('OUTPUT=%s' % tempout)
+        stdouts,rval = pic.runPic(opts.jar, cl)
+        haveTempout = True
+
+    elif pic.picname == 'SortSam':
+        cl.append('I=%s' % opts.input)
+        cl.append('O=%s' % tempout)
+        cl.append('SORT_ORDER=%s' % opts.sortorder)
+        stdouts,rval = pic.runPic(opts.jar,cl)
+        haveTempout = True
+
+    elif pic.picname == 'BuildBamIndex':
+        cl.append('I=%s' % opts.input)
+        cl.append('O=%s' % tempout)
+        stdouts,rval = pic.runPic(opts.jar,cl)
+        haveTempout = True
+
+    elif pic.picname == 'SamFormatConverter':
+        cl.append('INPUT=%s' % opts.input)
+        cl.append('OUTPUT=%s' % tempout)
+        pic.runPic(opts.jar, cl)
+        haveTempout = True
     elif pic.picname == "DownsampleSam":
         cl.append('I=%s' % opts.input)
         mystring = opts.output
@@ -775,17 +805,13 @@
         stdouts,rval = pic.runPic(opts.jar, cl)
         myoutput = mystringsam.replace(".sam", "")
         os.rename(mystringsam,myoutput)
-    elif pic.picname == 'SamFormatConverter':
-        cl.append('INPUT=%s' % opts.input)
-        cl.append('OUTPUT=%s' % tempout)
-        pic.runPic(opts.jar, cl)
-        haveTempout = True
-
+
+
     else:
         print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
         sys.exit(1)
     if haveTempout:
-        # Some Picard tools produced a potentially intermediate bam file.
+        # Some Picard tools produced a potentially intermediate bam file.
         # Either just move to final location or create sam
         if os.path.exists(tempout):
             shutil.move(tempout, os.path.abspath(opts.output))
@@ -795,4 +821,3 @@
         print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
         # signal failure
 if __name__=="__main__": __main__()
-