changeset 4:a82400332451

Use shear_wrapper.py to generate reference indexes when needed
author Jim Johnson <jj@umn.edu>
date Tue, 16 Jul 2013 12:38:48 -0500
parents 630532975fa9
children aaaa5a071ff0
files datatypes_conf.xml lib/galaxy/datatypes/shear.py shear_assemble.xml shear_sv.xml shear_wrapper.py tool_dependencies.xml
diffstat 6 files changed, 364 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/datatypes_conf.xml	Mon Jul 08 01:10:38 2013 -0500
+++ b/datatypes_conf.xml	Tue Jul 16 12:38:48 2013 -0500
@@ -1,6 +1,10 @@
 <?xml version="1.0"?>
 <datatypes>
+    <datatype_files>
+        <datatype_file name="shear.py"/>
+    </datatype_files>
     <registration>
         <datatype extension="shear.sdi" type="galaxy.datatypes.tabular:Tabular" subclass="True"/>
+        <datatype extension="shear.svidx" type="galaxy.datatypes.shear:ShearSvIndex" />
     </registration>
 </datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/shear.py	Tue Jul 16 12:38:48 2013 -0500
@@ -0,0 +1,65 @@
+import sys,os
+import logging
+import galaxy.datatypes.data
+from galaxy.datatypes.data import Text
+from galaxy.datatypes.metadata import MetadataElement
+
+class ShearSvIndex( Text ):
+    MetadataElement( name="base_name", desc="base name for this index set", default='shear_ref', set_in_upload=True, readonly=True )
+    """
+    generated files:
+      copy fasta:
+        base_name.fa
+      index fasta with samtools index:
+        base_name.fa.fai
+      create 2bit with faToTwoBit:
+        base_name.2bit
+      create bwa index:
+        base_name.fa.amb
+        base_name.fa.ann
+        base_name.fa.bwt
+        base_name.fa.pac
+        base_name.fa.rbwt
+        base_name.fa.rpac
+        base_name.fa.rsa
+        base_name.fa.sa
+    """
+    file_ext = 'shear.svidx'
+    composite_type = 'auto_primary_file'
+    allow_datatype_change = False
+
+    def __init__(self, **kwd):
+        """Initialize  datatype"""
+        Text.__init__( self, **kwd )
+
+    def init_meta( self, dataset, copy_from=None ):
+        Text.init_meta( self, dataset, copy_from=copy_from )
+
+    def generate_primary_file( self, dataset = None ):
+        """ 
+        This is called only at upload to write the html file
+        cannot rename the datasets here - they come with the default unfortunately
+        """
+
+    def regenerate_primary_file(self,dataset):
+        """
+        cannot do this until we are setting metadata 
+        """
+        bn = dataset.metadata.base_name
+        f = file(dataset.file_name,'w')
+        f.write(bn)
+        f.close()
+
+    def set_meta( self, dataset, overwrite = True, **kwd ):
+        efp = dataset.extra_files_path
+        flist = os.listdir(efp)
+        for i,fname in enumerate(flist):
+            if fname.endswith('.fa'):
+                dataset.metadata.base_name = fname[:-3]
+                break
+            if fname.endswith('.2bit'):
+                dataset.metadata.base_name = fname[:-5]
+                break
+        self.regenerate_primary_file(dataset)
+
+
--- a/shear_assemble.xml	Mon Jul 08 01:10:38 2013 -0500
+++ b/shear_assemble.xml	Tue Jul 16 12:38:48 2013 -0500
@@ -6,15 +6,21 @@
   <!--
   <version_command></version_command>
   -->
-  <command>java -jar \$SHEAR_JAR_PATH/SHEAR.jar assemble -s $sdi_file 
-     #if $genomeSource.refGenomeSource == 'indexed':
-       -f $genomeSource.ref_fastas.fields.path
-     #else:
-       -f $genomeSource.ref_fasta
-     #end if
-     -o $output_fasta
+  <command interpreter="python">
+    shear_wrapper.py -j \$SHEAR_JAR_PATH/SHEAR.jar --command assemble 
+    -p $prefix
+    -s $sdi_file
+    #if $genomeSource.refGenomeSource == 'indexed':
+      -f $genomeSource.ref_fastas.fields.path
+    #else:
+      -f $genomeSource.ref_fasta
+    #end if
+    -o $output_fasta
   </command>
   <inputs>
+    <param name="prefix" type="text" value="shear_assemble" label="Prefix for all generated files">
+      <validator type="regex" message="Prefix should start with a letter and contain only letter, digit, and '_' or '-' characters">[a-zA-Z0-9][_a-zA-Z0-9-]*</validator>
+    </param>
     <param name="sdi_file" type="data" format="shear.sdi" label="SDI file produced by SHEAR's 'sv' command containing the SVs to use to create the new genomic sequence."/> 
     <!-- reference data -->
     <conditional name="genomeSource">
@@ -39,13 +45,14 @@
     <exit_code range="1:" level="fatal" description="Error" />
   </stdio>
   <outputs>
-    <data format="fasta" name="output_fasta" label="${tool.name} on ${on_string}: personal genome"  />
+    <data format="fasta" name="output_fasta" label="${tool.name} on ${on_string}: ${prefix}.fa"  />
   </outputs>
   <tests>
     <test>
-      <param name="sdi_file" ftype="shear.sdi" value="shear_sv.sdi"/>
+      <param name="prefix" value="shear_assembly" />
+      <param name="sdi_file" value="shear_sv.sdi" ftype="shear.sdi" />
       <param name="refGenomeSource" value="history"/>
-      <param name="ref_fasta" ftype="fasta" value="syn.fa"/>
+      <param name="ref_fasta" value="syn.fa" ftype="fasta" />
       <output name="output_fasta"  file="simulated-data.fa"/>
     </test>
   </tests>
--- a/shear_sv.xml	Mon Jul 08 01:10:38 2013 -0500
+++ b/shear_sv.xml	Tue Jul 16 12:38:48 2013 -0500
@@ -11,27 +11,38 @@
   <!--
   <version_command></version_command>
   -->
-  <command>java -jar  \$SHEAR_JAR_PATH/SHEAR.jar sv -p $prefix 
-     -b $bamfile 
+  <command interpreter="python">
+    shear_wrapper.py -j \$SHEAR_JAR_PATH/SHEAR.jar --command sv 
+    -p $prefix 
+    -b $bamfile 
      #if $genomeSource.refGenomeSource == 'indexed':
        -f $genomeSource.ref_fastas.fields.path 
        -i $genomeSource.bwa_indices.fields.path 
        -t $genomeSource.twobit_indices.fields.path
+     #elif $genomeSource.refGenomeSource == 'svidx':
+       -f $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.fa
+       -i $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.fa
+       -t $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.2bit
      #else:
        -f $genomeSource.ref_fasta
-       -i $genomeSource.bwa_index
-       -t $genomeSource.twobit
+       #if $genomeSource.save_svidx:
+         -D $sv_idx.extra_files_path
+       #end if
      #end if
+     --report $report
+     --sdi $sdi
   </command>
   <inputs>
     <param name="bamfile" type="data" format="bam" label="BAM alignment file containing the input sequences to the assembly."/> 
+    <param name="prefix" type="text" value="shear_sv" label="Prefix for all generated files">
+      <validator type="regex" message="Prefix should start with a letter and contain only letter, digit, and '_' or '-' characters">[a-zA-Z0-9][_a-zA-Z0-9-]*</validator>
+    </param>
     <!-- reference data -->
     <conditional name="genomeSource">
       <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a cached file?">
         <option value="indexed" selected="true">Use a cached reference genome</option>
-        <!--
-        <option value="history">Use one from the history</option>
-        -->
+        <option value="history">Use fasta from the history</option>
+        <option value="svidx">Use SHEAR sv index from the history</option>
       </param>
       <when value="indexed">
         <param name="ref_fastas" type="select" label="Select a reference genome fasta">
@@ -59,31 +70,32 @@
           </options>
         </param>
       </when>
-      <!--
       <when value="history">
-        <param name="ref_fasta" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
-        <param name="twobit" type="data" format="twobit" metadata_name="dbkey" label="Select a reference from history" />
-        <param name="bwa_index" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+        <param name="ref_fasta" type="data" format="fasta" label="Select a reference from history" />
+        <param name="save_svidx" type="boolean" truevalue="yes" falsevalue="no" label="Save sv generated reference indexes"/> 
       </when>
-      -->
+      <when value="svidx">
+        <param name="svidx" type="data" format="shear.svidx" label="Select a reference from history" />
+      </when>
     </conditional>
-    <param name="prefix" type="hidden" value="shear_sv" label="Prefix for all generated files"/>
   </inputs>
   <stdio>
     <exit_code range="1:" level="fatal" description="Error" />
   </stdio>
   <outputs>
-    <data format="shear.sdi" name="sdi" label="${tool.name} on ${on_string}: shear_sv.sdi" from_work_dir="shear_sv.sdi" />
-    <data format="txt" name="report" label="${tool.name} on ${on_string}: shear_sv.report" from_work_dir="shear_sv.report"/>
+    <data format="shear.sdi" name="sdi" label="${tool.name} on ${on_string}: ${prefix}.sdi" />
+    <data format="txt" name="report" label="${tool.name} on ${on_string}: ${prefix}.report" />
+    <data format="shear.svidx" name="sv_idx" label="${tool.name} on ${on_string}: ${prefix}.svidx">
+      <filter>genomeSource['refGenomeSource'] == 'history' and genomeSource['save_svidx'] == True</filter>
+    </data>
   </outputs>
   <tests>
     <test>
-      <param name="bamfile" ftype="bam" value="simulated-data.bam"/>
+      <param name="prefix" value="shear_test" />
+      <param name="bamfile" value="simulated-data.bam" ftype="bam" />
       <param name="refGenomeSource" value="history"/>
-      <param name="ref_fasta" ftype="fasta" value="syn.fa"/>
-      <param name="bwa_index" ftype="" value="syn.fa"/>
-      <param name="twobit" ftype="twobit" value="syn.2bit"/>
-      <output name="shear.sdi"  file="shear_sv.sdi"/>
+      <param name="ref_fasta" value="syn.fa" ftype="fasta" />
+      <output name="sdi"  file="shear_sv.sdi"/>
       <output name="report"  file="shear_sv.report"/>
     </test>
   </tests>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/shear_wrapper.py	Tue Jul 16 12:38:48 2013 -0500
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+
+import optparse, os, shutil, subprocess, sys, tempfile, os.path
+
+def stop_err( msg, code ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit(code)
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-j', '--jar_path', dest='jar_path', help='Path to SHEAR.jar' )
+    parser.add_option( '-C', '--command', dest='command', help='SHEAR command to run' )
+    parser.add_option( '-p', '--prefix', dest='prefix', help='Prefix for all generated files. Required.' )
+    parser.add_option( '-b', '--bam', dest='bam', help='BAM alignment file containing the input sequences to the assembly.' )
+    parser.add_option( '-B', '--bai', dest='bai', help='BAM index file (.bai)' )
+    parser.add_option( '-f', '--fasta_reference', dest='fasta', help='The reference sequence fasta file' )
+    parser.add_option( '-F', '--fasta_index', dest='fai', help='The .fai index file for the reference sequence fasta' )
+    parser.add_option( '-t', '--twobit', dest='twobit', help='The .2bit encoding of the reference sequence fasta generated by faToTwoBit' )
+    parser.add_option( '-i', '--bwa_index', dest='bwa_index', help='The bwa index of the reference sequence' )
+    parser.add_option( '-r', '--report', dest='report', help='The SHEAR output report' )
+    parser.add_option( '-s', '--sdi', dest='sdi', help='The SHEAR sdi input from the SHEAR sv command' )
+    parser.add_option( '-o', '--output', dest='output', help='The SHEAR output assembly fasta file' )
+    parser.add_option( '-D', '--svidx_dir', dest='svidx_dir', help='The SHEAR output assembly fasta file' )
+    (options, args) = parser.parse_args()
+
+    def make_ref(src, dest, copy=False):
+        if copy:
+            shutil.copy( src, dest )
+        else:
+            os.symlink( src, dest )
+
+    # output version # of tool
+    #try:
+    #except:
+    #    sys.stdout.write( 'Could not determine BWA version\n' )
+    if not options.jar_path:
+        stop_err('path to SHEAR.jar is required',1)
+    if options.command:
+        command = options.command
+    else:
+        stop_err('SHEAR command is required',1)
+    args = [ 'java','-jar' ]
+    args.append( options.jar_path )
+    args.append( command )
+    # Check required params for command
+    buffsize = 1048576 
+    copy_ref = False
+    prefix = options.prefix if options.prefix and len(options.prefix) > 0 else 'ref'
+    if command in ['sv']:
+        args.append( '-p' )
+        args.append( prefix )
+    if options.svidx_dir and command in ['sv']:
+        if not os.path.isdir(options.svidx_dir):
+            os.makedirs(options.svidx_dir)
+        ref_prefix = os.path.join(options.svidx_dir,prefix)
+        copy_ref = True
+    else:
+        ref_prefix = os.path.join(os.getcwd(),prefix)
+
+    if command in ['sv','assemble']:
+        ref_fasta = ref_prefix + '.fa'
+        ref_index = ref_fasta + '.fai'
+        if options.fasta:
+            make_ref( options.fasta, ref_fasta, copy=copy_ref )
+        else:
+            stop_err('Reference fasta file is required',3)
+        # if reference fasta index is not supllied, generate it
+        if options.fai:
+            make_ref( options.fai, ref_index, copy=copy_ref )
+        elif os.path.exists(options.fasta + '.fai'):
+            make_ref( options.fasta + '.fai', ref_index, copy=copy_ref )
+        else:
+            # generate fasta fai index
+            cmd1 = 'samtools faidx %s' % (ref_fasta )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='fai_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),5 )
+        args.append( '-f' )
+        args.append( ref_fasta )
+
+    if command in ['sv']:
+        if not options.bam:
+            stop_err('BAM alignment file is required',2)
+
+        # if reference 2bit is not supplied, generate it
+        twobit = ref_prefix + '.2bit'
+        if options.twobit:
+            make_ref( options.twobit, twobit, copy=copy_ref )
+        else:
+            # generate 2bit index
+            cmd1 = 'faToTwoBit %s %s' % (ref_fasta, twobit )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='twobit_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),6 )
+        args.append( '-t' )
+        args.append( twobit )
+
+        # if bwa index is not supplied, generate it
+        bwa_index = ref_fasta
+        if options.bwa_index:
+            if copy_ref:
+                shutil.copytree(os.path.dirname(bwa_index),options.svidx_dir)
+            bwa_index = options.bwa_index
+        else:
+            ONE_GB = 2**30
+            if os.stat( ref_fasta ).st_size <= ONE_GB: #use 1 GB as cut off for memory vs. max of 2gb database size; this is somewhat arbitrary
+                index_algorithm = 'is'
+            else:
+                index_algorithm = 'bwtsw'
+            cmd1 = 'bwa index -a %s %s' % ( index_algorithm, ref_fasta )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='bwa_index_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+                bwa_index = ref_fasta
+            except Exception, e:
+                # clean up temp dirs
+                stop_err( 'Error creating bwa index. ' + str( e ),7 )
+        args.append( '-i' )
+        args.append( bwa_index )
+
+        bam_file = os.path.join(os.getcwd(),prefix + '.bam')
+        bai_file = bam_file + '.bai'
+        if options.bam:
+            os.symlink( options.bam, bam_file )
+        else:
+            stop_err('BAM alignment file is required',2)
+        # if bam index is not supplied, generate it
+        if options.bai:
+            os.symlink( options.bai, bai_file )
+        elif os.path.exists(options.bam + '.bai'):
+            os.symlink( options.bam + '.bai', bai_file )
+        else:
+            # generate bam index
+            cmd1 = 'samtools index %s %s' % (bam_file, bai_file )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='bai_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing BAM alignment file. ' + str( e ),4)
+        args.append( '-b' )
+        args.append( bam_file )
+    
+    if command in ['assemble']:
+        if not options.sdi or not options.output:
+            stop_err('SHEAR .sdi file and output file are required for assemble',2)
+        args.append( '-s' )
+        args.append( options.sdi )
+        args.append( '-o' )
+        args.append( options.output )
+
+    # Run SHEAR command
+    print >> sys.stdout, "%s" % ' '.join(args)
+    try:
+        proc = subprocess.Popen( args=args, shell=False )
+        returncode = proc.wait()
+        if returncode != 0:
+            stop_err( 'Error running SHEAR ' + command, returncode )
+        if command in ['sv']:
+            report_path = os.path.join(os.getcwd(),prefix + '.report')
+            if os.path.exists(report_path):
+                if options.report:
+                    shutil.copy(report_path,options.report)
+            else:
+                raise Exception, 'no report file'
+            sdi_path = os.path.join(os.getcwd(),prefix + '.sdi')
+            if os.path.exists(sdi_path):
+                if options.sdi:
+                    shutil.copy(sdi_path,options.sdi)
+            else:
+                raise Exception, 'no sdi file'
+    except Exception, e:
+        # clean up temp dirs
+        stop_err( 'Error running SHEAR %s %s' % (command,str(e)),9 )
+
+if __name__=="__main__": __main__()
+
--- a/tool_dependencies.xml	Mon Jul 08 01:10:38 2013 -0500
+++ b/tool_dependencies.xml	Tue Jul 16 12:38:48 2013 -0500
@@ -1,6 +1,5 @@
 <?xml version="1.0"?>
 <tool_dependency>
-http://testtoolshed.g2.bx.psu.edu/
     <package name="samtools" version="0.1.18">
         <repository changeset_revision="5f7ec5048224" name="package_samtools_0_1_18" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
@@ -12,6 +11,11 @@
     </package>
     <package name="crest" version="1.0.1">
         <repository changeset_revision="89a9d6ce7f96" name="package_crest_1_0_1" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+          <readme>
+CREST requires: 
+  Bioperl package Bio::DB:Sam  
+  cap3 which only has binary executable downloads from:   http://seq.cs.iastate.edu/cap3.html
+          </readme>
     </package>
     <package name="shear" version="0.1.2">
         <repository changeset_revision="c2ab94f12a7a" name="package_shear_0_1_2" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" />