Mercurial > repos > jjohnson > shear
changeset 4:a82400332451
Use shear_wrapper.py to generate reference indexes when needed
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Tue, 16 Jul 2013 12:38:48 -0500 |
parents | 630532975fa9 |
children | aaaa5a071ff0 |
files | datatypes_conf.xml lib/galaxy/datatypes/shear.py shear_assemble.xml shear_sv.xml shear_wrapper.py tool_dependencies.xml |
diffstat | 6 files changed, 364 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/datatypes_conf.xml Mon Jul 08 01:10:38 2013 -0500 +++ b/datatypes_conf.xml Tue Jul 16 12:38:48 2013 -0500 @@ -1,6 +1,10 @@ <?xml version="1.0"?> <datatypes> + <datatype_files> + <datatype_file name="shear.py"/> + </datatype_files> <registration> <datatype extension="shear.sdi" type="galaxy.datatypes.tabular:Tabular" subclass="True"/> + <datatype extension="shear.svidx" type="galaxy.datatypes.shear:ShearSvIndex" /> </registration> </datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/shear.py Tue Jul 16 12:38:48 2013 -0500 @@ -0,0 +1,65 @@ +import sys,os +import logging +import galaxy.datatypes.data +from galaxy.datatypes.data import Text +from galaxy.datatypes.metadata import MetadataElement + +class ShearSvIndex( Text ): + MetadataElement( name="base_name", desc="base name for this index set", default='shear_ref', set_in_upload=True, readonly=True ) + """ + generated files: + copy fasta: + base_name.fa + index fasta with samtools index: + base_name.fa.fai + create 2bit with faToTwoBit: + base_name.2bit + create bwa index: + base_name.fa.amb + base_name.fa.ann + base_name.fa.bwt + base_name.fa.pac + base_name.fa.rbwt + base_name.fa.rpac + base_name.fa.rsa + base_name.fa.sa + """ + file_ext = 'shear.svidx' + composite_type = 'auto_primary_file' + allow_datatype_change = False + + def __init__(self, **kwd): + """Initialize datatype""" + Text.__init__( self, **kwd ) + + def init_meta( self, dataset, copy_from=None ): + Text.init_meta( self, dataset, copy_from=copy_from ) + + def generate_primary_file( self, dataset = None ): + """ + This is called only at upload to write the html file + cannot rename the datasets here - they come with the default unfortunately + """ + + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + bn = dataset.metadata.base_name + f = file(dataset.file_name,'w') + f.write(bn) + f.close() + + def set_meta( self, dataset, overwrite = True, **kwd ): + efp = dataset.extra_files_path + flist = os.listdir(efp) + for i,fname in enumerate(flist): + if fname.endswith('.fa'): + dataset.metadata.base_name = fname[:-3] + break + if fname.endswith('.2bit'): + dataset.metadata.base_name = fname[:-5] + break + self.regenerate_primary_file(dataset) + +
--- a/shear_assemble.xml Mon Jul 08 01:10:38 2013 -0500 +++ b/shear_assemble.xml Tue Jul 16 12:38:48 2013 -0500 @@ -6,15 +6,21 @@ <!-- <version_command></version_command> --> - <command>java -jar \$SHEAR_JAR_PATH/SHEAR.jar assemble -s $sdi_file - #if $genomeSource.refGenomeSource == 'indexed': - -f $genomeSource.ref_fastas.fields.path - #else: - -f $genomeSource.ref_fasta - #end if - -o $output_fasta + <command interpreter="python"> + shear_wrapper.py -j \$SHEAR_JAR_PATH/SHEAR.jar --command assemble + -p $prefix + -s $sdi_file + #if $genomeSource.refGenomeSource == 'indexed': + -f $genomeSource.ref_fastas.fields.path + #else: + -f $genomeSource.ref_fasta + #end if + -o $output_fasta </command> <inputs> + <param name="prefix" type="text" value="shear_assemble" label="Prefix for all generated files"> + <validator type="regex" message="Prefix should start with a letter and contain only letter, digit, and '_' or '-' characters">[a-zA-Z0-9][_a-zA-Z0-9-]*</validator> + </param> <param name="sdi_file" type="data" format="shear.sdi" label="SDI file produced by SHEAR's 'sv' command containing the SVs to use to create the new genomic sequence."/> <!-- reference data --> <conditional name="genomeSource"> @@ -39,13 +45,14 @@ <exit_code range="1:" level="fatal" description="Error" /> </stdio> <outputs> - <data format="fasta" name="output_fasta" label="${tool.name} on ${on_string}: personal genome" /> + <data format="fasta" name="output_fasta" label="${tool.name} on ${on_string}: ${prefix}.fa" /> </outputs> <tests> <test> - <param name="sdi_file" ftype="shear.sdi" value="shear_sv.sdi"/> + <param name="prefix" value="shear_assembly" /> + <param name="sdi_file" value="shear_sv.sdi" ftype="shear.sdi" /> <param name="refGenomeSource" value="history"/> - <param name="ref_fasta" ftype="fasta" value="syn.fa"/> + <param name="ref_fasta" value="syn.fa" ftype="fasta" /> <output name="output_fasta" file="simulated-data.fa"/> </test> </tests>
--- a/shear_sv.xml Mon Jul 08 01:10:38 2013 -0500 +++ b/shear_sv.xml Tue Jul 16 12:38:48 2013 -0500 @@ -11,27 +11,38 @@ <!-- <version_command></version_command> --> - <command>java -jar \$SHEAR_JAR_PATH/SHEAR.jar sv -p $prefix - -b $bamfile + <command interpreter="python"> + shear_wrapper.py -j \$SHEAR_JAR_PATH/SHEAR.jar --command sv + -p $prefix + -b $bamfile #if $genomeSource.refGenomeSource == 'indexed': -f $genomeSource.ref_fastas.fields.path -i $genomeSource.bwa_indices.fields.path -t $genomeSource.twobit_indices.fields.path + #elif $genomeSource.refGenomeSource == 'svidx': + -f $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.fa + -i $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.fa + -t $genomeSource.svidx.extra_files_path/${genomeSource.svidx.metadata.base_name}.2bit #else: -f $genomeSource.ref_fasta - -i $genomeSource.bwa_index - -t $genomeSource.twobit + #if $genomeSource.save_svidx: + -D $sv_idx.extra_files_path + #end if #end if + --report $report + --sdi $sdi </command> <inputs> <param name="bamfile" type="data" format="bam" label="BAM alignment file containing the input sequences to the assembly."/> + <param name="prefix" type="text" value="shear_sv" label="Prefix for all generated files"> + <validator type="regex" message="Prefix should start with a letter and contain only letter, digit, and '_' or '-' characters">[a-zA-Z0-9][_a-zA-Z0-9-]*</validator> + </param> <!-- reference data --> <conditional name="genomeSource"> <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a cached file?"> <option value="indexed" selected="true">Use a cached reference genome</option> - <!-- - <option value="history">Use one from the history</option> - --> + <option value="history">Use fasta from the history</option> + <option value="svidx">Use SHEAR sv index from the history</option> </param> <when value="indexed"> <param name="ref_fastas" type="select" label="Select a reference genome fasta"> @@ -59,31 +70,32 @@ </options> </param> </when> - <!-- <when value="history"> - <param name="ref_fasta" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> - <param name="twobit" type="data" format="twobit" metadata_name="dbkey" label="Select a reference from history" /> - <param name="bwa_index" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> + <param name="ref_fasta" type="data" format="fasta" label="Select a reference from history" /> + <param name="save_svidx" type="boolean" truevalue="yes" falsevalue="no" label="Save sv generated reference indexes"/> </when> - --> + <when value="svidx"> + <param name="svidx" type="data" format="shear.svidx" label="Select a reference from history" /> + </when> </conditional> - <param name="prefix" type="hidden" value="shear_sv" label="Prefix for all generated files"/> </inputs> <stdio> <exit_code range="1:" level="fatal" description="Error" /> </stdio> <outputs> - <data format="shear.sdi" name="sdi" label="${tool.name} on ${on_string}: shear_sv.sdi" from_work_dir="shear_sv.sdi" /> - <data format="txt" name="report" label="${tool.name} on ${on_string}: shear_sv.report" from_work_dir="shear_sv.report"/> + <data format="shear.sdi" name="sdi" label="${tool.name} on ${on_string}: ${prefix}.sdi" /> + <data format="txt" name="report" label="${tool.name} on ${on_string}: ${prefix}.report" /> + <data format="shear.svidx" name="sv_idx" label="${tool.name} on ${on_string}: ${prefix}.svidx"> + <filter>genomeSource['refGenomeSource'] == 'history' and genomeSource['save_svidx'] == True</filter> + </data> </outputs> <tests> <test> - <param name="bamfile" ftype="bam" value="simulated-data.bam"/> + <param name="prefix" value="shear_test" /> + <param name="bamfile" value="simulated-data.bam" ftype="bam" /> <param name="refGenomeSource" value="history"/> - <param name="ref_fasta" ftype="fasta" value="syn.fa"/> - <param name="bwa_index" ftype="" value="syn.fa"/> - <param name="twobit" ftype="twobit" value="syn.2bit"/> - <output name="shear.sdi" file="shear_sv.sdi"/> + <param name="ref_fasta" value="syn.fa" ftype="fasta" /> + <output name="sdi" file="shear_sv.sdi"/> <output name="report" file="shear_sv.report"/> </test> </tests>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/shear_wrapper.py Tue Jul 16 12:38:48 2013 -0500 @@ -0,0 +1,241 @@ +#!/usr/bin/env python + +import optparse, os, shutil, subprocess, sys, tempfile, os.path + +def stop_err( msg, code ): + sys.stderr.write( '%s\n' % msg ) + sys.exit(code) + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-j', '--jar_path', dest='jar_path', help='Path to SHEAR.jar' ) + parser.add_option( '-C', '--command', dest='command', help='SHEAR command to run' ) + parser.add_option( '-p', '--prefix', dest='prefix', help='Prefix for all generated files. Required.' ) + parser.add_option( '-b', '--bam', dest='bam', help='BAM alignment file containing the input sequences to the assembly.' ) + parser.add_option( '-B', '--bai', dest='bai', help='BAM index file (.bai)' ) + parser.add_option( '-f', '--fasta_reference', dest='fasta', help='The reference sequence fasta file' ) + parser.add_option( '-F', '--fasta_index', dest='fai', help='The .fai index file for the reference sequence fasta' ) + parser.add_option( '-t', '--twobit', dest='twobit', help='The .2bit encoding of the reference sequence fasta generated by faToTwoBit' ) + parser.add_option( '-i', '--bwa_index', dest='bwa_index', help='The bwa index of the reference sequence' ) + parser.add_option( '-r', '--report', dest='report', help='The SHEAR output report' ) + parser.add_option( '-s', '--sdi', dest='sdi', help='The SHEAR sdi input from the SHEAR sv command' ) + parser.add_option( '-o', '--output', dest='output', help='The SHEAR output assembly fasta file' ) + parser.add_option( '-D', '--svidx_dir', dest='svidx_dir', help='The SHEAR output assembly fasta file' ) + (options, args) = parser.parse_args() + + def make_ref(src, dest, copy=False): + if copy: + shutil.copy( src, dest ) + else: + os.symlink( src, dest ) + + # output version # of tool + #try: + #except: + # sys.stdout.write( 'Could not determine BWA version\n' ) + if not options.jar_path: + stop_err('path to SHEAR.jar is required',1) + if options.command: + command = options.command + else: + stop_err('SHEAR command is required',1) + args = [ 'java','-jar' ] + args.append( options.jar_path ) + args.append( command ) + # Check required params for command + buffsize = 1048576 + copy_ref = False + prefix = options.prefix if options.prefix and len(options.prefix) > 0 else 'ref' + if command in ['sv']: + args.append( '-p' ) + args.append( prefix ) + if options.svidx_dir and command in ['sv']: + if not os.path.isdir(options.svidx_dir): + os.makedirs(options.svidx_dir) + ref_prefix = os.path.join(options.svidx_dir,prefix) + copy_ref = True + else: + ref_prefix = os.path.join(os.getcwd(),prefix) + + if command in ['sv','assemble']: + ref_fasta = ref_prefix + '.fa' + ref_index = ref_fasta + '.fai' + if options.fasta: + make_ref( options.fasta, ref_fasta, copy=copy_ref ) + else: + stop_err('Reference fasta file is required',3) + # if reference fasta index is not supllied, generate it + if options.fai: + make_ref( options.fai, ref_index, copy=copy_ref ) + elif os.path.exists(options.fasta + '.fai'): + make_ref( options.fasta + '.fai', ref_index, copy=copy_ref ) + else: + # generate fasta fai index + cmd1 = 'samtools faidx %s' % (ref_fasta ) + try: + tmp_name = tempfile.NamedTemporaryFile(prefix='fai_', suffix='.err').name + tmp_stderr = open( tmp_name, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + if returncode != 0: + stderr = '' + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + raise Exception, stderr + except Exception, e: + stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),5 ) + args.append( '-f' ) + args.append( ref_fasta ) + + if command in ['sv']: + if not options.bam: + stop_err('BAM alignment file is required',2) + + # if reference 2bit is not supplied, generate it + twobit = ref_prefix + '.2bit' + if options.twobit: + make_ref( options.twobit, twobit, copy=copy_ref ) + else: + # generate 2bit index + cmd1 = 'faToTwoBit %s %s' % (ref_fasta, twobit ) + try: + tmp_name = tempfile.NamedTemporaryFile(prefix='twobit_', suffix='.err').name + tmp_stderr = open( tmp_name, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + if returncode != 0: + stderr = '' + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + raise Exception, stderr + except Exception, e: + stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),6 ) + args.append( '-t' ) + args.append( twobit ) + + # if bwa index is not supplied, generate it + bwa_index = ref_fasta + if options.bwa_index: + if copy_ref: + shutil.copytree(os.path.dirname(bwa_index),options.svidx_dir) + bwa_index = options.bwa_index + else: + ONE_GB = 2**30 + if os.stat( ref_fasta ).st_size <= ONE_GB: #use 1 GB as cut off for memory vs. max of 2gb database size; this is somewhat arbitrary + index_algorithm = 'is' + else: + index_algorithm = 'bwtsw' + cmd1 = 'bwa index -a %s %s' % ( index_algorithm, ref_fasta ) + try: + tmp_name = tempfile.NamedTemporaryFile(prefix='bwa_index_', suffix='.err').name + tmp_stderr = open( tmp_name, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + if returncode != 0: + stderr = '' + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + raise Exception, stderr + bwa_index = ref_fasta + except Exception, e: + # clean up temp dirs + stop_err( 'Error creating bwa index. ' + str( e ),7 ) + args.append( '-i' ) + args.append( bwa_index ) + + bam_file = os.path.join(os.getcwd(),prefix + '.bam') + bai_file = bam_file + '.bai' + if options.bam: + os.symlink( options.bam, bam_file ) + else: + stop_err('BAM alignment file is required',2) + # if bam index is not supplied, generate it + if options.bai: + os.symlink( options.bai, bai_file ) + elif os.path.exists(options.bam + '.bai'): + os.symlink( options.bam + '.bai', bai_file ) + else: + # generate bam index + cmd1 = 'samtools index %s %s' % (bam_file, bai_file ) + try: + tmp_name = tempfile.NamedTemporaryFile(prefix='bai_', suffix='.err').name + tmp_stderr = open( tmp_name, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + if returncode != 0: + stderr = '' + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + raise Exception, stderr + except Exception, e: + stop_err( 'Error indexing BAM alignment file. ' + str( e ),4) + args.append( '-b' ) + args.append( bam_file ) + + if command in ['assemble']: + if not options.sdi or not options.output: + stop_err('SHEAR .sdi file and output file are required for assemble',2) + args.append( '-s' ) + args.append( options.sdi ) + args.append( '-o' ) + args.append( options.output ) + + # Run SHEAR command + print >> sys.stdout, "%s" % ' '.join(args) + try: + proc = subprocess.Popen( args=args, shell=False ) + returncode = proc.wait() + if returncode != 0: + stop_err( 'Error running SHEAR ' + command, returncode ) + if command in ['sv']: + report_path = os.path.join(os.getcwd(),prefix + '.report') + if os.path.exists(report_path): + if options.report: + shutil.copy(report_path,options.report) + else: + raise Exception, 'no report file' + sdi_path = os.path.join(os.getcwd(),prefix + '.sdi') + if os.path.exists(sdi_path): + if options.sdi: + shutil.copy(sdi_path,options.sdi) + else: + raise Exception, 'no sdi file' + except Exception, e: + # clean up temp dirs + stop_err( 'Error running SHEAR %s %s' % (command,str(e)),9 ) + +if __name__=="__main__": __main__() +
--- a/tool_dependencies.xml Mon Jul 08 01:10:38 2013 -0500 +++ b/tool_dependencies.xml Tue Jul 16 12:38:48 2013 -0500 @@ -1,6 +1,5 @@ <?xml version="1.0"?> <tool_dependency> -http://testtoolshed.g2.bx.psu.edu/ <package name="samtools" version="0.1.18"> <repository changeset_revision="5f7ec5048224" name="package_samtools_0_1_18" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> @@ -12,6 +11,11 @@ </package> <package name="crest" version="1.0.1"> <repository changeset_revision="89a9d6ce7f96" name="package_crest_1_0_1" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <readme> +CREST requires: + Bioperl package Bio::DB:Sam + cap3 which only has binary executable downloads from: http://seq.cs.iastate.edu/cap3.html + </readme> </package> <package name="shear" version="0.1.2"> <repository changeset_revision="c2ab94f12a7a" name="package_shear_0_1_2" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" />