Mercurial > repos > jjohnson > shear
view shear_wrapper.py @ 8:8fac6c7d32fd
Copy reference fasta for shear sv to avoid GATK issue with symlinks
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Tue, 22 Oct 2013 09:44:32 -0500 |
parents | 0158f7356ffd |
children |
line wrap: on
line source
#!/usr/bin/env python import optparse, os, shutil, subprocess, sys, tempfile, os.path def stop_err( msg, code ): sys.stderr.write( '%s\n' % msg ) sys.exit(code) def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option( '-j', '--jar_path', dest='jar_path', help='Path to SHEAR.jar' ) parser.add_option( '-C', '--command', dest='command', help='SHEAR command to run' ) parser.add_option( '-p', '--prefix', dest='prefix', help='Prefix for all generated files. Required.' ) parser.add_option( '-b', '--bam', dest='bam', help='BAM alignment file containing the input sequences to the assembly.' ) parser.add_option( '-B', '--bai', dest='bai', help='BAM index file (.bai)' ) parser.add_option( '-f', '--fasta_reference', dest='fasta', help='The reference sequence fasta file' ) parser.add_option( '-F', '--fasta_index', dest='fai', help='The .fai index file for the reference sequence fasta' ) parser.add_option( '-t', '--twobit', dest='twobit', help='The .2bit encoding of the reference sequence fasta generated by faToTwoBit' ) parser.add_option( '-i', '--bwa_index', dest='bwa_index', help='The bwa index of the reference sequence' ) parser.add_option( '-R', '--report', dest='report', help='The SHEAR output report' ) parser.add_option( '-s', '--sdi', dest='sdi', help='The SHEAR sdi input from the SHEAR sv command' ) parser.add_option( '-o', '--output', dest='output', help='The SHEAR output assembly fasta file' ) parser.add_option( '-D', '--svidx_dir', dest='svidx_dir', help='The SHEAR output assembly fasta file' ) parser.add_option( '-S', '--sv-only', dest='sv_only', action="store_true", help='SV Only prediction mode.' ) parser.add_option( '-r', '--region', dest='region', help='Region of the input alignment to analyze' ) (options, args) = parser.parse_args() def make_ref(src, dest, copy=False): if copy: shutil.copy( src, dest ) else: os.symlink( src, dest ) # output version # of tool #try: #except: # sys.stdout.write( 'Could not determine BWA version\n' ) if not options.jar_path: stop_err('path to SHEAR.jar is required',1) if options.command: command = options.command else: stop_err('SHEAR command is required',1) args = [ 'java','-jar' ] args.append( options.jar_path ) args.append( command ) # Check required params for command buffsize = 1048576 copy_ref = True prefix = options.prefix if options.prefix and len(options.prefix) > 0 else 'ref' if command in ['sv']: args.append( '-p' ) args.append( prefix ) if options.sv_only: args.append('--sv-only') if options.region: args.append('-r') args.append(options.region) copy_ref = True # GATK has issue with symlinks for .fa and .fai if options.svidx_dir and command in ['sv']: if not os.path.isdir(options.svidx_dir): os.makedirs(options.svidx_dir) ref_prefix = os.path.join(options.svidx_dir,prefix) copy_ref = True else: ref_prefix = os.path.join(os.getcwd(),prefix) if command in ['sv','assemble']: ref_fasta = ref_prefix + '.fa' ref_index = ref_fasta + '.fai' if options.fasta: make_ref( options.fasta, ref_fasta, copy=copy_ref ) else: stop_err('Reference fasta file is required',3) # if reference fasta index is not supllied, generate it if options.fai: make_ref( options.fai, ref_index, copy=copy_ref ) elif os.path.exists(options.fasta + '.fai'): make_ref( options.fasta + '.fai', ref_index, copy=copy_ref ) else: # generate fasta fai index cmd1 = 'samtools faidx %s' % (ref_fasta ) try: tmp_name = tempfile.NamedTemporaryFile(prefix='fai_', suffix='.err').name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() if returncode != 0: stderr = '' # get stderr, allowing for case where it's very large tmp_stderr = open( tmp, 'rb' ) try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass raise Exception, stderr except Exception, e: stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),5 ) args.append( '-f' ) args.append( ref_fasta ) if command in ['sv']: if not options.bam: stop_err('BAM alignment file is required',2) # if reference 2bit is not supplied, generate it twobit = ref_prefix + '.2bit' if options.twobit: make_ref( options.twobit, twobit, copy=copy_ref ) else: # generate 2bit index cmd1 = 'faToTwoBit %s %s' % (ref_fasta, twobit ) try: tmp_name = tempfile.NamedTemporaryFile(prefix='twobit_', suffix='.err').name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() if returncode != 0: stderr = '' # get stderr, allowing for case where it's very large tmp_stderr = open( tmp, 'rb' ) try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass raise Exception, stderr except Exception, e: stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),6 ) args.append( '-t' ) args.append( twobit ) # if bwa index is not supplied, generate it bwa_index = ref_fasta if options.bwa_index: if copy_ref: shutil.copytree(os.path.dirname(bwa_index),options.svidx_dir) bwa_index = options.bwa_index else: ONE_GB = 2**30 if os.stat( ref_fasta ).st_size <= ONE_GB: #use 1 GB as cut off for memory vs. max of 2gb database size; this is somewhat arbitrary index_algorithm = 'is' else: index_algorithm = 'bwtsw' cmd1 = 'bwa index -a %s %s' % ( index_algorithm, ref_fasta ) try: tmp_name = tempfile.NamedTemporaryFile(prefix='bwa_index_', suffix='.err').name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() if returncode != 0: stderr = '' # get stderr, allowing for case where it's very large tmp_stderr = open( tmp, 'rb' ) try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass raise Exception, stderr bwa_index = ref_fasta except Exception, e: # clean up temp dirs stop_err( 'Error creating bwa index. ' + str( e ),7 ) args.append( '-i' ) args.append( bwa_index ) bam_file = os.path.join(os.getcwd(),prefix + '.bam') bai_file = bam_file + '.bai' if options.bam: os.symlink( options.bam, bam_file ) else: stop_err('BAM alignment file is required',2) # if bam index is not supplied, generate it if options.bai: os.symlink( options.bai, bai_file ) elif os.path.exists(options.bam + '.bai'): os.symlink( options.bam + '.bai', bai_file ) else: # generate bam index cmd1 = 'samtools index %s %s' % (bam_file, bai_file ) try: tmp_name = tempfile.NamedTemporaryFile(prefix='bai_', suffix='.err').name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() if returncode != 0: stderr = '' # get stderr, allowing for case where it's very large tmp_stderr = open( tmp, 'rb' ) try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass raise Exception, stderr except Exception, e: stop_err( 'Error indexing BAM alignment file. ' + str( e ),4) args.append( '-b' ) args.append( bam_file ) if command in ['assemble']: if not options.sdi or not options.output: stop_err('SHEAR .sdi file and output file are required for assemble',2) args.append( '-s' ) args.append( options.sdi ) args.append( '-o' ) args.append( options.output ) # Run SHEAR command print >> sys.stdout, "%s" % ' '.join(args) try: proc = subprocess.Popen( args=args, shell=False ) returncode = proc.wait() if returncode != 0: stop_err( 'Error running SHEAR ' + command, returncode ) except Exception, e: # clean up temp dirs stop_err( 'Error running SHEAR %s %s' % (command,str(e)),9 ) if __name__=="__main__": __main__()