Mercurial > repos > bgruening > bismark
diff bismark_wrapper.py @ 18:862fb59a9a25 draft
Uploaded
author | bgruening |
---|---|
date | Mon, 14 Apr 2014 16:42:38 -0400 |
parents | 73508c5b4273 |
children | 30caca800c9b |
line wrap: on
line diff
--- a/bismark_wrapper.py Sun Feb 24 14:49:36 2013 -0500 +++ b/bismark_wrapper.py Mon Apr 14 16:42:38 2014 -0400 @@ -1,6 +1,13 @@ #!/usr/bin/env python -import argparse, os, shutil, subprocess, sys, tempfile, fileinput +import argparse +import os +import shutil +import subprocess +import sys +import shlex +import tempfile +import fileinput import fileinput from glob import glob @@ -9,6 +16,8 @@ sys.exit() def __main__(): + + print 'tempfile_location',tempfile.gettempdir() #Parse Command Line parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') parser.add_argument( '-p', '--num-threads', dest='num_threads', @@ -22,6 +31,8 @@ parser.add_argument( '--own-file', dest='own_file', help='' ) parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) parser.add_argument( '-O', '--output', dest='output' ) + + parser.add_argument( '--output-report-file', dest='output_report_file' ) parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) @@ -32,6 +43,7 @@ help='The forward reads file in Sanger FASTQ or FASTA format.' ) parser.add_argument( '-2', '--mate2', dest='mate2', help='The reverse reads file in Sanger FASTQ or FASTA format.' ) + parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" ) parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', help='Additional output file with unmapped reads (single-end).' ) @@ -39,14 +51,16 @@ help='File name for unmapped reads (left, paired-end).' ) parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', help='File name for unmapped reads (right, paired-end).' ) - - + + parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', help='Additional output file with suppressed reads (single-end).' ) parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', help='File name for suppressed reads (left, paired-end).' ) parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', help='File name for suppressed reads (right, paired-end).' ) + parser.add_argument( '--stdout', dest='output_stdout', + help='File name for the standard output of bismark.' ) parser.add_argument( '--single-paired', dest='single_paired', @@ -93,7 +107,7 @@ parser.add_argument( '--chunkmbs', type=int, default=512 ) args = parser.parse_args() - + # Create bismark index if necessary. index_dir = "" if args.own_file: @@ -119,8 +133,12 @@ else: cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) if args.bismark_path: - # add the path to the bismark perl scripts, that is needed for galaxy - cmd_index = os.path.join(args.bismark_path, cmd_index) + if os.path.exists(args.bismark_path): + # add the path to the bismark perl scripts, that is needed for galaxy + cmd_index = os.path.join(args.bismark_path, cmd_index) + else: + # assume the same directory as that script + cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) try: tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name tmp_stderr = open( tmp, 'wb' ) @@ -147,15 +165,33 @@ stop_err( 'Error indexing reference sequence\n' + str( e ) ) index_dir = tmp_index_dir else: - index_dir = args.index_path + # bowtie path is the path to the index directory and the first path of the index file name + index_dir = os.path.dirname( args.index_path ) # Build bismark command + + """ + Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the + TMP to some larger space. It's not recommended but it works. + """ + #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) tmp_bismark_dir = tempfile.mkdtemp() output_dir = os.path.join( tmp_bismark_dir, 'results') - cmd = 'bismark %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' + cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' + + if args.fasta: + # the query input files (specified as mate1,mate2 or singles) are FastA + cmd = '%s %s' % (cmd, '--fasta') + elif args.fastq: + cmd = '%s %s' % (cmd, '--fastq') + if args.bismark_path: # add the path to the bismark perl scripts, that is needed for galaxy - cmd = os.path.join(args.bismark_path, cmd) + if os.path.exists(args.bismark_path): + cmd = os.path.join(args.bismark_path, cmd) + else: + # assume the same directory as that script + cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) arguments = { 'genome_folder': index_dir, @@ -178,7 +214,7 @@ if not args.bowtie2: # use bowtie specific options - #additional_opts += ' --best ' # bug in bismark, --best is not available only --non-best, best is default + #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default if args.seed_mismatches: # --seedmms additional_opts += ' -n %s ' % args.seed_mismatches @@ -188,7 +224,7 @@ # alignment options if args.bowtie2: - additional_opts += ' -p %s --bowtie2 ' % args.num_threads + additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each if args.seed_mismatches: additional_opts += ' -N %s ' % args.seed_mismatches if args.seed_len: @@ -220,9 +256,11 @@ arguments.update( {'args': additional_opts, 'reads': reads} ) - # Final command: + # Final bismark command: cmd = cmd % arguments - + print 'bismark_cmd:', cmd + #sys.stderr.write( cmd ) + #sys.exit(1) # Run try: tmp_out = tempfile.NamedTemporaryFile().name @@ -231,36 +269,37 @@ tmp_stderr = open( tmp_err, 'wb' ) proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp_err, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass + + if returncode != 0: + tmp_stdout.close() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + + raise Exception, stderr tmp_stdout.close() tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - + # TODO: look for errors in program output. except Exception, e: - stop_err( 'Error in bismark:\n' + str( e ) ) - + stop_err( 'Error in bismark:\n' + str( e ) ) # collect and copy output files - """ if args.output_report_file: output_report_file = open(args.output_report_file, 'w+') - for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )): + for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): output_report_file.write(line) output_report_file.close() - """ + if args.output_suppressed_reads: shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) @@ -276,7 +315,51 @@ if args.output_unmapped_reads_r: shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) - shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output) + try: + """ + merge all bam files + """ + #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name + tmp_stdout = open( tmp_out, 'wab' ) + tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name + tmp_stderr = open( tmp_err, 'wb' ) + + tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name + + bam_files = glob( os.path.join( output_dir, '*.bam') ) + if len( bam_files ) > 1: + cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) ) + + proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) + + returncode = proc.wait() + tmp_stdout.close() + tmp_stderr.close() + if returncode != 0: + raise Exception, open( tmp_stderr.name ).read() + else: + tmp_res = bam_files[0] + + bam_path = "%s" % tmp_res + + if os.path.exists( bam_path ): + if args.sort_bam: + cmd = 'samtools sort -@ %s %s %s' % (args.num_threads, bam_path, args.output) + else: + shutil.copy( bam_path, args.output ) + else: + stop_err( 'BAM file no found:\n' + str( bam_path ) ) + + + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error in merging bam files:\n' + str( e ) ) + + + if args.output_stdout: + # copy the temporary saved stdout from bismark + shutil.move( tmp_out, args.output_stdout ) # Clean up temp dirs if args.own_file: @@ -284,5 +367,7 @@ shutil.rmtree( tmp_index_dir ) if os.path.exists( tmp_bismark_dir ): shutil.rmtree( tmp_bismark_dir ) + if os.path.exists( output_dir ): + shutil.rmtree( output_dir ) if __name__=="__main__": __main__()