comparison bismark_wrapper.py @ 18:862fb59a9a25 draft

Uploaded
author bgruening
date Mon, 14 Apr 2014 16:42:38 -0400
parents 73508c5b4273
children 30caca800c9b
comparison
equal deleted inserted replaced
17:73508c5b4273 18:862fb59a9a25
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput 3 import argparse
4 import os
5 import shutil
6 import subprocess
7 import sys
8 import shlex
9 import tempfile
10 import fileinput
4 import fileinput 11 import fileinput
5 from glob import glob 12 from glob import glob
6 13
7 def stop_err( msg ): 14 def stop_err( msg ):
8 sys.stderr.write( "%s\n" % msg ) 15 sys.stderr.write( "%s\n" % msg )
9 sys.exit() 16 sys.exit()
10 17
11 def __main__(): 18 def __main__():
19
20 print 'tempfile_location',tempfile.gettempdir()
12 #Parse Command Line 21 #Parse Command Line
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') 22 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
14 parser.add_argument( '-p', '--num-threads', dest='num_threads', 23 parser.add_argument( '-p', '--num-threads', dest='num_threads',
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) 24 type=int, default=4, help='Use this many threads to align reads. The default is 4.' )
16 25
20 29
21 # input options 30 # input options
22 parser.add_argument( '--own-file', dest='own_file', help='' ) 31 parser.add_argument( '--own-file', dest='own_file', help='' )
23 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) 32 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
24 parser.add_argument( '-O', '--output', dest='output' ) 33 parser.add_argument( '-O', '--output', dest='output' )
34
35
25 parser.add_argument( '--output-report-file', dest='output_report_file' ) 36 parser.add_argument( '--output-report-file', dest='output_report_file' )
26 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) 37 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" )
27 38
28 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) 39 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False)
29 40
30 41
31 parser.add_argument( '-1', '--mate1', dest='mate1', 42 parser.add_argument( '-1', '--mate1', dest='mate1',
32 help='The forward reads file in Sanger FASTQ or FASTA format.' ) 43 help='The forward reads file in Sanger FASTQ or FASTA format.' )
33 parser.add_argument( '-2', '--mate2', dest='mate2', 44 parser.add_argument( '-2', '--mate2', dest='mate2',
34 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) 45 help='The reverse reads file in Sanger FASTQ or FASTA format.' )
46 parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" )
35 47
36 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', 48 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads',
37 help='Additional output file with unmapped reads (single-end).' ) 49 help='Additional output file with unmapped reads (single-end).' )
38 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', 50 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l',
39 help='File name for unmapped reads (left, paired-end).' ) 51 help='File name for unmapped reads (left, paired-end).' )
40 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', 52 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r',
41 help='File name for unmapped reads (right, paired-end).' ) 53 help='File name for unmapped reads (right, paired-end).' )
42 54
43 55
44 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', 56 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads',
45 help='Additional output file with suppressed reads (single-end).' ) 57 help='Additional output file with suppressed reads (single-end).' )
46 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', 58 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l',
47 help='File name for suppressed reads (left, paired-end).' ) 59 help='File name for suppressed reads (left, paired-end).' )
48 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', 60 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r',
49 help='File name for suppressed reads (right, paired-end).' ) 61 help='File name for suppressed reads (right, paired-end).' )
62 parser.add_argument( '--stdout', dest='output_stdout',
63 help='File name for the standard output of bismark.' )
50 64
51 65
52 parser.add_argument( '--single-paired', dest='single_paired', 66 parser.add_argument( '--single-paired', dest='single_paired',
53 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) 67 help='The single-end reads file in Sanger FASTQ or FASTA format.' )
54 68
91 this parameter up to dedicate more memory to the descriptors. Default: 512. 105 this parameter up to dedicate more memory to the descriptors. Default: 512.
92 """ 106 """
93 parser.add_argument( '--chunkmbs', type=int, default=512 ) 107 parser.add_argument( '--chunkmbs', type=int, default=512 )
94 108
95 args = parser.parse_args() 109 args = parser.parse_args()
96 110
97 # Create bismark index if necessary. 111 # Create bismark index if necessary.
98 index_dir = "" 112 index_dir = ""
99 if args.own_file: 113 if args.own_file:
100 """ 114 """
101 Create a temporary index with the offered files from the user. 115 Create a temporary index with the offered files from the user.
117 if args.bowtie2: 131 if args.bowtie2:
118 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) 132 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir )
119 else: 133 else:
120 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) 134 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir )
121 if args.bismark_path: 135 if args.bismark_path:
122 # add the path to the bismark perl scripts, that is needed for galaxy 136 if os.path.exists(args.bismark_path):
123 cmd_index = os.path.join(args.bismark_path, cmd_index) 137 # add the path to the bismark perl scripts, that is needed for galaxy
138 cmd_index = os.path.join(args.bismark_path, cmd_index)
139 else:
140 # assume the same directory as that script
141 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index)
124 try: 142 try:
125 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name 143 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
126 tmp_stderr = open( tmp, 'wb' ) 144 tmp_stderr = open( tmp, 'wb' )
127 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) 145 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() )
128 returncode = proc.wait() 146 returncode = proc.wait()
145 if os.path.exists( tmp_index_dir ): 163 if os.path.exists( tmp_index_dir ):
146 shutil.rmtree( tmp_index_dir ) 164 shutil.rmtree( tmp_index_dir )
147 stop_err( 'Error indexing reference sequence\n' + str( e ) ) 165 stop_err( 'Error indexing reference sequence\n' + str( e ) )
148 index_dir = tmp_index_dir 166 index_dir = tmp_index_dir
149 else: 167 else:
150 index_dir = args.index_path 168 # bowtie path is the path to the index directory and the first path of the index file name
169 index_dir = os.path.dirname( args.index_path )
151 170
152 # Build bismark command 171 # Build bismark command
172
173 """
174 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the
175 TMP to some larger space. It's not recommended but it works.
176 """
177 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' )
153 tmp_bismark_dir = tempfile.mkdtemp() 178 tmp_bismark_dir = tempfile.mkdtemp()
154 output_dir = os.path.join( tmp_bismark_dir, 'results') 179 output_dir = os.path.join( tmp_bismark_dir, 'results')
155 cmd = 'bismark %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' 180 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s'
181
182 if args.fasta:
183 # the query input files (specified as mate1,mate2 or singles) are FastA
184 cmd = '%s %s' % (cmd, '--fasta')
185 elif args.fastq:
186 cmd = '%s %s' % (cmd, '--fastq')
187
156 if args.bismark_path: 188 if args.bismark_path:
157 # add the path to the bismark perl scripts, that is needed for galaxy 189 # add the path to the bismark perl scripts, that is needed for galaxy
158 cmd = os.path.join(args.bismark_path, cmd) 190 if os.path.exists(args.bismark_path):
191 cmd = os.path.join(args.bismark_path, cmd)
192 else:
193 # assume the same directory as that script
194 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd)
159 195
160 arguments = { 196 arguments = {
161 'genome_folder': index_dir, 197 'genome_folder': index_dir,
162 'args': '', 198 'args': '',
163 'tmp_bismark_dir': tmp_bismark_dir, 199 'tmp_bismark_dir': tmp_bismark_dir,
176 reads = ' %s ' % ( args.single_paired ) 212 reads = ' %s ' % ( args.single_paired )
177 213
178 214
179 if not args.bowtie2: 215 if not args.bowtie2:
180 # use bowtie specific options 216 # use bowtie specific options
181 #additional_opts += ' --best ' # bug in bismark, --best is not available only --non-best, best is default 217 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default
182 if args.seed_mismatches: 218 if args.seed_mismatches:
183 # --seedmms 219 # --seedmms
184 additional_opts += ' -n %s ' % args.seed_mismatches 220 additional_opts += ' -n %s ' % args.seed_mismatches
185 if args.seed_len: 221 if args.seed_len:
186 # --seedlen 222 # --seedlen
187 additional_opts += ' -l %s ' % args.seed_len 223 additional_opts += ' -l %s ' % args.seed_len
188 224
189 # alignment options 225 # alignment options
190 if args.bowtie2: 226 if args.bowtie2:
191 additional_opts += ' -p %s --bowtie2 ' % args.num_threads 227 additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each
192 if args.seed_mismatches: 228 if args.seed_mismatches:
193 additional_opts += ' -N %s ' % args.seed_mismatches 229 additional_opts += ' -N %s ' % args.seed_mismatches
194 if args.seed_len: 230 if args.seed_len:
195 additional_opts += ' -L %s ' % args.seed_len 231 additional_opts += ' -L %s ' % args.seed_len
196 if args.seed_extention_attempts: 232 if args.seed_extention_attempts:
218 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): 254 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r):
219 additional_opts += ' --ambiguous ' 255 additional_opts += ' --ambiguous '
220 256
221 arguments.update( {'args': additional_opts, 'reads': reads} ) 257 arguments.update( {'args': additional_opts, 'reads': reads} )
222 258
223 # Final command: 259 # Final bismark command:
224 cmd = cmd % arguments 260 cmd = cmd % arguments
225 261 print 'bismark_cmd:', cmd
262 #sys.stderr.write( cmd )
263 #sys.exit(1)
226 # Run 264 # Run
227 try: 265 try:
228 tmp_out = tempfile.NamedTemporaryFile().name 266 tmp_out = tempfile.NamedTemporaryFile().name
229 tmp_stdout = open( tmp_out, 'wb' ) 267 tmp_stdout = open( tmp_out, 'wb' )
230 tmp_err = tempfile.NamedTemporaryFile().name 268 tmp_err = tempfile.NamedTemporaryFile().name
231 tmp_stderr = open( tmp_err, 'wb' ) 269 tmp_stderr = open( tmp_err, 'wb' )
232 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) 270 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
233 returncode = proc.wait() 271 returncode = proc.wait()
234 tmp_stderr.close() 272
235 # get stderr, allowing for case where it's very large 273 if returncode != 0:
236 tmp_stderr = open( tmp_err, 'rb' ) 274 tmp_stdout.close()
237 stderr = '' 275 tmp_stderr.close()
238 buffsize = 1048576 276 # get stderr, allowing for case where it's very large
239 try: 277 tmp_stderr = open( tmp_err, 'rb' )
240 while True: 278 stderr = ''
241 stderr += tmp_stderr.read( buffsize ) 279 buffsize = 1048576
242 if not stderr or len( stderr ) % buffsize != 0: 280 try:
243 break 281 while True:
244 except OverflowError: 282 stderr += tmp_stderr.read( buffsize )
245 pass 283 if not stderr or len( stderr ) % buffsize != 0:
284 break
285 except OverflowError:
286 pass
287
288 raise Exception, stderr
246 tmp_stdout.close() 289 tmp_stdout.close()
247 tmp_stderr.close() 290 tmp_stderr.close()
248 if returncode != 0: 291
249 raise Exception, stderr
250
251 # TODO: look for errors in program output. 292 # TODO: look for errors in program output.
252 except Exception, e: 293 except Exception, e:
253 stop_err( 'Error in bismark:\n' + str( e ) ) 294 stop_err( 'Error in bismark:\n' + str( e ) )
254
255 295
256 # collect and copy output files 296 # collect and copy output files
257 """
258 if args.output_report_file: 297 if args.output_report_file:
259 output_report_file = open(args.output_report_file, 'w+') 298 output_report_file = open(args.output_report_file, 'w+')
260 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )): 299 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )):
261 output_report_file.write(line) 300 output_report_file.write(line)
262 output_report_file.close() 301 output_report_file.close()
263 """ 302
264 303
265 if args.output_suppressed_reads: 304 if args.output_suppressed_reads:
266 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) 305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads )
267 if args.output_suppressed_reads_l: 306 if args.output_suppressed_reads_l:
268 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) 307 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l )
274 if args.output_unmapped_reads_l: 313 if args.output_unmapped_reads_l:
275 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) 314 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l )
276 if args.output_unmapped_reads_r: 315 if args.output_unmapped_reads_r:
277 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) 316 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r )
278 317
279 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output) 318 try:
319 """
320 merge all bam files
321 """
322 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name
323 tmp_stdout = open( tmp_out, 'wab' )
324 tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name
325 tmp_stderr = open( tmp_err, 'wb' )
326
327 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name
328
329 bam_files = glob( os.path.join( output_dir, '*.bam') )
330 if len( bam_files ) > 1:
331 cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) )
332
333 proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE )
334
335 returncode = proc.wait()
336 tmp_stdout.close()
337 tmp_stderr.close()
338 if returncode != 0:
339 raise Exception, open( tmp_stderr.name ).read()
340 else:
341 tmp_res = bam_files[0]
342
343 bam_path = "%s" % tmp_res
344
345 if os.path.exists( bam_path ):
346 if args.sort_bam:
347 cmd = 'samtools sort -@ %s %s %s' % (args.num_threads, bam_path, args.output)
348 else:
349 shutil.copy( bam_path, args.output )
350 else:
351 stop_err( 'BAM file no found:\n' + str( bam_path ) )
352
353
354
355 # TODO: look for errors in program output.
356 except Exception, e:
357 stop_err( 'Error in merging bam files:\n' + str( e ) )
358
359
360 if args.output_stdout:
361 # copy the temporary saved stdout from bismark
362 shutil.move( tmp_out, args.output_stdout )
280 363
281 # Clean up temp dirs 364 # Clean up temp dirs
282 if args.own_file: 365 if args.own_file:
283 if os.path.exists( tmp_index_dir ): 366 if os.path.exists( tmp_index_dir ):
284 shutil.rmtree( tmp_index_dir ) 367 shutil.rmtree( tmp_index_dir )
285 if os.path.exists( tmp_bismark_dir ): 368 if os.path.exists( tmp_bismark_dir ):
286 shutil.rmtree( tmp_bismark_dir ) 369 shutil.rmtree( tmp_bismark_dir )
370 if os.path.exists( output_dir ):
371 shutil.rmtree( output_dir )
287 372
288 if __name__=="__main__": __main__() 373 if __name__=="__main__": __main__()