Mercurial > repos > bgruening > bismark
comparison bismark_wrapper.py @ 18:862fb59a9a25 draft
Uploaded
author | bgruening |
---|---|
date | Mon, 14 Apr 2014 16:42:38 -0400 |
parents | 73508c5b4273 |
children | 30caca800c9b |
comparison
equal
deleted
inserted
replaced
17:73508c5b4273 | 18:862fb59a9a25 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput | 3 import argparse |
4 import os | |
5 import shutil | |
6 import subprocess | |
7 import sys | |
8 import shlex | |
9 import tempfile | |
10 import fileinput | |
4 import fileinput | 11 import fileinput |
5 from glob import glob | 12 from glob import glob |
6 | 13 |
7 def stop_err( msg ): | 14 def stop_err( msg ): |
8 sys.stderr.write( "%s\n" % msg ) | 15 sys.stderr.write( "%s\n" % msg ) |
9 sys.exit() | 16 sys.exit() |
10 | 17 |
11 def __main__(): | 18 def __main__(): |
19 | |
20 print 'tempfile_location',tempfile.gettempdir() | |
12 #Parse Command Line | 21 #Parse Command Line |
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | 22 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') |
14 parser.add_argument( '-p', '--num-threads', dest='num_threads', | 23 parser.add_argument( '-p', '--num-threads', dest='num_threads', |
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) | 24 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) |
16 | 25 |
20 | 29 |
21 # input options | 30 # input options |
22 parser.add_argument( '--own-file', dest='own_file', help='' ) | 31 parser.add_argument( '--own-file', dest='own_file', help='' ) |
23 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | 32 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) |
24 parser.add_argument( '-O', '--output', dest='output' ) | 33 parser.add_argument( '-O', '--output', dest='output' ) |
34 | |
35 | |
25 parser.add_argument( '--output-report-file', dest='output_report_file' ) | 36 parser.add_argument( '--output-report-file', dest='output_report_file' ) |
26 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) | 37 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) |
27 | 38 |
28 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) | 39 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) |
29 | 40 |
30 | 41 |
31 parser.add_argument( '-1', '--mate1', dest='mate1', | 42 parser.add_argument( '-1', '--mate1', dest='mate1', |
32 help='The forward reads file in Sanger FASTQ or FASTA format.' ) | 43 help='The forward reads file in Sanger FASTQ or FASTA format.' ) |
33 parser.add_argument( '-2', '--mate2', dest='mate2', | 44 parser.add_argument( '-2', '--mate2', dest='mate2', |
34 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) | 45 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) |
46 parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" ) | |
35 | 47 |
36 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', | 48 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', |
37 help='Additional output file with unmapped reads (single-end).' ) | 49 help='Additional output file with unmapped reads (single-end).' ) |
38 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', | 50 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', |
39 help='File name for unmapped reads (left, paired-end).' ) | 51 help='File name for unmapped reads (left, paired-end).' ) |
40 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', | 52 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', |
41 help='File name for unmapped reads (right, paired-end).' ) | 53 help='File name for unmapped reads (right, paired-end).' ) |
42 | 54 |
43 | 55 |
44 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', | 56 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', |
45 help='Additional output file with suppressed reads (single-end).' ) | 57 help='Additional output file with suppressed reads (single-end).' ) |
46 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', | 58 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', |
47 help='File name for suppressed reads (left, paired-end).' ) | 59 help='File name for suppressed reads (left, paired-end).' ) |
48 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', | 60 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', |
49 help='File name for suppressed reads (right, paired-end).' ) | 61 help='File name for suppressed reads (right, paired-end).' ) |
62 parser.add_argument( '--stdout', dest='output_stdout', | |
63 help='File name for the standard output of bismark.' ) | |
50 | 64 |
51 | 65 |
52 parser.add_argument( '--single-paired', dest='single_paired', | 66 parser.add_argument( '--single-paired', dest='single_paired', |
53 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) | 67 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) |
54 | 68 |
91 this parameter up to dedicate more memory to the descriptors. Default: 512. | 105 this parameter up to dedicate more memory to the descriptors. Default: 512. |
92 """ | 106 """ |
93 parser.add_argument( '--chunkmbs', type=int, default=512 ) | 107 parser.add_argument( '--chunkmbs', type=int, default=512 ) |
94 | 108 |
95 args = parser.parse_args() | 109 args = parser.parse_args() |
96 | 110 |
97 # Create bismark index if necessary. | 111 # Create bismark index if necessary. |
98 index_dir = "" | 112 index_dir = "" |
99 if args.own_file: | 113 if args.own_file: |
100 """ | 114 """ |
101 Create a temporary index with the offered files from the user. | 115 Create a temporary index with the offered files from the user. |
117 if args.bowtie2: | 131 if args.bowtie2: |
118 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) | 132 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) |
119 else: | 133 else: |
120 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) | 134 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) |
121 if args.bismark_path: | 135 if args.bismark_path: |
122 # add the path to the bismark perl scripts, that is needed for galaxy | 136 if os.path.exists(args.bismark_path): |
123 cmd_index = os.path.join(args.bismark_path, cmd_index) | 137 # add the path to the bismark perl scripts, that is needed for galaxy |
138 cmd_index = os.path.join(args.bismark_path, cmd_index) | |
139 else: | |
140 # assume the same directory as that script | |
141 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) | |
124 try: | 142 try: |
125 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | 143 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name |
126 tmp_stderr = open( tmp, 'wb' ) | 144 tmp_stderr = open( tmp, 'wb' ) |
127 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) | 145 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) |
128 returncode = proc.wait() | 146 returncode = proc.wait() |
145 if os.path.exists( tmp_index_dir ): | 163 if os.path.exists( tmp_index_dir ): |
146 shutil.rmtree( tmp_index_dir ) | 164 shutil.rmtree( tmp_index_dir ) |
147 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | 165 stop_err( 'Error indexing reference sequence\n' + str( e ) ) |
148 index_dir = tmp_index_dir | 166 index_dir = tmp_index_dir |
149 else: | 167 else: |
150 index_dir = args.index_path | 168 # bowtie path is the path to the index directory and the first path of the index file name |
169 index_dir = os.path.dirname( args.index_path ) | |
151 | 170 |
152 # Build bismark command | 171 # Build bismark command |
172 | |
173 """ | |
174 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the | |
175 TMP to some larger space. It's not recommended but it works. | |
176 """ | |
177 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) | |
153 tmp_bismark_dir = tempfile.mkdtemp() | 178 tmp_bismark_dir = tempfile.mkdtemp() |
154 output_dir = os.path.join( tmp_bismark_dir, 'results') | 179 output_dir = os.path.join( tmp_bismark_dir, 'results') |
155 cmd = 'bismark %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' | 180 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' |
181 | |
182 if args.fasta: | |
183 # the query input files (specified as mate1,mate2 or singles) are FastA | |
184 cmd = '%s %s' % (cmd, '--fasta') | |
185 elif args.fastq: | |
186 cmd = '%s %s' % (cmd, '--fastq') | |
187 | |
156 if args.bismark_path: | 188 if args.bismark_path: |
157 # add the path to the bismark perl scripts, that is needed for galaxy | 189 # add the path to the bismark perl scripts, that is needed for galaxy |
158 cmd = os.path.join(args.bismark_path, cmd) | 190 if os.path.exists(args.bismark_path): |
191 cmd = os.path.join(args.bismark_path, cmd) | |
192 else: | |
193 # assume the same directory as that script | |
194 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) | |
159 | 195 |
160 arguments = { | 196 arguments = { |
161 'genome_folder': index_dir, | 197 'genome_folder': index_dir, |
162 'args': '', | 198 'args': '', |
163 'tmp_bismark_dir': tmp_bismark_dir, | 199 'tmp_bismark_dir': tmp_bismark_dir, |
176 reads = ' %s ' % ( args.single_paired ) | 212 reads = ' %s ' % ( args.single_paired ) |
177 | 213 |
178 | 214 |
179 if not args.bowtie2: | 215 if not args.bowtie2: |
180 # use bowtie specific options | 216 # use bowtie specific options |
181 #additional_opts += ' --best ' # bug in bismark, --best is not available only --non-best, best is default | 217 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default |
182 if args.seed_mismatches: | 218 if args.seed_mismatches: |
183 # --seedmms | 219 # --seedmms |
184 additional_opts += ' -n %s ' % args.seed_mismatches | 220 additional_opts += ' -n %s ' % args.seed_mismatches |
185 if args.seed_len: | 221 if args.seed_len: |
186 # --seedlen | 222 # --seedlen |
187 additional_opts += ' -l %s ' % args.seed_len | 223 additional_opts += ' -l %s ' % args.seed_len |
188 | 224 |
189 # alignment options | 225 # alignment options |
190 if args.bowtie2: | 226 if args.bowtie2: |
191 additional_opts += ' -p %s --bowtie2 ' % args.num_threads | 227 additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each |
192 if args.seed_mismatches: | 228 if args.seed_mismatches: |
193 additional_opts += ' -N %s ' % args.seed_mismatches | 229 additional_opts += ' -N %s ' % args.seed_mismatches |
194 if args.seed_len: | 230 if args.seed_len: |
195 additional_opts += ' -L %s ' % args.seed_len | 231 additional_opts += ' -L %s ' % args.seed_len |
196 if args.seed_extention_attempts: | 232 if args.seed_extention_attempts: |
218 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): | 254 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): |
219 additional_opts += ' --ambiguous ' | 255 additional_opts += ' --ambiguous ' |
220 | 256 |
221 arguments.update( {'args': additional_opts, 'reads': reads} ) | 257 arguments.update( {'args': additional_opts, 'reads': reads} ) |
222 | 258 |
223 # Final command: | 259 # Final bismark command: |
224 cmd = cmd % arguments | 260 cmd = cmd % arguments |
225 | 261 print 'bismark_cmd:', cmd |
262 #sys.stderr.write( cmd ) | |
263 #sys.exit(1) | |
226 # Run | 264 # Run |
227 try: | 265 try: |
228 tmp_out = tempfile.NamedTemporaryFile().name | 266 tmp_out = tempfile.NamedTemporaryFile().name |
229 tmp_stdout = open( tmp_out, 'wb' ) | 267 tmp_stdout = open( tmp_out, 'wb' ) |
230 tmp_err = tempfile.NamedTemporaryFile().name | 268 tmp_err = tempfile.NamedTemporaryFile().name |
231 tmp_stderr = open( tmp_err, 'wb' ) | 269 tmp_stderr = open( tmp_err, 'wb' ) |
232 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | 270 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) |
233 returncode = proc.wait() | 271 returncode = proc.wait() |
234 tmp_stderr.close() | 272 |
235 # get stderr, allowing for case where it's very large | 273 if returncode != 0: |
236 tmp_stderr = open( tmp_err, 'rb' ) | 274 tmp_stdout.close() |
237 stderr = '' | 275 tmp_stderr.close() |
238 buffsize = 1048576 | 276 # get stderr, allowing for case where it's very large |
239 try: | 277 tmp_stderr = open( tmp_err, 'rb' ) |
240 while True: | 278 stderr = '' |
241 stderr += tmp_stderr.read( buffsize ) | 279 buffsize = 1048576 |
242 if not stderr or len( stderr ) % buffsize != 0: | 280 try: |
243 break | 281 while True: |
244 except OverflowError: | 282 stderr += tmp_stderr.read( buffsize ) |
245 pass | 283 if not stderr or len( stderr ) % buffsize != 0: |
284 break | |
285 except OverflowError: | |
286 pass | |
287 | |
288 raise Exception, stderr | |
246 tmp_stdout.close() | 289 tmp_stdout.close() |
247 tmp_stderr.close() | 290 tmp_stderr.close() |
248 if returncode != 0: | 291 |
249 raise Exception, stderr | |
250 | |
251 # TODO: look for errors in program output. | 292 # TODO: look for errors in program output. |
252 except Exception, e: | 293 except Exception, e: |
253 stop_err( 'Error in bismark:\n' + str( e ) ) | 294 stop_err( 'Error in bismark:\n' + str( e ) ) |
254 | |
255 | 295 |
256 # collect and copy output files | 296 # collect and copy output files |
257 """ | |
258 if args.output_report_file: | 297 if args.output_report_file: |
259 output_report_file = open(args.output_report_file, 'w+') | 298 output_report_file = open(args.output_report_file, 'w+') |
260 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )): | 299 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): |
261 output_report_file.write(line) | 300 output_report_file.write(line) |
262 output_report_file.close() | 301 output_report_file.close() |
263 """ | 302 |
264 | 303 |
265 if args.output_suppressed_reads: | 304 if args.output_suppressed_reads: |
266 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) | 305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) |
267 if args.output_suppressed_reads_l: | 306 if args.output_suppressed_reads_l: |
268 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) | 307 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) |
274 if args.output_unmapped_reads_l: | 313 if args.output_unmapped_reads_l: |
275 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) | 314 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) |
276 if args.output_unmapped_reads_r: | 315 if args.output_unmapped_reads_r: |
277 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) | 316 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) |
278 | 317 |
279 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output) | 318 try: |
319 """ | |
320 merge all bam files | |
321 """ | |
322 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
323 tmp_stdout = open( tmp_out, 'wab' ) | |
324 tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
325 tmp_stderr = open( tmp_err, 'wb' ) | |
326 | |
327 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name | |
328 | |
329 bam_files = glob( os.path.join( output_dir, '*.bam') ) | |
330 if len( bam_files ) > 1: | |
331 cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) ) | |
332 | |
333 proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) | |
334 | |
335 returncode = proc.wait() | |
336 tmp_stdout.close() | |
337 tmp_stderr.close() | |
338 if returncode != 0: | |
339 raise Exception, open( tmp_stderr.name ).read() | |
340 else: | |
341 tmp_res = bam_files[0] | |
342 | |
343 bam_path = "%s" % tmp_res | |
344 | |
345 if os.path.exists( bam_path ): | |
346 if args.sort_bam: | |
347 cmd = 'samtools sort -@ %s %s %s' % (args.num_threads, bam_path, args.output) | |
348 else: | |
349 shutil.copy( bam_path, args.output ) | |
350 else: | |
351 stop_err( 'BAM file no found:\n' + str( bam_path ) ) | |
352 | |
353 | |
354 | |
355 # TODO: look for errors in program output. | |
356 except Exception, e: | |
357 stop_err( 'Error in merging bam files:\n' + str( e ) ) | |
358 | |
359 | |
360 if args.output_stdout: | |
361 # copy the temporary saved stdout from bismark | |
362 shutil.move( tmp_out, args.output_stdout ) | |
280 | 363 |
281 # Clean up temp dirs | 364 # Clean up temp dirs |
282 if args.own_file: | 365 if args.own_file: |
283 if os.path.exists( tmp_index_dir ): | 366 if os.path.exists( tmp_index_dir ): |
284 shutil.rmtree( tmp_index_dir ) | 367 shutil.rmtree( tmp_index_dir ) |
285 if os.path.exists( tmp_bismark_dir ): | 368 if os.path.exists( tmp_bismark_dir ): |
286 shutil.rmtree( tmp_bismark_dir ) | 369 shutil.rmtree( tmp_bismark_dir ) |
370 if os.path.exists( output_dir ): | |
371 shutil.rmtree( output_dir ) | |
287 | 372 |
288 if __name__=="__main__": __main__() | 373 if __name__=="__main__": __main__() |