Mercurial > repos > bgruening > bismark
comparison bismark_wrapper.py @ 7:63d9e369c669 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 18 Oct 2012 04:06:44 -0400 |
parents | |
children | 221e740377ca |
comparison
equal
deleted
inserted
replaced
6:1c97427d28a3 | 7:63d9e369c669 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput | |
4 import fileinput | |
5 from glob import glob | |
6 | |
7 def stop_err( msg ): | |
8 sys.stderr.write( "%s\n" % msg ) | |
9 sys.exit() | |
10 | |
11 def __main__(): | |
12 #Parse Command Line | |
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | |
14 parser.add_argument( '-p', '--num-threads', dest='num_threads', | |
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) | |
16 | |
17 # input options | |
18 parser.add_argument( '--own-file', dest='own_file', help='' ) | |
19 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
20 parser.add_argument( '-O', '--output', dest='output' ) | |
21 parser.add_argument( '--output-report-file', dest='output_report_file' ) | |
22 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) | |
23 | |
24 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) | |
25 | |
26 | |
27 parser.add_argument( '-1', '--mate1', dest='mate1', | |
28 help='The forward reads file in Sanger FASTQ or FASTA format.' ) | |
29 parser.add_argument( '-2', '--mate2', dest='mate2', | |
30 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) | |
31 | |
32 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', | |
33 help='Additional output file with unmapped reads (single-end).' ) | |
34 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', | |
35 help='File name for unmapped reads (left, paired-end).' ) | |
36 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', | |
37 help='File name for unmapped reads (right, paired-end).' ) | |
38 | |
39 | |
40 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', | |
41 help='Additional output file with suppressed reads (single-end).' ) | |
42 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', | |
43 help='File name for suppressed reads (left, paired-end).' ) | |
44 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', | |
45 help='File name for suppressed reads (right, paired-end).' ) | |
46 | |
47 | |
48 parser.add_argument( '--single-paired', dest='single_paired', | |
49 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) | |
50 | |
51 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') | |
52 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') | |
53 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) | |
54 | |
55 | |
56 parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) | |
57 parser.add_argument( '--qupto', type=int) | |
58 | |
59 | |
60 # paired end options | |
61 parser.add_argument( '-I', '--minins', dest='min_insert' ) | |
62 parser.add_argument( '-X', '--maxins', dest='max_insert' ) | |
63 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) | |
64 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) | |
65 | |
66 #parse general options | |
67 # default 20 | |
68 parser.add_argument( '--seed-len', dest='seed_len', type=int) | |
69 # default 15 | |
70 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) | |
71 # default 0 | |
72 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) | |
73 # default 2 | |
74 parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) | |
75 """ | |
76 # default 70 | |
77 parser.add_argument( '--maqerr', dest='maqerr', type=int ) | |
78 """ | |
79 | |
80 """ | |
81 The number of megabytes of memory a given thread is given to store path | |
82 descriptors in --best mode. Best-first search must keep track of many paths | |
83 at once to ensure it is always extending the path with the lowest cumulative | |
84 cost. Bowtie tries to minimize the memory impact of the descriptors, but | |
85 they can still grow very large in some cases. If you receive an error message | |
86 saying that chunk memory has been exhausted in --best mode, try adjusting | |
87 this parameter up to dedicate more memory to the descriptors. Default: 512. | |
88 """ | |
89 parser.add_argument( '--chunkmbs', type=int, default=512 ) | |
90 | |
91 args = parser.parse_args() | |
92 | |
93 # Create bismark index if necessary. | |
94 index_dir = "" | |
95 if args.own_file: | |
96 """ | |
97 Create a temporary index with the offered files from the user. | |
98 Utilizing the script: bismark_genome_preparation | |
99 bismark_genome_preparation --bowtie2 hg19/ | |
100 """ | |
101 tmp_index_dir = tempfile.mkdtemp() | |
102 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) | |
103 try: | |
104 """ | |
105 Create a hard link pointing to args.own_file named 'index_path'.fa. | |
106 """ | |
107 os.symlink( args.own_file, index_path + '.fa' ) | |
108 except Exception, e: | |
109 if os.path.exists( tmp_index_dir ): | |
110 shutil.rmtree( tmp_index_dir ) | |
111 stop_err( 'Error in linking the reference database.\n' + str( e ) ) | |
112 # bismark_genome_preparation needs the complete path to the folder in which the database is stored | |
113 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) | |
114 try: | |
115 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
116 tmp_stderr = open( tmp, 'wb' ) | |
117 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) | |
118 returncode = proc.wait() | |
119 tmp_stderr.close() | |
120 # get stderr, allowing for case where it's very large | |
121 tmp_stderr = open( tmp, 'rb' ) | |
122 stderr = '' | |
123 buffsize = 1048576 | |
124 try: | |
125 while True: | |
126 stderr += tmp_stderr.read( buffsize ) | |
127 if not stderr or len( stderr ) % buffsize != 0: | |
128 break | |
129 except OverflowError: | |
130 pass | |
131 tmp_stderr.close() | |
132 if returncode != 0: | |
133 raise Exception, stderr | |
134 except Exception, e: | |
135 if os.path.exists( tmp_index_dir ): | |
136 shutil.rmtree( tmp_index_dir ) | |
137 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
138 index_dir = tmp_index_dir | |
139 else: | |
140 index_dir = args.index_path | |
141 | |
142 # Build bismark command | |
143 tmp_bismark_dir = tempfile.mkdtemp() | |
144 output_dir = os.path.join( tmp_bismark_dir, 'results') | |
145 cmd = 'bismark -p %(num_threads)s %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet --bowtie2 %(genome_folder)s %(reads)s' | |
146 | |
147 arguments = { | |
148 'genome_folder': index_dir, | |
149 'num_threads': args.num_threads, | |
150 'args': '', | |
151 'tmp_bismark_dir': tmp_bismark_dir, | |
152 'output_dir': output_dir, | |
153 } | |
154 | |
155 additional_opts = '' | |
156 # Set up the reads | |
157 if args.mate_paired: | |
158 # paired-end reads library | |
159 reads = '-1 %s ' % ( args.mate1 ) | |
160 reads += ' -2 %s ' % ( args.mate2 ) | |
161 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) | |
162 else: | |
163 # single paired reads library | |
164 reads = ' %s ' % ( args.single_paired ) | |
165 | |
166 # alignment options | |
167 if args.seed_mismatches: | |
168 additional_opts += ' -N %s ' % args.seed_mismatches | |
169 if args.seed_len: | |
170 additional_opts += ' -L %s ' % args.seed_len | |
171 if args.seed_extention_attempts: | |
172 additional_opts += ' -D %s ' % args.seed_extention_attempts | |
173 if args.max_reseed: | |
174 additional_opts += ' -R %s ' % args.max_reseed | |
175 """ | |
176 if args.maqerr: | |
177 additional_opts += ' --maqerr %s ' % args.maqerr | |
178 """ | |
179 if args.no_discordant: | |
180 additional_opts += ' --no-discordant ' | |
181 if args.no_mixed: | |
182 additional_opts += ' --no-mixed ' | |
183 if args.skip_reads: | |
184 additional_opts += ' --skip %s ' % args.skip_reads | |
185 if args.qupto: | |
186 additional_opts += ' --qupto %s ' % args.qupto | |
187 if args.phred64: | |
188 additional_opts += ' --phred64-quals ' | |
189 if args.suppress_header: | |
190 additional_opts += ' --sam-no-hd ' | |
191 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): | |
192 additional_opts += ' --un ' | |
193 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): | |
194 additional_opts += ' --ambiguous ' | |
195 | |
196 arguments.update( {'args': additional_opts, 'reads': reads} ) | |
197 | |
198 # Final command: | |
199 cmd = cmd % arguments | |
200 | |
201 # Run | |
202 try: | |
203 tmp_out = tempfile.NamedTemporaryFile().name | |
204 tmp_stdout = open( tmp_out, 'wb' ) | |
205 tmp_err = tempfile.NamedTemporaryFile().name | |
206 tmp_stderr = open( tmp_err, 'wb' ) | |
207 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
208 returncode = proc.wait() | |
209 tmp_stderr.close() | |
210 # get stderr, allowing for case where it's very large | |
211 tmp_stderr = open( tmp_err, 'rb' ) | |
212 stderr = '' | |
213 buffsize = 1048576 | |
214 try: | |
215 while True: | |
216 stderr += tmp_stderr.read( buffsize ) | |
217 if not stderr or len( stderr ) % buffsize != 0: | |
218 break | |
219 except OverflowError: | |
220 pass | |
221 tmp_stdout.close() | |
222 tmp_stderr.close() | |
223 if returncode != 0: | |
224 raise Exception, stderr | |
225 | |
226 # TODO: look for errors in program output. | |
227 except Exception, e: | |
228 stop_err( 'Error in bismark:\n' + str( e ) ) | |
229 | |
230 | |
231 # collect and copy output files | |
232 """ | |
233 if args.output_report_file: | |
234 output_report_file = open(args.output_report_file, 'w+') | |
235 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )): | |
236 output_report_file.write(line) | |
237 output_report_file.close() | |
238 """ | |
239 | |
240 if args.output_suppressed_reads: | |
241 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) | |
242 if args.output_suppressed_reads_l: | |
243 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) | |
244 if args.output_suppressed_reads_r: | |
245 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) | |
246 | |
247 if args.output_unmapped_reads: | |
248 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) | |
249 if args.output_unmapped_reads_l: | |
250 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) | |
251 if args.output_unmapped_reads_r: | |
252 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) | |
253 | |
254 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output) | |
255 | |
256 # Clean up temp dirs | |
257 if args.own_file: | |
258 if os.path.exists( tmp_index_dir ): | |
259 shutil.rmtree( tmp_index_dir ) | |
260 if os.path.exists( tmp_bismark_dir ): | |
261 shutil.rmtree( tmp_bismark_dir ) | |
262 | |
263 if __name__=="__main__": __main__() |