comparison bismark_wrapper.py @ 7:63d9e369c669 draft

Uploaded
author bgruening
date Thu, 18 Oct 2012 04:06:44 -0400
parents
children 221e740377ca
comparison
equal deleted inserted replaced
6:1c97427d28a3 7:63d9e369c669
1 #!/usr/bin/env python
2
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput
4 import fileinput
5 from glob import glob
6
7 def stop_err( msg ):
8 sys.stderr.write( "%s\n" % msg )
9 sys.exit()
10
11 def __main__():
12 #Parse Command Line
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
14 parser.add_argument( '-p', '--num-threads', dest='num_threads',
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' )
16
17 # input options
18 parser.add_argument( '--own-file', dest='own_file', help='' )
19 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
20 parser.add_argument( '-O', '--output', dest='output' )
21 parser.add_argument( '--output-report-file', dest='output_report_file' )
22 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" )
23
24 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False)
25
26
27 parser.add_argument( '-1', '--mate1', dest='mate1',
28 help='The forward reads file in Sanger FASTQ or FASTA format.' )
29 parser.add_argument( '-2', '--mate2', dest='mate2',
30 help='The reverse reads file in Sanger FASTQ or FASTA format.' )
31
32 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads',
33 help='Additional output file with unmapped reads (single-end).' )
34 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l',
35 help='File name for unmapped reads (left, paired-end).' )
36 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r',
37 help='File name for unmapped reads (right, paired-end).' )
38
39
40 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads',
41 help='Additional output file with suppressed reads (single-end).' )
42 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l',
43 help='File name for suppressed reads (left, paired-end).' )
44 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r',
45 help='File name for suppressed reads (right, paired-end).' )
46
47
48 parser.add_argument( '--single-paired', dest='single_paired',
49 help='The single-end reads file in Sanger FASTQ or FASTA format.' )
50
51 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format')
52 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format')
53 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" )
54
55
56 parser.add_argument( '--skip-reads', dest='skip_reads', type=int )
57 parser.add_argument( '--qupto', type=int)
58
59
60 # paired end options
61 parser.add_argument( '-I', '--minins', dest='min_insert' )
62 parser.add_argument( '-X', '--maxins', dest='max_insert' )
63 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" )
64 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" )
65
66 #parse general options
67 # default 20
68 parser.add_argument( '--seed-len', dest='seed_len', type=int)
69 # default 15
70 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int )
71 # default 0
72 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int )
73 # default 2
74 parser.add_argument( '--max-reseed', dest='max_reseed', type=int )
75 """
76 # default 70
77 parser.add_argument( '--maqerr', dest='maqerr', type=int )
78 """
79
80 """
81 The number of megabytes of memory a given thread is given to store path
82 descriptors in --best mode. Best-first search must keep track of many paths
83 at once to ensure it is always extending the path with the lowest cumulative
84 cost. Bowtie tries to minimize the memory impact of the descriptors, but
85 they can still grow very large in some cases. If you receive an error message
86 saying that chunk memory has been exhausted in --best mode, try adjusting
87 this parameter up to dedicate more memory to the descriptors. Default: 512.
88 """
89 parser.add_argument( '--chunkmbs', type=int, default=512 )
90
91 args = parser.parse_args()
92
93 # Create bismark index if necessary.
94 index_dir = ""
95 if args.own_file:
96 """
97 Create a temporary index with the offered files from the user.
98 Utilizing the script: bismark_genome_preparation
99 bismark_genome_preparation --bowtie2 hg19/
100 """
101 tmp_index_dir = tempfile.mkdtemp()
102 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) )
103 try:
104 """
105 Create a hard link pointing to args.own_file named 'index_path'.fa.
106 """
107 os.symlink( args.own_file, index_path + '.fa' )
108 except Exception, e:
109 if os.path.exists( tmp_index_dir ):
110 shutil.rmtree( tmp_index_dir )
111 stop_err( 'Error in linking the reference database.\n' + str( e ) )
112 # bismark_genome_preparation needs the complete path to the folder in which the database is stored
113 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir )
114 try:
115 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
116 tmp_stderr = open( tmp, 'wb' )
117 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() )
118 returncode = proc.wait()
119 tmp_stderr.close()
120 # get stderr, allowing for case where it's very large
121 tmp_stderr = open( tmp, 'rb' )
122 stderr = ''
123 buffsize = 1048576
124 try:
125 while True:
126 stderr += tmp_stderr.read( buffsize )
127 if not stderr or len( stderr ) % buffsize != 0:
128 break
129 except OverflowError:
130 pass
131 tmp_stderr.close()
132 if returncode != 0:
133 raise Exception, stderr
134 except Exception, e:
135 if os.path.exists( tmp_index_dir ):
136 shutil.rmtree( tmp_index_dir )
137 stop_err( 'Error indexing reference sequence\n' + str( e ) )
138 index_dir = tmp_index_dir
139 else:
140 index_dir = args.index_path
141
142 # Build bismark command
143 tmp_bismark_dir = tempfile.mkdtemp()
144 output_dir = os.path.join( tmp_bismark_dir, 'results')
145 cmd = 'bismark -p %(num_threads)s %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet --bowtie2 %(genome_folder)s %(reads)s'
146
147 arguments = {
148 'genome_folder': index_dir,
149 'num_threads': args.num_threads,
150 'args': '',
151 'tmp_bismark_dir': tmp_bismark_dir,
152 'output_dir': output_dir,
153 }
154
155 additional_opts = ''
156 # Set up the reads
157 if args.mate_paired:
158 # paired-end reads library
159 reads = '-1 %s ' % ( args.mate1 )
160 reads += ' -2 %s ' % ( args.mate2 )
161 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert)
162 else:
163 # single paired reads library
164 reads = ' %s ' % ( args.single_paired )
165
166 # alignment options
167 if args.seed_mismatches:
168 additional_opts += ' -N %s ' % args.seed_mismatches
169 if args.seed_len:
170 additional_opts += ' -L %s ' % args.seed_len
171 if args.seed_extention_attempts:
172 additional_opts += ' -D %s ' % args.seed_extention_attempts
173 if args.max_reseed:
174 additional_opts += ' -R %s ' % args.max_reseed
175 """
176 if args.maqerr:
177 additional_opts += ' --maqerr %s ' % args.maqerr
178 """
179 if args.no_discordant:
180 additional_opts += ' --no-discordant '
181 if args.no_mixed:
182 additional_opts += ' --no-mixed '
183 if args.skip_reads:
184 additional_opts += ' --skip %s ' % args.skip_reads
185 if args.qupto:
186 additional_opts += ' --qupto %s ' % args.qupto
187 if args.phred64:
188 additional_opts += ' --phred64-quals '
189 if args.suppress_header:
190 additional_opts += ' --sam-no-hd '
191 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r):
192 additional_opts += ' --un '
193 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r):
194 additional_opts += ' --ambiguous '
195
196 arguments.update( {'args': additional_opts, 'reads': reads} )
197
198 # Final command:
199 cmd = cmd % arguments
200
201 # Run
202 try:
203 tmp_out = tempfile.NamedTemporaryFile().name
204 tmp_stdout = open( tmp_out, 'wb' )
205 tmp_err = tempfile.NamedTemporaryFile().name
206 tmp_stderr = open( tmp_err, 'wb' )
207 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
208 returncode = proc.wait()
209 tmp_stderr.close()
210 # get stderr, allowing for case where it's very large
211 tmp_stderr = open( tmp_err, 'rb' )
212 stderr = ''
213 buffsize = 1048576
214 try:
215 while True:
216 stderr += tmp_stderr.read( buffsize )
217 if not stderr or len( stderr ) % buffsize != 0:
218 break
219 except OverflowError:
220 pass
221 tmp_stdout.close()
222 tmp_stderr.close()
223 if returncode != 0:
224 raise Exception, stderr
225
226 # TODO: look for errors in program output.
227 except Exception, e:
228 stop_err( 'Error in bismark:\n' + str( e ) )
229
230
231 # collect and copy output files
232 """
233 if args.output_report_file:
234 output_report_file = open(args.output_report_file, 'w+')
235 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )):
236 output_report_file.write(line)
237 output_report_file.close()
238 """
239
240 if args.output_suppressed_reads:
241 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads )
242 if args.output_suppressed_reads_l:
243 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l )
244 if args.output_suppressed_reads_r:
245 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r )
246
247 if args.output_unmapped_reads:
248 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads )
249 if args.output_unmapped_reads_l:
250 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l )
251 if args.output_unmapped_reads_r:
252 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r )
253
254 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output)
255
256 # Clean up temp dirs
257 if args.own_file:
258 if os.path.exists( tmp_index_dir ):
259 shutil.rmtree( tmp_index_dir )
260 if os.path.exists( tmp_bismark_dir ):
261 shutil.rmtree( tmp_bismark_dir )
262
263 if __name__=="__main__": __main__()