Mercurial > repos > oinizan > demultiplex
comparison demultiplex.py.save2 @ 0:da4101033e10 draft default tip
planemo upload
author | oinizan |
---|---|
date | Wed, 18 Oct 2017 05:30:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:da4101033e10 |
---|---|
1 #!/usr/bin/env python2.7 | |
2 # | |
3 # Copyright (C) 2014 INRA | |
4 # | |
5 # This program is free software: you can redistribute it and/or modify | |
6 # it under the terms of the GNU General Public License as published by | |
7 # the Free Software Foundation, either version 3 of the License, or | |
8 # (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 # | |
18 | |
19 __author__ = 'Plateforme bioinformatique Toulouse and SIGENAE' | |
20 __copyright__ = 'Copyright (C) 2015 INRA' | |
21 __license__ = 'GNU General Public License' | |
22 __version__ = '1.2.0' | |
23 __email__ = 'frogs@inra.fr' | |
24 __status__ = 'prod' | |
25 | |
26 import os | |
27 import sys | |
28 import gzip | |
29 import time | |
30 import tarfile | |
31 import argparse | |
32 | |
33 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
34 # PATH | |
35 BIN_DIR = os.path.abspath(os.path.join(os.path.dirname(CURRENT_DIR), "libexec")) | |
36 os.environ['PATH'] = BIN_DIR + os.pathsep + os.environ['PATH'] | |
37 # PYTHONPATH | |
38 LIB_DIR = os.path.abspath(os.path.join(os.path.dirname(CURRENT_DIR), "lib")) | |
39 sys.path.append(LIB_DIR) | |
40 if os.getenv('PYTHONPATH') is None: os.environ['PYTHONPATH'] = LIB_DIR | |
41 else: os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + os.pathsep + LIB_DIR | |
42 | |
43 from frogsUtils import * | |
44 | |
45 | |
46 ################################################################################################################################################## | |
47 # | |
48 # COMMAND LINES | |
49 # | |
50 ################################################################################################################################################## | |
51 class Demultiplex(Cmd): | |
52 """ | |
53 @summary : Demultiplex samples. | |
54 """ | |
55 def __init__(self, R1_input_file, R2_input_file, barcode_file, mismatches, end, global_tmp_files, R1_output_files, R2_output_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log): | |
56 """ | |
57 @param R1_input_file : [str] Path to the R1 fastq file. | |
58 @param R2_input_file : [str] Path to the R2 fastq file. | |
59 @param barcode_file : [str] Path to barcodes and samples (one line by sample) description file. Line format : SAMPLE_NAME<TAB>BARCODE. | |
60 @param mismatches : [int] Number of mismatches allowed | |
61 @param end : [str] barcode ends ? forward : bol or reverse : eol (def bol) | |
62 @param global_tmp_files : [str] Path for R1 and R2 files. | |
63 @param R1_output_files : [list] Paths to the R1 fastq files (one by sample). User provides an empty list. | |
64 @param R2_output_files : [list] Paths to the R2 fastq files (one by sample). User provides an empty list. | |
65 @param demultiplex_err_files : [list] Paths to the files with ambiguous and unmatched reads. User provides an empty list. | |
66 """ | |
67 | |
68 tmp_files = TmpFiles( global_tmp_files.tmp_dir ) | |
69 | |
70 tmp_folder = os.path.join( global_tmp_files.tmp_dir, global_tmp_files.prefix + "_tmp", tmp_files.prefix ) | |
71 global_tmp_files.dirs.append(tmp_folder) | |
72 if not os.path.exists(tmp_folder): | |
73 os.mkdir(tmp_folder) | |
74 self.samples_names = list() | |
75 # Update output data | |
76 FH_barcode = open( barcode_file ) | |
77 for line in FH_barcode: | |
78 sample_name, barcode = line.strip().rsplit(None, 1) | |
79 # OI DEBUG | |
80 print "Line is " + line | |
81 print "Sample name is " + sample_name | |
82 print "Barcode is " + barcode | |
83 print "Append to R1_output_files file: " + tmp_folder + " " + sample_name + " " + '_R1.fastq' | |
84 R1_output_files.append( os.path.join(tmp_folder, sample_name + '_R1.fastq') ) | |
85 | |
86 global_tmp_files.files.append(os.path.join(tmp_folder, sample_name + '_R1.fastq') ) | |
87 if R2_input_file != None: | |
88 R2_output_files.append( os.path.join(tmp_folder, sample_name + '_R2.fastq') ) | |
89 global_tmp_files.files.append(os.path.join(tmp_folder, sample_name + '_R2.fastq')) | |
90 self.samples_names.append( sample_name.replace(' ', '_') ) | |
91 FH_barcode.close() | |
92 self.R1_input_file = R1_input_file | |
93 self.ambiguous = os.path.join(tmp_folder, 'ambiguous_R1.fastq') | |
94 self.unmatched = os.path.join(tmp_folder, 'unmatched_R1.fastq') | |
95 demultiplex_err_files1.extend( [self.ambiguous,self.unmatched] ) | |
96 global_tmp_files.files.extend( [self.ambiguous,self.unmatched] ) | |
97 if R2_input_file != None: | |
98 demultiplex_err_files2.extend( [os.path.join(tmp_folder, 'ambiguous_R2.fastq'),os.path.join(tmp_folder, 'unmatched_R2.fastq') ]) | |
99 global_tmp_files.files.extend( [os.path.join(tmp_folder, 'ambiguous_R2.fastq'),os.path.join(tmp_folder, 'unmatched_R2.fastq') ]) | |
100 | |
101 # Set class | |
102 if R2_input_file != None: | |
103 Cmd.__init__( self, | |
104 'perl ' + os.environ['TOOL_DIRECTORY'] + '/splitbc.pl', | |
105 'Demultiplex reads.', | |
106 R1_input_file + ' ' + R2_input_file + ' --' + end + ' --bcfile ' + barcode_file + ' --mismatches ' + `mismatches` + ' --trim --no_adapt --prefix-r1 ' + os.path.join(tmp_folder, '%_R1.fastq') +\ | |
107 ' --prefix-r2 ' + os.path.join(tmp_folder, '%_R2.fastq') + ' >> ' + demultiplex_log, | |
108 None ) | |
109 else: | |
110 Cmd.__init__( self, | |
111 'perl '+ os.environ['TOOL_DIRECTORY'] + '/splitbc.pl', | |
112 'Demultiplex reads.', | |
113 R1_input_file + ' --' + end + ' --bcfile ' + barcode_file + ' --mismatches ' + `mismatches` + ' --trim --no_adapt --prefix-r1 ' + os.path.join(tmp_folder, '%_R1.fastq') +\ | |
114 ' >> ' + demultiplex_log, | |
115 None ) | |
116 | |
117 def parser(self, log_file): | |
118 """ | |
119 @summary : Parse the command results to add information in log_file. | |
120 @log_file : [str] Path to the sample process log file. | |
121 """ | |
122 # Parse output | |
123 nb_seq_before = get_fastq_nb_seq(self.R1_input_file) | |
124 nb_seq_unmatched = get_fastq_nb_seq(self.unmatched) | |
125 nb_seq_ambiguous = get_fastq_nb_seq(self.ambiguous) | |
126 # Write result | |
127 FH_log = Logger( log_file ) | |
128 FH_log.write( 'Results :\n' ) | |
129 FH_log.write( '\tnb seq before demultiplexing : ' + str(nb_seq_before) + '\n' ) | |
130 FH_log.write( '\tnb seq after process matched : ' + str(nb_seq_before - nb_seq_unmatched) + '\n' ) | |
131 FH_log.write( '\tnb seq after process non-ambiguous : ' + str(nb_seq_before - nb_seq_unmatched - nb_seq_ambiguous) + '\n' ) | |
132 FH_log.close() | |
133 | |
134 def get_version(self): | |
135 """ | |
136 @summary : Returns the program version number. | |
137 @return : version number if this is possible, otherwise this method return 'unknown'. | |
138 """ | |
139 return Cmd.get_version(self, 'stdout') | |
140 | |
141 | |
142 class Archive(Cmd): | |
143 """ | |
144 @summary : Creates an archive with files. | |
145 """ | |
146 def __init__(self, archived_files, archive_path): | |
147 """ | |
148 @param archived_files: [list] Files added in final archive. | |
149 @param archive_path: [str] Path to the new archive. | |
150 """ | |
151 | |
152 tmp_files=TmpFiles( os.path.dirname(archive_path) ) | |
153 tmp_folder = os.path.join( tmp_files.tmp_dir, tmp_files.prefix) | |
154 tmp_files.dirs.append(tmp_folder) | |
155 if not os.path.exists(tmp_folder): | |
156 os.makedirs(tmp_folder) | |
157 | |
158 if len(archived_files) == 0: | |
159 raise Exception( "At least one file must be add to the archive '" + archive_path + "'." ) | |
160 | |
161 archived_basenames = list() | |
162 for current in archived_files: | |
163 if not os.path.dirname(current) == tmp_folder: | |
164 os.rename(current, os.path.join(tmp_folder,os.path.basename(current))) | |
165 tmp_files.files.append(os.path.join(tmp_folder,os.path.basename(current))) | |
166 archived_basenames.append(os.path.basename(current)) | |
167 # OI DEBUG | |
168 archive_path_present = "archive_path is file or directory: " + str(os.path.isfile(archive_path)) + " or " + str(os.path.isdir(archive_path)) | |
169 print archive_path_present | |
170 tmp_folder_present = "tmp_folder is file or directory: " + str(os.path.isfile(tmp_folder)) + " or " + str(os.path.isdir(tmp_folder)) | |
171 print tmp_folder_present | |
172 for files in archived_basenames: | |
173 print "archived basename is " + files | |
174 files_present = "archived basename is file or directory: " + str(os.path.isfile(files)) + " or " + str(os.path.isdir(files)) | |
175 print files_present | |
176 # END OI DEBUG | |
177 | |
178 | |
179 | |
180 Cmd.__init__( self, | |
181 'tar', | |
182 'Archives files.', | |
183 '-zcf ' + archive_path + ' -C ' + tmp_folder + " " + " ".join(archived_basenames), | |
184 None ) | |
185 | |
186 self.Files=tmp_files | |
187 | |
188 | |
189 def parser(self,log_file): | |
190 self.Files.deleteAll() | |
191 | |
192 | |
193 ################################################################################################################################################## | |
194 # | |
195 # FUNCTIONS | |
196 # | |
197 ################################################################################################################################################## | |
198 def is_gzip( file ): | |
199 """ | |
200 @return: [bool] True if the file is gziped. | |
201 @param file : [str] Path to processed file. | |
202 """ | |
203 is_gzip = None | |
204 FH_input = gzip.open( file ) | |
205 try: | |
206 FH_input.readline() | |
207 is_gzip = True | |
208 except: | |
209 is_gzip = False | |
210 finally: | |
211 FH_input.close() | |
212 return is_gzip | |
213 | |
214 def split_barcode_file( barcode_file, barcodes_file_list, global_tmp_files ): | |
215 """ | |
216 @summary: In case of double multiplexe, split barcode file in one forward and multiple reverse barcode files | |
217 @param barcode_file: [str] Path to the input barcode file | |
218 @param barcodes_file_list: [list] List of path to the ouput barcode files | |
219 @param out_dir: [str] path to the output directory to write barcode files | |
220 """ | |
221 out_dir = global_tmp_files.tmp_dir | |
222 barcode_input = open(barcode_file,"r") | |
223 barcode_dict={} | |
224 for l in barcode_input.readlines(): | |
225 [s,f,r]=l.strip().split() | |
226 if not "forward_bc" in barcode_dict: | |
227 barcode_dict["forward_bc"] = [f+"\t"+f] | |
228 elif not f+"\t"+f in barcode_dict["forward_bc"]: | |
229 barcode_dict["forward_bc"].append( f+"\t"+f) | |
230 if not f+"_reverse_bc" in barcode_dict: | |
231 barcode_dict[f+"_reverse_bc"] = [s+"\t"+r] | |
232 else : | |
233 barcode_dict[f+"_reverse_bc"].append(s+"\t"+r) | |
234 | |
235 f=barcode_dict.pop("forward_bc") | |
236 barcodes_file_list.append(os.path.join(out_dir,"forward_bc")) | |
237 global_tmp_files.files.append(os.path.join(out_dir,"forward_bc")) | |
238 FH_out = open(os.path.join(out_dir,"forward_bc"),"w") | |
239 FH_out.write("\n".join(f)+"\n") | |
240 FH_out.close() | |
241 | |
242 for bc_file in barcode_dict: | |
243 barcodes_file_list.append(os.path.join(out_dir,bc_file)) | |
244 global_tmp_files.files.append(os.path.join(out_dir,bc_file)) | |
245 FH_out = open(os.path.join(out_dir,bc_file),"w") | |
246 FH_out.write("\n".join(barcode_dict[bc_file])+"\n") | |
247 FH_out.close() | |
248 | |
249 def get_fastq_nb_seq( fastq_file ): | |
250 """ | |
251 @summary: Returns the number of sequences in fastq_file. | |
252 @param fastq_file: [str] Path to the fastq file processed. | |
253 @return: [int] The number of sequences. | |
254 """ | |
255 FH_input = None | |
256 if not is_gzip(fastq_file): | |
257 FH_input = open( fastq_file ) | |
258 else: | |
259 FH_input = gzip.open( fastq_file ) | |
260 nb_line = 0 | |
261 for line in FH_input: | |
262 nb_line += 1 | |
263 FH_input.close() | |
264 nb_seq = nb_line/4 | |
265 return nb_seq | |
266 | |
267 def concat_files(list_input, output_file): | |
268 | |
269 FH_out=open(output_file,"w") | |
270 for f in list_input : | |
271 FH_in = open(f) | |
272 string="" | |
273 i=0 | |
274 for line in FH_in: | |
275 string+= line | |
276 i+=1 | |
277 if i==2000 : | |
278 FH_out.write(string) | |
279 string="" | |
280 i=0 | |
281 if i != 0: | |
282 FH_out.write(string) | |
283 FH_in.close() | |
284 FH_out.close() | |
285 | |
286 def summarise_results( summary_file, barcode_file, log_file ): | |
287 """ | |
288 @summary: Writes one summary of results from several logs. | |
289 @param summary_file: [str] The output file. | |
290 @param log_files: [list] The list of path to log files (one log file by sample). | |
291 """ | |
292 sample_dict=dict() | |
293 FH_barcode= open(barcode_file) | |
294 for line in FH_barcode: | |
295 sample_dict[line.split()[0]]=0 | |
296 | |
297 FH_summary = open(summary_file, "w") | |
298 FH_summary.write( "#sample\tcount\n") | |
299 FH_log = open(log_file,"r") | |
300 sample_dict["unmatched"]=0 | |
301 sample_dict["ambiguous"]=0 | |
302 | |
303 for line in FH_log.readlines(): | |
304 if line.startswith("Barcode") or line.startswith("total") : | |
305 pass | |
306 else : | |
307 l=line.replace('(','\t').split() | |
308 if l[0] in sample_dict: | |
309 sample_dict[l[0]] += int(l[1]) | |
310 | |
311 for s in sample_dict: | |
312 FH_summary.write(s + '\t' + str(sample_dict[s]) + '\n') | |
313 FH_summary.close() | |
314 | |
315 | |
316 ################################################################################################################################################## | |
317 # | |
318 # MAIN | |
319 # | |
320 ################################################################################################################################################## | |
321 if __name__ == "__main__": | |
322 # Manage parameters | |
323 parser = argparse.ArgumentParser( | |
324 description='Split by samples the reads in function of inner barcode.' | |
325 ) | |
326 parser.add_argument('-m', '--mismatches', type=int, default=0, help="Number of mismatches allowed in barcode. [Default: %(default)s]") | |
327 parser.add_argument('-e', '--end', type=str, default="bol", help="barcode is at the begining of the forward end (bol) or of the reverse (eol) or both (both). [Default: %(default)s]") | |
328 parser.add_argument( '--debug', default=False, action='store_true', help="Keep temporary files to debug program." ) | |
329 parser.add_argument( '-v', '--version', action='version', version=__version__ ) | |
330 # Inputs | |
331 group_input = parser.add_argument_group( 'Inputs' ) | |
332 group_input.add_argument( '--input-R1', required=True, help='The R1 sequence file with all samples (format: fastq).' ) | |
333 group_input.add_argument( '--input-R2', default=None, help='The R2 sequence file with all samples (format: fastq).' ) | |
334 group_input.add_argument( '--input-barcode', help='This file describes barcodes and samples (one line by sample). Line format : SAMPLE_NAME<TAB>BARCODE or SAMPLE_NAME<TAB>BARCODE_FW<TAB>BARCODE_RV.' ) | |
335 group_output = parser.add_argument_group( 'Outputs' ) | |
336 # Outputs | |
337 group_output.add_argument( '--output-demultiplexed', default="demultiplexed_read.tar.gz", help='The tar file containing R1 files and R2 files for each sample (format: tar). [Default: %(default)s]' ) | |
338 group_output.add_argument( '--output-excluded', default="undemultiplexed_read.tar.gz", help='The tar file containing R1 files and R2 files not demultiplexed (format: tar). [Default: %(default)s]' ) | |
339 group_output.add_argument( '-s', '--summary', default='summary.tsv', help='TSV file with summary of filters results (format: TSV). [Default: %(default)s]') | |
340 group_output.add_argument( '-l', '--log-file', default=sys.stdout, help='This output file will contain several information on executed commands.') | |
341 args = parser.parse_args() | |
342 prevent_shell_injections(args) | |
343 | |
344 Logger.static_write(args.log_file, "## Application\nSoftware :" + sys.argv[0] + " (version : " + str(__version__) + ")\nCommand : " + " ".join(sys.argv) + "\n\n") | |
345 | |
346 # Process | |
347 R1_files = list() | |
348 R2_files = list() | |
349 tmp_barcode_files = list() | |
350 tmp_R1_files = list() | |
351 tmp_R2_files = list() | |
352 demultiplex_err_files1 = list() | |
353 demultiplex_err_files2 = list() | |
354 excluded_R1_file = os.path.join(os.path.split(args.output_demultiplexed)[0],os.path.basename(args.input_R1)+"_excluded_demult") | |
355 if args.input_R2 != None : | |
356 excluded_R2_file = os.path.join(os.path.split(args.output_demultiplexed)[0],os.path.basename(args.input_R2)+"_excluded_demult") | |
357 uniq_id = str(time.time()) + "_" + str(os.getpid()) | |
358 | |
359 tmp_files = TmpFiles( os.path.split(args.output_demultiplexed)[0] ) | |
360 demultiplex_log = tmp_files.add("Demult.log") | |
361 tmp_folder=tmp_files.add_dir("tmp") | |
362 os.mkdir(tmp_folder) | |
363 | |
364 sample_list=[] | |
365 try: | |
366 # Process | |
367 if args.end == "bol" or args.end == "eol" : | |
368 | |
369 info="\n#Demultiplexing " + os.path.basename(args.input_R1) | |
370 if args.input_R2 != None: | |
371 info+= " and " + os.path.basename(args.input_R2) | |
372 info += " with " + os.path.basename(args.input_barcode) + " in " + args.end + " strand\n" | |
373 Logger.static_write(args.log_file,info) | |
374 Demultiplex(args.input_R1, args.input_R2, args.input_barcode, args.mismatches, args.end, tmp_files, R1_files, R2_files, demultiplex_err_files1,demultiplex_err_files2, demultiplex_log).submit( args.log_file ) | |
375 # OI DEBUG | |
376 for r1file in R1_files: | |
377 Logger.static_write(args.log_file,"\n\t#Archive demultiplexed files is " + r1file + " \n") | |
378 | |
379 else: | |
380 split_barcode_file(args.input_barcode, tmp_barcode_files, tmp_files) | |
381 info="\n#Demultiplexing " + os.path.basename(args.input_R1) | |
382 if args.input_R2 != None: | |
383 info+= " and " + os.path.basename(args.input_R2) | |
384 info += " with " + os.path.basename(tmp_barcode_files[0]) + " in bol strand\n" | |
385 Logger.static_write(args.log_file,info) | |
386 Demultiplex(args.input_R1, args.input_R2, tmp_barcode_files[0], args.mismatches, "bol", tmp_files, tmp_R1_files, tmp_R2_files, demultiplex_err_files1,demultiplex_err_files2, demultiplex_log).submit( args.log_file ) | |
387 for idx,read1_file in enumerate(tmp_R1_files): | |
388 bc = os.path.basename(read1_file).replace("_R1.fastq","") | |
389 if os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc") in tmp_barcode_files: | |
390 if os.stat(tmp_R1_files[idx]).st_size != 0 : | |
391 info="\n#Demultiplexing " + os.path.basename(tmp_R1_files[idx]) | |
392 if args.input_R2 != None: | |
393 info+= " and " + os.path.basename(tmp_R2_files[idx]) | |
394 info += " with " + bc+"_reverse_bc" + " in eol strand\n" | |
395 Logger.static_write(args.log_file,info) | |
396 if args.input_R2 != None: | |
397 Demultiplex(tmp_R1_files[idx], tmp_R2_files[idx], os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc"), args.mismatches, "eol", tmp_files, R1_files, R2_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log).submit( args.log_file ) | |
398 else: | |
399 Demultiplex(tmp_R1_files[idx], None, os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc"), args.mismatches, "eol", tmp_files, R1_files, R2_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log).submit( args.log_file ) | |
400 | |
401 Logger.static_write(args.log_file,"\n#Summarising result\n") | |
402 summarise_results( args.summary, args.input_barcode, demultiplex_log ) | |
403 Logger.static_write(args.log_file,"\n#Concatenation of undemultiplexed files 1\n") | |
404 concat_files(demultiplex_err_files1, excluded_R1_file ) | |
405 if len(R2_files) > 0: | |
406 Logger.static_write(args.log_file,"\n#Concatenation of undemultiplexed files 2\n") | |
407 concat_files(demultiplex_err_files2, excluded_R2_file ) | |
408 Logger.static_write(args.log_file,"\n#Archive demultiplexed R1 and R2 files\n") | |
409 Archive(R1_files + R2_files, args.output_demultiplexed).submit( args.log_file ) | |
410 Logger.static_write(args.log_file,"\n#Archive undemultiplexed R1 and R2 files\n") | |
411 Archive([excluded_R1_file,excluded_R2_file], args.output_excluded).submit( args.log_file ) | |
412 else: | |
413 Logger.static_write(args.log_file,"\n#Archive demultiplexed files\n") | |
414 Archive(R1_files, args.output_demultiplexed).submit( args.log_file ) | |
415 Logger.static_write(args.log_file,"\n#Archive undemultiplexed files\n") | |
416 Logger.static_write(args.log_file,"\n#Excluded_R1_file is " + excluded_R1_file + "\n") | |
417 Archive([excluded_R1_file], args.output_excluded).submit( args.log_file ) | |
418 | |
419 # Remove temporary files | |
420 finally: | |
421 #OI DEBUG | |
422 if not args.debug: | |
423 pass | |
424 #Logger.static_write(args.log_file,"\n#Removing temporary files\n") | |
425 #tmp_files.deleteAll() |