annotate demultiplex.py.save @ 0:da4101033e10 draft default tip

planemo upload
author oinizan
date Wed, 18 Oct 2017 05:30:40 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
1 #!/usr/bin/env python2.7
da4101033e10 planemo upload
oinizan
parents:
diff changeset
2 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
3 # Copyright (C) 2014 INRA
da4101033e10 planemo upload
oinizan
parents:
diff changeset
4 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
5 # This program is free software: you can redistribute it and/or modify
da4101033e10 planemo upload
oinizan
parents:
diff changeset
6 # it under the terms of the GNU General Public License as published by
da4101033e10 planemo upload
oinizan
parents:
diff changeset
7 # the Free Software Foundation, either version 3 of the License, or
da4101033e10 planemo upload
oinizan
parents:
diff changeset
8 # (at your option) any later version.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
9 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
10 # This program is distributed in the hope that it will be useful,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
da4101033e10 planemo upload
oinizan
parents:
diff changeset
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
da4101033e10 planemo upload
oinizan
parents:
diff changeset
13 # GNU General Public License for more details.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
14 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
15 # You should have received a copy of the GNU General Public License
da4101033e10 planemo upload
oinizan
parents:
diff changeset
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
17 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
18
da4101033e10 planemo upload
oinizan
parents:
diff changeset
19 __author__ = 'Plateforme bioinformatique Toulouse and SIGENAE'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
20 __copyright__ = 'Copyright (C) 2015 INRA'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
21 __license__ = 'GNU General Public License'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
22 __version__ = '1.2.0'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
23 __email__ = 'frogs@inra.fr'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
24 __status__ = 'prod'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
25
da4101033e10 planemo upload
oinizan
parents:
diff changeset
26 import os
da4101033e10 planemo upload
oinizan
parents:
diff changeset
27 import sys
da4101033e10 planemo upload
oinizan
parents:
diff changeset
28 import gzip
da4101033e10 planemo upload
oinizan
parents:
diff changeset
29 import time
da4101033e10 planemo upload
oinizan
parents:
diff changeset
30 import tarfile
da4101033e10 planemo upload
oinizan
parents:
diff changeset
31 import argparse
da4101033e10 planemo upload
oinizan
parents:
diff changeset
32
da4101033e10 planemo upload
oinizan
parents:
diff changeset
33 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
34 # PATH
da4101033e10 planemo upload
oinizan
parents:
diff changeset
35 BIN_DIR = os.path.abspath(os.path.join(os.path.dirname(CURRENT_DIR), "libexec"))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
36 os.environ['PATH'] = BIN_DIR + os.pathsep + os.environ['PATH']
da4101033e10 planemo upload
oinizan
parents:
diff changeset
37 # PYTHONPATH
da4101033e10 planemo upload
oinizan
parents:
diff changeset
38 LIB_DIR = os.path.abspath(os.path.join(os.path.dirname(CURRENT_DIR), "lib"))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
39 sys.path.append(LIB_DIR)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
40 if os.getenv('PYTHONPATH') is None: os.environ['PYTHONPATH'] = LIB_DIR
da4101033e10 planemo upload
oinizan
parents:
diff changeset
41 else: os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + os.pathsep + LIB_DIR
da4101033e10 planemo upload
oinizan
parents:
diff changeset
42
da4101033e10 planemo upload
oinizan
parents:
diff changeset
43 from frogsUtils import *
da4101033e10 planemo upload
oinizan
parents:
diff changeset
44
da4101033e10 planemo upload
oinizan
parents:
diff changeset
45
da4101033e10 planemo upload
oinizan
parents:
diff changeset
46 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
47 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
48 # COMMAND LINES
da4101033e10 planemo upload
oinizan
parents:
diff changeset
49 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
50 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
51 class Demultiplex(Cmd):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
52 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
53 @summary : Demultiplex samples.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
54 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
55 def __init__(self, R1_input_file, R2_input_file, barcode_file, mismatches, end, global_tmp_files, R1_output_files, R2_output_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
56 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
57 @param R1_input_file : [str] Path to the R1 fastq file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
58 @param R2_input_file : [str] Path to the R2 fastq file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
59 @param barcode_file : [str] Path to barcodes and samples (one line by sample) description file. Line format : SAMPLE_NAME<TAB>BARCODE.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
60 @param mismatches : [int] Number of mismatches allowed
da4101033e10 planemo upload
oinizan
parents:
diff changeset
61 @param end : [str] barcode ends ? forward : bol or reverse : eol (def bol)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
62 @param global_tmp_files : [str] Path for R1 and R2 files.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
63 @param R1_output_files : [list] Paths to the R1 fastq files (one by sample). User provides an empty list.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
64 @param R2_output_files : [list] Paths to the R2 fastq files (one by sample). User provides an empty list.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
65 @param demultiplex_err_files : [list] Paths to the files with ambiguous and unmatched reads. User provides an empty list.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
66 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
67
da4101033e10 planemo upload
oinizan
parents:
diff changeset
68 tmp_files = TmpFiles( global_tmp_files.tmp_dir )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
69
da4101033e10 planemo upload
oinizan
parents:
diff changeset
70 tmp_folder = os.path.join( global_tmp_files.tmp_dir, global_tmp_files.prefix + "_tmp", tmp_files.prefix )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
71 global_tmp_files.dirs.append(tmp_folder)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
72 if not os.path.exists(tmp_folder):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
73 os.mkdir(tmp_folder)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
74 self.samples_names = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
75 # Update output data
da4101033e10 planemo upload
oinizan
parents:
diff changeset
76 FH_barcode = open( barcode_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
77 for line in FH_barcode:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
78 sample_name, barcode = line.strip().rsplit(None, 1)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
79 R1_output_files.append( os.path.join(tmp_folder, sample_name + '_R1.fastq') )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
80 global_tmp_files.files.append(os.path.join(tmp_folder, sample_name + '_R1.fastq') )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
81 if R2_input_file != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
82 R2_output_files.append( os.path.join(tmp_folder, sample_name + '_R2.fastq') )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
83 global_tmp_files.files.append(os.path.join(tmp_folder, sample_name + '_R2.fastq'))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
84 self.samples_names.append( sample_name.replace(' ', '_') )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
85 FH_barcode.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
86 self.R1_input_file = R1_input_file
da4101033e10 planemo upload
oinizan
parents:
diff changeset
87 self.ambiguous = os.path.join(tmp_folder, 'ambiguous_R1.fastq')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
88 self.unmatched = os.path.join(tmp_folder, 'unmatched_R1.fastq')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
89 demultiplex_err_files1.extend( [self.ambiguous,self.unmatched] )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
90 global_tmp_files.files.extend( [self.ambiguous,self.unmatched] )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
91 if R2_input_file != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
92 demultiplex_err_files2.extend( [os.path.join(tmp_folder, 'ambiguous_R2.fastq'),os.path.join(tmp_folder, 'unmatched_R2.fastq') ])
da4101033e10 planemo upload
oinizan
parents:
diff changeset
93 global_tmp_files.files.extend( [os.path.join(tmp_folder, 'ambiguous_R2.fastq'),os.path.join(tmp_folder, 'unmatched_R2.fastq') ])
da4101033e10 planemo upload
oinizan
parents:
diff changeset
94
da4101033e10 planemo upload
oinizan
parents:
diff changeset
95 # Set class
da4101033e10 planemo upload
oinizan
parents:
diff changeset
96 if R2_input_file != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
97 Cmd.__init__( self,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
98 'perl splitbc.pl',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
99 'Demultiplex reads.',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
100 R1_input_file + ' ' + R2_input_file + ' --' + end + ' --bcfile ' + barcode_file + ' --mismatches ' + `mismatches` + ' --trim --no_adapt --prefix-r1 ' + os.path.join(tmp_folder, '%_R1.fastq') +\
da4101033e10 planemo upload
oinizan
parents:
diff changeset
101 ' --prefix-r2 ' + os.path.join(tmp_folder, '%_R2.fastq') + ' >> ' + demultiplex_log,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
102 None )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
103 else:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
104 Cmd.__init__( self,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
105 'perl splitbc.pl',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
106 'Demultiplex reads.',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
107 R1_input_file + ' --' + end + ' --bcfile ' + barcode_file + ' --mismatches ' + `mismatches` + ' --trim --no_adapt --prefix-r1 ' + os.path.join(tmp_folder, '%_R1.fastq') +\
da4101033e10 planemo upload
oinizan
parents:
diff changeset
108 ' >> ' + demultiplex_log,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
109 None )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
110
da4101033e10 planemo upload
oinizan
parents:
diff changeset
111 def parser(self, log_file):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
112 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
113 @summary : Parse the command results to add information in log_file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
114 @log_file : [str] Path to the sample process log file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
115 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
116 # Parse output
da4101033e10 planemo upload
oinizan
parents:
diff changeset
117 nb_seq_before = get_fastq_nb_seq(self.R1_input_file)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
118 nb_seq_unmatched = get_fastq_nb_seq(self.unmatched)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
119 nb_seq_ambiguous = get_fastq_nb_seq(self.ambiguous)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
120 # Write result
da4101033e10 planemo upload
oinizan
parents:
diff changeset
121 FH_log = Logger( log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
122 FH_log.write( 'Results :\n' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
123 FH_log.write( '\tnb seq before demultiplexing : ' + str(nb_seq_before) + '\n' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
124 FH_log.write( '\tnb seq after process matched : ' + str(nb_seq_before - nb_seq_unmatched) + '\n' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
125 FH_log.write( '\tnb seq after process non-ambiguous : ' + str(nb_seq_before - nb_seq_unmatched - nb_seq_ambiguous) + '\n' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
126 FH_log.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
127
da4101033e10 planemo upload
oinizan
parents:
diff changeset
128 def get_version(self):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
129 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
130 @summary : Returns the program version number.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
131 @return : version number if this is possible, otherwise this method return 'unknown'.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
132 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
133 return Cmd.get_version(self, 'stdout')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
134
da4101033e10 planemo upload
oinizan
parents:
diff changeset
135
da4101033e10 planemo upload
oinizan
parents:
diff changeset
136 class Archive(Cmd):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
137 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
138 @summary : Creates an archive with files.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
139 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
140 def __init__(self, archived_files, archive_path):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
141 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
142 @param archived_files: [list] Files added in final archive.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
143 @param archive_path: [str] Path to the new archive.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
144 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
145
da4101033e10 planemo upload
oinizan
parents:
diff changeset
146 tmp_files=TmpFiles( os.path.dirname(archive_path) )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
147 tmp_folder = os.path.join( tmp_files.tmp_dir, tmp_files.prefix)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
148 tmp_files.dirs.append(tmp_folder)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
149 if not os.path.exists(tmp_folder):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
150 os.makedirs(tmp_folder)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
151
da4101033e10 planemo upload
oinizan
parents:
diff changeset
152 if len(archived_files) == 0:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
153 raise Exception( "At least one file must be add to the archive '" + archive_path + "'." )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
154
da4101033e10 planemo upload
oinizan
parents:
diff changeset
155 archived_basenames = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
156 for current in archived_files:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
157 if not os.path.dirname(current) == tmp_folder:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
158 os.rename(current, os.path.join(tmp_folder,os.path.basename(current)))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
159 tmp_files.files.append(os.path.join(tmp_folder,os.path.basename(current)))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
160 archived_basenames.append(os.path.basename(current))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
161
da4101033e10 planemo upload
oinizan
parents:
diff changeset
162
da4101033e10 planemo upload
oinizan
parents:
diff changeset
163 Cmd.__init__( self,
da4101033e10 planemo upload
oinizan
parents:
diff changeset
164 'tar',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
165 'Archives files.',
da4101033e10 planemo upload
oinizan
parents:
diff changeset
166 '-zcf ' + archive_path + ' -C ' + tmp_folder + " " + " ".join(archived_basenames),
da4101033e10 planemo upload
oinizan
parents:
diff changeset
167 None )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
168
da4101033e10 planemo upload
oinizan
parents:
diff changeset
169 self.Files=tmp_files
da4101033e10 planemo upload
oinizan
parents:
diff changeset
170
da4101033e10 planemo upload
oinizan
parents:
diff changeset
171
da4101033e10 planemo upload
oinizan
parents:
diff changeset
172 def parser(self,log_file):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
173 self.Files.deleteAll()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
174
da4101033e10 planemo upload
oinizan
parents:
diff changeset
175
da4101033e10 planemo upload
oinizan
parents:
diff changeset
176 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
177 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
178 # FUNCTIONS
da4101033e10 planemo upload
oinizan
parents:
diff changeset
179 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
180 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
181 def is_gzip( file ):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
182 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
183 @return: [bool] True if the file is gziped.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
184 @param file : [str] Path to processed file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
185 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
186 is_gzip = None
da4101033e10 planemo upload
oinizan
parents:
diff changeset
187 FH_input = gzip.open( file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
188 try:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
189 FH_input.readline()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
190 is_gzip = True
da4101033e10 planemo upload
oinizan
parents:
diff changeset
191 except:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
192 is_gzip = False
da4101033e10 planemo upload
oinizan
parents:
diff changeset
193 finally:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
194 FH_input.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
195 return is_gzip
da4101033e10 planemo upload
oinizan
parents:
diff changeset
196
da4101033e10 planemo upload
oinizan
parents:
diff changeset
197 def split_barcode_file( barcode_file, barcodes_file_list, global_tmp_files ):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
198 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
199 @summary: In case of double multiplexe, split barcode file in one forward and multiple reverse barcode files
da4101033e10 planemo upload
oinizan
parents:
diff changeset
200 @param barcode_file: [str] Path to the input barcode file
da4101033e10 planemo upload
oinizan
parents:
diff changeset
201 @param barcodes_file_list: [list] List of path to the ouput barcode files
da4101033e10 planemo upload
oinizan
parents:
diff changeset
202 @param out_dir: [str] path to the output directory to write barcode files
da4101033e10 planemo upload
oinizan
parents:
diff changeset
203 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
204 out_dir = global_tmp_files.tmp_dir
da4101033e10 planemo upload
oinizan
parents:
diff changeset
205 barcode_input = open(barcode_file,"r")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
206 barcode_dict={}
da4101033e10 planemo upload
oinizan
parents:
diff changeset
207 for l in barcode_input.readlines():
da4101033e10 planemo upload
oinizan
parents:
diff changeset
208 [s,f,r]=l.strip().split()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
209 if not "forward_bc" in barcode_dict:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
210 barcode_dict["forward_bc"] = [f+"\t"+f]
da4101033e10 planemo upload
oinizan
parents:
diff changeset
211 elif not f+"\t"+f in barcode_dict["forward_bc"]:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
212 barcode_dict["forward_bc"].append( f+"\t"+f)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
213 if not f+"_reverse_bc" in barcode_dict:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
214 barcode_dict[f+"_reverse_bc"] = [s+"\t"+r]
da4101033e10 planemo upload
oinizan
parents:
diff changeset
215 else :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
216 barcode_dict[f+"_reverse_bc"].append(s+"\t"+r)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
217
da4101033e10 planemo upload
oinizan
parents:
diff changeset
218 f=barcode_dict.pop("forward_bc")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
219 barcodes_file_list.append(os.path.join(out_dir,"forward_bc"))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
220 global_tmp_files.files.append(os.path.join(out_dir,"forward_bc"))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
221 FH_out = open(os.path.join(out_dir,"forward_bc"),"w")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
222 FH_out.write("\n".join(f)+"\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
223 FH_out.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
224
da4101033e10 planemo upload
oinizan
parents:
diff changeset
225 for bc_file in barcode_dict:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
226 barcodes_file_list.append(os.path.join(out_dir,bc_file))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
227 global_tmp_files.files.append(os.path.join(out_dir,bc_file))
da4101033e10 planemo upload
oinizan
parents:
diff changeset
228 FH_out = open(os.path.join(out_dir,bc_file),"w")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
229 FH_out.write("\n".join(barcode_dict[bc_file])+"\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
230 FH_out.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
231
da4101033e10 planemo upload
oinizan
parents:
diff changeset
232 def get_fastq_nb_seq( fastq_file ):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
233 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
234 @summary: Returns the number of sequences in fastq_file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
235 @param fastq_file: [str] Path to the fastq file processed.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
236 @return: [int] The number of sequences.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
237 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
238 FH_input = None
da4101033e10 planemo upload
oinizan
parents:
diff changeset
239 if not is_gzip(fastq_file):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
240 FH_input = open( fastq_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
241 else:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
242 FH_input = gzip.open( fastq_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
243 nb_line = 0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
244 for line in FH_input:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
245 nb_line += 1
da4101033e10 planemo upload
oinizan
parents:
diff changeset
246 FH_input.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
247 nb_seq = nb_line/4
da4101033e10 planemo upload
oinizan
parents:
diff changeset
248 return nb_seq
da4101033e10 planemo upload
oinizan
parents:
diff changeset
249
da4101033e10 planemo upload
oinizan
parents:
diff changeset
250 def concat_files(list_input, output_file):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
251
da4101033e10 planemo upload
oinizan
parents:
diff changeset
252 FH_out=open(output_file,"w")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
253 for f in list_input :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
254 FH_in = open(f)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
255 string=""
da4101033e10 planemo upload
oinizan
parents:
diff changeset
256 i=0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
257 for line in FH_in:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
258 string+= line
da4101033e10 planemo upload
oinizan
parents:
diff changeset
259 i+=1
da4101033e10 planemo upload
oinizan
parents:
diff changeset
260 if i==2000 :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
261 FH_out.write(string)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
262 string=""
da4101033e10 planemo upload
oinizan
parents:
diff changeset
263 i=0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
264 if i != 0:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
265 FH_out.write(string)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
266 FH_in.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
267 FH_out.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
268
da4101033e10 planemo upload
oinizan
parents:
diff changeset
269 def summarise_results( summary_file, barcode_file, log_file ):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
270 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
271 @summary: Writes one summary of results from several logs.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
272 @param summary_file: [str] The output file.
da4101033e10 planemo upload
oinizan
parents:
diff changeset
273 @param log_files: [list] The list of path to log files (one log file by sample).
da4101033e10 planemo upload
oinizan
parents:
diff changeset
274 """
da4101033e10 planemo upload
oinizan
parents:
diff changeset
275 sample_dict=dict()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
276 FH_barcode= open(barcode_file)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
277 for line in FH_barcode:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
278 sample_dict[line.split()[0]]=0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
279
da4101033e10 planemo upload
oinizan
parents:
diff changeset
280 FH_summary = open(summary_file, "w")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
281 FH_summary.write( "#sample\tcount\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
282 FH_log = open(log_file,"r")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
283 sample_dict["unmatched"]=0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
284 sample_dict["ambiguous"]=0
da4101033e10 planemo upload
oinizan
parents:
diff changeset
285
da4101033e10 planemo upload
oinizan
parents:
diff changeset
286 for line in FH_log.readlines():
da4101033e10 planemo upload
oinizan
parents:
diff changeset
287 if line.startswith("Barcode") or line.startswith("total") :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
288 pass
da4101033e10 planemo upload
oinizan
parents:
diff changeset
289 else :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
290 l=line.replace('(','\t').split()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
291 if l[0] in sample_dict:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
292 sample_dict[l[0]] += int(l[1])
da4101033e10 planemo upload
oinizan
parents:
diff changeset
293
da4101033e10 planemo upload
oinizan
parents:
diff changeset
294 for s in sample_dict:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
295 FH_summary.write(s + '\t' + str(sample_dict[s]) + '\n')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
296 FH_summary.close()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
297
da4101033e10 planemo upload
oinizan
parents:
diff changeset
298
da4101033e10 planemo upload
oinizan
parents:
diff changeset
299 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
300 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
301 # MAIN
da4101033e10 planemo upload
oinizan
parents:
diff changeset
302 #
da4101033e10 planemo upload
oinizan
parents:
diff changeset
303 ##################################################################################################################################################
da4101033e10 planemo upload
oinizan
parents:
diff changeset
304 if __name__ == "__main__":
da4101033e10 planemo upload
oinizan
parents:
diff changeset
305 # Manage parameters
da4101033e10 planemo upload
oinizan
parents:
diff changeset
306 parser = argparse.ArgumentParser(
da4101033e10 planemo upload
oinizan
parents:
diff changeset
307 description='Split by samples the reads in function of inner barcode.'
da4101033e10 planemo upload
oinizan
parents:
diff changeset
308 )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
309 parser.add_argument('-m', '--mismatches', type=int, default=0, help="Number of mismatches allowed in barcode. [Default: %(default)s]")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
310 parser.add_argument('-e', '--end', type=str, default="bol", help="barcode is at the begining of the forward end (bol) or of the reverse (eol) or both (both). [Default: %(default)s]")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
311 parser.add_argument( '--debug', default=False, action='store_true', help="Keep temporary files to debug program." )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
312 parser.add_argument( '-v', '--version', action='version', version=__version__ )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
313 # Inputs
da4101033e10 planemo upload
oinizan
parents:
diff changeset
314 group_input = parser.add_argument_group( 'Inputs' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
315 group_input.add_argument( '--input-R1', required=True, help='The R1 sequence file with all samples (format: fastq).' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
316 group_input.add_argument( '--input-R2', default=None, help='The R2 sequence file with all samples (format: fastq).' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
317 group_input.add_argument( '--input-barcode', help='This file describes barcodes and samples (one line by sample). Line format : SAMPLE_NAME<TAB>BARCODE or SAMPLE_NAME<TAB>BARCODE_FW<TAB>BARCODE_RV.' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
318 group_output = parser.add_argument_group( 'Outputs' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
319 # Outputs
da4101033e10 planemo upload
oinizan
parents:
diff changeset
320 group_output.add_argument( '--output-demultiplexed', default="demultiplexed_read.tar.gz", help='The tar file containing R1 files and R2 files for each sample (format: tar). [Default: %(default)s]' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
321 group_output.add_argument( '--output-excluded', default="undemultiplexed_read.tar.gz", help='The tar file containing R1 files and R2 files not demultiplexed (format: tar). [Default: %(default)s]' )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
322 group_output.add_argument( '-s', '--summary', default='summary.tsv', help='TSV file with summary of filters results (format: TSV). [Default: %(default)s]')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
323 group_output.add_argument( '-l', '--log-file', default=sys.stdout, help='This output file will contain several information on executed commands.')
da4101033e10 planemo upload
oinizan
parents:
diff changeset
324 args = parser.parse_args()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
325 prevent_shell_injections(args)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
326
da4101033e10 planemo upload
oinizan
parents:
diff changeset
327 Logger.static_write(args.log_file, "## Application\nSoftware :" + sys.argv[0] + " (version : " + str(__version__) + ")\nCommand : " + " ".join(sys.argv) + "\n\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
328
da4101033e10 planemo upload
oinizan
parents:
diff changeset
329 # Process
da4101033e10 planemo upload
oinizan
parents:
diff changeset
330 R1_files = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
331 R2_files = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
332 tmp_barcode_files = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
333 tmp_R1_files = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
334 tmp_R2_files = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
335 demultiplex_err_files1 = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
336 demultiplex_err_files2 = list()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
337 excluded_R1_file = os.path.join(os.path.split(args.output_demultiplexed)[0],os.path.basename(args.input_R1)+"_excluded_demult")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
338 if args.input_R2 != None :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
339 excluded_R2_file = os.path.join(os.path.split(args.output_demultiplexed)[0],os.path.basename(args.input_R2)+"_excluded_demult")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
340 uniq_id = str(time.time()) + "_" + str(os.getpid())
da4101033e10 planemo upload
oinizan
parents:
diff changeset
341
da4101033e10 planemo upload
oinizan
parents:
diff changeset
342 tmp_files = TmpFiles( os.path.split(args.output_demultiplexed)[0] )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
343 demultiplex_log = tmp_files.add("Demult.log")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
344 tmp_folder=tmp_files.add_dir("tmp")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
345 os.mkdir(tmp_folder)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
346
da4101033e10 planemo upload
oinizan
parents:
diff changeset
347 sample_list=[]
da4101033e10 planemo upload
oinizan
parents:
diff changeset
348 try:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
349 # Process
da4101033e10 planemo upload
oinizan
parents:
diff changeset
350 if args.end == "bol" or args.end == "eol" :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
351
da4101033e10 planemo upload
oinizan
parents:
diff changeset
352 info="\n#Demultiplexing " + os.path.basename(args.input_R1)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
353 if args.input_R2 != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
354 info+= " and " + os.path.basename(args.input_R2)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
355 info += " with " + os.path.basename(args.input_barcode) + " in " + args.end + " strand\n"
da4101033e10 planemo upload
oinizan
parents:
diff changeset
356 Logger.static_write(args.log_file,info)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
357 Demultiplex(args.input_R1, args.input_R2, args.input_barcode, args.mismatches, args.end, tmp_files, R1_files, R2_files, demultiplex_err_files1,demultiplex_err_files2, demultiplex_log).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
358 else:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
359 split_barcode_file(args.input_barcode, tmp_barcode_files, tmp_files)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
360 info="\n#Demultiplexing " + os.path.basename(args.input_R1)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
361 if args.input_R2 != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
362 info+= " and " + os.path.basename(args.input_R2)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
363 info += " with " + os.path.basename(tmp_barcode_files[0]) + " in bol strand\n"
da4101033e10 planemo upload
oinizan
parents:
diff changeset
364 Logger.static_write(args.log_file,info)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
365 Demultiplex(args.input_R1, args.input_R2, tmp_barcode_files[0], args.mismatches, "bol", tmp_files, tmp_R1_files, tmp_R2_files, demultiplex_err_files1,demultiplex_err_files2, demultiplex_log).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
366 for idx,read1_file in enumerate(tmp_R1_files):
da4101033e10 planemo upload
oinizan
parents:
diff changeset
367 bc = os.path.basename(read1_file).replace("_R1.fastq","")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
368 if os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc") in tmp_barcode_files:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
369 if os.stat(tmp_R1_files[idx]).st_size != 0 :
da4101033e10 planemo upload
oinizan
parents:
diff changeset
370 info="\n#Demultiplexing " + os.path.basename(tmp_R1_files[idx])
da4101033e10 planemo upload
oinizan
parents:
diff changeset
371 if args.input_R2 != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
372 info+= " and " + os.path.basename(tmp_R2_files[idx])
da4101033e10 planemo upload
oinizan
parents:
diff changeset
373 info += " with " + bc+"_reverse_bc" + " in eol strand\n"
da4101033e10 planemo upload
oinizan
parents:
diff changeset
374 Logger.static_write(args.log_file,info)
da4101033e10 planemo upload
oinizan
parents:
diff changeset
375 if args.input_R2 != None:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
376 Demultiplex(tmp_R1_files[idx], tmp_R2_files[idx], os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc"), args.mismatches, "eol", tmp_files, R1_files, R2_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
377 else:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
378 Demultiplex(tmp_R1_files[idx], None, os.path.join(tmp_files.tmp_dir,bc+"_reverse_bc"), args.mismatches, "eol", tmp_files, R1_files, R2_files, demultiplex_err_files1, demultiplex_err_files2, demultiplex_log).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
379
da4101033e10 planemo upload
oinizan
parents:
diff changeset
380 Logger.static_write(args.log_file,"\n#Summarising result\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
381 summarise_results( args.summary, args.input_barcode, demultiplex_log )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
382
da4101033e10 planemo upload
oinizan
parents:
diff changeset
383 Logger.static_write(args.log_file,"\n#Concatenation of undemultiplexed files 1\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
384 concat_files(demultiplex_err_files1, excluded_R1_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
385 if len(R2_files) > 0:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
386 Logger.static_write(args.log_file,"\n#Concatenation of undemultiplexed files 2\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
387 concat_files(demultiplex_err_files2, excluded_R2_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
388 Logger.static_write(args.log_file,"\n#Archive demultiplexed R1 and R2 files\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
389 Archive(R1_files + R2_files, args.output_demultiplexed).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
390 Logger.static_write(args.log_file,"\n#Archive undemultiplexed R1 and R2 files\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
391 Archive([excluded_R1_file,excluded_R2_file], args.output_excluded).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
392 else:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
393 Logger.static_write(args.log_file,"\n#Archive demultiplexed files\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
394 Archive(R1_files, args.output_demultiplexed).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
395 Logger.static_write(args.log_file,"\n#Archive undemultiplexed files\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
396 Archive([excluded_R1_file], args.output_excluded).submit( args.log_file )
da4101033e10 planemo upload
oinizan
parents:
diff changeset
397
da4101033e10 planemo upload
oinizan
parents:
diff changeset
398 # Remove temporary files
da4101033e10 planemo upload
oinizan
parents:
diff changeset
399 finally:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
400 if not args.debug:
da4101033e10 planemo upload
oinizan
parents:
diff changeset
401 Logger.static_write(args.log_file,"\n#Removing temporary files\n")
da4101033e10 planemo upload
oinizan
parents:
diff changeset
402 tmp_files.deleteAll()
da4101033e10 planemo upload
oinizan
parents:
diff changeset
403