comparison scripts/S01_find_orf_on_multiple_alignment.py @ 2:0d2f72caea10 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
author abims-sbr
date Wed, 27 Sep 2017 10:03:05 -0400
parents 567d5b771a90
children ff98ed7849fa
comparison
equal deleted inserted replaced
1:567d5b771a90 2:0d2f72caea10
57 return(bash_codeUniversel) 57 return(bash_codeUniversel)
58 ########################################################### 58 ###########################################################
59 59
60 60
61 ###################################################################################################################### 61 ######################################################################################################################
62 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### 62 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 #####
63 ###################################################################################################################### 63 ######################################################################################################################
64 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) 64 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF)
65 def multiple3(seq): 65 def multiple3(seq):
66 leng = len(seq) 66 leng = len(seq)
67 modulo = leng%3 67 modulo = leng%3
68 if modulo == 0: # the results of dividing leng per 3 is an integer 68 if modulo == 0: # the results of dividing leng per 3 is an integer
69 new_seq = seq 69 new_seq = seq
70 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) 70 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3)
71 new_seq = seq[:-1] # remove the last nc 71 new_seq = seq[:-1] # remove the last nc
72 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) 72 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3)
73 new_seq = seq[:-2] # remove the 2 last nc 73 new_seq = seq[:-2] # remove the 2 last nc
74 len1 = len(new_seq) 74 len1 = len(new_seq)
75 return(new_seq, modulo) 75 return(new_seq, modulo)
76 ########################################################## 76 ##########################################################
77 77
78 78
79 ############################# 79 #############################
94 base1 = string.capitalize(base1) 94 base1 = string.capitalize(base1)
95 base2 = seq_dna[i+1] 95 base2 = seq_dna[i+1]
96 base2 = string.capitalize(base2) 96 base2 = string.capitalize(base2)
97 base3 = seq_dna[i+2] 97 base3 = seq_dna[i+2]
98 base3 = string.capitalize(base3) 98 base3 = string.capitalize(base3)
99 99
100 codon = base1+base2+base3 100 codon = base1+base2+base3
101 codon = string.replace(codon, "T", "U") 101 codon = string.replace(codon, "T", "U")
102 102
103 if codon in bash_codeUniversel.keys(): 103 if codon in bash_codeUniversel.keys():
104 aa = bash_codeUniversel[codon] 104 aa = bash_codeUniversel[codon]
105 seq_aa = seq_aa + aa 105 seq_aa = seq_aa + aa
106 else: 106 else:
107 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" 107 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N"
108 i = i + 3 108 i = i + 3
109 109
110 return(seq_aa) 110 return(seq_aa)
111 ########################################################## 111 ##########################################################
112 112
113 113
114 ###### DEF 4 - Part 2 - ###### 114 ###### DEF 4 - Part 2 - ######
115 ############################## 115 ##############################
116 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): 116 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel):
117 117
118 ## 1 ## Get the list of aligned aa seq for the 3 ORF: 118 ## 1 ## Get the list of aligned aa seq for the 3 ORF:
119 bash_of_aligned_aa_seq_3ORF = {} 119 bash_of_aligned_aa_seq_3ORF = {}
120 bash_of_aligned_nuc_seq_3ORF = {} 120 bash_of_aligned_nuc_seq_3ORF = {}
121 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] 121 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
122 for fasta_name in bash_aligned_nc_seq.keys(): 122 for fasta_name in bash_aligned_nc_seq.keys():
123 ## 1.1. ## Get the raw sequence 123 ## 1.1. ## Get the raw sequence
124 sequence_nc = bash_aligned_nc_seq[fasta_name] 124 sequence_nc = bash_aligned_nc_seq[fasta_name]
125 125
126 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: 126 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not:
127 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### 127 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ###
128 128
129 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence 129 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence
130 seq_nuc_ORF1 = new_sequence_nc 130 seq_nuc_ORF1 = new_sequence_nc
131 seq_nuc_ORF2 = new_sequence_nc[1:-2] 131 seq_nuc_ORF2 = new_sequence_nc[1:-2]
132 seq_nuc_ORF3 = new_sequence_nc[2:-1] 132 seq_nuc_ORF3 = new_sequence_nc[2:-1]
133 seq_reversed=ReverseComplement2(seq_nuc_ORF1) 133 seq_reversed=ReverseComplement2(seq_nuc_ORF1)
134 seq_nuc_ORF4=seq_reversed 134 seq_nuc_ORF4=seq_reversed
135 seq_nuc_ORF5=seq_reversed[1:-2] 135 seq_nuc_ORF5=seq_reversed[1:-2]
136 seq_nuc_ORF6=seq_reversed[2:-1] 136 seq_nuc_ORF6=seq_reversed[2:-1]
137 137
138 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] 138 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6]
139 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) 139 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc)
140 140
141 ## 1.4. ## Get the 3 ORFs (aa) for each sequence 141 ## 1.4. ## Get the 3 ORFs (aa) for each sequence
142 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## 142 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ##
143 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## 143 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ##
144 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## 144 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ##
145 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## 145 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ##
146 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## 146 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ##
147 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## 147 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ##
148 148
149 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] 149 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6]
150 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) 150 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa)
151 151
152 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) 152 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
153 BEST_MAX = 0 153 BEST_MAX = 0
154 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs 154 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs
155 ORF_Aligned_aa = [] 155 ORF_Aligned_aa = []
156 ORF_Aligned_nuc = [] 156 ORF_Aligned_nuc = []
157 157
158 158
159 ## 2.1 ## Get the alignment of sequence for a given ORF 159 ## 2.1 ## Get the alignment of sequence for a given ORF
160 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd 160 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
161 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): 161 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
162 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] 162 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i]
163 aa_length = len(ORFsequence) 163 aa_length = len(ORFsequence)
164 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = 164 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" =
165 165
166 n = i+1 166 n = i+1
167 167
168 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): 168 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys():
169 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] 169 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i]
170 nuc_length = len(ORFsequence) 170 nuc_length = len(ORFsequence)
171 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = 171 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" =
172 172
173 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment 173 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment
174 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) 174 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
175 ## Next step is to get the longuest subsequence whithout stop 175 ## Next step is to get the longuest subsequence whithout stop
176 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" 176 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
177 MAX_LENGTH = 0 177 MAX_LENGTH = 0
178 LONGUEST_SEGMENT_UNSTOPPED = "" 178 LONGUEST_SEGMENT_UNSTOPPED = ""
179 j = 0 # Start from first position in alignment 179 j = 0 # Start from first position in alignment
180 List_of_List_subsequences = [] 180 List_of_List_subsequences = []
181 List_positions_subsequence = [] 181 List_positions_subsequence = []
182 while j < aa_length: 182 while j < aa_length:
183 column = [] 183 column = []
184 for seq in ORF_Aligned_aa: 184 for seq in ORF_Aligned_aa:
185 column.append(seq[j]) 185 column.append(seq[j])
186 j = j+1 186 j = j+1
187 if "*" in column: 187 if "*" in column:
188 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions 188 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions
189 List_positions_subsequence = [] ## Re-initialyse list of positions 189 List_positions_subsequence = [] ## Re-initialyse list of positions
190 else: 190 else:
191 List_positions_subsequence.append(j) 191 List_positions_subsequence.append(j)
192 192
193 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) 193 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
194 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] 194 LONGUEST_SUBSEQUENCE_LIST_POSITION = []
195 MAX=0 195 MAX=0
196 for sublist in List_of_List_subsequences: 196 for sublist in List_of_List_subsequences:
197 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: 197 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH:
198 MAX = len(sublist) 198 MAX = len(sublist)
199 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist 199 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist
200 200
201 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) 201 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
202 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: 202 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
203 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: 203 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0:
204 CDS_maybe_truncated = 1 204 CDS_maybe_truncated = 1
205 else: 205 else:
206 CDS_maybe_truncated = 0 206 CDS_maybe_truncated = 0
207 else: 207 else:
208 CDS_maybe_truncated = 0 208 CDS_maybe_truncated = 0
209 209
210 210
211 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) 211 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
212 ## Test whether it is the better ORF 212 ## Test whether it is the better ORF
213 if MAX > BEST_MAX: 213 if MAX > BEST_MAX:
214 BEST_MAX = MAX 214 BEST_MAX = MAX
237 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment 237 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
238 pos_MIN_nuc = pos_MIN_aa * 3 238 pos_MIN_nuc = pos_MIN_aa * 3
239 pos_MAX_nuc = pos_MAX_aa * 3 239 pos_MAX_nuc = pos_MAX_aa * 3
240 240
241 BESTORF_bash_aligned_nc_seq = {} 241 BESTORF_bash_aligned_nc_seq = {}
242 BESTORF_bash_aligned_nc_seq_CODING = {} 242 BESTORF_bash_aligned_nc_seq_CODING = {}
243 for fasta_name in bash_aligned_nc_seq.keys(): 243 for fasta_name in bash_aligned_nc_seq.keys():
244 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] 244 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF]
245 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] 245 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
246 BESTORF_bash_aligned_nc_seq[fasta_name] = seq 246 BESTORF_bash_aligned_nc_seq[fasta_name] = seq
247 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding 247 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
257 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa 257 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
258 ########################################################################################################### 258 ###########################################################################################################
259 259
260 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} 260 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
261 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} 261 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
262 262
263 Ortho = 0 263 Ortho = 0
264 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): 264 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
265 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] 265 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
266 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### 266 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ###
267 267
269 if Ortho == 1: 269 if Ortho == 1:
270 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING 270 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
271 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING 271 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
272 272
273 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: 273 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
274 if Ortho == 0 and CDS_maybe_truncated == 1: 274 if Ortho == 0 and CDS_maybe_truncated == 1:
275 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING 275 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
276 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING 276 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
277 277
278 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): 278 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
279 ## => the 2 bash "CDS_with_M" are left empty ("{}") 279 ## => the 2 bash "CDS_with_M" are left empty ("{}")
280 280
281 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) 281 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M)
282 ########################################################## 282 ##########################################################
283 283
284 284
285 ################################################################################################## 285 ##################################################################################################
286 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### 286 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ######
295 return listindex 295 return listindex
296 ###################################################### 296 ######################################################
297 297
298 298
299 ############################################################ 299 ############################################################
300 ###### DEF 6 : Detect if methionin in the aa sequence ###### 300 ###### DEF 6 : Detect if methionin in the aa sequence ######
301 ############################################################ 301 ############################################################
302 def detect_Methionine(seq_aa, Ortho): 302 def detect_Methionine(seq_aa, Ortho):
303 303
304 ln = len(seq_aa) 304 ln = len(seq_aa)
305 nbre = sys.argv[2] 305 nbre = sys.argv[2]
306 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH 306 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH
307 #Ortho = 0 ## means orthologs not found 307 #Ortho = 0 ## means orthologs not found
308 308
309 ## Find all indices of occurances of "M" in a string of aa 309 ## Find all indices of occurances of "M" in a string of aa
310 list_indices = allindices(seq_aa, "M") ### DEF5 ### 310 list_indices = allindices(seq_aa, "M") ### DEF5 ###
311 311
312 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS 312 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
313 if list_indices != []: 313 if list_indices != []:
314 first_M = list_indices[0] 314 first_M = list_indices[0]
315 if first_M < CUTOFF_Last_50aa: 315 if first_M < CUTOFF_Last_50aa:
316 Ortho = 1 ## means orthologs found 316 Ortho = 1 ## means orthologs found
317 317
318 return(Ortho) 318 return(Ortho)
319 ################################### 319 ###################################
322 322
323 323
324 324
325 325
326 ############################################################ 326 ############################################################
327 ###### DEF 7 : Reverse complement DNA sequence ###### 327 ###### DEF 7 : Reverse complement DNA sequence ######
328 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html 328 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
329 ############################################################ 329 ############################################################
330 330
331 331
332 def ReverseComplement2(seq): 332 def ReverseComplement2(seq):
342 ####################### 342 #######################
343 ##### RUN RUN RUN ##### 343 ##### RUN RUN RUN #####
344 ####################### 344 #######################
345 import string, os, time, re, zipfile, sys 345 import string, os, time, re, zipfile, sys
346 346
347 infiles = sys.argv[1]
347 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number 348 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number
348 349
349 ## INPUT / OUTPUT 350 ## INPUT / OUTPUT
350 list_file = [] 351 list_file = str.split(infiles,",")
351 zfile = zipfile.ZipFile(sys.argv[1]) 352
352 for name in zfile.namelist() : 353 ### Get Universal Code
353 list_file.append(name)
354 zfile.extract(name, "./")
355
356 F2 = open(sys.argv[2], 'r') 354 F2 = open(sys.argv[2], 'r')
355 bash_codeUniversel = code_universel(F2) ### DEF2 ###
356 F2.close()
357 357
358 os.mkdir("04_BEST_ORF_nuc") 358 os.mkdir("04_BEST_ORF_nuc")
359 Path_OUT1 = "04_BEST_ORF_nuc" 359 Path_OUT1 = "04_BEST_ORF_nuc"
360 os.mkdir("04_BEST_ORF_aa") 360 os.mkdir("04_BEST_ORF_aa")
361 Path_OUT2 = "04_BEST_ORF_aa" 361 Path_OUT2 = "04_BEST_ORF_aa"
369 Path_OUT5 = "06_CDS_with_M_nuc" 369 Path_OUT5 = "06_CDS_with_M_nuc"
370 os.mkdir("06_CDS_with_M_aa") 370 os.mkdir("06_CDS_with_M_aa")
371 Path_OUT6 = "06_CDS_with_M_aa" 371 Path_OUT6 = "06_CDS_with_M_aa"
372 372
373 373
374 ### Get Universal Code 374
375 bash_codeUniversel = code_universel(F2) ### DEF2 ###
376 F2.close()
377 375
378 ### Get the Bash corresponding to an alignment file in fasta format 376 ### Get the Bash corresponding to an alignment file in fasta format
379 count_file_processed = 0 377 count_file_processed = 0
380 count_file_with_CDS = 0 378 count_file_with_CDS = 0
381 count_file_without_CDS = 0 379 count_file_without_CDS = 0
382 count_file_with_CDS_plus_M = 0 380 count_file_with_CDS_plus_M = 0
383 381
384 for file in list_file: 382 for file in list_file:
385 count_file_processed = count_file_processed + 1 383 count_file_processed = count_file_processed + 1
386 fasta_file_path = "./%s" %file 384 fasta_file_path = "./%s" %file
387 bash_fasta = dico(fasta_file_path) ### DEF 1 ### 385 bash_fasta = dico(fasta_file_path) ### DEF 1 ###
388 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### 386 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ###
389 387
390 ## a ## OUTPUT BESTORF_nuc 388 ## a ## OUTPUT BESTORF_nuc
391 if BESTORF_nuc != {}: 389 if BESTORF_nuc != {}:
392 count_file_with_CDS = count_file_with_CDS +1 390 count_file_with_CDS = count_file_with_CDS +1
393 OUT1 = open("%s/%s" %(Path_OUT1,file), "w") 391 OUT1 = open("%s/%s" %(Path_OUT1,file), "w")
396 OUT1.write("%s\n" %fasta_name) 394 OUT1.write("%s\n" %fasta_name)
397 OUT1.write("%s\n" %seq) 395 OUT1.write("%s\n" %seq)
398 OUT1.close() 396 OUT1.close()
399 else: 397 else:
400 count_file_without_CDS = count_file_without_CDS + 1 398 count_file_without_CDS = count_file_without_CDS + 1
401 399
402 400
403 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! 401 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!!
404 if BESTORF_aa != {}: 402 if BESTORF_aa != {}:
405 OUT2 = open("%s/%s" %(Path_OUT2,file), "w") 403 OUT2 = open("%s/%s" %(Path_OUT2,file), "w")
406 for fasta_name in BESTORF_aa.keys(): 404 for fasta_name in BESTORF_aa.keys():
407 seq = BESTORF_aa[fasta_name] 405 seq = BESTORF_aa[fasta_name]
408 OUT2.write("%s\n" %fasta_name) 406 OUT2.write("%s\n" %fasta_name)
409 OUT2.write("%s\n" %seq) 407 OUT2.write("%s\n" %seq)
410 OUT2.close() 408 OUT2.close()
411 409
412 ## c ## OUTPUT BESTORF_aa 410 ## c ## OUTPUT BESTORF_aa
413 if BESTORF_nuc_CODING != {}: 411 if BESTORF_nuc_CODING != {}:
414 OUT3 = open("%s/%s" %(Path_OUT3,file), "w") 412 OUT3 = open("%s/%s" %(Path_OUT3,file), "w")
415 for fasta_name in BESTORF_nuc_CODING.keys(): 413 for fasta_name in BESTORF_nuc_CODING.keys():
423 OUT4 = open("%s/%s" %(Path_OUT4,file), "w") 421 OUT4 = open("%s/%s" %(Path_OUT4,file), "w")
424 for fasta_name in BESTORF_aa_CODING.keys(): 422 for fasta_name in BESTORF_aa_CODING.keys():
425 seq = BESTORF_aa_CODING[fasta_name] 423 seq = BESTORF_aa_CODING[fasta_name]
426 OUT4.write("%s\n" %fasta_name) 424 OUT4.write("%s\n" %fasta_name)
427 OUT4.write("%s\n" %seq) 425 OUT4.write("%s\n" %seq)
428 OUT4.close() 426 OUT4.close()
429 427
430 ## e ## OUTPUT BESTORF_nuc_CDS_with_M 428 ## e ## OUTPUT BESTORF_nuc_CDS_with_M
431 if BESTORF_nuc_CDS_with_M != {}: 429 if BESTORF_nuc_CDS_with_M != {}:
432 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 430 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1
433 OUT5 = open("%s/%s" %(Path_OUT5,file), "w") 431 OUT5 = open("%s/%s" %(Path_OUT5,file), "w")
434 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): 432 for fasta_name in BESTORF_nuc_CDS_with_M.keys():
435 seq = BESTORF_nuc_CDS_with_M[fasta_name] 433 seq = BESTORF_nuc_CDS_with_M[fasta_name]
436 OUT5.write("%s\n" %fasta_name) 434 OUT5.write("%s\n" %fasta_name)
437 OUT5.write("%s\n" %seq) 435 OUT5.write("%s\n" %seq)
438 OUT5.close() 436 OUT5.close()
439 437
440 ## f ## OUTPUT BESTORF_aa_CDS_with_M 438 ## f ## OUTPUT BESTORF_aa_CDS_with_M
441 if BESTORF_aa_CDS_with_M != {}: 439 if BESTORF_aa_CDS_with_M != {}:
442 OUT6 = open("%s/%s" %(Path_OUT6,file), "w") 440 OUT6 = open("%s/%s" %(Path_OUT6,file), "w")
443 for fasta_name in BESTORF_aa_CDS_with_M.keys(): 441 for fasta_name in BESTORF_aa_CDS_with_M.keys():
444 seq = BESTORF_aa_CDS_with_M[fasta_name] 442 seq = BESTORF_aa_CDS_with_M[fasta_name]
445 OUT6.write("%s\n" %fasta_name) 443 OUT6.write("%s\n" %fasta_name)
446 OUT6.write("%s\n" %seq) 444 OUT6.write("%s\n" %seq)
447 OUT6.close() 445 OUT6.close()
448 446
449 os.system("rm -rf %s" %file) 447 os.system("rm -rf %s" %file)
450 448
451 ## Print 449 ## Print
452 print "*************** CDS detection ***************" 450 print "*************** CDS detection ***************"
453 print "\nFiles processed: %d" %count_file_processed 451 print "\nFiles processed: %d" %count_file_processed
454 print "\tFiles with CDS: %d" %count_file_with_CDS 452 print "\tFiles with CDS: %d" %count_file_with_CDS
455 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M 453 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
456 print "\tFiles without CDS: %d \n" %count_file_without_CDS 454 print "\tFiles without CDS: %d \n" %count_file_without_CDS
457 print "" 455 print ""
458
459 ## Zipfile
460 f_bestORF_nuc = zipfile.ZipFile("ORF_Search_bestORF_nuc.zip", "w")
461 f_bestORF_aa = zipfile.ZipFile("ORF_Search_bestORF_aa.zip", "w")
462 f_CDS_nuc = zipfile.ZipFile("ORF_Search_CDS_nuc.zip", "w")
463 f_CDS_aa = zipfile.ZipFile("ORF_Search_CDS_aa.zip", "w")
464 f_CDSM_nuc = zipfile.ZipFile("ORF_Search_CDSM_nuc.zip", "w")
465 f_CDSM_aa = zipfile.ZipFile("ORF_Search_CDSM_aa.zip", "w")
466
467 os.chdir("%s" %Path_OUT1)
468 folder = os.listdir("./")
469 for i in folder :
470 f_bestORF_nuc.write("./%s" %i)
471
472 os.chdir("../%s" %Path_OUT2)
473 folder = os.listdir("./")
474 for i in folder :
475 f_bestORF_aa.write("./%s" %i)
476
477 os.chdir("../%s" %Path_OUT3)
478 folder = os.listdir("./")
479 for i in folder :
480 f_CDS_nuc.write("./%s" %i)
481
482 os.chdir("../%s" %Path_OUT4)
483 folder = os.listdir("./")
484 for i in folder :
485 f_CDS_aa.write("./%s" %i)
486
487 os.chdir("../%s" %Path_OUT5)
488 folder = os.listdir("./")
489 for i in folder :
490 f_CDSM_nuc.write("./%s" %i)
491
492 os.chdir("../%s" %Path_OUT6)
493 folder = os.listdir("./")
494 for i in folder :
495 f_CDSM_aa.write("./%s" %i)