comparison scripts/S01_find_orf_on_multiple_alignment.py @ 11:06a28df198b6 draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
author lecorguille
date Mon, 24 Sep 2018 03:58:34 -0400
parents 3d00be2d05f3
children
comparison
equal deleted inserted replaced
10:3d00be2d05f3 11:06a28df198b6
1 #!/usr/bin/python 1 #!/usr/bin/env python
2 # coding: utf8 2 # coding: utf8
3 ## Author: Eric Fontanillas 3 # Author: Eric Fontanillas
4 ## Modification: 03/09/14 by Julie BAFFARD 4 # Modification: 03/09/14 by Julie BAFFARD
5 ## Last modification : 05/03/18 by Victor Mataigne 5 # Last modification : 25/07/18 by Victor Mataigne
6 6
7 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria 7 # Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
8 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF 8 # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
9 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa 9 # CRITERIA 2 - This longest part should be > 150nc or 50aa
10 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa 10 # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
11 ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA 11 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
12 ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA 12 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
13 13
14 #################################################### 14 import string, os, time, re, zipfile, sys, argparse
15 ###### DEF 2 : Create bash for genetic code ######## 15 from dico import dico
16 ####################################################
17 ### KEY = codon
18 ### VALUE = Amino Acid
19 16
20 def code_universel(F1): 17 def code_universel(F1):
18 """ Creates bash for genetic code (key : codon ; value : amino-acid) """
21 bash_codeUniversel = {} 19 bash_codeUniversel = {}
20
22 with open(F1, "r") as file: 21 with open(F1, "r") as file:
23 for line in file.readlines(): 22 for line in file.readlines():
24 L1 = string.split(line, " ") 23 L1 = string.split(line, " ")
25 length1 = len(L1) 24 length1 = len(L1)
26 if length1 == 3: 25 if length1 == 3:
29 bash_codeUniversel[key] = value 28 bash_codeUniversel[key] = value
30 else: 29 else:
31 key = L1[0] 30 key = L1[0]
32 value = L1[2] 31 value = L1[2]
33 bash_codeUniversel[key] = value 32 bash_codeUniversel[key] = value
33
34 return(bash_codeUniversel) 34 return(bash_codeUniversel)
35 ########################################################### 35
36
37
38 ######################################################################################################################
39 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 #####
40 ######################################################################################################################
41 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF)
42 def multiple3(seq): 36 def multiple3(seq):
43 leng = len(seq) 37 """ Tests if the sequence is a multiple of 3, and if not removes extra-bases
44 modulo = leng%3 38 !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
45 if modulo == 0: # the results of dividing leng per 3 is an integer 39
46 new_seq = seq 40 m = len(seq)%3
47 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) 41 if m != 0 :
48 new_seq = seq[:-1] # remove the last nc 42 return seq[:-m], m
49 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) 43 else :
50 new_seq = seq[:-2] # remove the 2 last nc 44 return seq, m
51 len1 = len(new_seq) 45
52 return(new_seq, modulo) 46 def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
53 ########################################################## 47 """ Detects if methionin in the aa sequence """
54 48
55 49 ln = len(seq_aa)
56 ############################# 50 CUTOFF_Last_50aa = ln - minimal_cds_length
57 ###### DEF 4 : GET ORF ###### 51
58 ############################# 52 # Find all indices of occurances of "M" in a string of aa
59 ##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences 53 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
60 ##- CRITERIA1: Get the segment in the alignment with no codon stop 54
61 55 # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
62 56 if list_indices != []:
63 ###### DEF 4 - Part 1 - ###### 57 first_M = list_indices[0]
64 ############################## 58 if first_M < CUTOFF_Last_50aa:
65 def simply_get_ORF(seq_dna, bash_codeUniversel): 59 Ortho = 1 # means orthologs found
66 seq_aa = "" 60
67 i = 0 61 return(Ortho)
68 len1 = len(seq_dna) 62
69 while i < len1: 63 def ReverseComplement2(seq):
70 base1 = seq_dna[i] 64 """ Reverse complement DNA sequence """
71 base1 = string.capitalize(base1) 65 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
72 base2 = seq_dna[i+1] 66 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
73 base2 = string.capitalize(base2) 67
74 base3 = seq_dna[i+2] 68 return "".join([seq_dict[base] for base in reversed(seq)])
75 base3 = string.capitalize(base3) 69
76 70 def simply_get_ORF(seq_dna, gen_code):
77 codon = base1+base2+base3 71 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
78 codon = string.replace(codon, "T", "U") 72 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
79 73
80 if codon in bash_codeUniversel.keys(): 74 return ''.join(seq_by_aa)
81 aa = bash_codeUniversel[codon] 75
82 seq_aa = seq_aa + aa 76 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec):
83 else: 77 # Multiple sequence based : Based on the alignment of several sequences (orthogroup)
84 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" 78 # Criteria 1 : Get the segment in the alignment with no codon stop
85 i = i + 3 79
86 80 # 1 - Get the list of aligned aa seq for the 3 ORF:
87 return(seq_aa)
88 ##########################################################
89
90
91 ###### DEF 4 - Part 2 - ######
92 ##############################
93 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel):
94
95 ## 1 ## Get the list of aligned aa seq for the 3 ORF:
96 bash_of_aligned_aa_seq_3ORF = {} 81 bash_of_aligned_aa_seq_3ORF = {}
97 bash_of_aligned_nuc_seq_3ORF = {} 82 bash_of_aligned_nuc_seq_3ORF = {}
98 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] 83 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
84
99 for fasta_name in bash_aligned_nc_seq.keys(): 85 for fasta_name in bash_aligned_nc_seq.keys():
100 ## 1.1. ## Get the raw sequence 86 # Get sequence, chek if multiple 3, then get 6 orfs
101 sequence_nc = bash_aligned_nc_seq[fasta_name] 87 sequence_nc = bash_aligned_nc_seq[fasta_name]
102 88 new_sequence_nc, modulo = multiple3(sequence_nc)
103 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: 89 new_sequence_rev = ReverseComplement2(new_sequence_nc)
104 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### 90 # For each seq of the multialignment => give the 6 ORFs (in nuc)
105 91 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
106 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence 92
107 seq_nuc_ORF1 = new_sequence_nc 93 seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
108 seq_nuc_ORF2 = new_sequence_nc[1:-2] 94 seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
109 seq_nuc_ORF3 = new_sequence_nc[2:-1] 95 seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
110 seq_reversed=ReverseComplement2(seq_nuc_ORF1) 96 seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
111 seq_nuc_ORF4=seq_reversed 97 seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
112 seq_nuc_ORF5=seq_reversed[1:-2] 98 seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
113 seq_nuc_ORF6=seq_reversed[2:-1] 99
114 100 # For each seq of the multialignment => give the 6 ORFs (in aa)
115 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] 101 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
116 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) 102
117 103 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
118 ## 1.4. ## Get the 3 ORFs (aa) for each sequence
119 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ##
120 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ##
121 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ##
122 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ##
123 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ##
124 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ##
125
126 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6]
127 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa)
128
129 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
130 BEST_MAX = 0 104 BEST_MAX = 0
131 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs 105
106 for i in [0,1,2,3,4,5]: # Test the 6 ORFs
132 ORF_Aligned_aa = [] 107 ORF_Aligned_aa = []
133 ORF_Aligned_nuc = [] 108 ORF_Aligned_nuc = []
134 109
135 110 # 2.1 - Get the alignment of sequence for a given ORF
136 ## 2.1 ## Get the alignment of sequence for a given ORF 111 # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
137 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
138 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): 112 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
139 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] 113 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i]
140 aa_length = len(ORFsequence) 114 aa_length = len(ORFsequence)
141 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = 115 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" =
142 116
143 n = i+1 117 n = i+1
144 118
145 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): 119 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys():
146 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] 120 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i]
147 nuc_length = len(ORFsequence) 121 nuc_length = len(ORFsequence)
148 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = 122 ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" =
149 123
150 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment 124 # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment
151 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) 125 # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
152 ## Next step is to get the longuest subsequence whithout stop 126 # Next step is to get the longuest subsequence whithout stop
153 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" 127 # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
154 MAX_LENGTH = 0 128 MAX_LENGTH = 0
155 LONGUEST_SEGMENT_UNSTOPPED = "" 129 LONGUEST_SEGMENT_UNSTOPPED = ""
156 j = 0 # Start from first position in alignment 130 j = 0 # Start from first position in alignment
157 List_of_List_subsequences = [] 131 List_of_List_subsequences = []
158 List_positions_subsequence = [] 132 List_positions_subsequence = []
160 column = [] 134 column = []
161 for seq in ORF_Aligned_aa: 135 for seq in ORF_Aligned_aa:
162 column.append(seq[j]) 136 column.append(seq[j])
163 j = j+1 137 j = j+1
164 if "*" in column: 138 if "*" in column:
165 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions 139 List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions
166 List_positions_subsequence = [] ## Re-initialyse list of positions 140 List_positions_subsequence = [] # Re-initialyse list of positions
167 else: 141 else:
168 List_positions_subsequence.append(j) 142 List_positions_subsequence.append(j)
169 143
170 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) 144 # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
171 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] 145 LONGUEST_SUBSEQUENCE_LIST_POSITION = []
172 MAX=0 146 MAX=0
173 for sublist in List_of_List_subsequences: 147 for sublist in List_of_List_subsequences:
174 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: 148 if len(sublist) > MAX and len(sublist) > minimal_cds_length:
175 MAX = len(sublist) 149 MAX = len(sublist)
176 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist 150 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist
177 151
178 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) 152 # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
179 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: 153 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
180 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: 154 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0:
181 CDS_maybe_truncated = 1 155 CDS_maybe_truncated = 1
182 else: 156 else:
183 CDS_maybe_truncated = 0 157 CDS_maybe_truncated = 0
184 else: 158 else:
185 CDS_maybe_truncated = 0 159 CDS_maybe_truncated = 0
186 160
187 161
188 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) 162 # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
189 ## Test whether it is the better ORF 163 # Test whether it is the better ORF
190 if MAX > BEST_MAX: 164 if MAX > BEST_MAX:
191 BEST_MAX = MAX 165 BEST_MAX = MAX
192 BEST_ORF = i+1 166 BEST_ORF = i+1
193 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION 167 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION
194 168
195 169
196 ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT) 170 # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
197 ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) 171 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
198 ## And get the INDEX of the best ORF [0, 1, or 2] 172 # And get the INDEX of the best ORF [0, 1, or 2]
199 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: 173 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
200 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] 174 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
201 pos_MIN_aa = pos_MIN_aa - 1 175 pos_MIN_aa = pos_MIN_aa - 1
202 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] 176 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1]
203 177
204 178
205 BESTORF_bash_of_aligned_aa_seq = {} 179 BESTORF_bash_of_aligned_aa_seq = {}
206 BESTORF_bash_of_aligned_aa_seq_CODING = {} 180 BESTORF_bash_of_aligned_aa_seq_CODING = {}
207 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): 181 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
208 index_BEST_ORF = BEST_ORF-1 ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 182 index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3
209 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] 183 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF]
210 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] 184 seq_coding = seq[pos_MIN_aa:pos_MAX_aa]
211 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq 185 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq
212 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding 186 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
213 187
214 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment 188 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
215 pos_MIN_nuc = pos_MIN_aa * 3 189 pos_MIN_nuc = pos_MIN_aa * 3
216 pos_MAX_nuc = pos_MAX_aa * 3 190 pos_MAX_nuc = pos_MAX_aa * 3
217 191
218 BESTORF_bash_aligned_nc_seq = {} 192 BESTORF_bash_aligned_nc_seq = {}
219 BESTORF_bash_aligned_nc_seq_CODING = {} 193 BESTORF_bash_aligned_nc_seq_CODING = {}
221 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] 195 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF]
222 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] 196 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
223 BESTORF_bash_aligned_nc_seq[fasta_name] = seq 197 BESTORF_bash_aligned_nc_seq[fasta_name] = seq
224 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding 198 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
225 199
226 else: ### no CDS found ### 200 else: # no CDS found
227 BESTORF_bash_aligned_nc_seq = {} 201 BESTORF_bash_aligned_nc_seq = {}
228 BESTORF_bash_aligned_nc_seq_CODING = {} 202 BESTORF_bash_aligned_nc_seq_CODING = {}
229 BESTORF_bash_of_aligned_aa_seq = {} 203 BESTORF_bash_of_aligned_aa_seq = {}
230 BESTORF_bash_of_aligned_aa_seq_CODING ={} 204 BESTORF_bash_of_aligned_aa_seq_CODING ={}
231 205
232 206 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
233 207
234 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
235 ###########################################################################################################
236
237 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} 208 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
238 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} 209 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
239 210
240 Ortho = 0 211 Ortho = 0
241 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): 212 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
242 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] 213 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
243 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### 214 Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###
244 215
245 ## CASE 1: A "M" is present and correctly localized (not in last 50 aa) 216 # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
246 if Ortho == 1: 217 if Ortho == 1:
247 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING 218 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
248 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING 219 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
249 220
250 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: 221 # CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
251 if Ortho == 0 and CDS_maybe_truncated == 1: 222 if Ortho == 0 and CDS_maybe_truncated == 1:
252 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING 223 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
253 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING 224 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
254 225
255 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): 226 # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
256 ## => the 2 bash "CDS_with_M" are left empty ("{}") 227 ## => the 2 bash "CDS_with_M" are left empty ("{}")
257 228
258 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) 229 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M)
259 ########################################################## 230
260 231 def write_output_file(results_dict, name_elems, path_out):
261 232 if results_dict != {}:
262 ################################################################################################## 233 name_elems[3] = str(len(results_dict.keys()))
263 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### 234 new_name = "_".join(name_elems)
264 ################################################################################################## 235
265 def allindices(string, sub): 236 out1 = open("%s/%s" %(path_out,new_name), "w")
266 listindex=[] 237 for fasta_name in results_dict.keys():
267 offset=0 238 seq = results_dict[fasta_name]
268 i = string.find(sub, offset) 239 out1.write("%s\n" %fasta_name)
269 while i >= 0: 240 out1.write("%s\n" %seq)
270 listindex.append(i) 241 out1.close()
271 i = string.find(sub, i + 1) 242
272 return listindex 243 def main():
273 ###################################################### 244 parser = argparse.ArgumentParser()
274 245 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
275 246 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)
276 ############################################################ 247 parser.add_argument("min_spec", help="Minimal number of species per alignment")
277 ###### DEF 6 : Detect if methionin in the aa sequence ###### 248 parser.add_argument("list_files", help="File with all input files names")
278 ############################################################ 249 args = parser.parse_args()
279 def detect_Methionine(seq_aa, Ortho): 250
280 251 minimal_cds_length = int(args.min_cds_len) # in aa number
281 ln = len(seq_aa) 252 bash_codeUniversel = code_universel(args.codeUniversel)
282 nbre = sys.argv[2] 253 minimum_species = int(args.min_spec)
283 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH
284 #Ortho = 0 ## means orthologs not found
285
286 ## Find all indices of occurances of "M" in a string of aa
287 list_indices = allindices(seq_aa, "M") ### DEF5 ###
288
289 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
290 if list_indices != []:
291 first_M = list_indices[0]
292 if first_M < CUTOFF_Last_50aa:
293 Ortho = 1 ## means orthologs found
294
295 return(Ortho)
296 ###################################
297
298
299 ############################################################
300 ###### DEF 7 : Reverse complement DNA sequence ######
301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
302 ############################################################
303 def ReverseComplement2(seq):
304 # too lazy to construct the dictionary manually, use a dict comprehension
305 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
306 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
307 return "".join([seq_dict[base] for base in reversed(seq)])
308 ############################
309
310
311 #######################
312 ##### RUN RUN RUN #####
313 #######################
314 import string, os, time, re, zipfile, sys
315 from dico import dico
316
317 MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number
318
319 ### Get Universal Code
320 bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ###
321
322 ## INPUT from file containing list of species
323 list_files = []
324 with open(sys.argv[3], 'r') as f:
325 for line in f.readlines():
326 list_files.append(line.strip('\n'))
327
328 os.mkdir("04_BEST_ORF_nuc")
329 Path_OUT1 = "04_BEST_ORF_nuc"
330 os.mkdir("04_BEST_ORF_aa")
331 Path_OUT2 = "04_BEST_ORF_aa"
332
333 os.mkdir("05_CDS_nuc")
334 Path_OUT3 = "05_CDS_nuc"
335 os.mkdir("05_CDS_aa")
336 Path_OUT4 = "05_CDS_aa"
337
338 os.mkdir("06_CDS_with_M_nuc")
339 Path_OUT5 = "06_CDS_with_M_nuc"
340 os.mkdir("06_CDS_with_M_aa")
341 Path_OUT6 = "06_CDS_with_M_aa"
342
343 ### Get the Bash corresponding to an alignment file in fasta format
344 count_file_processed = 0
345 count_file_with_CDS = 0
346 count_file_without_CDS = 0
347 count_file_with_CDS_plus_M = 0
348
349 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
350 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
351
352 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
353
354 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
355 #n0 = 0
356
357 for file in list_files:
358 #n0 += 1
359
360 count_file_processed = count_file_processed + 1
361 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
362 fasta_file_path = "./%s" %file
363 bash_fasta = dico(fasta_file_path) ### DEF 1 ###
364 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ###
365 254
366 name_elems[1] = nb_gp 255 # Inputs from file containing list of species
367 256 list_files = []
368 ## a ## OUTPUT BESTORF_nuc 257 with open(args.list_files, 'r') as f:
369 if BESTORF_nuc != {}: 258 for line in f.readlines():
370 name_elems[3] = str(len(BESTORF_nuc.keys())) 259 list_files.append(line.strip('\n'))
371 new_name = "_".join(name_elems) 260
372 count_file_with_CDS = count_file_with_CDS +1 261 # Directories for results
373 OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") 262 dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"]
374 for fasta_name in BESTORF_nuc.keys(): 263 for directory in dirs:
375 seq = BESTORF_nuc[fasta_name] 264 os.mkdir(directory)
376 OUT1.write("%s\n" %fasta_name) 265
377 OUT1.write("%s\n" %seq) 266 count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0
378 OUT1.close() 267 count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0
379 else: 268
380 count_file_without_CDS = count_file_without_CDS + 1 269 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
381 270 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
382 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! 271 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
383 if BESTORF_aa != {}: 272
384 name_elems[3] = str(len(BESTORF_aa.keys())) 273 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
385 new_name = "_".join(name_elems) 274 #n0 = 0
386 OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") 275
387 for fasta_name in BESTORF_aa.keys(): 276 for file in list_files:
388 seq = BESTORF_aa[fasta_name] 277 #n0 += 1
389 OUT2.write("%s\n" %fasta_name) 278
390 OUT2.write("%s\n" %seq) 279 count_file_processed = count_file_processed + 1
391 OUT2.close() 280 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
392 281 fasta_file_path = "./%s" %file
393 ## c ## OUTPUT BESTORF_aa 282 bash_fasta = dico(fasta_file_path)
394 if BESTORF_nuc_CODING != {}: 283 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
395 name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) 284
396 new_name = "_".join(name_elems) 285 name_elems[1] = nb_gp
397 OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") 286
398 for fasta_name in BESTORF_nuc_CODING.keys(): 287 # Update counts and write group in corresponding output directory
399 seq = BESTORF_nuc_CODING[fasta_name] 288 if BESTORF_nuc != {}:
400 OUT3.write("%s\n" %fasta_name) 289 count_file_with_CDS += 1
401 OUT3.write("%s\n" %seq) 290 if len(BESTORF_nuc.keys()) >= minimum_species :
402 OUT3.close() 291 count_file_with_cds_and_enought_species += 1
403 292 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc
404 ## d ## OUTPUT BESTORF_aa_CODING 293 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting
405 if BESTORF_aa_CODING != {}: 294 else:
406 name_elems[3] = str(len(BESTORF_aa_CODING.keys())) 295 count_file_without_CDS += 1
407 new_name = "_".join(name_elems) 296
408 OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") 297 if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species:
409 for fasta_name in BESTORF_aa_CODING.keys(): 298 write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2])
410 seq = BESTORF_aa_CODING[fasta_name] 299 write_output_file(BESTORF_aa_CODING, name_elems, dirs[3])
411 OUT4.write("%s\n" %fasta_name) 300
412 OUT4.write("%s\n" %seq) 301 if BESTORF_nuc_CDS_with_M != {}:
413 OUT4.close() 302 count_file_with_CDS_plus_M += 1
414 303 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species :
415 ## e ## OUTPUT BESTORF_nuc_CDS_with_M 304 count_file_with_cds_M_and_enought_species += 1
416 if BESTORF_nuc_CDS_with_M != {}: 305 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
417 name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) 306 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
418 new_name = "_".join(name_elems) 307
419 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 308 print "*************** CDS detection ***************"
420 OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") 309 print "\nFiles processed: %d" %count_file_processed
421 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): 310 print "\tFiles with CDS: %d" %count_file_with_CDS
422 seq = BESTORF_nuc_CDS_with_M[fasta_name] 311 print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
423 OUT5.write("%s\n" %fasta_name) 312 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
424 OUT5.write("%s\n" %seq) 313 print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species)
425 OUT5.close() 314 print "\tFiles without CDS: %d \n" %count_file_without_CDS
426 315 print ""
427 ## f ## OUTPUT BESTORF_aa_CDS_with_M 316
428 if BESTORF_aa_CDS_with_M != {}: 317 if __name__ == '__main__':
429 name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) 318 main()
430 new_name = "_".join(name_elems)
431 OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w")
432 for fasta_name in BESTORF_aa_CDS_with_M.keys():
433 seq = BESTORF_aa_CDS_with_M[fasta_name]
434 OUT6.write("%s\n" %fasta_name)
435 OUT6.write("%s\n" %seq)
436 OUT6.close()
437
438 #os.system("rm -rf %s" %file)
439
440 ## Print
441 print "*************** CDS detection ***************"
442 print "\nFiles processed: %d" %count_file_processed
443 print "\tFiles with CDS: %d" %count_file_with_CDS
444 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
445 print "\tFiles without CDS: %d \n" %count_file_without_CDS
446 print ""