Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 2:0d2f72caea10 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
| author | abims-sbr |
|---|---|
| date | Wed, 27 Sep 2017 10:03:05 -0400 |
| parents | 567d5b771a90 |
| children | ff98ed7849fa |
comparison
equal
deleted
inserted
replaced
| 1:567d5b771a90 | 2:0d2f72caea10 |
|---|---|
| 57 return(bash_codeUniversel) | 57 return(bash_codeUniversel) |
| 58 ########################################################### | 58 ########################################################### |
| 59 | 59 |
| 60 | 60 |
| 61 ###################################################################################################################### | 61 ###################################################################################################################### |
| 62 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### | 62 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### |
| 63 ###################################################################################################################### | 63 ###################################################################################################################### |
| 64 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) | 64 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) |
| 65 def multiple3(seq): | 65 def multiple3(seq): |
| 66 leng = len(seq) | 66 leng = len(seq) |
| 67 modulo = leng%3 | 67 modulo = leng%3 |
| 68 if modulo == 0: # the results of dividing leng per 3 is an integer | 68 if modulo == 0: # the results of dividing leng per 3 is an integer |
| 69 new_seq = seq | 69 new_seq = seq |
| 70 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) | 70 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) |
| 71 new_seq = seq[:-1] # remove the last nc | 71 new_seq = seq[:-1] # remove the last nc |
| 72 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) | 72 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) |
| 73 new_seq = seq[:-2] # remove the 2 last nc | 73 new_seq = seq[:-2] # remove the 2 last nc |
| 74 len1 = len(new_seq) | 74 len1 = len(new_seq) |
| 75 return(new_seq, modulo) | 75 return(new_seq, modulo) |
| 76 ########################################################## | 76 ########################################################## |
| 77 | 77 |
| 78 | 78 |
| 79 ############################# | 79 ############################# |
| 94 base1 = string.capitalize(base1) | 94 base1 = string.capitalize(base1) |
| 95 base2 = seq_dna[i+1] | 95 base2 = seq_dna[i+1] |
| 96 base2 = string.capitalize(base2) | 96 base2 = string.capitalize(base2) |
| 97 base3 = seq_dna[i+2] | 97 base3 = seq_dna[i+2] |
| 98 base3 = string.capitalize(base3) | 98 base3 = string.capitalize(base3) |
| 99 | 99 |
| 100 codon = base1+base2+base3 | 100 codon = base1+base2+base3 |
| 101 codon = string.replace(codon, "T", "U") | 101 codon = string.replace(codon, "T", "U") |
| 102 | 102 |
| 103 if codon in bash_codeUniversel.keys(): | 103 if codon in bash_codeUniversel.keys(): |
| 104 aa = bash_codeUniversel[codon] | 104 aa = bash_codeUniversel[codon] |
| 105 seq_aa = seq_aa + aa | 105 seq_aa = seq_aa + aa |
| 106 else: | 106 else: |
| 107 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" | 107 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" |
| 108 i = i + 3 | 108 i = i + 3 |
| 109 | 109 |
| 110 return(seq_aa) | 110 return(seq_aa) |
| 111 ########################################################## | 111 ########################################################## |
| 112 | 112 |
| 113 | 113 |
| 114 ###### DEF 4 - Part 2 - ###### | 114 ###### DEF 4 - Part 2 - ###### |
| 115 ############################## | 115 ############################## |
| 116 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): | 116 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): |
| 117 | 117 |
| 118 ## 1 ## Get the list of aligned aa seq for the 3 ORF: | 118 ## 1 ## Get the list of aligned aa seq for the 3 ORF: |
| 119 bash_of_aligned_aa_seq_3ORF = {} | 119 bash_of_aligned_aa_seq_3ORF = {} |
| 120 bash_of_aligned_nuc_seq_3ORF = {} | 120 bash_of_aligned_nuc_seq_3ORF = {} |
| 121 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | 121 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] |
| 122 for fasta_name in bash_aligned_nc_seq.keys(): | 122 for fasta_name in bash_aligned_nc_seq.keys(): |
| 123 ## 1.1. ## Get the raw sequence | 123 ## 1.1. ## Get the raw sequence |
| 124 sequence_nc = bash_aligned_nc_seq[fasta_name] | 124 sequence_nc = bash_aligned_nc_seq[fasta_name] |
| 125 | 125 |
| 126 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: | 126 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: |
| 127 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### | 127 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### |
| 128 | 128 |
| 129 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence | 129 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence |
| 130 seq_nuc_ORF1 = new_sequence_nc | 130 seq_nuc_ORF1 = new_sequence_nc |
| 131 seq_nuc_ORF2 = new_sequence_nc[1:-2] | 131 seq_nuc_ORF2 = new_sequence_nc[1:-2] |
| 132 seq_nuc_ORF3 = new_sequence_nc[2:-1] | 132 seq_nuc_ORF3 = new_sequence_nc[2:-1] |
| 133 seq_reversed=ReverseComplement2(seq_nuc_ORF1) | 133 seq_reversed=ReverseComplement2(seq_nuc_ORF1) |
| 134 seq_nuc_ORF4=seq_reversed | 134 seq_nuc_ORF4=seq_reversed |
| 135 seq_nuc_ORF5=seq_reversed[1:-2] | 135 seq_nuc_ORF5=seq_reversed[1:-2] |
| 136 seq_nuc_ORF6=seq_reversed[2:-1] | 136 seq_nuc_ORF6=seq_reversed[2:-1] |
| 137 | 137 |
| 138 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] | 138 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] |
| 139 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) | 139 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) |
| 140 | 140 |
| 141 ## 1.4. ## Get the 3 ORFs (aa) for each sequence | 141 ## 1.4. ## Get the 3 ORFs (aa) for each sequence |
| 142 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 142 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 143 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 143 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 144 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 144 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 145 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 145 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 146 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 146 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 147 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## | 147 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## |
| 148 | 148 |
| 149 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] | 149 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] |
| 150 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) | 150 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) |
| 151 | 151 |
| 152 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) | 152 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) |
| 153 BEST_MAX = 0 | 153 BEST_MAX = 0 |
| 154 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs | 154 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs |
| 155 ORF_Aligned_aa = [] | 155 ORF_Aligned_aa = [] |
| 156 ORF_Aligned_nuc = [] | 156 ORF_Aligned_nuc = [] |
| 157 | 157 |
| 158 | 158 |
| 159 ## 2.1 ## Get the alignment of sequence for a given ORF | 159 ## 2.1 ## Get the alignment of sequence for a given ORF |
| 160 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd | 160 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd |
| 161 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): | 161 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): |
| 162 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] | 162 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] |
| 163 aa_length = len(ORFsequence) | 163 aa_length = len(ORFsequence) |
| 164 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = | 164 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = |
| 165 | 165 |
| 166 n = i+1 | 166 n = i+1 |
| 167 | 167 |
| 168 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): | 168 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): |
| 169 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] | 169 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] |
| 170 nuc_length = len(ORFsequence) | 170 nuc_length = len(ORFsequence) |
| 171 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = | 171 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = |
| 172 | 172 |
| 173 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment | 173 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment |
| 174 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) | 174 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) |
| 175 ## Next step is to get the longuest subsequence whithout stop | 175 ## Next step is to get the longuest subsequence whithout stop |
| 176 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" | 176 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" |
| 177 MAX_LENGTH = 0 | 177 MAX_LENGTH = 0 |
| 178 LONGUEST_SEGMENT_UNSTOPPED = "" | 178 LONGUEST_SEGMENT_UNSTOPPED = "" |
| 179 j = 0 # Start from first position in alignment | 179 j = 0 # Start from first position in alignment |
| 180 List_of_List_subsequences = [] | 180 List_of_List_subsequences = [] |
| 181 List_positions_subsequence = [] | 181 List_positions_subsequence = [] |
| 182 while j < aa_length: | 182 while j < aa_length: |
| 183 column = [] | 183 column = [] |
| 184 for seq in ORF_Aligned_aa: | 184 for seq in ORF_Aligned_aa: |
| 185 column.append(seq[j]) | 185 column.append(seq[j]) |
| 186 j = j+1 | 186 j = j+1 |
| 187 if "*" in column: | 187 if "*" in column: |
| 188 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions | 188 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions |
| 189 List_positions_subsequence = [] ## Re-initialyse list of positions | 189 List_positions_subsequence = [] ## Re-initialyse list of positions |
| 190 else: | 190 else: |
| 191 List_positions_subsequence.append(j) | 191 List_positions_subsequence.append(j) |
| 192 | 192 |
| 193 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) | 193 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) |
| 194 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | 194 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] |
| 195 MAX=0 | 195 MAX=0 |
| 196 for sublist in List_of_List_subsequences: | 196 for sublist in List_of_List_subsequences: |
| 197 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: | 197 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: |
| 198 MAX = len(sublist) | 198 MAX = len(sublist) |
| 199 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist | 199 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist |
| 200 | 200 |
| 201 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) | 201 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) |
| 202 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | 202 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: |
| 203 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: | 203 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: |
| 204 CDS_maybe_truncated = 1 | 204 CDS_maybe_truncated = 1 |
| 205 else: | 205 else: |
| 206 CDS_maybe_truncated = 0 | 206 CDS_maybe_truncated = 0 |
| 207 else: | 207 else: |
| 208 CDS_maybe_truncated = 0 | 208 CDS_maybe_truncated = 0 |
| 209 | 209 |
| 210 | 210 |
| 211 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) | 211 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) |
| 212 ## Test whether it is the better ORF | 212 ## Test whether it is the better ORF |
| 213 if MAX > BEST_MAX: | 213 if MAX > BEST_MAX: |
| 214 BEST_MAX = MAX | 214 BEST_MAX = MAX |
| 237 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment | 237 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment |
| 238 pos_MIN_nuc = pos_MIN_aa * 3 | 238 pos_MIN_nuc = pos_MIN_aa * 3 |
| 239 pos_MAX_nuc = pos_MAX_aa * 3 | 239 pos_MAX_nuc = pos_MAX_aa * 3 |
| 240 | 240 |
| 241 BESTORF_bash_aligned_nc_seq = {} | 241 BESTORF_bash_aligned_nc_seq = {} |
| 242 BESTORF_bash_aligned_nc_seq_CODING = {} | 242 BESTORF_bash_aligned_nc_seq_CODING = {} |
| 243 for fasta_name in bash_aligned_nc_seq.keys(): | 243 for fasta_name in bash_aligned_nc_seq.keys(): |
| 244 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] | 244 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] |
| 245 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] | 245 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] |
| 246 BESTORF_bash_aligned_nc_seq[fasta_name] = seq | 246 BESTORF_bash_aligned_nc_seq[fasta_name] = seq |
| 247 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding | 247 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding |
| 257 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa | 257 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa |
| 258 ########################################################################################################### | 258 ########################################################################################################### |
| 259 | 259 |
| 260 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} | 260 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} |
| 261 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} | 261 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} |
| 262 | 262 |
| 263 Ortho = 0 | 263 Ortho = 0 |
| 264 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): | 264 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): |
| 265 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] | 265 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] |
| 266 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### | 266 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### |
| 267 | 267 |
| 269 if Ortho == 1: | 269 if Ortho == 1: |
| 270 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | 270 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING |
| 271 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | 271 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING |
| 272 | 272 |
| 273 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: | 273 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: |
| 274 if Ortho == 0 and CDS_maybe_truncated == 1: | 274 if Ortho == 0 and CDS_maybe_truncated == 1: |
| 275 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | 275 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING |
| 276 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | 276 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING |
| 277 | 277 |
| 278 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): | 278 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): |
| 279 ## => the 2 bash "CDS_with_M" are left empty ("{}") | 279 ## => the 2 bash "CDS_with_M" are left empty ("{}") |
| 280 | 280 |
| 281 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) | 281 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) |
| 282 ########################################################## | 282 ########################################################## |
| 283 | 283 |
| 284 | 284 |
| 285 ################################################################################################## | 285 ################################################################################################## |
| 286 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### | 286 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### |
| 295 return listindex | 295 return listindex |
| 296 ###################################################### | 296 ###################################################### |
| 297 | 297 |
| 298 | 298 |
| 299 ############################################################ | 299 ############################################################ |
| 300 ###### DEF 6 : Detect if methionin in the aa sequence ###### | 300 ###### DEF 6 : Detect if methionin in the aa sequence ###### |
| 301 ############################################################ | 301 ############################################################ |
| 302 def detect_Methionine(seq_aa, Ortho): | 302 def detect_Methionine(seq_aa, Ortho): |
| 303 | 303 |
| 304 ln = len(seq_aa) | 304 ln = len(seq_aa) |
| 305 nbre = sys.argv[2] | 305 nbre = sys.argv[2] |
| 306 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH | 306 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH |
| 307 #Ortho = 0 ## means orthologs not found | 307 #Ortho = 0 ## means orthologs not found |
| 308 | 308 |
| 309 ## Find all indices of occurances of "M" in a string of aa | 309 ## Find all indices of occurances of "M" in a string of aa |
| 310 list_indices = allindices(seq_aa, "M") ### DEF5 ### | 310 list_indices = allindices(seq_aa, "M") ### DEF5 ### |
| 311 | 311 |
| 312 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS | 312 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS |
| 313 if list_indices != []: | 313 if list_indices != []: |
| 314 first_M = list_indices[0] | 314 first_M = list_indices[0] |
| 315 if first_M < CUTOFF_Last_50aa: | 315 if first_M < CUTOFF_Last_50aa: |
| 316 Ortho = 1 ## means orthologs found | 316 Ortho = 1 ## means orthologs found |
| 317 | 317 |
| 318 return(Ortho) | 318 return(Ortho) |
| 319 ################################### | 319 ################################### |
| 322 | 322 |
| 323 | 323 |
| 324 | 324 |
| 325 | 325 |
| 326 ############################################################ | 326 ############################################################ |
| 327 ###### DEF 7 : Reverse complement DNA sequence ###### | 327 ###### DEF 7 : Reverse complement DNA sequence ###### |
| 328 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html | 328 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html |
| 329 ############################################################ | 329 ############################################################ |
| 330 | 330 |
| 331 | 331 |
| 332 def ReverseComplement2(seq): | 332 def ReverseComplement2(seq): |
| 342 ####################### | 342 ####################### |
| 343 ##### RUN RUN RUN ##### | 343 ##### RUN RUN RUN ##### |
| 344 ####################### | 344 ####################### |
| 345 import string, os, time, re, zipfile, sys | 345 import string, os, time, re, zipfile, sys |
| 346 | 346 |
| 347 infiles = sys.argv[1] | |
| 347 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number | 348 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number |
| 348 | 349 |
| 349 ## INPUT / OUTPUT | 350 ## INPUT / OUTPUT |
| 350 list_file = [] | 351 list_file = str.split(infiles,",") |
| 351 zfile = zipfile.ZipFile(sys.argv[1]) | 352 |
| 352 for name in zfile.namelist() : | 353 ### Get Universal Code |
| 353 list_file.append(name) | |
| 354 zfile.extract(name, "./") | |
| 355 | |
| 356 F2 = open(sys.argv[2], 'r') | 354 F2 = open(sys.argv[2], 'r') |
| 355 bash_codeUniversel = code_universel(F2) ### DEF2 ### | |
| 356 F2.close() | |
| 357 | 357 |
| 358 os.mkdir("04_BEST_ORF_nuc") | 358 os.mkdir("04_BEST_ORF_nuc") |
| 359 Path_OUT1 = "04_BEST_ORF_nuc" | 359 Path_OUT1 = "04_BEST_ORF_nuc" |
| 360 os.mkdir("04_BEST_ORF_aa") | 360 os.mkdir("04_BEST_ORF_aa") |
| 361 Path_OUT2 = "04_BEST_ORF_aa" | 361 Path_OUT2 = "04_BEST_ORF_aa" |
| 369 Path_OUT5 = "06_CDS_with_M_nuc" | 369 Path_OUT5 = "06_CDS_with_M_nuc" |
| 370 os.mkdir("06_CDS_with_M_aa") | 370 os.mkdir("06_CDS_with_M_aa") |
| 371 Path_OUT6 = "06_CDS_with_M_aa" | 371 Path_OUT6 = "06_CDS_with_M_aa" |
| 372 | 372 |
| 373 | 373 |
| 374 ### Get Universal Code | 374 |
| 375 bash_codeUniversel = code_universel(F2) ### DEF2 ### | |
| 376 F2.close() | |
| 377 | 375 |
| 378 ### Get the Bash corresponding to an alignment file in fasta format | 376 ### Get the Bash corresponding to an alignment file in fasta format |
| 379 count_file_processed = 0 | 377 count_file_processed = 0 |
| 380 count_file_with_CDS = 0 | 378 count_file_with_CDS = 0 |
| 381 count_file_without_CDS = 0 | 379 count_file_without_CDS = 0 |
| 382 count_file_with_CDS_plus_M = 0 | 380 count_file_with_CDS_plus_M = 0 |
| 383 | 381 |
| 384 for file in list_file: | 382 for file in list_file: |
| 385 count_file_processed = count_file_processed + 1 | 383 count_file_processed = count_file_processed + 1 |
| 386 fasta_file_path = "./%s" %file | 384 fasta_file_path = "./%s" %file |
| 387 bash_fasta = dico(fasta_file_path) ### DEF 1 ### | 385 bash_fasta = dico(fasta_file_path) ### DEF 1 ### |
| 388 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### | 386 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### |
| 389 | 387 |
| 390 ## a ## OUTPUT BESTORF_nuc | 388 ## a ## OUTPUT BESTORF_nuc |
| 391 if BESTORF_nuc != {}: | 389 if BESTORF_nuc != {}: |
| 392 count_file_with_CDS = count_file_with_CDS +1 | 390 count_file_with_CDS = count_file_with_CDS +1 |
| 393 OUT1 = open("%s/%s" %(Path_OUT1,file), "w") | 391 OUT1 = open("%s/%s" %(Path_OUT1,file), "w") |
| 396 OUT1.write("%s\n" %fasta_name) | 394 OUT1.write("%s\n" %fasta_name) |
| 397 OUT1.write("%s\n" %seq) | 395 OUT1.write("%s\n" %seq) |
| 398 OUT1.close() | 396 OUT1.close() |
| 399 else: | 397 else: |
| 400 count_file_without_CDS = count_file_without_CDS + 1 | 398 count_file_without_CDS = count_file_without_CDS + 1 |
| 401 | 399 |
| 402 | 400 |
| 403 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! | 401 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! |
| 404 if BESTORF_aa != {}: | 402 if BESTORF_aa != {}: |
| 405 OUT2 = open("%s/%s" %(Path_OUT2,file), "w") | 403 OUT2 = open("%s/%s" %(Path_OUT2,file), "w") |
| 406 for fasta_name in BESTORF_aa.keys(): | 404 for fasta_name in BESTORF_aa.keys(): |
| 407 seq = BESTORF_aa[fasta_name] | 405 seq = BESTORF_aa[fasta_name] |
| 408 OUT2.write("%s\n" %fasta_name) | 406 OUT2.write("%s\n" %fasta_name) |
| 409 OUT2.write("%s\n" %seq) | 407 OUT2.write("%s\n" %seq) |
| 410 OUT2.close() | 408 OUT2.close() |
| 411 | 409 |
| 412 ## c ## OUTPUT BESTORF_aa | 410 ## c ## OUTPUT BESTORF_aa |
| 413 if BESTORF_nuc_CODING != {}: | 411 if BESTORF_nuc_CODING != {}: |
| 414 OUT3 = open("%s/%s" %(Path_OUT3,file), "w") | 412 OUT3 = open("%s/%s" %(Path_OUT3,file), "w") |
| 415 for fasta_name in BESTORF_nuc_CODING.keys(): | 413 for fasta_name in BESTORF_nuc_CODING.keys(): |
| 423 OUT4 = open("%s/%s" %(Path_OUT4,file), "w") | 421 OUT4 = open("%s/%s" %(Path_OUT4,file), "w") |
| 424 for fasta_name in BESTORF_aa_CODING.keys(): | 422 for fasta_name in BESTORF_aa_CODING.keys(): |
| 425 seq = BESTORF_aa_CODING[fasta_name] | 423 seq = BESTORF_aa_CODING[fasta_name] |
| 426 OUT4.write("%s\n" %fasta_name) | 424 OUT4.write("%s\n" %fasta_name) |
| 427 OUT4.write("%s\n" %seq) | 425 OUT4.write("%s\n" %seq) |
| 428 OUT4.close() | 426 OUT4.close() |
| 429 | 427 |
| 430 ## e ## OUTPUT BESTORF_nuc_CDS_with_M | 428 ## e ## OUTPUT BESTORF_nuc_CDS_with_M |
| 431 if BESTORF_nuc_CDS_with_M != {}: | 429 if BESTORF_nuc_CDS_with_M != {}: |
| 432 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 | 430 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 |
| 433 OUT5 = open("%s/%s" %(Path_OUT5,file), "w") | 431 OUT5 = open("%s/%s" %(Path_OUT5,file), "w") |
| 434 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): | 432 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): |
| 435 seq = BESTORF_nuc_CDS_with_M[fasta_name] | 433 seq = BESTORF_nuc_CDS_with_M[fasta_name] |
| 436 OUT5.write("%s\n" %fasta_name) | 434 OUT5.write("%s\n" %fasta_name) |
| 437 OUT5.write("%s\n" %seq) | 435 OUT5.write("%s\n" %seq) |
| 438 OUT5.close() | 436 OUT5.close() |
| 439 | 437 |
| 440 ## f ## OUTPUT BESTORF_aa_CDS_with_M | 438 ## f ## OUTPUT BESTORF_aa_CDS_with_M |
| 441 if BESTORF_aa_CDS_with_M != {}: | 439 if BESTORF_aa_CDS_with_M != {}: |
| 442 OUT6 = open("%s/%s" %(Path_OUT6,file), "w") | 440 OUT6 = open("%s/%s" %(Path_OUT6,file), "w") |
| 443 for fasta_name in BESTORF_aa_CDS_with_M.keys(): | 441 for fasta_name in BESTORF_aa_CDS_with_M.keys(): |
| 444 seq = BESTORF_aa_CDS_with_M[fasta_name] | 442 seq = BESTORF_aa_CDS_with_M[fasta_name] |
| 445 OUT6.write("%s\n" %fasta_name) | 443 OUT6.write("%s\n" %fasta_name) |
| 446 OUT6.write("%s\n" %seq) | 444 OUT6.write("%s\n" %seq) |
| 447 OUT6.close() | 445 OUT6.close() |
| 448 | 446 |
| 449 os.system("rm -rf %s" %file) | 447 os.system("rm -rf %s" %file) |
| 450 | 448 |
| 451 ## Print | 449 ## Print |
| 452 print "*************** CDS detection ***************" | 450 print "*************** CDS detection ***************" |
| 453 print "\nFiles processed: %d" %count_file_processed | 451 print "\nFiles processed: %d" %count_file_processed |
| 454 print "\tFiles with CDS: %d" %count_file_with_CDS | 452 print "\tFiles with CDS: %d" %count_file_with_CDS |
| 455 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M | 453 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M |
| 456 print "\tFiles without CDS: %d \n" %count_file_without_CDS | 454 print "\tFiles without CDS: %d \n" %count_file_without_CDS |
| 457 print "" | 455 print "" |
| 458 | |
| 459 ## Zipfile | |
| 460 f_bestORF_nuc = zipfile.ZipFile("ORF_Search_bestORF_nuc.zip", "w") | |
| 461 f_bestORF_aa = zipfile.ZipFile("ORF_Search_bestORF_aa.zip", "w") | |
| 462 f_CDS_nuc = zipfile.ZipFile("ORF_Search_CDS_nuc.zip", "w") | |
| 463 f_CDS_aa = zipfile.ZipFile("ORF_Search_CDS_aa.zip", "w") | |
| 464 f_CDSM_nuc = zipfile.ZipFile("ORF_Search_CDSM_nuc.zip", "w") | |
| 465 f_CDSM_aa = zipfile.ZipFile("ORF_Search_CDSM_aa.zip", "w") | |
| 466 | |
| 467 os.chdir("%s" %Path_OUT1) | |
| 468 folder = os.listdir("./") | |
| 469 for i in folder : | |
| 470 f_bestORF_nuc.write("./%s" %i) | |
| 471 | |
| 472 os.chdir("../%s" %Path_OUT2) | |
| 473 folder = os.listdir("./") | |
| 474 for i in folder : | |
| 475 f_bestORF_aa.write("./%s" %i) | |
| 476 | |
| 477 os.chdir("../%s" %Path_OUT3) | |
| 478 folder = os.listdir("./") | |
| 479 for i in folder : | |
| 480 f_CDS_nuc.write("./%s" %i) | |
| 481 | |
| 482 os.chdir("../%s" %Path_OUT4) | |
| 483 folder = os.listdir("./") | |
| 484 for i in folder : | |
| 485 f_CDS_aa.write("./%s" %i) | |
| 486 | |
| 487 os.chdir("../%s" %Path_OUT5) | |
| 488 folder = os.listdir("./") | |
| 489 for i in folder : | |
| 490 f_CDSM_nuc.write("./%s" %i) | |
| 491 | |
| 492 os.chdir("../%s" %Path_OUT6) | |
| 493 folder = os.listdir("./") | |
| 494 for i in folder : | |
| 495 f_CDSM_aa.write("./%s" %i) |
