Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 11:06a28df198b6 draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
| author | lecorguille |
|---|---|
| date | Mon, 24 Sep 2018 03:58:34 -0400 |
| parents | 3d00be2d05f3 |
| children |
comparison
equal
deleted
inserted
replaced
| 10:3d00be2d05f3 | 11:06a28df198b6 |
|---|---|
| 1 #!/usr/bin/python | 1 #!/usr/bin/env python |
| 2 # coding: utf8 | 2 # coding: utf8 |
| 3 ## Author: Eric Fontanillas | 3 # Author: Eric Fontanillas |
| 4 ## Modification: 03/09/14 by Julie BAFFARD | 4 # Modification: 03/09/14 by Julie BAFFARD |
| 5 ## Last modification : 05/03/18 by Victor Mataigne | 5 # Last modification : 25/07/18 by Victor Mataigne |
| 6 | 6 |
| 7 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria | 7 # Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria |
| 8 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF | 8 # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF |
| 9 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa | 9 # CRITERIA 2 - This longest part should be > 150nc or 50aa |
| 10 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa | 10 # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa |
| 11 ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA | 11 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA |
| 12 ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA | 12 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA |
| 13 | 13 |
| 14 #################################################### | 14 import string, os, time, re, zipfile, sys, argparse |
| 15 ###### DEF 2 : Create bash for genetic code ######## | 15 from dico import dico |
| 16 #################################################### | |
| 17 ### KEY = codon | |
| 18 ### VALUE = Amino Acid | |
| 19 | 16 |
| 20 def code_universel(F1): | 17 def code_universel(F1): |
| 18 """ Creates bash for genetic code (key : codon ; value : amino-acid) """ | |
| 21 bash_codeUniversel = {} | 19 bash_codeUniversel = {} |
| 20 | |
| 22 with open(F1, "r") as file: | 21 with open(F1, "r") as file: |
| 23 for line in file.readlines(): | 22 for line in file.readlines(): |
| 24 L1 = string.split(line, " ") | 23 L1 = string.split(line, " ") |
| 25 length1 = len(L1) | 24 length1 = len(L1) |
| 26 if length1 == 3: | 25 if length1 == 3: |
| 29 bash_codeUniversel[key] = value | 28 bash_codeUniversel[key] = value |
| 30 else: | 29 else: |
| 31 key = L1[0] | 30 key = L1[0] |
| 32 value = L1[2] | 31 value = L1[2] |
| 33 bash_codeUniversel[key] = value | 32 bash_codeUniversel[key] = value |
| 33 | |
| 34 return(bash_codeUniversel) | 34 return(bash_codeUniversel) |
| 35 ########################################################### | 35 |
| 36 | |
| 37 | |
| 38 ###################################################################################################################### | |
| 39 ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### | |
| 40 ###################################################################################################################### | |
| 41 ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) | |
| 42 def multiple3(seq): | 36 def multiple3(seq): |
| 43 leng = len(seq) | 37 """ Tests if the sequence is a multiple of 3, and if not removes extra-bases |
| 44 modulo = leng%3 | 38 !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ |
| 45 if modulo == 0: # the results of dividing leng per 3 is an integer | 39 |
| 46 new_seq = seq | 40 m = len(seq)%3 |
| 47 elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) | 41 if m != 0 : |
| 48 new_seq = seq[:-1] # remove the last nc | 42 return seq[:-m], m |
| 49 elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) | 43 else : |
| 50 new_seq = seq[:-2] # remove the 2 last nc | 44 return seq, m |
| 51 len1 = len(new_seq) | 45 |
| 52 return(new_seq, modulo) | 46 def detect_Methionine(seq_aa, Ortho, minimal_cds_length): |
| 53 ########################################################## | 47 """ Detects if methionin in the aa sequence """ |
| 54 | 48 |
| 55 | 49 ln = len(seq_aa) |
| 56 ############################# | 50 CUTOFF_Last_50aa = ln - minimal_cds_length |
| 57 ###### DEF 4 : GET ORF ###### | 51 |
| 58 ############################# | 52 # Find all indices of occurances of "M" in a string of aa |
| 59 ##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences | 53 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] |
| 60 ##- CRITERIA1: Get the segment in the alignment with no codon stop | 54 |
| 61 | 55 # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS |
| 62 | 56 if list_indices != []: |
| 63 ###### DEF 4 - Part 1 - ###### | 57 first_M = list_indices[0] |
| 64 ############################## | 58 if first_M < CUTOFF_Last_50aa: |
| 65 def simply_get_ORF(seq_dna, bash_codeUniversel): | 59 Ortho = 1 # means orthologs found |
| 66 seq_aa = "" | 60 |
| 67 i = 0 | 61 return(Ortho) |
| 68 len1 = len(seq_dna) | 62 |
| 69 while i < len1: | 63 def ReverseComplement2(seq): |
| 70 base1 = seq_dna[i] | 64 """ Reverse complement DNA sequence """ |
| 71 base1 = string.capitalize(base1) | 65 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' |
| 72 base2 = seq_dna[i+1] | 66 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } |
| 73 base2 = string.capitalize(base2) | 67 |
| 74 base3 = seq_dna[i+2] | 68 return "".join([seq_dict[base] for base in reversed(seq)]) |
| 75 base3 = string.capitalize(base3) | 69 |
| 76 | 70 def simply_get_ORF(seq_dna, gen_code): |
| 77 codon = base1+base2+base3 | 71 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] |
| 78 codon = string.replace(codon, "T", "U") | 72 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] |
| 79 | 73 |
| 80 if codon in bash_codeUniversel.keys(): | 74 return ''.join(seq_by_aa) |
| 81 aa = bash_codeUniversel[codon] | 75 |
| 82 seq_aa = seq_aa + aa | 76 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): |
| 83 else: | 77 # Multiple sequence based : Based on the alignment of several sequences (orthogroup) |
| 84 seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" | 78 # Criteria 1 : Get the segment in the alignment with no codon stop |
| 85 i = i + 3 | 79 |
| 86 | 80 # 1 - Get the list of aligned aa seq for the 3 ORF: |
| 87 return(seq_aa) | |
| 88 ########################################################## | |
| 89 | |
| 90 | |
| 91 ###### DEF 4 - Part 2 - ###### | |
| 92 ############################## | |
| 93 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): | |
| 94 | |
| 95 ## 1 ## Get the list of aligned aa seq for the 3 ORF: | |
| 96 bash_of_aligned_aa_seq_3ORF = {} | 81 bash_of_aligned_aa_seq_3ORF = {} |
| 97 bash_of_aligned_nuc_seq_3ORF = {} | 82 bash_of_aligned_nuc_seq_3ORF = {} |
| 98 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | 83 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] |
| 84 | |
| 99 for fasta_name in bash_aligned_nc_seq.keys(): | 85 for fasta_name in bash_aligned_nc_seq.keys(): |
| 100 ## 1.1. ## Get the raw sequence | 86 # Get sequence, chek if multiple 3, then get 6 orfs |
| 101 sequence_nc = bash_aligned_nc_seq[fasta_name] | 87 sequence_nc = bash_aligned_nc_seq[fasta_name] |
| 102 | 88 new_sequence_nc, modulo = multiple3(sequence_nc) |
| 103 ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: | 89 new_sequence_rev = ReverseComplement2(new_sequence_nc) |
| 104 new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### | 90 # For each seq of the multialignment => give the 6 ORFs (in nuc) |
| 105 | 91 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] |
| 106 ## 1.3. ## Get the 3 ORFs (nuc) for each sequence | 92 |
| 107 seq_nuc_ORF1 = new_sequence_nc | 93 seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) |
| 108 seq_nuc_ORF2 = new_sequence_nc[1:-2] | 94 seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) |
| 109 seq_nuc_ORF3 = new_sequence_nc[2:-1] | 95 seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) |
| 110 seq_reversed=ReverseComplement2(seq_nuc_ORF1) | 96 seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) |
| 111 seq_nuc_ORF4=seq_reversed | 97 seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) |
| 112 seq_nuc_ORF5=seq_reversed[1:-2] | 98 seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) |
| 113 seq_nuc_ORF6=seq_reversed[2:-1] | 99 |
| 114 | 100 # For each seq of the multialignment => give the 6 ORFs (in aa) |
| 115 LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] | 101 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] |
| 116 bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) | 102 |
| 117 | 103 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) |
| 118 ## 1.4. ## Get the 3 ORFs (aa) for each sequence | |
| 119 seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 120 seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 121 seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 122 seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 123 seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 124 seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## | |
| 125 | |
| 126 LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] | |
| 127 bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) | |
| 128 | |
| 129 ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) | |
| 130 BEST_MAX = 0 | 104 BEST_MAX = 0 |
| 131 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs | 105 |
| 106 for i in [0,1,2,3,4,5]: # Test the 6 ORFs | |
| 132 ORF_Aligned_aa = [] | 107 ORF_Aligned_aa = [] |
| 133 ORF_Aligned_nuc = [] | 108 ORF_Aligned_nuc = [] |
| 134 | 109 |
| 135 | 110 # 2.1 - Get the alignment of sequence for a given ORF |
| 136 ## 2.1 ## Get the alignment of sequence for a given ORF | 111 # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd |
| 137 ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd | |
| 138 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): | 112 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): |
| 139 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] | 113 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] |
| 140 aa_length = len(ORFsequence) | 114 aa_length = len(ORFsequence) |
| 141 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = | 115 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = |
| 142 | 116 |
| 143 n = i+1 | 117 n = i+1 |
| 144 | 118 |
| 145 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): | 119 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): |
| 146 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] | 120 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] |
| 147 nuc_length = len(ORFsequence) | 121 nuc_length = len(ORFsequence) |
| 148 ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = | 122 ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" = |
| 149 | 123 |
| 150 ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment | 124 # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment |
| 151 ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) | 125 # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) |
| 152 ## Next step is to get the longuest subsequence whithout stop | 126 # Next step is to get the longuest subsequence whithout stop |
| 153 ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" | 127 # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" |
| 154 MAX_LENGTH = 0 | 128 MAX_LENGTH = 0 |
| 155 LONGUEST_SEGMENT_UNSTOPPED = "" | 129 LONGUEST_SEGMENT_UNSTOPPED = "" |
| 156 j = 0 # Start from first position in alignment | 130 j = 0 # Start from first position in alignment |
| 157 List_of_List_subsequences = [] | 131 List_of_List_subsequences = [] |
| 158 List_positions_subsequence = [] | 132 List_positions_subsequence = [] |
| 160 column = [] | 134 column = [] |
| 161 for seq in ORF_Aligned_aa: | 135 for seq in ORF_Aligned_aa: |
| 162 column.append(seq[j]) | 136 column.append(seq[j]) |
| 163 j = j+1 | 137 j = j+1 |
| 164 if "*" in column: | 138 if "*" in column: |
| 165 List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions | 139 List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions |
| 166 List_positions_subsequence = [] ## Re-initialyse list of positions | 140 List_positions_subsequence = [] # Re-initialyse list of positions |
| 167 else: | 141 else: |
| 168 List_positions_subsequence.append(j) | 142 List_positions_subsequence.append(j) |
| 169 | 143 |
| 170 ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) | 144 # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) |
| 171 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | 145 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] |
| 172 MAX=0 | 146 MAX=0 |
| 173 for sublist in List_of_List_subsequences: | 147 for sublist in List_of_List_subsequences: |
| 174 if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: | 148 if len(sublist) > MAX and len(sublist) > minimal_cds_length: |
| 175 MAX = len(sublist) | 149 MAX = len(sublist) |
| 176 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist | 150 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist |
| 177 | 151 |
| 178 ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) | 152 # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) |
| 179 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | 153 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: |
| 180 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: | 154 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: |
| 181 CDS_maybe_truncated = 1 | 155 CDS_maybe_truncated = 1 |
| 182 else: | 156 else: |
| 183 CDS_maybe_truncated = 0 | 157 CDS_maybe_truncated = 0 |
| 184 else: | 158 else: |
| 185 CDS_maybe_truncated = 0 | 159 CDS_maybe_truncated = 0 |
| 186 | 160 |
| 187 | 161 |
| 188 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) | 162 # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) |
| 189 ## Test whether it is the better ORF | 163 # Test whether it is the better ORF |
| 190 if MAX > BEST_MAX: | 164 if MAX > BEST_MAX: |
| 191 BEST_MAX = MAX | 165 BEST_MAX = MAX |
| 192 BEST_ORF = i+1 | 166 BEST_ORF = i+1 |
| 193 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION | 167 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION |
| 194 | 168 |
| 195 | 169 |
| 196 ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT) | 170 # 3 - ONCE we have this better segment (BEST CODING SEGMENT) |
| 197 ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) | 171 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) |
| 198 ## And get the INDEX of the best ORF [0, 1, or 2] | 172 # And get the INDEX of the best ORF [0, 1, or 2] |
| 199 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | 173 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: |
| 200 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] | 174 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] |
| 201 pos_MIN_aa = pos_MIN_aa - 1 | 175 pos_MIN_aa = pos_MIN_aa - 1 |
| 202 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] | 176 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] |
| 203 | 177 |
| 204 | 178 |
| 205 BESTORF_bash_of_aligned_aa_seq = {} | 179 BESTORF_bash_of_aligned_aa_seq = {} |
| 206 BESTORF_bash_of_aligned_aa_seq_CODING = {} | 180 BESTORF_bash_of_aligned_aa_seq_CODING = {} |
| 207 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): | 181 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): |
| 208 index_BEST_ORF = BEST_ORF-1 ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 | 182 index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 |
| 209 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] | 183 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] |
| 210 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] | 184 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] |
| 211 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq | 185 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq |
| 212 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding | 186 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding |
| 213 | 187 |
| 214 ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment | 188 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment |
| 215 pos_MIN_nuc = pos_MIN_aa * 3 | 189 pos_MIN_nuc = pos_MIN_aa * 3 |
| 216 pos_MAX_nuc = pos_MAX_aa * 3 | 190 pos_MAX_nuc = pos_MAX_aa * 3 |
| 217 | 191 |
| 218 BESTORF_bash_aligned_nc_seq = {} | 192 BESTORF_bash_aligned_nc_seq = {} |
| 219 BESTORF_bash_aligned_nc_seq_CODING = {} | 193 BESTORF_bash_aligned_nc_seq_CODING = {} |
| 221 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] | 195 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] |
| 222 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] | 196 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] |
| 223 BESTORF_bash_aligned_nc_seq[fasta_name] = seq | 197 BESTORF_bash_aligned_nc_seq[fasta_name] = seq |
| 224 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding | 198 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding |
| 225 | 199 |
| 226 else: ### no CDS found ### | 200 else: # no CDS found |
| 227 BESTORF_bash_aligned_nc_seq = {} | 201 BESTORF_bash_aligned_nc_seq = {} |
| 228 BESTORF_bash_aligned_nc_seq_CODING = {} | 202 BESTORF_bash_aligned_nc_seq_CODING = {} |
| 229 BESTORF_bash_of_aligned_aa_seq = {} | 203 BESTORF_bash_of_aligned_aa_seq = {} |
| 230 BESTORF_bash_of_aligned_aa_seq_CODING ={} | 204 BESTORF_bash_of_aligned_aa_seq_CODING ={} |
| 231 | 205 |
| 232 | 206 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa |
| 233 | 207 |
| 234 ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa | |
| 235 ########################################################################################################### | |
| 236 | |
| 237 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} | 208 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} |
| 238 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} | 209 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} |
| 239 | 210 |
| 240 Ortho = 0 | 211 Ortho = 0 |
| 241 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): | 212 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): |
| 242 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] | 213 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] |
| 243 Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### | 214 Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### |
| 244 | 215 |
| 245 ## CASE 1: A "M" is present and correctly localized (not in last 50 aa) | 216 # CASE 1: A "M" is present and correctly localized (not in last 50 aa) |
| 246 if Ortho == 1: | 217 if Ortho == 1: |
| 247 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | 218 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING |
| 248 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | 219 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING |
| 249 | 220 |
| 250 ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: | 221 # CASE 2: in case the CDS is truncated, so the "M" is maybe missing: |
| 251 if Ortho == 0 and CDS_maybe_truncated == 1: | 222 if Ortho == 0 and CDS_maybe_truncated == 1: |
| 252 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | 223 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING |
| 253 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | 224 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING |
| 254 | 225 |
| 255 ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): | 226 # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): |
| 256 ## => the 2 bash "CDS_with_M" are left empty ("{}") | 227 ## => the 2 bash "CDS_with_M" are left empty ("{}") |
| 257 | 228 |
| 258 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) | 229 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) |
| 259 ########################################################## | 230 |
| 260 | 231 def write_output_file(results_dict, name_elems, path_out): |
| 261 | 232 if results_dict != {}: |
| 262 ################################################################################################## | 233 name_elems[3] = str(len(results_dict.keys())) |
| 263 ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### | 234 new_name = "_".join(name_elems) |
| 264 ################################################################################################## | 235 |
| 265 def allindices(string, sub): | 236 out1 = open("%s/%s" %(path_out,new_name), "w") |
| 266 listindex=[] | 237 for fasta_name in results_dict.keys(): |
| 267 offset=0 | 238 seq = results_dict[fasta_name] |
| 268 i = string.find(sub, offset) | 239 out1.write("%s\n" %fasta_name) |
| 269 while i >= 0: | 240 out1.write("%s\n" %seq) |
| 270 listindex.append(i) | 241 out1.close() |
| 271 i = string.find(sub, i + 1) | 242 |
| 272 return listindex | 243 def main(): |
| 273 ###################################################### | 244 parser = argparse.ArgumentParser() |
| 274 | 245 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") |
| 275 | 246 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) |
| 276 ############################################################ | 247 parser.add_argument("min_spec", help="Minimal number of species per alignment") |
| 277 ###### DEF 6 : Detect if methionin in the aa sequence ###### | 248 parser.add_argument("list_files", help="File with all input files names") |
| 278 ############################################################ | 249 args = parser.parse_args() |
| 279 def detect_Methionine(seq_aa, Ortho): | 250 |
| 280 | 251 minimal_cds_length = int(args.min_cds_len) # in aa number |
| 281 ln = len(seq_aa) | 252 bash_codeUniversel = code_universel(args.codeUniversel) |
| 282 nbre = sys.argv[2] | 253 minimum_species = int(args.min_spec) |
| 283 CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH | |
| 284 #Ortho = 0 ## means orthologs not found | |
| 285 | |
| 286 ## Find all indices of occurances of "M" in a string of aa | |
| 287 list_indices = allindices(seq_aa, "M") ### DEF5 ### | |
| 288 | |
| 289 ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS | |
| 290 if list_indices != []: | |
| 291 first_M = list_indices[0] | |
| 292 if first_M < CUTOFF_Last_50aa: | |
| 293 Ortho = 1 ## means orthologs found | |
| 294 | |
| 295 return(Ortho) | |
| 296 ################################### | |
| 297 | |
| 298 | |
| 299 ############################################################ | |
| 300 ###### DEF 7 : Reverse complement DNA sequence ###### | |
| 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html | |
| 302 ############################################################ | |
| 303 def ReverseComplement2(seq): | |
| 304 # too lazy to construct the dictionary manually, use a dict comprehension | |
| 305 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' | |
| 306 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } | |
| 307 return "".join([seq_dict[base] for base in reversed(seq)]) | |
| 308 ############################ | |
| 309 | |
| 310 | |
| 311 ####################### | |
| 312 ##### RUN RUN RUN ##### | |
| 313 ####################### | |
| 314 import string, os, time, re, zipfile, sys | |
| 315 from dico import dico | |
| 316 | |
| 317 MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number | |
| 318 | |
| 319 ### Get Universal Code | |
| 320 bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ### | |
| 321 | |
| 322 ## INPUT from file containing list of species | |
| 323 list_files = [] | |
| 324 with open(sys.argv[3], 'r') as f: | |
| 325 for line in f.readlines(): | |
| 326 list_files.append(line.strip('\n')) | |
| 327 | |
| 328 os.mkdir("04_BEST_ORF_nuc") | |
| 329 Path_OUT1 = "04_BEST_ORF_nuc" | |
| 330 os.mkdir("04_BEST_ORF_aa") | |
| 331 Path_OUT2 = "04_BEST_ORF_aa" | |
| 332 | |
| 333 os.mkdir("05_CDS_nuc") | |
| 334 Path_OUT3 = "05_CDS_nuc" | |
| 335 os.mkdir("05_CDS_aa") | |
| 336 Path_OUT4 = "05_CDS_aa" | |
| 337 | |
| 338 os.mkdir("06_CDS_with_M_nuc") | |
| 339 Path_OUT5 = "06_CDS_with_M_nuc" | |
| 340 os.mkdir("06_CDS_with_M_aa") | |
| 341 Path_OUT6 = "06_CDS_with_M_aa" | |
| 342 | |
| 343 ### Get the Bash corresponding to an alignment file in fasta format | |
| 344 count_file_processed = 0 | |
| 345 count_file_with_CDS = 0 | |
| 346 count_file_without_CDS = 0 | |
| 347 count_file_with_CDS_plus_M = 0 | |
| 348 | |
| 349 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), | |
| 350 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. | |
| 351 | |
| 352 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] | |
| 353 | |
| 354 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond | |
| 355 #n0 = 0 | |
| 356 | |
| 357 for file in list_files: | |
| 358 #n0 += 1 | |
| 359 | |
| 360 count_file_processed = count_file_processed + 1 | |
| 361 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number | |
| 362 fasta_file_path = "./%s" %file | |
| 363 bash_fasta = dico(fasta_file_path) ### DEF 1 ### | |
| 364 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### | |
| 365 | 254 |
| 366 name_elems[1] = nb_gp | 255 # Inputs from file containing list of species |
| 367 | 256 list_files = [] |
| 368 ## a ## OUTPUT BESTORF_nuc | 257 with open(args.list_files, 'r') as f: |
| 369 if BESTORF_nuc != {}: | 258 for line in f.readlines(): |
| 370 name_elems[3] = str(len(BESTORF_nuc.keys())) | 259 list_files.append(line.strip('\n')) |
| 371 new_name = "_".join(name_elems) | 260 |
| 372 count_file_with_CDS = count_file_with_CDS +1 | 261 # Directories for results |
| 373 OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") | 262 dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"] |
| 374 for fasta_name in BESTORF_nuc.keys(): | 263 for directory in dirs: |
| 375 seq = BESTORF_nuc[fasta_name] | 264 os.mkdir(directory) |
| 376 OUT1.write("%s\n" %fasta_name) | 265 |
| 377 OUT1.write("%s\n" %seq) | 266 count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0 |
| 378 OUT1.close() | 267 count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0 |
| 379 else: | 268 |
| 380 count_file_without_CDS = count_file_without_CDS + 1 | 269 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), |
| 381 | 270 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. |
| 382 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! | 271 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] |
| 383 if BESTORF_aa != {}: | 272 |
| 384 name_elems[3] = str(len(BESTORF_aa.keys())) | 273 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond |
| 385 new_name = "_".join(name_elems) | 274 #n0 = 0 |
| 386 OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") | 275 |
| 387 for fasta_name in BESTORF_aa.keys(): | 276 for file in list_files: |
| 388 seq = BESTORF_aa[fasta_name] | 277 #n0 += 1 |
| 389 OUT2.write("%s\n" %fasta_name) | 278 |
| 390 OUT2.write("%s\n" %seq) | 279 count_file_processed = count_file_processed + 1 |
| 391 OUT2.close() | 280 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number |
| 392 | 281 fasta_file_path = "./%s" %file |
| 393 ## c ## OUTPUT BESTORF_aa | 282 bash_fasta = dico(fasta_file_path) |
| 394 if BESTORF_nuc_CODING != {}: | 283 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) |
| 395 name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) | 284 |
| 396 new_name = "_".join(name_elems) | 285 name_elems[1] = nb_gp |
| 397 OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") | 286 |
| 398 for fasta_name in BESTORF_nuc_CODING.keys(): | 287 # Update counts and write group in corresponding output directory |
| 399 seq = BESTORF_nuc_CODING[fasta_name] | 288 if BESTORF_nuc != {}: |
| 400 OUT3.write("%s\n" %fasta_name) | 289 count_file_with_CDS += 1 |
| 401 OUT3.write("%s\n" %seq) | 290 if len(BESTORF_nuc.keys()) >= minimum_species : |
| 402 OUT3.close() | 291 count_file_with_cds_and_enought_species += 1 |
| 403 | 292 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc |
| 404 ## d ## OUTPUT BESTORF_aa_CODING | 293 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting |
| 405 if BESTORF_aa_CODING != {}: | 294 else: |
| 406 name_elems[3] = str(len(BESTORF_aa_CODING.keys())) | 295 count_file_without_CDS += 1 |
| 407 new_name = "_".join(name_elems) | 296 |
| 408 OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") | 297 if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species: |
| 409 for fasta_name in BESTORF_aa_CODING.keys(): | 298 write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2]) |
| 410 seq = BESTORF_aa_CODING[fasta_name] | 299 write_output_file(BESTORF_aa_CODING, name_elems, dirs[3]) |
| 411 OUT4.write("%s\n" %fasta_name) | 300 |
| 412 OUT4.write("%s\n" %seq) | 301 if BESTORF_nuc_CDS_with_M != {}: |
| 413 OUT4.close() | 302 count_file_with_CDS_plus_M += 1 |
| 414 | 303 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : |
| 415 ## e ## OUTPUT BESTORF_nuc_CDS_with_M | 304 count_file_with_cds_M_and_enought_species += 1 |
| 416 if BESTORF_nuc_CDS_with_M != {}: | 305 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) |
| 417 name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) | 306 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) |
| 418 new_name = "_".join(name_elems) | 307 |
| 419 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 | 308 print "*************** CDS detection ***************" |
| 420 OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") | 309 print "\nFiles processed: %d" %count_file_processed |
| 421 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): | 310 print "\tFiles with CDS: %d" %count_file_with_CDS |
| 422 seq = BESTORF_nuc_CDS_with_M[fasta_name] | 311 print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) |
| 423 OUT5.write("%s\n" %fasta_name) | 312 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M |
| 424 OUT5.write("%s\n" %seq) | 313 print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) |
| 425 OUT5.close() | 314 print "\tFiles without CDS: %d \n" %count_file_without_CDS |
| 426 | 315 print "" |
| 427 ## f ## OUTPUT BESTORF_aa_CDS_with_M | 316 |
| 428 if BESTORF_aa_CDS_with_M != {}: | 317 if __name__ == '__main__': |
| 429 name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) | 318 main() |
| 430 new_name = "_".join(name_elems) | |
| 431 OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w") | |
| 432 for fasta_name in BESTORF_aa_CDS_with_M.keys(): | |
| 433 seq = BESTORF_aa_CDS_with_M[fasta_name] | |
| 434 OUT6.write("%s\n" %fasta_name) | |
| 435 OUT6.write("%s\n" %seq) | |
| 436 OUT6.close() | |
| 437 | |
| 438 #os.system("rm -rf %s" %file) | |
| 439 | |
| 440 ## Print | |
| 441 print "*************** CDS detection ***************" | |
| 442 print "\nFiles processed: %d" %count_file_processed | |
| 443 print "\tFiles with CDS: %d" %count_file_with_CDS | |
| 444 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M | |
| 445 print "\tFiles without CDS: %d \n" %count_file_without_CDS | |
| 446 print "" |
