Mercurial > repos > abims-sbr > cds_search
view scripts/S01_find_orf_on_multiple_alignment.py @ 10:3d00be2d05f3 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 7d45b8e07bf99420df9b195617515b8002b5eaf3
| author | abims-sbr |
|---|---|
| date | Fri, 06 Jul 2018 02:52:15 -0400 |
| parents | 640ef4c06ed5 |
| children | 06a28df198b6 |
line wrap: on
line source
#!/usr/bin/python # coding: utf8 ## Author: Eric Fontanillas ## Modification: 03/09/14 by Julie BAFFARD ## Last modification : 05/03/18 by Victor Mataigne ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF ## CRITERIA 2 ## This longest part should be > 150nc or 50aa ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA #################################################### ###### DEF 2 : Create bash for genetic code ######## #################################################### ### KEY = codon ### VALUE = Amino Acid def code_universel(F1): bash_codeUniversel = {} with open(F1, "r") as file: for line in file.readlines(): L1 = string.split(line, " ") length1 = len(L1) if length1 == 3: key = L1[0] value = L1[2][:-1] bash_codeUniversel[key] = value else: key = L1[0] value = L1[2] bash_codeUniversel[key] = value return(bash_codeUniversel) ########################################################### ###################################################################################################################### ##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### ###################################################################################################################### ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) def multiple3(seq): leng = len(seq) modulo = leng%3 if modulo == 0: # the results of dividing leng per 3 is an integer new_seq = seq elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) new_seq = seq[:-1] # remove the last nc elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) new_seq = seq[:-2] # remove the 2 last nc len1 = len(new_seq) return(new_seq, modulo) ########################################################## ############################# ###### DEF 4 : GET ORF ###### ############################# ##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences ##- CRITERIA1: Get the segment in the alignment with no codon stop ###### DEF 4 - Part 1 - ###### ############################## def simply_get_ORF(seq_dna, bash_codeUniversel): seq_aa = "" i = 0 len1 = len(seq_dna) while i < len1: base1 = seq_dna[i] base1 = string.capitalize(base1) base2 = seq_dna[i+1] base2 = string.capitalize(base2) base3 = seq_dna[i+2] base3 = string.capitalize(base3) codon = base1+base2+base3 codon = string.replace(codon, "T", "U") if codon in bash_codeUniversel.keys(): aa = bash_codeUniversel[codon] seq_aa = seq_aa + aa else: seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" i = i + 3 return(seq_aa) ########################################################## ###### DEF 4 - Part 2 - ###### ############################## def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): ## 1 ## Get the list of aligned aa seq for the 3 ORF: bash_of_aligned_aa_seq_3ORF = {} bash_of_aligned_nuc_seq_3ORF = {} BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] for fasta_name in bash_aligned_nc_seq.keys(): ## 1.1. ## Get the raw sequence sequence_nc = bash_aligned_nc_seq[fasta_name] ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### ## 1.3. ## Get the 3 ORFs (nuc) for each sequence seq_nuc_ORF1 = new_sequence_nc seq_nuc_ORF2 = new_sequence_nc[1:-2] seq_nuc_ORF3 = new_sequence_nc[2:-1] seq_reversed=ReverseComplement2(seq_nuc_ORF1) seq_nuc_ORF4=seq_reversed seq_nuc_ORF5=seq_reversed[1:-2] seq_nuc_ORF6=seq_reversed[2:-1] LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) ## 1.4. ## Get the 3 ORFs (aa) for each sequence seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) BEST_MAX = 0 for i in [0,1,2,3,4,5]: ### Test the 6 ORFs ORF_Aligned_aa = [] ORF_Aligned_nuc = [] ## 2.1 ## Get the alignment of sequence for a given ORF ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] aa_length = len(ORFsequence) ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = n = i+1 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] nuc_length = len(ORFsequence) ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) ## Next step is to get the longuest subsequence whithout stop ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" MAX_LENGTH = 0 LONGUEST_SEGMENT_UNSTOPPED = "" j = 0 # Start from first position in alignment List_of_List_subsequences = [] List_positions_subsequence = [] while j < aa_length: column = [] for seq in ORF_Aligned_aa: column.append(seq[j]) j = j+1 if "*" in column: List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions List_positions_subsequence = [] ## Re-initialyse list of positions else: List_positions_subsequence.append(j) ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) LONGUEST_SUBSEQUENCE_LIST_POSITION = [] MAX=0 for sublist in List_of_List_subsequences: if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: MAX = len(sublist) LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: CDS_maybe_truncated = 1 else: CDS_maybe_truncated = 0 else: CDS_maybe_truncated = 0 ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) ## Test whether it is the better ORF if MAX > BEST_MAX: BEST_MAX = MAX BEST_ORF = i+1 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT) ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) ## And get the INDEX of the best ORF [0, 1, or 2] if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] pos_MIN_aa = pos_MIN_aa - 1 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] BESTORF_bash_of_aligned_aa_seq = {} BESTORF_bash_of_aligned_aa_seq_CODING = {} for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): index_BEST_ORF = BEST_ORF-1 ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] seq_coding = seq[pos_MIN_aa:pos_MAX_aa] BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment pos_MIN_nuc = pos_MIN_aa * 3 pos_MAX_nuc = pos_MAX_aa * 3 BESTORF_bash_aligned_nc_seq = {} BESTORF_bash_aligned_nc_seq_CODING = {} for fasta_name in bash_aligned_nc_seq.keys(): seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] BESTORF_bash_aligned_nc_seq[fasta_name] = seq BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding else: ### no CDS found ### BESTORF_bash_aligned_nc_seq = {} BESTORF_bash_aligned_nc_seq_CODING = {} BESTORF_bash_of_aligned_aa_seq = {} BESTORF_bash_of_aligned_aa_seq_CODING ={} ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa ########################################################################################################### BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} Ortho = 0 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### ## CASE 1: A "M" is present and correctly localized (not in last 50 aa) if Ortho == 1: BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: if Ortho == 0 and CDS_maybe_truncated == 1: BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): ## => the 2 bash "CDS_with_M" are left empty ("{}") return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) ########################################################## ################################################################################################## ###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### ################################################################################################## def allindices(string, sub): listindex=[] offset=0 i = string.find(sub, offset) while i >= 0: listindex.append(i) i = string.find(sub, i + 1) return listindex ###################################################### ############################################################ ###### DEF 6 : Detect if methionin in the aa sequence ###### ############################################################ def detect_Methionine(seq_aa, Ortho): ln = len(seq_aa) nbre = sys.argv[2] CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH #Ortho = 0 ## means orthologs not found ## Find all indices of occurances of "M" in a string of aa list_indices = allindices(seq_aa, "M") ### DEF5 ### ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS if list_indices != []: first_M = list_indices[0] if first_M < CUTOFF_Last_50aa: Ortho = 1 ## means orthologs found return(Ortho) ################################### ############################################################ ###### DEF 7 : Reverse complement DNA sequence ###### ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html ############################################################ def ReverseComplement2(seq): # too lazy to construct the dictionary manually, use a dict comprehension seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } return "".join([seq_dict[base] for base in reversed(seq)]) ############################ ####################### ##### RUN RUN RUN ##### ####################### import string, os, time, re, zipfile, sys from dico import dico MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number ### Get Universal Code bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ### ## INPUT from file containing list of species list_files = [] with open(sys.argv[3], 'r') as f: for line in f.readlines(): list_files.append(line.strip('\n')) os.mkdir("04_BEST_ORF_nuc") Path_OUT1 = "04_BEST_ORF_nuc" os.mkdir("04_BEST_ORF_aa") Path_OUT2 = "04_BEST_ORF_aa" os.mkdir("05_CDS_nuc") Path_OUT3 = "05_CDS_nuc" os.mkdir("05_CDS_aa") Path_OUT4 = "05_CDS_aa" os.mkdir("06_CDS_with_M_nuc") Path_OUT5 = "06_CDS_with_M_nuc" os.mkdir("06_CDS_with_M_aa") Path_OUT6 = "06_CDS_with_M_aa" ### Get the Bash corresponding to an alignment file in fasta format count_file_processed = 0 count_file_with_CDS = 0 count_file_without_CDS = 0 count_file_with_CDS_plus_M = 0 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond #n0 = 0 for file in list_files: #n0 += 1 count_file_processed = count_file_processed + 1 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number fasta_file_path = "./%s" %file bash_fasta = dico(fasta_file_path) ### DEF 1 ### BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### name_elems[1] = nb_gp ## a ## OUTPUT BESTORF_nuc if BESTORF_nuc != {}: name_elems[3] = str(len(BESTORF_nuc.keys())) new_name = "_".join(name_elems) count_file_with_CDS = count_file_with_CDS +1 OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") for fasta_name in BESTORF_nuc.keys(): seq = BESTORF_nuc[fasta_name] OUT1.write("%s\n" %fasta_name) OUT1.write("%s\n" %seq) OUT1.close() else: count_file_without_CDS = count_file_without_CDS + 1 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! if BESTORF_aa != {}: name_elems[3] = str(len(BESTORF_aa.keys())) new_name = "_".join(name_elems) OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") for fasta_name in BESTORF_aa.keys(): seq = BESTORF_aa[fasta_name] OUT2.write("%s\n" %fasta_name) OUT2.write("%s\n" %seq) OUT2.close() ## c ## OUTPUT BESTORF_aa if BESTORF_nuc_CODING != {}: name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) new_name = "_".join(name_elems) OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") for fasta_name in BESTORF_nuc_CODING.keys(): seq = BESTORF_nuc_CODING[fasta_name] OUT3.write("%s\n" %fasta_name) OUT3.write("%s\n" %seq) OUT3.close() ## d ## OUTPUT BESTORF_aa_CODING if BESTORF_aa_CODING != {}: name_elems[3] = str(len(BESTORF_aa_CODING.keys())) new_name = "_".join(name_elems) OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") for fasta_name in BESTORF_aa_CODING.keys(): seq = BESTORF_aa_CODING[fasta_name] OUT4.write("%s\n" %fasta_name) OUT4.write("%s\n" %seq) OUT4.close() ## e ## OUTPUT BESTORF_nuc_CDS_with_M if BESTORF_nuc_CDS_with_M != {}: name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) new_name = "_".join(name_elems) count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") for fasta_name in BESTORF_nuc_CDS_with_M.keys(): seq = BESTORF_nuc_CDS_with_M[fasta_name] OUT5.write("%s\n" %fasta_name) OUT5.write("%s\n" %seq) OUT5.close() ## f ## OUTPUT BESTORF_aa_CDS_with_M if BESTORF_aa_CDS_with_M != {}: name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) new_name = "_".join(name_elems) OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w") for fasta_name in BESTORF_aa_CDS_with_M.keys(): seq = BESTORF_aa_CDS_with_M[fasta_name] OUT6.write("%s\n" %fasta_name) OUT6.write("%s\n" %seq) OUT6.close() #os.system("rm -rf %s" %file) ## Print print "*************** CDS detection ***************" print "\nFiles processed: %d" %count_file_processed print "\tFiles with CDS: %d" %count_file_with_CDS print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M print "\tFiles without CDS: %d \n" %count_file_without_CDS print ""
