Mercurial > repos > abims-sbr > cds_search
diff scripts/S01_find_orf_on_multiple_alignment.py @ 1:567d5b771a90 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:47:57 -0400 |
| parents | |
| children | 0d2f72caea10 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Thu Apr 13 09:47:57 2017 -0400 @@ -0,0 +1,495 @@ +#!/usr/bin/python +## Author: Eric Fontanillas +## Last modification: 03/09/14 by Julie BAFFARD + +## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria + ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF + ## CRITERIA 2 ## This longest part should be > 150nc or 50aa + ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa + ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA + ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA + + +############################### +##### DEF 1 : Dico fasta ##### +############################### +def dico(fasta_file_path): + F2 = open(fasta_file_path, "r") + dicoco = {} + while 1: + next2 = F2.readline() + if not next2: + break + if next2[0] == ">": + fasta_name_query = next2[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + next3 = F2.readline() + fasta_seq_query = next3[:-1] + dicoco[fasta_name_query]=fasta_seq_query + F2.close() + return(dicoco) +############################################################ + + +#################################################### +###### DEF 2 : Create bash for genetic code ######## +#################################################### +### KEY = codon +### VALUE = Amino Acid + +def code_universel(F1): + bash_codeUniversel = {} + while 1: + next = F1.readline() + if not next: break + L1 = string.split(next, " ") + length1 = len(L1) + if length1 == 3: + key = L1[0] + value = L1[2][:-1] + bash_codeUniversel[key] = value + else: + key = L1[0] + value = L1[2] + bash_codeUniversel[key] = value + F1.close() + return(bash_codeUniversel) +########################################################### + + +###################################################################################################################### +##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### +###################################################################################################################### +### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) +def multiple3(seq): + leng = len(seq) + modulo = leng%3 + if modulo == 0: # the results of dividing leng per 3 is an integer + new_seq = seq + elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) + new_seq = seq[:-1] # remove the last nc + elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) + new_seq = seq[:-2] # remove the 2 last nc + len1 = len(new_seq) + return(new_seq, modulo) +########################################################## + + +############################# +###### DEF 4 : GET ORF ###### +############################# +##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences +##- CRITERIA1: Get the segment in the alignment with no codon stop + + +###### DEF 4 - Part 1 - ###### +############################## +def simply_get_ORF(seq_dna, bash_codeUniversel): + seq_aa = "" + i = 0 + len1 = len(seq_dna) + while i < len1: + base1 = seq_dna[i] + base1 = string.capitalize(base1) + base2 = seq_dna[i+1] + base2 = string.capitalize(base2) + base3 = seq_dna[i+2] + base3 = string.capitalize(base3) + + codon = base1+base2+base3 + codon = string.replace(codon, "T", "U") + + if codon in bash_codeUniversel.keys(): + aa = bash_codeUniversel[codon] + seq_aa = seq_aa + aa + else: + seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" + i = i + 3 + + return(seq_aa) +########################################################## + + +###### DEF 4 - Part 2 - ###### +############################## +def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): + + ## 1 ## Get the list of aligned aa seq for the 3 ORF: + bash_of_aligned_aa_seq_3ORF = {} + bash_of_aligned_nuc_seq_3ORF = {} + BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] + for fasta_name in bash_aligned_nc_seq.keys(): + ## 1.1. ## Get the raw sequence + sequence_nc = bash_aligned_nc_seq[fasta_name] + + ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: + new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### + + ## 1.3. ## Get the 3 ORFs (nuc) for each sequence + seq_nuc_ORF1 = new_sequence_nc + seq_nuc_ORF2 = new_sequence_nc[1:-2] + seq_nuc_ORF3 = new_sequence_nc[2:-1] + seq_reversed=ReverseComplement2(seq_nuc_ORF1) + seq_nuc_ORF4=seq_reversed + seq_nuc_ORF5=seq_reversed[1:-2] + seq_nuc_ORF6=seq_reversed[2:-1] + + LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] + bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) + + ## 1.4. ## Get the 3 ORFs (aa) for each sequence + seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## + + LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] + bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) + + ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) + BEST_MAX = 0 + for i in [0,1,2,3,4,5]: ### Test the 6 ORFs + ORF_Aligned_aa = [] + ORF_Aligned_nuc = [] + + + ## 2.1 ## Get the alignment of sequence for a given ORF + ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd + for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): + ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] + aa_length = len(ORFsequence) + ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = + + n = i+1 + + for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): + ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] + nuc_length = len(ORFsequence) + ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = + + ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment + ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) + ## Next step is to get the longuest subsequence whithout stop + ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" + MAX_LENGTH = 0 + LONGUEST_SEGMENT_UNSTOPPED = "" + j = 0 # Start from first position in alignment + List_of_List_subsequences = [] + List_positions_subsequence = [] + while j < aa_length: + column = [] + for seq in ORF_Aligned_aa: + column.append(seq[j]) + j = j+1 + if "*" in column: + List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions + List_positions_subsequence = [] ## Re-initialyse list of positions + else: + List_positions_subsequence.append(j) + + ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) + LONGUEST_SUBSEQUENCE_LIST_POSITION = [] + MAX=0 + for sublist in List_of_List_subsequences: + if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: + MAX = len(sublist) + LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist + + ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) + if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: + if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: + CDS_maybe_truncated = 1 + else: + CDS_maybe_truncated = 0 + else: + CDS_maybe_truncated = 0 + + + ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) + ## Test whether it is the better ORF + if MAX > BEST_MAX: + BEST_MAX = MAX + BEST_ORF = i+1 + BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION + + + ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT) + ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) + ## And get the INDEX of the best ORF [0, 1, or 2] + if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: + pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] + pos_MIN_aa = pos_MIN_aa - 1 + pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] + + + BESTORF_bash_of_aligned_aa_seq = {} + BESTORF_bash_of_aligned_aa_seq_CODING = {} + for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): + index_BEST_ORF = BEST_ORF-1 ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 + seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] + seq_coding = seq[pos_MIN_aa:pos_MAX_aa] + BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq + BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding + + ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment + pos_MIN_nuc = pos_MIN_aa * 3 + pos_MAX_nuc = pos_MAX_aa * 3 + + BESTORF_bash_aligned_nc_seq = {} + BESTORF_bash_aligned_nc_seq_CODING = {} + for fasta_name in bash_aligned_nc_seq.keys(): + seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] + seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] + BESTORF_bash_aligned_nc_seq[fasta_name] = seq + BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding + + else: ### no CDS found ### + BESTORF_bash_aligned_nc_seq = {} + BESTORF_bash_aligned_nc_seq_CODING = {} + BESTORF_bash_of_aligned_aa_seq = {} + BESTORF_bash_of_aligned_aa_seq_CODING ={} + + + + ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa + ########################################################################################################### + + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} + + Ortho = 0 + for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): + seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] + Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### + + ## CASE 1: A "M" is present and correctly localized (not in last 50 aa) + if Ortho == 1: + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING + + ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: + if Ortho == 0 and CDS_maybe_truncated == 1: + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING + + ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): + ## => the 2 bash "CDS_with_M" are left empty ("{}") + + return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) +########################################################## + + +################################################################################################## +###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### +################################################################################################## +def allindices(string, sub): + listindex=[] + offset=0 + i = string.find(sub, offset) + while i >= 0: + listindex.append(i) + i = string.find(sub, i + 1) + return listindex +###################################################### + + +############################################################ +###### DEF 6 : Detect if methionin in the aa sequence ###### +############################################################ +def detect_Methionine(seq_aa, Ortho): + + ln = len(seq_aa) + nbre = sys.argv[2] + CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH + #Ortho = 0 ## means orthologs not found + + ## Find all indices of occurances of "M" in a string of aa + list_indices = allindices(seq_aa, "M") ### DEF5 ### + + ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS + if list_indices != []: + first_M = list_indices[0] + if first_M < CUTOFF_Last_50aa: + Ortho = 1 ## means orthologs found + + return(Ortho) +################################### + + + + + + +############################################################ +###### DEF 7 : Reverse complement DNA sequence ###### +###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html +############################################################ + + +def ReverseComplement2(seq): + # too lazy to construct the dictionary manually, use a dict comprehension + seq1 = 'ATCG-TAGC-atcg-tagc-' + seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } + return "".join([seq_dict[base] for base in reversed(seq)]) + +################################### + + + +####################### +##### RUN RUN RUN ##### +####################### +import string, os, time, re, zipfile, sys + +MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number + +## INPUT / OUTPUT +list_file = [] +zfile = zipfile.ZipFile(sys.argv[1]) +for name in zfile.namelist() : + list_file.append(name) + zfile.extract(name, "./") + +F2 = open(sys.argv[2], 'r') + +os.mkdir("04_BEST_ORF_nuc") +Path_OUT1 = "04_BEST_ORF_nuc" +os.mkdir("04_BEST_ORF_aa") +Path_OUT2 = "04_BEST_ORF_aa" + +os.mkdir("05_CDS_nuc") +Path_OUT3 = "05_CDS_nuc" +os.mkdir("05_CDS_aa") +Path_OUT4 = "05_CDS_aa" + +os.mkdir("06_CDS_with_M_nuc") +Path_OUT5 = "06_CDS_with_M_nuc" +os.mkdir("06_CDS_with_M_aa") +Path_OUT6 = "06_CDS_with_M_aa" + + +### Get Universal Code +bash_codeUniversel = code_universel(F2) ### DEF2 ### +F2.close() + +### Get the Bash corresponding to an alignment file in fasta format +count_file_processed = 0 +count_file_with_CDS = 0 +count_file_without_CDS = 0 +count_file_with_CDS_plus_M = 0 + +for file in list_file: + count_file_processed = count_file_processed + 1 + fasta_file_path = "./%s" %file + bash_fasta = dico(fasta_file_path) ### DEF 1 ### + BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### + + ## a ## OUTPUT BESTORF_nuc + if BESTORF_nuc != {}: + count_file_with_CDS = count_file_with_CDS +1 + OUT1 = open("%s/%s" %(Path_OUT1,file), "w") + for fasta_name in BESTORF_nuc.keys(): + seq = BESTORF_nuc[fasta_name] + OUT1.write("%s\n" %fasta_name) + OUT1.write("%s\n" %seq) + OUT1.close() + else: + count_file_without_CDS = count_file_without_CDS + 1 + + + ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! + if BESTORF_aa != {}: + OUT2 = open("%s/%s" %(Path_OUT2,file), "w") + for fasta_name in BESTORF_aa.keys(): + seq = BESTORF_aa[fasta_name] + OUT2.write("%s\n" %fasta_name) + OUT2.write("%s\n" %seq) + OUT2.close() + + ## c ## OUTPUT BESTORF_aa + if BESTORF_nuc_CODING != {}: + OUT3 = open("%s/%s" %(Path_OUT3,file), "w") + for fasta_name in BESTORF_nuc_CODING.keys(): + seq = BESTORF_nuc_CODING[fasta_name] + OUT3.write("%s\n" %fasta_name) + OUT3.write("%s\n" %seq) + OUT3.close() + + ## d ## OUTPUT BESTORF_aa_CODING + if BESTORF_aa_CODING != {}: + OUT4 = open("%s/%s" %(Path_OUT4,file), "w") + for fasta_name in BESTORF_aa_CODING.keys(): + seq = BESTORF_aa_CODING[fasta_name] + OUT4.write("%s\n" %fasta_name) + OUT4.write("%s\n" %seq) + OUT4.close() + + ## e ## OUTPUT BESTORF_nuc_CDS_with_M + if BESTORF_nuc_CDS_with_M != {}: + count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 + OUT5 = open("%s/%s" %(Path_OUT5,file), "w") + for fasta_name in BESTORF_nuc_CDS_with_M.keys(): + seq = BESTORF_nuc_CDS_with_M[fasta_name] + OUT5.write("%s\n" %fasta_name) + OUT5.write("%s\n" %seq) + OUT5.close() + + ## f ## OUTPUT BESTORF_aa_CDS_with_M + if BESTORF_aa_CDS_with_M != {}: + OUT6 = open("%s/%s" %(Path_OUT6,file), "w") + for fasta_name in BESTORF_aa_CDS_with_M.keys(): + seq = BESTORF_aa_CDS_with_M[fasta_name] + OUT6.write("%s\n" %fasta_name) + OUT6.write("%s\n" %seq) + OUT6.close() + + os.system("rm -rf %s" %file) + +## Print +print "*************** CDS detection ***************" +print "\nFiles processed: %d" %count_file_processed +print "\tFiles with CDS: %d" %count_file_with_CDS +print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M +print "\tFiles without CDS: %d \n" %count_file_without_CDS +print "" + +## Zipfile +f_bestORF_nuc = zipfile.ZipFile("ORF_Search_bestORF_nuc.zip", "w") +f_bestORF_aa = zipfile.ZipFile("ORF_Search_bestORF_aa.zip", "w") +f_CDS_nuc = zipfile.ZipFile("ORF_Search_CDS_nuc.zip", "w") +f_CDS_aa = zipfile.ZipFile("ORF_Search_CDS_aa.zip", "w") +f_CDSM_nuc = zipfile.ZipFile("ORF_Search_CDSM_nuc.zip", "w") +f_CDSM_aa = zipfile.ZipFile("ORF_Search_CDSM_aa.zip", "w") + +os.chdir("%s" %Path_OUT1) +folder = os.listdir("./") +for i in folder : + f_bestORF_nuc.write("./%s" %i) + +os.chdir("../%s" %Path_OUT2) +folder = os.listdir("./") +for i in folder : + f_bestORF_aa.write("./%s" %i) + +os.chdir("../%s" %Path_OUT3) +folder = os.listdir("./") +for i in folder : + f_CDS_nuc.write("./%s" %i) + +os.chdir("../%s" %Path_OUT4) +folder = os.listdir("./") +for i in folder : + f_CDS_aa.write("./%s" %i) + +os.chdir("../%s" %Path_OUT5) +folder = os.listdir("./") +for i in folder : + f_CDSM_nuc.write("./%s" %i) + +os.chdir("../%s" %Path_OUT6) +folder = os.listdir("./") +for i in folder : + f_CDSM_aa.write("./%s" %i)
