Mercurial > repos > abims-sbr > cds_search
diff scripts/S01_find_orf_on_multiple_alignment.py @ 2:0d2f72caea10 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
| author | abims-sbr |
|---|---|
| date | Wed, 27 Sep 2017 10:03:05 -0400 |
| parents | 567d5b771a90 |
| children | ff98ed7849fa |
line wrap: on
line diff
--- a/scripts/S01_find_orf_on_multiple_alignment.py Thu Apr 13 09:47:57 2017 -0400 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Wed Sep 27 10:03:05 2017 -0400 @@ -59,19 +59,19 @@ ###################################################################################################################### -##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### +##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### ###################################################################################################################### ### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) def multiple3(seq): leng = len(seq) modulo = leng%3 if modulo == 0: # the results of dividing leng per 3 is an integer - new_seq = seq + new_seq = seq elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) new_seq = seq[:-1] # remove the last nc elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) new_seq = seq[:-2] # remove the 2 last nc - len1 = len(new_seq) + len1 = len(new_seq) return(new_seq, modulo) ########################################################## @@ -96,13 +96,13 @@ base2 = string.capitalize(base2) base3 = seq_dna[i+2] base3 = string.capitalize(base3) - + codon = base1+base2+base3 codon = string.replace(codon, "T", "U") if codon in bash_codeUniversel.keys(): aa = bash_codeUniversel[codon] - seq_aa = seq_aa + aa + seq_aa = seq_aa + aa else: seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" i = i + 3 @@ -113,8 +113,8 @@ ###### DEF 4 - Part 2 - ###### ############################## -def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): - +def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): + ## 1 ## Get the list of aligned aa seq for the 3 ORF: bash_of_aligned_aa_seq_3ORF = {} bash_of_aligned_nuc_seq_3ORF = {} @@ -125,7 +125,7 @@ ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### - + ## 1.3. ## Get the 3 ORFs (nuc) for each sequence seq_nuc_ORF1 = new_sequence_nc seq_nuc_ORF2 = new_sequence_nc[1:-2] @@ -134,7 +134,7 @@ seq_nuc_ORF4=seq_reversed seq_nuc_ORF5=seq_reversed[1:-2] seq_nuc_ORF6=seq_reversed[2:-1] - + LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) @@ -142,30 +142,30 @@ seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## + seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] - bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) + bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) BEST_MAX = 0 - for i in [0,1,2,3,4,5]: ### Test the 6 ORFs + for i in [0,1,2,3,4,5]: ### Test the 6 ORFs ORF_Aligned_aa = [] ORF_Aligned_nuc = [] - + ## 2.1 ## Get the alignment of sequence for a given ORF ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd - for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): + for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] aa_length = len(ORFsequence) ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = n = i+1 - - for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): + + for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] nuc_length = len(ORFsequence) ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = @@ -175,11 +175,11 @@ ## Next step is to get the longuest subsequence whithout stop ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" MAX_LENGTH = 0 - LONGUEST_SEGMENT_UNSTOPPED = "" - j = 0 # Start from first position in alignment + LONGUEST_SEGMENT_UNSTOPPED = "" + j = 0 # Start from first position in alignment List_of_List_subsequences = [] List_positions_subsequence = [] - while j < aa_length: + while j < aa_length: column = [] for seq in ORF_Aligned_aa: column.append(seq[j]) @@ -189,7 +189,7 @@ List_positions_subsequence = [] ## Re-initialyse list of positions else: List_positions_subsequence.append(j) - + ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) LONGUEST_SUBSEQUENCE_LIST_POSITION = [] MAX=0 @@ -197,7 +197,7 @@ if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: MAX = len(sublist) LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist - + ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: @@ -206,7 +206,7 @@ CDS_maybe_truncated = 0 else: CDS_maybe_truncated = 0 - + ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) ## Test whether it is the better ORF @@ -239,7 +239,7 @@ pos_MAX_nuc = pos_MAX_aa * 3 BESTORF_bash_aligned_nc_seq = {} - BESTORF_bash_aligned_nc_seq_CODING = {} + BESTORF_bash_aligned_nc_seq_CODING = {} for fasta_name in bash_aligned_nc_seq.keys(): seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] @@ -259,7 +259,7 @@ BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} - + Ortho = 0 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] @@ -271,14 +271,14 @@ BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: - if Ortho == 0 and CDS_maybe_truncated == 1: + if Ortho == 0 and CDS_maybe_truncated == 1: BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): ## => the 2 bash "CDS_with_M" are left empty ("{}") - - return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) + + return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) ########################################################## @@ -297,7 +297,7 @@ ############################################################ -###### DEF 6 : Detect if methionin in the aa sequence ###### +###### DEF 6 : Detect if methionin in the aa sequence ###### ############################################################ def detect_Methionine(seq_aa, Ortho): @@ -305,13 +305,13 @@ nbre = sys.argv[2] CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH #Ortho = 0 ## means orthologs not found - + ## Find all indices of occurances of "M" in a string of aa list_indices = allindices(seq_aa, "M") ### DEF5 ### - + ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS if list_indices != []: - first_M = list_indices[0] + first_M = list_indices[0] if first_M < CUTOFF_Last_50aa: Ortho = 1 ## means orthologs found @@ -324,7 +324,7 @@ ############################################################ -###### DEF 7 : Reverse complement DNA sequence ###### +###### DEF 7 : Reverse complement DNA sequence ###### ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html ############################################################ @@ -344,16 +344,16 @@ ####################### import string, os, time, re, zipfile, sys +infiles = sys.argv[1] MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number ## INPUT / OUTPUT -list_file = [] -zfile = zipfile.ZipFile(sys.argv[1]) -for name in zfile.namelist() : - list_file.append(name) - zfile.extract(name, "./") - +list_file = str.split(infiles,",") + +### Get Universal Code F2 = open(sys.argv[2], 'r') +bash_codeUniversel = code_universel(F2) ### DEF2 ### +F2.close() os.mkdir("04_BEST_ORF_nuc") Path_OUT1 = "04_BEST_ORF_nuc" @@ -371,9 +371,7 @@ Path_OUT6 = "06_CDS_with_M_aa" -### Get Universal Code -bash_codeUniversel = code_universel(F2) ### DEF2 ### -F2.close() + ### Get the Bash corresponding to an alignment file in fasta format count_file_processed = 0 @@ -382,10 +380,10 @@ count_file_with_CDS_plus_M = 0 for file in list_file: - count_file_processed = count_file_processed + 1 + count_file_processed = count_file_processed + 1 fasta_file_path = "./%s" %file bash_fasta = dico(fasta_file_path) ### DEF 1 ### - BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### + BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### ## a ## OUTPUT BESTORF_nuc if BESTORF_nuc != {}: @@ -398,8 +396,8 @@ OUT1.close() else: count_file_without_CDS = count_file_without_CDS + 1 - - + + ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! if BESTORF_aa != {}: OUT2 = open("%s/%s" %(Path_OUT2,file), "w") @@ -407,7 +405,7 @@ seq = BESTORF_aa[fasta_name] OUT2.write("%s\n" %fasta_name) OUT2.write("%s\n" %seq) - OUT2.close() + OUT2.close() ## c ## OUTPUT BESTORF_aa if BESTORF_nuc_CODING != {}: @@ -425,7 +423,7 @@ seq = BESTORF_aa_CODING[fasta_name] OUT4.write("%s\n" %fasta_name) OUT4.write("%s\n" %seq) - OUT4.close() + OUT4.close() ## e ## OUTPUT BESTORF_nuc_CDS_with_M if BESTORF_nuc_CDS_with_M != {}: @@ -435,7 +433,7 @@ seq = BESTORF_nuc_CDS_with_M[fasta_name] OUT5.write("%s\n" %fasta_name) OUT5.write("%s\n" %seq) - OUT5.close() + OUT5.close() ## f ## OUTPUT BESTORF_aa_CDS_with_M if BESTORF_aa_CDS_with_M != {}: @@ -446,50 +444,12 @@ OUT6.write("%s\n" %seq) OUT6.close() - os.system("rm -rf %s" %file) + os.system("rm -rf %s" %file) -## Print +## Print print "*************** CDS detection ***************" print "\nFiles processed: %d" %count_file_processed print "\tFiles with CDS: %d" %count_file_with_CDS print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M print "\tFiles without CDS: %d \n" %count_file_without_CDS print "" - -## Zipfile -f_bestORF_nuc = zipfile.ZipFile("ORF_Search_bestORF_nuc.zip", "w") -f_bestORF_aa = zipfile.ZipFile("ORF_Search_bestORF_aa.zip", "w") -f_CDS_nuc = zipfile.ZipFile("ORF_Search_CDS_nuc.zip", "w") -f_CDS_aa = zipfile.ZipFile("ORF_Search_CDS_aa.zip", "w") -f_CDSM_nuc = zipfile.ZipFile("ORF_Search_CDSM_nuc.zip", "w") -f_CDSM_aa = zipfile.ZipFile("ORF_Search_CDSM_aa.zip", "w") - -os.chdir("%s" %Path_OUT1) -folder = os.listdir("./") -for i in folder : - f_bestORF_nuc.write("./%s" %i) - -os.chdir("../%s" %Path_OUT2) -folder = os.listdir("./") -for i in folder : - f_bestORF_aa.write("./%s" %i) - -os.chdir("../%s" %Path_OUT3) -folder = os.listdir("./") -for i in folder : - f_CDS_nuc.write("./%s" %i) - -os.chdir("../%s" %Path_OUT4) -folder = os.listdir("./") -for i in folder : - f_CDS_aa.write("./%s" %i) - -os.chdir("../%s" %Path_OUT5) -folder = os.listdir("./") -for i in folder : - f_CDSM_nuc.write("./%s" %i) - -os.chdir("../%s" %Path_OUT6) -folder = os.listdir("./") -for i in folder : - f_CDSM_aa.write("./%s" %i)
