Mercurial > repos > abims-sbr > cds_search
diff scripts/S01_find_orf_on_multiple_alignment.py @ 8:716a45028e55 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 90cfcf697b9f128e81bea1270378e59d63ab0a6f
| author | abims-sbr |
|---|---|
| date | Mon, 12 Mar 2018 06:30:49 -0400 |
| parents | 35e39b4128ba |
| children | 640ef4c06ed5 |
line wrap: on
line diff
--- a/scripts/S01_find_orf_on_multiple_alignment.py Wed Feb 28 10:38:40 2018 -0500 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Mon Mar 12 06:30:49 2018 -0400 @@ -1,6 +1,8 @@ #!/usr/bin/python +# coding: utf8 ## Author: Eric Fontanillas -## Last modification: 03/09/14 by Julie BAFFARD +## Modification: 03/09/14 by Julie BAFFARD +## Last modification : 05/03/18 by Victor Mataigne ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF @@ -294,24 +296,16 @@ ################################### - - - - ############################################################ ###### DEF 7 : Reverse complement DNA sequence ###### ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html ############################################################ - - def ReverseComplement2(seq): # too lazy to construct the dictionary manually, use a dict comprehension seq1 = 'ATCG-TAGC-atcg-tagc-' seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } return "".join([seq_dict[base] for base in reversed(seq)]) - -################################### - +############################ ####################### @@ -344,25 +338,35 @@ os.mkdir("06_CDS_with_M_aa") Path_OUT6 = "06_CDS_with_M_aa" - - - ### Get the Bash corresponding to an alignment file in fasta format count_file_processed = 0 count_file_with_CDS = 0 count_file_without_CDS = 0 count_file_with_CDS_plus_M = 0 +# ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), +# and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. + +name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] + +# by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond +n0 = 0 for file in list_file: + n0 += 1 + count_file_processed = count_file_processed + 1 fasta_file_path = "./%s" %file bash_fasta = dico(fasta_file_path) ### DEF 1 ### BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### + + name_elems[1] = str(n0) ## a ## OUTPUT BESTORF_nuc if BESTORF_nuc != {}: + name_elems[3] = str(len(BESTORF_nuc.keys())) + new_name = "_".join(name_elems) count_file_with_CDS = count_file_with_CDS +1 - OUT1 = open("%s/%s" %(Path_OUT1,file), "w") + OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") for fasta_name in BESTORF_nuc.keys(): seq = BESTORF_nuc[fasta_name] OUT1.write("%s\n" %fasta_name) @@ -371,10 +375,11 @@ else: count_file_without_CDS = count_file_without_CDS + 1 - ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! if BESTORF_aa != {}: - OUT2 = open("%s/%s" %(Path_OUT2,file), "w") + name_elems[3] = str(len(BESTORF_aa.keys())) + new_name = "_".join(name_elems) + OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") for fasta_name in BESTORF_aa.keys(): seq = BESTORF_aa[fasta_name] OUT2.write("%s\n" %fasta_name) @@ -383,7 +388,9 @@ ## c ## OUTPUT BESTORF_aa if BESTORF_nuc_CODING != {}: - OUT3 = open("%s/%s" %(Path_OUT3,file), "w") + name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) + new_name = "_".join(name_elems) + OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") for fasta_name in BESTORF_nuc_CODING.keys(): seq = BESTORF_nuc_CODING[fasta_name] OUT3.write("%s\n" %fasta_name) @@ -392,7 +399,9 @@ ## d ## OUTPUT BESTORF_aa_CODING if BESTORF_aa_CODING != {}: - OUT4 = open("%s/%s" %(Path_OUT4,file), "w") + name_elems[3] = str(len(BESTORF_aa_CODING.keys())) + new_name = "_".join(name_elems) + OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") for fasta_name in BESTORF_aa_CODING.keys(): seq = BESTORF_aa_CODING[fasta_name] OUT4.write("%s\n" %fasta_name) @@ -401,8 +410,10 @@ ## e ## OUTPUT BESTORF_nuc_CDS_with_M if BESTORF_nuc_CDS_with_M != {}: + name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) + new_name = "_".join(name_elems) count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 - OUT5 = open("%s/%s" %(Path_OUT5,file), "w") + OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") for fasta_name in BESTORF_nuc_CDS_with_M.keys(): seq = BESTORF_nuc_CDS_with_M[fasta_name] OUT5.write("%s\n" %fasta_name) @@ -411,7 +422,9 @@ ## f ## OUTPUT BESTORF_aa_CDS_with_M if BESTORF_aa_CDS_with_M != {}: - OUT6 = open("%s/%s" %(Path_OUT6,file), "w") + name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) + new_name = "_".join(name_elems) + OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w") for fasta_name in BESTORF_aa_CDS_with_M.keys(): seq = BESTORF_aa_CDS_with_M[fasta_name] OUT6.write("%s\n" %fasta_name)
