cds_search: scripts/S01_find_orf_on_multiple

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 8:716a45028e55 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 90cfcf697b9f128e81bea1270378e59d63ab0a6f

author	abims-sbr
date	Mon, 12 Mar 2018 06:30:49 -0400
parents	35e39b4128ba
children	640ef4c06ed5

comparison

equal deleted inserted replaced

-:35e39b4128ba
+:716a45028e55
 #!/usr/bin/python
+# coding: utf8
 ## Author: Eric Fontanillas
-## Last modification: 03/09/14 by Julie BAFFARD
+## Modification: 03/09/14 by Julie BAFFARD
+## Last modification : 05/03/18 by Victor Mataigne
 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa
 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
 return(Ortho)
 ###################################
 ############################################################
 ###### DEF 7 : Reverse complement DNA sequence ######
 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
 ############################################################
 def ReverseComplement2(seq):
 # too lazy to construct the dictionary manually, use a dict comprehension
 seq1 = 'ATCG-TAGC-atcg-tagc-'
 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 }
 return "".join([seq_dict[base] for base in reversed(seq)])
+############################
-###################################
 #######################
 ##### RUN RUN RUN #####
 #######################
 os.mkdir("06_CDS_with_M_nuc")
 Path_OUT5 = "06_CDS_with_M_nuc"
 os.mkdir("06_CDS_with_M_aa")
 Path_OUT6 = "06_CDS_with_M_aa"
 ### Get the Bash corresponding to an alignment file in fasta format
 count_file_processed = 0
 count_file_with_CDS = 0
 count_file_without_CDS = 0
 count_file_with_CDS_plus_M = 0
+# ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
+# and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
+name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
+# by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
+n0 = 0
 for file in list_file:
+n0 += 1
 count_file_processed = count_file_processed + 1
 fasta_file_path = "./%s" %file
 bash_fasta = dico(fasta_file_path)   ### DEF 1 ###
 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel)   ### DEF 4 - PART 2 - ###
+name_elems[1] = str(n0)
 ## a ## OUTPUT BESTORF_nuc
 if BESTORF_nuc != {}:
+name_elems[3] = str(len(BESTORF_nuc.keys()))
+new_name = "_".join(name_elems)
 count_file_with_CDS = count_file_with_CDS +1
-OUT1 = open("%s/%s" %(Path_OUT1,file), "w")
+OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w")
 for fasta_name in BESTORF_nuc.keys():
 seq = BESTORF_nuc[fasta_name]
 OUT1.write("%s\n" %fasta_name)
 OUT1.write("%s\n" %seq)
 OUT1.close()
 else:
 count_file_without_CDS = count_file_without_CDS + 1
 ## b ## OUTPUT BESTORF_nuc_CODING  ===> THE MOST INTERESTING!!!
 if BESTORF_aa != {}:
-OUT2 = open("%s/%s" %(Path_OUT2,file), "w")
+name_elems[3] = str(len(BESTORF_aa.keys()))
+new_name = "_".join(name_elems)
+OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w")
 for fasta_name in BESTORF_aa.keys():
 seq = BESTORF_aa[fasta_name]
 OUT2.write("%s\n" %fasta_name)
 OUT2.write("%s\n" %seq)
 OUT2.close()
 ## c ## OUTPUT BESTORF_aa
 if BESTORF_nuc_CODING != {}:
-OUT3 = open("%s/%s" %(Path_OUT3,file), "w")
+name_elems[3] = str(len(BESTORF_nuc_CODING.keys()))
+new_name = "_".join(name_elems)
+OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w")
 for fasta_name in BESTORF_nuc_CODING.keys():
 seq = BESTORF_nuc_CODING[fasta_name]
 OUT3.write("%s\n" %fasta_name)
 OUT3.write("%s\n" %seq)
 OUT3.close()
 ## d ## OUTPUT BESTORF_aa_CODING
 if BESTORF_aa_CODING != {}:
-OUT4 = open("%s/%s" %(Path_OUT4,file), "w")
+name_elems[3] = str(len(BESTORF_aa_CODING.keys()))
+new_name = "_".join(name_elems)
+OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w")
 for fasta_name in BESTORF_aa_CODING.keys():
 seq = BESTORF_aa_CODING[fasta_name]
 OUT4.write("%s\n" %fasta_name)
 OUT4.write("%s\n" %seq)
 OUT4.close()
 ## e ## OUTPUT BESTORF_nuc_CDS_with_M
 if BESTORF_nuc_CDS_with_M != {}:
+name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys()))
+new_name = "_".join(name_elems)
 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1
-OUT5 = open("%s/%s" %(Path_OUT5,file), "w")
+OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w")
 for fasta_name in BESTORF_nuc_CDS_with_M.keys():
 seq = BESTORF_nuc_CDS_with_M[fasta_name]
 OUT5.write("%s\n" %fasta_name)
 OUT5.write("%s\n" %seq)
 OUT5.close()
 ## f ## OUTPUT BESTORF_aa_CDS_with_M
 if BESTORF_aa_CDS_with_M != {}:
-OUT6 = open("%s/%s" %(Path_OUT6,file), "w")
+name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys()))
+new_name = "_".join(name_elems)
+OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w")
 for fasta_name in BESTORF_aa_CDS_with_M.keys():
 seq = BESTORF_aa_CDS_with_M[fasta_name]
 OUT6.write("%s\n" %fasta_name)
 OUT6.write("%s\n" %seq)
 OUT6.close()

Mercurial > repos > abims-sbr > cds_search

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 8:716a45028e55 draft