cds_search: scripts/S01_find_orf_on_multiple

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 9:640ef4c06ed5 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit f1ba8d136e0129f3e8435b25a95f70f697d51464-dirty

author	abims-sbr
date	Tue, 03 Jul 2018 10:54:18 -0400
parents	716a45028e55
children	3d00be2d05f3

comparison

equal deleted inserted replaced

-:716a45028e55
+:640ef4c06ed5
 ###### DEF 7 : Reverse complement DNA sequence ######
 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
 ############################################################
 def ReverseComplement2(seq):
 # too lazy to construct the dictionary manually, use a dict comprehension
-seq1 = 'ATCG-TAGC-atcg-tagc-'
+seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
-seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 }
+seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<16 }
 return "".join([seq_dict[base] for base in reversed(seq)])
 ############################
 #######################
 ##### RUN RUN RUN #####
 #######################
 import string, os, time, re, zipfile, sys
 from dico import dico
-infiles = sys.argv[1]
+MINIMAL_CDS_LENGTH = int(sys.argv[2])  ## in aa number
-MINIMAL_CDS_LENGTH = int(sys.argv[3])  ## in aa number
-## INPUT / OUTPUT
-list_file = str.split(infiles,",")
 ### Get Universal Code
-bash_codeUniversel = code_universel(sys.argv[2])  ### DEF2 ###
+bash_codeUniversel = code_universel(sys.argv[1])  ### DEF2 ###
+## INPUT from file containing list of species
+list_files = []
+with open(sys.argv[3], 'r') as f:
+for line in f.readlines():
+list_files.append(line.strip('\n'))
 os.mkdir("04_BEST_ORF_nuc")
 Path_OUT1 = "04_BEST_ORF_nuc"
 os.mkdir("04_BEST_ORF_aa")
 Path_OUT2 = "04_BEST_ORF_aa"
 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
-n0 = 0
+#n0 = 0
-for file in list_file:
-n0 += 1
+for file in list_files:
+#n0 += 1
 count_file_processed = count_file_processed + 1
+nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
 fasta_file_path = "./%s" %file
 bash_fasta = dico(fasta_file_path)   ### DEF 1 ###
 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel)   ### DEF 4 - PART 2 - ###
-name_elems[1] = str(n0)
+name_elems[1] = nb_gp
 ## a ## OUTPUT BESTORF_nuc
 if BESTORF_nuc != {}:
 name_elems[3] = str(len(BESTORF_nuc.keys()))
 new_name = "_".join(name_elems)
 seq = BESTORF_aa_CDS_with_M[fasta_name]
 OUT6.write("%s\n" %fasta_name)
 OUT6.write("%s\n" %seq)
 OUT6.close()
-os.system("rm -rf %s" %file)
+#os.system("rm -rf %s" %file)
 ## Print
 print "*************** CDS detection ***************"
 print "\nFiles processed: %d" %count_file_processed
 print "\tFiles with CDS: %d" %count_file_with_CDS

Mercurial > repos > abims-sbr > cds_search

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 9:640ef4c06ed5 draft