comparison scripts/S01_find_orf_on_multiple_alignment.py @ 9:640ef4c06ed5 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit f1ba8d136e0129f3e8435b25a95f70f697d51464-dirty
author abims-sbr
date Tue, 03 Jul 2018 10:54:18 -0400
parents 716a45028e55
children 3d00be2d05f3
comparison
equal deleted inserted replaced
8:716a45028e55 9:640ef4c06ed5
300 ###### DEF 7 : Reverse complement DNA sequence ###### 300 ###### DEF 7 : Reverse complement DNA sequence ######
301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
302 ############################################################ 302 ############################################################
303 def ReverseComplement2(seq): 303 def ReverseComplement2(seq):
304 # too lazy to construct the dictionary manually, use a dict comprehension 304 # too lazy to construct the dictionary manually, use a dict comprehension
305 seq1 = 'ATCG-TAGC-atcg-tagc-' 305 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
306 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } 306 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<16 }
307 return "".join([seq_dict[base] for base in reversed(seq)]) 307 return "".join([seq_dict[base] for base in reversed(seq)])
308 ############################ 308 ############################
309 309
310 310
311 ####################### 311 #######################
312 ##### RUN RUN RUN ##### 312 ##### RUN RUN RUN #####
313 ####################### 313 #######################
314 import string, os, time, re, zipfile, sys 314 import string, os, time, re, zipfile, sys
315 from dico import dico 315 from dico import dico
316 316
317 infiles = sys.argv[1] 317 MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number
318 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number
319
320 ## INPUT / OUTPUT
321 list_file = str.split(infiles,",")
322 318
323 ### Get Universal Code 319 ### Get Universal Code
324 bash_codeUniversel = code_universel(sys.argv[2]) ### DEF2 ### 320 bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ###
321
322 ## INPUT from file containing list of species
323 list_files = []
324 with open(sys.argv[3], 'r') as f:
325 for line in f.readlines():
326 list_files.append(line.strip('\n'))
325 327
326 os.mkdir("04_BEST_ORF_nuc") 328 os.mkdir("04_BEST_ORF_nuc")
327 Path_OUT1 = "04_BEST_ORF_nuc" 329 Path_OUT1 = "04_BEST_ORF_nuc"
328 os.mkdir("04_BEST_ORF_aa") 330 os.mkdir("04_BEST_ORF_aa")
329 Path_OUT2 = "04_BEST_ORF_aa" 331 Path_OUT2 = "04_BEST_ORF_aa"
348 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. 350 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
349 351
350 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] 352 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
351 353
352 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond 354 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
353 n0 = 0 355 #n0 = 0
354 for file in list_file: 356
355 n0 += 1 357 for file in list_files:
358 #n0 += 1
356 359
357 count_file_processed = count_file_processed + 1 360 count_file_processed = count_file_processed + 1
361 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
358 fasta_file_path = "./%s" %file 362 fasta_file_path = "./%s" %file
359 bash_fasta = dico(fasta_file_path) ### DEF 1 ### 363 bash_fasta = dico(fasta_file_path) ### DEF 1 ###
360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### 364 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ###
361 365
362 name_elems[1] = str(n0) 366 name_elems[1] = nb_gp
363 367
364 ## a ## OUTPUT BESTORF_nuc 368 ## a ## OUTPUT BESTORF_nuc
365 if BESTORF_nuc != {}: 369 if BESTORF_nuc != {}:
366 name_elems[3] = str(len(BESTORF_nuc.keys())) 370 name_elems[3] = str(len(BESTORF_nuc.keys()))
367 new_name = "_".join(name_elems) 371 new_name = "_".join(name_elems)
429 seq = BESTORF_aa_CDS_with_M[fasta_name] 433 seq = BESTORF_aa_CDS_with_M[fasta_name]
430 OUT6.write("%s\n" %fasta_name) 434 OUT6.write("%s\n" %fasta_name)
431 OUT6.write("%s\n" %seq) 435 OUT6.write("%s\n" %seq)
432 OUT6.close() 436 OUT6.close()
433 437
434 os.system("rm -rf %s" %file) 438 #os.system("rm -rf %s" %file)
435 439
436 ## Print 440 ## Print
437 print "*************** CDS detection ***************" 441 print "*************** CDS detection ***************"
438 print "\nFiles processed: %d" %count_file_processed 442 print "\nFiles processed: %d" %count_file_processed
439 print "\tFiles with CDS: %d" %count_file_with_CDS 443 print "\tFiles with CDS: %d" %count_file_with_CDS