comparison scripts/S01_find_orf_on_multiple_alignment.py @ 8:716a45028e55 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 90cfcf697b9f128e81bea1270378e59d63ab0a6f
author abims-sbr
date Mon, 12 Mar 2018 06:30:49 -0400
parents 35e39b4128ba
children 640ef4c06ed5
comparison
equal deleted inserted replaced
7:35e39b4128ba 8:716a45028e55
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # coding: utf8
2 ## Author: Eric Fontanillas 3 ## Author: Eric Fontanillas
3 ## Last modification: 03/09/14 by Julie BAFFARD 4 ## Modification: 03/09/14 by Julie BAFFARD
5 ## Last modification : 05/03/18 by Victor Mataigne
4 6
5 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria 7 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
6 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF 8 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
7 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa 9 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa
8 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa 10 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
292 294
293 return(Ortho) 295 return(Ortho)
294 ################################### 296 ###################################
295 297
296 298
297
298
299
300
301 ############################################################ 299 ############################################################
302 ###### DEF 7 : Reverse complement DNA sequence ###### 300 ###### DEF 7 : Reverse complement DNA sequence ######
303 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
304 ############################################################ 302 ############################################################
305
306
307 def ReverseComplement2(seq): 303 def ReverseComplement2(seq):
308 # too lazy to construct the dictionary manually, use a dict comprehension 304 # too lazy to construct the dictionary manually, use a dict comprehension
309 seq1 = 'ATCG-TAGC-atcg-tagc-' 305 seq1 = 'ATCG-TAGC-atcg-tagc-'
310 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } 306 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 }
311 return "".join([seq_dict[base] for base in reversed(seq)]) 307 return "".join([seq_dict[base] for base in reversed(seq)])
312 308 ############################
313 ###################################
314
315 309
316 310
317 ####################### 311 #######################
318 ##### RUN RUN RUN ##### 312 ##### RUN RUN RUN #####
319 ####################### 313 #######################
342 os.mkdir("06_CDS_with_M_nuc") 336 os.mkdir("06_CDS_with_M_nuc")
343 Path_OUT5 = "06_CDS_with_M_nuc" 337 Path_OUT5 = "06_CDS_with_M_nuc"
344 os.mkdir("06_CDS_with_M_aa") 338 os.mkdir("06_CDS_with_M_aa")
345 Path_OUT6 = "06_CDS_with_M_aa" 339 Path_OUT6 = "06_CDS_with_M_aa"
346 340
347
348
349
350 ### Get the Bash corresponding to an alignment file in fasta format 341 ### Get the Bash corresponding to an alignment file in fasta format
351 count_file_processed = 0 342 count_file_processed = 0
352 count_file_with_CDS = 0 343 count_file_with_CDS = 0
353 count_file_without_CDS = 0 344 count_file_without_CDS = 0
354 count_file_with_CDS_plus_M = 0 345 count_file_with_CDS_plus_M = 0
355 346
347 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
348 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
349
350 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
351
352 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
353 n0 = 0
356 for file in list_file: 354 for file in list_file:
355 n0 += 1
356
357 count_file_processed = count_file_processed + 1 357 count_file_processed = count_file_processed + 1
358 fasta_file_path = "./%s" %file 358 fasta_file_path = "./%s" %file
359 bash_fasta = dico(fasta_file_path) ### DEF 1 ### 359 bash_fasta = dico(fasta_file_path) ### DEF 1 ###
360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### 360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ###
361
362 name_elems[1] = str(n0)
361 363
362 ## a ## OUTPUT BESTORF_nuc 364 ## a ## OUTPUT BESTORF_nuc
363 if BESTORF_nuc != {}: 365 if BESTORF_nuc != {}:
366 name_elems[3] = str(len(BESTORF_nuc.keys()))
367 new_name = "_".join(name_elems)
364 count_file_with_CDS = count_file_with_CDS +1 368 count_file_with_CDS = count_file_with_CDS +1
365 OUT1 = open("%s/%s" %(Path_OUT1,file), "w") 369 OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w")
366 for fasta_name in BESTORF_nuc.keys(): 370 for fasta_name in BESTORF_nuc.keys():
367 seq = BESTORF_nuc[fasta_name] 371 seq = BESTORF_nuc[fasta_name]
368 OUT1.write("%s\n" %fasta_name) 372 OUT1.write("%s\n" %fasta_name)
369 OUT1.write("%s\n" %seq) 373 OUT1.write("%s\n" %seq)
370 OUT1.close() 374 OUT1.close()
371 else: 375 else:
372 count_file_without_CDS = count_file_without_CDS + 1 376 count_file_without_CDS = count_file_without_CDS + 1
373 377
374
375 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! 378 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!!
376 if BESTORF_aa != {}: 379 if BESTORF_aa != {}:
377 OUT2 = open("%s/%s" %(Path_OUT2,file), "w") 380 name_elems[3] = str(len(BESTORF_aa.keys()))
381 new_name = "_".join(name_elems)
382 OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w")
378 for fasta_name in BESTORF_aa.keys(): 383 for fasta_name in BESTORF_aa.keys():
379 seq = BESTORF_aa[fasta_name] 384 seq = BESTORF_aa[fasta_name]
380 OUT2.write("%s\n" %fasta_name) 385 OUT2.write("%s\n" %fasta_name)
381 OUT2.write("%s\n" %seq) 386 OUT2.write("%s\n" %seq)
382 OUT2.close() 387 OUT2.close()
383 388
384 ## c ## OUTPUT BESTORF_aa 389 ## c ## OUTPUT BESTORF_aa
385 if BESTORF_nuc_CODING != {}: 390 if BESTORF_nuc_CODING != {}:
386 OUT3 = open("%s/%s" %(Path_OUT3,file), "w") 391 name_elems[3] = str(len(BESTORF_nuc_CODING.keys()))
392 new_name = "_".join(name_elems)
393 OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w")
387 for fasta_name in BESTORF_nuc_CODING.keys(): 394 for fasta_name in BESTORF_nuc_CODING.keys():
388 seq = BESTORF_nuc_CODING[fasta_name] 395 seq = BESTORF_nuc_CODING[fasta_name]
389 OUT3.write("%s\n" %fasta_name) 396 OUT3.write("%s\n" %fasta_name)
390 OUT3.write("%s\n" %seq) 397 OUT3.write("%s\n" %seq)
391 OUT3.close() 398 OUT3.close()
392 399
393 ## d ## OUTPUT BESTORF_aa_CODING 400 ## d ## OUTPUT BESTORF_aa_CODING
394 if BESTORF_aa_CODING != {}: 401 if BESTORF_aa_CODING != {}:
395 OUT4 = open("%s/%s" %(Path_OUT4,file), "w") 402 name_elems[3] = str(len(BESTORF_aa_CODING.keys()))
403 new_name = "_".join(name_elems)
404 OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w")
396 for fasta_name in BESTORF_aa_CODING.keys(): 405 for fasta_name in BESTORF_aa_CODING.keys():
397 seq = BESTORF_aa_CODING[fasta_name] 406 seq = BESTORF_aa_CODING[fasta_name]
398 OUT4.write("%s\n" %fasta_name) 407 OUT4.write("%s\n" %fasta_name)
399 OUT4.write("%s\n" %seq) 408 OUT4.write("%s\n" %seq)
400 OUT4.close() 409 OUT4.close()
401 410
402 ## e ## OUTPUT BESTORF_nuc_CDS_with_M 411 ## e ## OUTPUT BESTORF_nuc_CDS_with_M
403 if BESTORF_nuc_CDS_with_M != {}: 412 if BESTORF_nuc_CDS_with_M != {}:
413 name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys()))
414 new_name = "_".join(name_elems)
404 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 415 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1
405 OUT5 = open("%s/%s" %(Path_OUT5,file), "w") 416 OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w")
406 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): 417 for fasta_name in BESTORF_nuc_CDS_with_M.keys():
407 seq = BESTORF_nuc_CDS_with_M[fasta_name] 418 seq = BESTORF_nuc_CDS_with_M[fasta_name]
408 OUT5.write("%s\n" %fasta_name) 419 OUT5.write("%s\n" %fasta_name)
409 OUT5.write("%s\n" %seq) 420 OUT5.write("%s\n" %seq)
410 OUT5.close() 421 OUT5.close()
411 422
412 ## f ## OUTPUT BESTORF_aa_CDS_with_M 423 ## f ## OUTPUT BESTORF_aa_CDS_with_M
413 if BESTORF_aa_CDS_with_M != {}: 424 if BESTORF_aa_CDS_with_M != {}:
414 OUT6 = open("%s/%s" %(Path_OUT6,file), "w") 425 name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys()))
426 new_name = "_".join(name_elems)
427 OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w")
415 for fasta_name in BESTORF_aa_CDS_with_M.keys(): 428 for fasta_name in BESTORF_aa_CDS_with_M.keys():
416 seq = BESTORF_aa_CDS_with_M[fasta_name] 429 seq = BESTORF_aa_CDS_with_M[fasta_name]
417 OUT6.write("%s\n" %fasta_name) 430 OUT6.write("%s\n" %fasta_name)
418 OUT6.write("%s\n" %seq) 431 OUT6.write("%s\n" %seq)
419 OUT6.close() 432 OUT6.close()