Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 8:716a45028e55 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 90cfcf697b9f128e81bea1270378e59d63ab0a6f
| author | abims-sbr |
|---|---|
| date | Mon, 12 Mar 2018 06:30:49 -0400 |
| parents | 35e39b4128ba |
| children | 640ef4c06ed5 |
comparison
equal
deleted
inserted
replaced
| 7:35e39b4128ba | 8:716a45028e55 |
|---|---|
| 1 #!/usr/bin/python | 1 #!/usr/bin/python |
| 2 # coding: utf8 | |
| 2 ## Author: Eric Fontanillas | 3 ## Author: Eric Fontanillas |
| 3 ## Last modification: 03/09/14 by Julie BAFFARD | 4 ## Modification: 03/09/14 by Julie BAFFARD |
| 5 ## Last modification : 05/03/18 by Victor Mataigne | |
| 4 | 6 |
| 5 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria | 7 ## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria |
| 6 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF | 8 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF |
| 7 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa | 9 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa |
| 8 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa | 10 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa |
| 292 | 294 |
| 293 return(Ortho) | 295 return(Ortho) |
| 294 ################################### | 296 ################################### |
| 295 | 297 |
| 296 | 298 |
| 297 | |
| 298 | |
| 299 | |
| 300 | |
| 301 ############################################################ | 299 ############################################################ |
| 302 ###### DEF 7 : Reverse complement DNA sequence ###### | 300 ###### DEF 7 : Reverse complement DNA sequence ###### |
| 303 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html | 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html |
| 304 ############################################################ | 302 ############################################################ |
| 305 | |
| 306 | |
| 307 def ReverseComplement2(seq): | 303 def ReverseComplement2(seq): |
| 308 # too lazy to construct the dictionary manually, use a dict comprehension | 304 # too lazy to construct the dictionary manually, use a dict comprehension |
| 309 seq1 = 'ATCG-TAGC-atcg-tagc-' | 305 seq1 = 'ATCG-TAGC-atcg-tagc-' |
| 310 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } | 306 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } |
| 311 return "".join([seq_dict[base] for base in reversed(seq)]) | 307 return "".join([seq_dict[base] for base in reversed(seq)]) |
| 312 | 308 ############################ |
| 313 ################################### | |
| 314 | |
| 315 | 309 |
| 316 | 310 |
| 317 ####################### | 311 ####################### |
| 318 ##### RUN RUN RUN ##### | 312 ##### RUN RUN RUN ##### |
| 319 ####################### | 313 ####################### |
| 342 os.mkdir("06_CDS_with_M_nuc") | 336 os.mkdir("06_CDS_with_M_nuc") |
| 343 Path_OUT5 = "06_CDS_with_M_nuc" | 337 Path_OUT5 = "06_CDS_with_M_nuc" |
| 344 os.mkdir("06_CDS_with_M_aa") | 338 os.mkdir("06_CDS_with_M_aa") |
| 345 Path_OUT6 = "06_CDS_with_M_aa" | 339 Path_OUT6 = "06_CDS_with_M_aa" |
| 346 | 340 |
| 347 | |
| 348 | |
| 349 | |
| 350 ### Get the Bash corresponding to an alignment file in fasta format | 341 ### Get the Bash corresponding to an alignment file in fasta format |
| 351 count_file_processed = 0 | 342 count_file_processed = 0 |
| 352 count_file_with_CDS = 0 | 343 count_file_with_CDS = 0 |
| 353 count_file_without_CDS = 0 | 344 count_file_without_CDS = 0 |
| 354 count_file_with_CDS_plus_M = 0 | 345 count_file_with_CDS_plus_M = 0 |
| 355 | 346 |
| 347 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), | |
| 348 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. | |
| 349 | |
| 350 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] | |
| 351 | |
| 352 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond | |
| 353 n0 = 0 | |
| 356 for file in list_file: | 354 for file in list_file: |
| 355 n0 += 1 | |
| 356 | |
| 357 count_file_processed = count_file_processed + 1 | 357 count_file_processed = count_file_processed + 1 |
| 358 fasta_file_path = "./%s" %file | 358 fasta_file_path = "./%s" %file |
| 359 bash_fasta = dico(fasta_file_path) ### DEF 1 ### | 359 bash_fasta = dico(fasta_file_path) ### DEF 1 ### |
| 360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### | 360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### |
| 361 | |
| 362 name_elems[1] = str(n0) | |
| 361 | 363 |
| 362 ## a ## OUTPUT BESTORF_nuc | 364 ## a ## OUTPUT BESTORF_nuc |
| 363 if BESTORF_nuc != {}: | 365 if BESTORF_nuc != {}: |
| 366 name_elems[3] = str(len(BESTORF_nuc.keys())) | |
| 367 new_name = "_".join(name_elems) | |
| 364 count_file_with_CDS = count_file_with_CDS +1 | 368 count_file_with_CDS = count_file_with_CDS +1 |
| 365 OUT1 = open("%s/%s" %(Path_OUT1,file), "w") | 369 OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") |
| 366 for fasta_name in BESTORF_nuc.keys(): | 370 for fasta_name in BESTORF_nuc.keys(): |
| 367 seq = BESTORF_nuc[fasta_name] | 371 seq = BESTORF_nuc[fasta_name] |
| 368 OUT1.write("%s\n" %fasta_name) | 372 OUT1.write("%s\n" %fasta_name) |
| 369 OUT1.write("%s\n" %seq) | 373 OUT1.write("%s\n" %seq) |
| 370 OUT1.close() | 374 OUT1.close() |
| 371 else: | 375 else: |
| 372 count_file_without_CDS = count_file_without_CDS + 1 | 376 count_file_without_CDS = count_file_without_CDS + 1 |
| 373 | 377 |
| 374 | |
| 375 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! | 378 ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! |
| 376 if BESTORF_aa != {}: | 379 if BESTORF_aa != {}: |
| 377 OUT2 = open("%s/%s" %(Path_OUT2,file), "w") | 380 name_elems[3] = str(len(BESTORF_aa.keys())) |
| 381 new_name = "_".join(name_elems) | |
| 382 OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") | |
| 378 for fasta_name in BESTORF_aa.keys(): | 383 for fasta_name in BESTORF_aa.keys(): |
| 379 seq = BESTORF_aa[fasta_name] | 384 seq = BESTORF_aa[fasta_name] |
| 380 OUT2.write("%s\n" %fasta_name) | 385 OUT2.write("%s\n" %fasta_name) |
| 381 OUT2.write("%s\n" %seq) | 386 OUT2.write("%s\n" %seq) |
| 382 OUT2.close() | 387 OUT2.close() |
| 383 | 388 |
| 384 ## c ## OUTPUT BESTORF_aa | 389 ## c ## OUTPUT BESTORF_aa |
| 385 if BESTORF_nuc_CODING != {}: | 390 if BESTORF_nuc_CODING != {}: |
| 386 OUT3 = open("%s/%s" %(Path_OUT3,file), "w") | 391 name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) |
| 392 new_name = "_".join(name_elems) | |
| 393 OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") | |
| 387 for fasta_name in BESTORF_nuc_CODING.keys(): | 394 for fasta_name in BESTORF_nuc_CODING.keys(): |
| 388 seq = BESTORF_nuc_CODING[fasta_name] | 395 seq = BESTORF_nuc_CODING[fasta_name] |
| 389 OUT3.write("%s\n" %fasta_name) | 396 OUT3.write("%s\n" %fasta_name) |
| 390 OUT3.write("%s\n" %seq) | 397 OUT3.write("%s\n" %seq) |
| 391 OUT3.close() | 398 OUT3.close() |
| 392 | 399 |
| 393 ## d ## OUTPUT BESTORF_aa_CODING | 400 ## d ## OUTPUT BESTORF_aa_CODING |
| 394 if BESTORF_aa_CODING != {}: | 401 if BESTORF_aa_CODING != {}: |
| 395 OUT4 = open("%s/%s" %(Path_OUT4,file), "w") | 402 name_elems[3] = str(len(BESTORF_aa_CODING.keys())) |
| 403 new_name = "_".join(name_elems) | |
| 404 OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") | |
| 396 for fasta_name in BESTORF_aa_CODING.keys(): | 405 for fasta_name in BESTORF_aa_CODING.keys(): |
| 397 seq = BESTORF_aa_CODING[fasta_name] | 406 seq = BESTORF_aa_CODING[fasta_name] |
| 398 OUT4.write("%s\n" %fasta_name) | 407 OUT4.write("%s\n" %fasta_name) |
| 399 OUT4.write("%s\n" %seq) | 408 OUT4.write("%s\n" %seq) |
| 400 OUT4.close() | 409 OUT4.close() |
| 401 | 410 |
| 402 ## e ## OUTPUT BESTORF_nuc_CDS_with_M | 411 ## e ## OUTPUT BESTORF_nuc_CDS_with_M |
| 403 if BESTORF_nuc_CDS_with_M != {}: | 412 if BESTORF_nuc_CDS_with_M != {}: |
| 413 name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) | |
| 414 new_name = "_".join(name_elems) | |
| 404 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 | 415 count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 |
| 405 OUT5 = open("%s/%s" %(Path_OUT5,file), "w") | 416 OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") |
| 406 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): | 417 for fasta_name in BESTORF_nuc_CDS_with_M.keys(): |
| 407 seq = BESTORF_nuc_CDS_with_M[fasta_name] | 418 seq = BESTORF_nuc_CDS_with_M[fasta_name] |
| 408 OUT5.write("%s\n" %fasta_name) | 419 OUT5.write("%s\n" %fasta_name) |
| 409 OUT5.write("%s\n" %seq) | 420 OUT5.write("%s\n" %seq) |
| 410 OUT5.close() | 421 OUT5.close() |
| 411 | 422 |
| 412 ## f ## OUTPUT BESTORF_aa_CDS_with_M | 423 ## f ## OUTPUT BESTORF_aa_CDS_with_M |
| 413 if BESTORF_aa_CDS_with_M != {}: | 424 if BESTORF_aa_CDS_with_M != {}: |
| 414 OUT6 = open("%s/%s" %(Path_OUT6,file), "w") | 425 name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) |
| 426 new_name = "_".join(name_elems) | |
| 427 OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w") | |
| 415 for fasta_name in BESTORF_aa_CDS_with_M.keys(): | 428 for fasta_name in BESTORF_aa_CDS_with_M.keys(): |
| 416 seq = BESTORF_aa_CDS_with_M[fasta_name] | 429 seq = BESTORF_aa_CDS_with_M[fasta_name] |
| 417 OUT6.write("%s\n" %fasta_name) | 430 OUT6.write("%s\n" %fasta_name) |
| 418 OUT6.write("%s\n" %seq) | 431 OUT6.write("%s\n" %seq) |
| 419 OUT6.close() | 432 OUT6.close() |
