Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 9:640ef4c06ed5 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit f1ba8d136e0129f3e8435b25a95f70f697d51464-dirty
| author | abims-sbr |
|---|---|
| date | Tue, 03 Jul 2018 10:54:18 -0400 |
| parents | 716a45028e55 |
| children | 3d00be2d05f3 |
comparison
equal
deleted
inserted
replaced
| 8:716a45028e55 | 9:640ef4c06ed5 |
|---|---|
| 300 ###### DEF 7 : Reverse complement DNA sequence ###### | 300 ###### DEF 7 : Reverse complement DNA sequence ###### |
| 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html | 301 ###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html |
| 302 ############################################################ | 302 ############################################################ |
| 303 def ReverseComplement2(seq): | 303 def ReverseComplement2(seq): |
| 304 # too lazy to construct the dictionary manually, use a dict comprehension | 304 # too lazy to construct the dictionary manually, use a dict comprehension |
| 305 seq1 = 'ATCG-TAGC-atcg-tagc-' | 305 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' |
| 306 seq_dict = { seq1[i]:seq1[i+5] for i in range(20) if i < 5 or 10<=i<15 } | 306 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<16 } |
| 307 return "".join([seq_dict[base] for base in reversed(seq)]) | 307 return "".join([seq_dict[base] for base in reversed(seq)]) |
| 308 ############################ | 308 ############################ |
| 309 | 309 |
| 310 | 310 |
| 311 ####################### | 311 ####################### |
| 312 ##### RUN RUN RUN ##### | 312 ##### RUN RUN RUN ##### |
| 313 ####################### | 313 ####################### |
| 314 import string, os, time, re, zipfile, sys | 314 import string, os, time, re, zipfile, sys |
| 315 from dico import dico | 315 from dico import dico |
| 316 | 316 |
| 317 infiles = sys.argv[1] | 317 MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number |
| 318 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number | |
| 319 | |
| 320 ## INPUT / OUTPUT | |
| 321 list_file = str.split(infiles,",") | |
| 322 | 318 |
| 323 ### Get Universal Code | 319 ### Get Universal Code |
| 324 bash_codeUniversel = code_universel(sys.argv[2]) ### DEF2 ### | 320 bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ### |
| 321 | |
| 322 ## INPUT from file containing list of species | |
| 323 list_files = [] | |
| 324 with open(sys.argv[3], 'r') as f: | |
| 325 for line in f.readlines(): | |
| 326 list_files.append(line.strip('\n')) | |
| 325 | 327 |
| 326 os.mkdir("04_BEST_ORF_nuc") | 328 os.mkdir("04_BEST_ORF_nuc") |
| 327 Path_OUT1 = "04_BEST_ORF_nuc" | 329 Path_OUT1 = "04_BEST_ORF_nuc" |
| 328 os.mkdir("04_BEST_ORF_aa") | 330 os.mkdir("04_BEST_ORF_aa") |
| 329 Path_OUT2 = "04_BEST_ORF_aa" | 331 Path_OUT2 = "04_BEST_ORF_aa" |
| 348 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. | 350 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. |
| 349 | 351 |
| 350 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] | 352 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] |
| 351 | 353 |
| 352 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond | 354 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond |
| 353 n0 = 0 | 355 #n0 = 0 |
| 354 for file in list_file: | 356 |
| 355 n0 += 1 | 357 for file in list_files: |
| 358 #n0 += 1 | |
| 356 | 359 |
| 357 count_file_processed = count_file_processed + 1 | 360 count_file_processed = count_file_processed + 1 |
| 361 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number | |
| 358 fasta_file_path = "./%s" %file | 362 fasta_file_path = "./%s" %file |
| 359 bash_fasta = dico(fasta_file_path) ### DEF 1 ### | 363 bash_fasta = dico(fasta_file_path) ### DEF 1 ### |
| 360 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### | 364 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### |
| 361 | 365 |
| 362 name_elems[1] = str(n0) | 366 name_elems[1] = nb_gp |
| 363 | 367 |
| 364 ## a ## OUTPUT BESTORF_nuc | 368 ## a ## OUTPUT BESTORF_nuc |
| 365 if BESTORF_nuc != {}: | 369 if BESTORF_nuc != {}: |
| 366 name_elems[3] = str(len(BESTORF_nuc.keys())) | 370 name_elems[3] = str(len(BESTORF_nuc.keys())) |
| 367 new_name = "_".join(name_elems) | 371 new_name = "_".join(name_elems) |
| 429 seq = BESTORF_aa_CDS_with_M[fasta_name] | 433 seq = BESTORF_aa_CDS_with_M[fasta_name] |
| 430 OUT6.write("%s\n" %fasta_name) | 434 OUT6.write("%s\n" %fasta_name) |
| 431 OUT6.write("%s\n" %seq) | 435 OUT6.write("%s\n" %seq) |
| 432 OUT6.close() | 436 OUT6.close() |
| 433 | 437 |
| 434 os.system("rm -rf %s" %file) | 438 #os.system("rm -rf %s" %file) |
| 435 | 439 |
| 436 ## Print | 440 ## Print |
| 437 print "*************** CDS detection ***************" | 441 print "*************** CDS detection ***************" |
| 438 print "\nFiles processed: %d" %count_file_processed | 442 print "\nFiles processed: %d" %count_file_processed |
| 439 print "\tFiles with CDS: %d" %count_file_with_CDS | 443 print "\tFiles with CDS: %d" %count_file_with_CDS |
