comparison scripts/S01_find_orf_on_multiple_alignment.py @ 3:ff98ed7849fa draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:55:29 -0500
parents 0d2f72caea10
children 35e39b4128ba
comparison
equal deleted inserted replaced
2:0d2f72caea10 3:ff98ed7849fa
6 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF 6 ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
7 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa 7 ## CRITERIA 2 ## This longest part should be > 150nc or 50aa
8 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa 8 ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
9 ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA 9 ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
10 ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA 10 ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
11
12
13 ###############################
14 ##### DEF 1 : Dico fasta #####
15 ###############################
16 def dico(fasta_file_path):
17 F2 = open(fasta_file_path, "r")
18 dicoco = {}
19 while 1:
20 next2 = F2.readline()
21 if not next2:
22 break
23 if next2[0] == ">":
24 fasta_name_query = next2[:-1]
25 Sn = string.split(fasta_name_query, "||")
26 fasta_name_query = Sn[0]
27 next3 = F2.readline()
28 fasta_seq_query = next3[:-1]
29 dicoco[fasta_name_query]=fasta_seq_query
30 F2.close()
31 return(dicoco)
32 ############################################################
33
34 11
35 #################################################### 12 ####################################################
36 ###### DEF 2 : Create bash for genetic code ######## 13 ###### DEF 2 : Create bash for genetic code ########
37 #################################################### 14 ####################################################
38 ### KEY = codon 15 ### KEY = codon
341 318
342 ####################### 319 #######################
343 ##### RUN RUN RUN ##### 320 ##### RUN RUN RUN #####
344 ####################### 321 #######################
345 import string, os, time, re, zipfile, sys 322 import string, os, time, re, zipfile, sys
323 from dico import dico
346 324
347 infiles = sys.argv[1] 325 infiles = sys.argv[1]
348 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number 326 MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number
349 327
350 ## INPUT / OUTPUT 328 ## INPUT / OUTPUT
380 count_file_with_CDS_plus_M = 0 358 count_file_with_CDS_plus_M = 0
381 359
382 for file in list_file: 360 for file in list_file:
383 count_file_processed = count_file_processed + 1 361 count_file_processed = count_file_processed + 1
384 fasta_file_path = "./%s" %file 362 fasta_file_path = "./%s" %file
385 bash_fasta = dico(fasta_file_path) ### DEF 1 ### 363 fasta_file = open(fasta_file_path, "r")
364 bash_fasta = dico(fasta_file) ### DEF 1 ###
365 fasta_file.close()
386 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### 366 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ###
387 367
388 ## a ## OUTPUT BESTORF_nuc 368 ## a ## OUTPUT BESTORF_nuc
389 if BESTORF_nuc != {}: 369 if BESTORF_nuc != {}:
390 count_file_with_CDS = count_file_with_CDS +1 370 count_file_with_CDS = count_file_with_CDS +1