comparison scripts/S06_post_processing_of_pairwise.py @ 4:6709645eff5d draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:53:53 -0500
parents c8af52875b0f
children
comparison
equal deleted inserted replaced
3:5f68b2fc02c1 4:6709645eff5d
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 ## AUTHOR: Eric Fontanillas 2 ## AUTHOR: Eric Fontanillas
3 ## LAST VERSION: 14/08/14 by Julie BAFFARD 3 ## LAST VERSION: 14/08/14 by Julie BAFFARD
4 4
5 MINIMUM_LENGTH = 1 #bp 5 MINIMUM_LENGTH = 1
6
7
8 ############################
9 ##### DEF1 : Get Pairs #####
10 ############################
11 def get_pairs(fasta_file_path):
12 F2 = open(fasta_file_path, "r")
13 list_pairwises = []
14 while 1:
15 next2 = F2.readline()
16 if not next2:
17 break
18 if next2[0] == ">":
19 fasta_name_query = next2[:-1]
20 next3 = F2.readline()
21 fasta_seq_query = next3[:-1]
22 next3 = F2.readline() ## jump one empty line (if any after the sequence)
23 fasta_name_match = next3[:-1]
24 next3 = F2.readline()
25 fasta_seq_match = next3[:-1]
26 pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match]
27
28 ## ADD pairwise with condition
29 list_pairwises.append(pairwise)
30 F2.close()
31
32 return(list_pairwises)
33 ##############################################
34
35 #################################
36 ##### DEF2 : Extract length #####
37 #################################
38 def extract_length(length_string): # format length string = 57...902
39 l3 = string.split(length_string, "...")
40 n1 = string.atoi(l3[0])
41 n2 = string.atoi(l3[1])
42 length = n2-n1
43
44 return(length)
45 ##############################################
46
47
48 ####################################
49 ##### DEF3 : Remove Redondancy #####
50 ####################################
51 def filter_redondancy_and_length(list_paireu, MIN_LENGTH):
52
53 bash1 = {}
54 list_pairout = []
55
56 for pair in list_paireu:
57 query_name = pair[0]
58 query_seq = pair[1]
59 match_name = pair[2]
60 match_seq = pair[3]
61
62 l1 = string.split(query_name, "||")
63 short_query_name = l1[0][1:]
64 length_matched = extract_length(l1[1]) ### DEF2 ###
65 l2 = string.split(match_name, "||")
66 short_match_name = l2[0][1:]
67 binom = "%s_%s" %(short_query_name, short_match_name)
68
69 ## TEST FOR REDONDANCY
70 ## REDONDANCY OF BINOME!!!! => MATCHE BETWEEN THE SAME 2 CONTIGS, BUT AT DIFFERENT POSITIONS ON THE CONTIG
71 ## REDONDANCY NOT REMOVED HERE:
72 ## 1/ Several "TERA" match with one "APN" (Counted in script "09_formatMatch_getBackNucleotides.py")
73 ## 2/ Several "APN" match with one "TERA" (Counted
74 if binom not in bash1.keys():
75 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched]
76 else:
77 old_length = bash1[binom][-1]
78 if length_matched > old_length:
79 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched]
80
81
82 for bino in bash1.keys():
83 length = bash1[bino][-1]
84 if length > MIN_LENGTH:
85 list_pairout.append(bash1[bino])
86
87 return(list_pairout)
88 ##############################################
89
90 6
91 ####################### 7 #######################
92 ##### RUN RUN RUN ##### 8 ##### RUN RUN RUN #####
93 ####################### 9 #######################
94 import string, os, time, re, sys 10 import string, os, time, re, sys
11 from functions import get_pairs, extract_length, filter_redondancy
95 12
96 ## 1 ## INPUT/OUTPUT 13 ## 1 ## INPUT/OUTPUT
97 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db 14 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db
98 15
99 F_IN = "%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE) 16 F_IN = "%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE)
104 F_OUT2 = "%s/09_onlyMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE) 21 F_OUT2 = "%s/09_onlyMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE)
105 File_OUT2 = open(F_OUT2, "w") 22 File_OUT2 = open(F_OUT2, "w")
106 23
107 ## 2 ## RUN 24 ## 2 ## RUN
108 list_pairwises = get_pairs(F_IN) ### DEF1 ### 25 list_pairwises = get_pairs(F_IN) ### DEF1 ###
109 list_pairwises_filtered1 = filter_redondancy_and_length(list_pairwises, MINIMUM_LENGTH) ### DEF3 ### 26 list_pairwises_filtered1 = filter_redondancy(list_pairwises, MINIMUM_LENGTH) ### DEF3 ###
110 27
111 i = 0 28 i = 0
112 for pair in list_pairwises_filtered1: 29 for pair in list_pairwises_filtered1:
113 i = i+1 30 i = i+1
114 31