comparison scripts/S06_post_processing_of_pairwise.py @ 1:c8af52875b0f draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author lecorguille
date Thu, 13 Apr 2017 09:46:45 -0400
parents
children 6709645eff5d
comparison
equal deleted inserted replaced
0:e95d4b20c62d 1:c8af52875b0f
1 #!/usr/bin/env python
2 ## AUTHOR: Eric Fontanillas
3 ## LAST VERSION: 14/08/14 by Julie BAFFARD
4
5 MINIMUM_LENGTH = 1 #bp
6
7
8 ############################
9 ##### DEF1 : Get Pairs #####
10 ############################
11 def get_pairs(fasta_file_path):
12 F2 = open(fasta_file_path, "r")
13 list_pairwises = []
14 while 1:
15 next2 = F2.readline()
16 if not next2:
17 break
18 if next2[0] == ">":
19 fasta_name_query = next2[:-1]
20 next3 = F2.readline()
21 fasta_seq_query = next3[:-1]
22 next3 = F2.readline() ## jump one empty line (if any after the sequence)
23 fasta_name_match = next3[:-1]
24 next3 = F2.readline()
25 fasta_seq_match = next3[:-1]
26 pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match]
27
28 ## ADD pairwise with condition
29 list_pairwises.append(pairwise)
30 F2.close()
31
32 return(list_pairwises)
33 ##############################################
34
35 #################################
36 ##### DEF2 : Extract length #####
37 #################################
38 def extract_length(length_string): # format length string = 57...902
39 l3 = string.split(length_string, "...")
40 n1 = string.atoi(l3[0])
41 n2 = string.atoi(l3[1])
42 length = n2-n1
43
44 return(length)
45 ##############################################
46
47
48 ####################################
49 ##### DEF3 : Remove Redondancy #####
50 ####################################
51 def filter_redondancy_and_length(list_paireu, MIN_LENGTH):
52
53 bash1 = {}
54 list_pairout = []
55
56 for pair in list_paireu:
57 query_name = pair[0]
58 query_seq = pair[1]
59 match_name = pair[2]
60 match_seq = pair[3]
61
62 l1 = string.split(query_name, "||")
63 short_query_name = l1[0][1:]
64 length_matched = extract_length(l1[1]) ### DEF2 ###
65 l2 = string.split(match_name, "||")
66 short_match_name = l2[0][1:]
67 binom = "%s_%s" %(short_query_name, short_match_name)
68
69 ## TEST FOR REDONDANCY
70 ## REDONDANCY OF BINOME!!!! => MATCHE BETWEEN THE SAME 2 CONTIGS, BUT AT DIFFERENT POSITIONS ON THE CONTIG
71 ## REDONDANCY NOT REMOVED HERE:
72 ## 1/ Several "TERA" match with one "APN" (Counted in script "09_formatMatch_getBackNucleotides.py")
73 ## 2/ Several "APN" match with one "TERA" (Counted
74 if binom not in bash1.keys():
75 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched]
76 else:
77 old_length = bash1[binom][-1]
78 if length_matched > old_length:
79 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched]
80
81
82 for bino in bash1.keys():
83 length = bash1[bino][-1]
84 if length > MIN_LENGTH:
85 list_pairout.append(bash1[bino])
86
87 return(list_pairout)
88 ##############################################
89
90
91 #######################
92 ##### RUN RUN RUN #####
93 #######################
94 import string, os, time, re, sys
95
96 ## 1 ## INPUT/OUTPUT
97 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db
98
99 F_IN = "%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE)
100
101 F_OUT = "%s/09_PairwiseMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE)
102 File_OUT = open(F_OUT, "w")
103
104 F_OUT2 = "%s/09_onlyMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE)
105 File_OUT2 = open(F_OUT2, "w")
106
107 ## 2 ## RUN
108 list_pairwises = get_pairs(F_IN) ### DEF1 ###
109 list_pairwises_filtered1 = filter_redondancy_and_length(list_pairwises, MINIMUM_LENGTH) ### DEF3 ###
110
111 i = 0
112 for pair in list_pairwises_filtered1:
113 i = i+1
114
115 ## Write pairwise alignment
116 File_OUT.write("%s\n" %pair[0])
117 File_OUT.write("%s\n" %pair[1])
118 File_OUT.write("%s\n" %pair[2])
119 File_OUT.write("%s\n" %pair[3])
120
121 ## Write only "matches" [AND UNGAP THEM: needed before the 2nd run of blast]
122 File_OUT2.write("%s\n" %pair[2])
123 seq_match = pair[3]
124 seq_match_ungapped = string.replace(seq_match, "-", "")
125 File_OUT2.write("%s\n" %seq_match_ungapped)
126
127 File_OUT.close()
128 File_OUT2.close()