comparison scripts/S08_script_extract_match_v20_blastx.py @ 4:6709645eff5d draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:53:53 -0500
parents c8af52875b0f
children
comparison
equal deleted inserted replaced
3:5f68b2fc02c1 4:6709645eff5d
4 4
5 ### TBLASTX formatting 5 ### TBLASTX formatting
6 6
7 ### MATCH = Only the first match keeped 7 ### MATCH = Only the first match keeped
8 MATCH = 0 # Only 1rst match Wanted 8 MATCH = 0 # Only 1rst match Wanted
9 #MATCH = 1 # All match want 9 #MATCH = 1 # All match wanted
10
10 ### SUBMATCH = several part of a same sequence match with the query 11 ### SUBMATCH = several part of a same sequence match with the query
11 SUBMATCH = 0 # SUBMATCH NOT WANTED (ONLY 1rst HIT) 12 SUBMATCH = 0 # SUBMATCH NOT WANTED (ONLY 1rst HIT)
12 #SUBMATCH =1 # SUBMATCH WANTED 13 #SUBMATCH =1 # SUBMATCH WANTED
13 14
14 15
23 24
24 ### SPECIFICITY TBLASTX (/BLASTN) formatting: 25 ### SPECIFICITY TBLASTX (/BLASTN) formatting:
25 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword 26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
26 ## 2/ change line "if keyword in nextline:" in function "split_file" 27 ## 2/ change line "if keyword in nextline:" in function "split_file"
27 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" 28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
28
29
30
31 ########################################
32 ### DEF 1. Split each "BLASTN" event ###
33 ########################################
34 def split_file(path_in, keyword):
35
36 file_in = open(path_in, "r")
37
38 RUN = ''
39 BASH1={}
40
41 while 1:
42 nextline = file_in.readline()
43
44 ##################################
45 ### [A] FORMATTING QUERY NAME ###
46
47 # Get query name
48 if nextline[0:6]=='Query=':
49 L1 = string.split(nextline, "||")
50 L2 = string.split(L1[0], " ")
51 query = L2[1]
52 if query[-1] == "\n":
53 query = query[:-1]
54
55 ### [A] END FORMATTING QUERY NAME ###
56 ######################################
57
58
59 ### split the file with keyword ###
60 if keyword in nextline:
61 # Two cases here:
62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
64
65 if RUN == '': # case #1#
66 RUN = RUN + nextline # we just added the first line of the file
67
68 else: # case #2# (there was a run before)
69 BASH1[query] = RUN # add the previous run to the bash
70 RUN = '' # re-initialize the "RUN"
71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
72
73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously)
74 RUN = RUN + nextline
75
76
77 if not nextline: # when no more line, we should record the last "RUN" in the bash1
78 BASH1[query] = RUN # add the last "RUN"
79 break
80
81
82 file_in.close()
83 return(BASH1)
84 #########################################################
85
86
87 ###############################################
88 ### DEF2 : Parse blast output for each query###
89 ###############################################
90
91 ### 2.1. detect matches (i.e. 'Sequences producing significant alignments:' ###
92 def detect_Matches(query, MATCH, WORK_DIR):
93 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w')
94 F5.write(bash1[query])
95 F5.close()
96
97 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r')
98 list1 =[]
99 list2 =[]
100
101 while 1:
102 nexteu = F6.readline()
103
104 if not nexteu : break
105
106 if "***** No hits found ******" in nexteu :
107 hit = 0
108 break
109
110 if 'Sequences producing significant alignments:' in nexteu:
111 hit = 1
112 F6.readline() # jump a line
113
114 while 1:
115 nexteu2 = F6.readline()
116
117 if nexteu2[0]==">": break
118
119 ######################################
120 ### [B] FORMAT MATCH NAME 1st STEP ###
121
122 if nexteu2 != '\n':
123 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
124 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators
125 list1.append(match)
126
127 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!!
128 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators
129
130 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match)
131 break
132 else: ## Read the other lines (i.e. All the Matches)
133 continue
134
135 ### [B] END FORMAT MATCH NAME 1st STEP ###
136 ##########################################
137
138 break
139
140 F6.close()
141 return(list1, list2, hit) # list1 = short name // list2 = more complete name
142 ######################################
143
144
145 #########################################
146 ### DEF3 : Get Information on matches ###
147 #########################################
148 ### Function used in the next function (2.3.)
149 def get_information_on_matches(list_of_line):
150
151 for line in list_of_line:
152
153 ## Score and Expect
154 if "Score" in line:
155 #print line
156 line = line[:-1] # remove "\n"
157 S_line = string.split(line, " = ")
158 Expect = S_line[-1] ## ***** Expect
159 S_line2 = string.split(S_line[1], " bits ")
160 Score = string.atof(S_line2[0])
161
162
163 ## Identities/gaps/percent/divergence/length_matched
164 elif "Identities" in line:
165 line = line[:-1] # remove "\n"
166
167 g = 0
168 if "Gaps" in line:
169 #print "HIT!!!"
170 pre_S_line = string.split(line, ",")
171 identity_line = pre_S_line[0]
172 gaps_line = pre_S_line[1]
173 g = 1
174 else:
175 identity_line = line
176 g = 0
177
178 ## treat identity line
179 S_line = string.split(identity_line, " ")
180
181 identities = S_line[-2] ## ***** identities
182 #print "\t\tIdentities = %s" %identities
183
184 S_line2 = string.split(identities, "/")
185 hits = string.atof(S_line2[0]) ## ***** hits
186 length_matched = string.atof(S_line2[1]) ## ***** length_matched
187 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences
188
189
190 ## identity_percent = S_line[-1]
191 identity_percent = hits/length_matched * 100 ## ***** identity_percent
192 #print "\t\tIdentity (percent) = %.2f" %identity_percent
193
194 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent
195 #print "\t\tDivergence (percent) = %.2f" %divergence_percent
196
197 ## treat gap line if any
198 if g ==1: # means there are gaps
199 S_line3 = string.split(gaps_line, " ")
200 gaps_part = S_line3[-2]
201 S_line4 = string.split(gaps_part, "/")
202 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number
203 #print "\t\tGaps number = %s" %gaps_number
204
205 real_differences = abs_nb_differences - gaps_number ## ***** real_differences
206 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent
207 #print "\t\tReal divergence (percent)= %.2f" %real_divergence_percent
208 else:
209 gaps_number = 0
210 #print "\t\tGaps number = %s" %gaps_number
211 real_differences = 0
212 real_divergence_percent = divergence_percent
213
214 ## Strand
215 #elif "Strand" in line:
216 # line = line[:-1] # remove "\n"
217 # S_line = string.split(line, " = ")
218 # strand = S_line[1]
219 # print "\t\tStrand = %s" %strand
220
221 ## Frame
222 elif "Frame" in line:
223 line = line[:-1] # remove "\n"
224 S_line = string.split(line, " = ")
225 frame = S_line[1]
226 #print "\t\tFrame = %s" %frame
227
228
229
230 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
231
232 return(list_informations)
233 ##################################
234 29
235 30
236 ############################ 31 ############################
237 ### DEF4 : get sequences ### 32 ### DEF4 : get sequences ###
238 ############################ 33 ############################
274 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]" 69 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
275 Listing1.reverse() 70 Listing1.reverse()
276 71
277 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">" 72 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
278 SEK = len(Listing2) 73 SEK = len(Listing2)
279
280 NB_SEK = 0 74 NB_SEK = 0
281 75
282 for e1 in Listing2: # "Listing2" contents all the entries begining with ">" 76 for e1 in Listing2: # "Listing2" contents all the entries begining with ">"
283 NB_SEK = NB_SEK + 1 77 NB_SEK = NB_SEK + 1
284 list51 = [] 78 list51 = []
288 l = l+1 82 l = l+1
289 if "Score =" in line: 83 if "Score =" in line:
290 index = l 84 index = l
291 list51.append(l) # index of the lines with score 85 list51.append(l) # index of the lines with score
292 86
293 list51.reverse() 87 list51.reverse()
294
295 Listing3 = [] 88 Listing3 = []
296 89
297 for i5 in list51: 90 for i5 in list51:
298 e2 = e1[i5:] 91 e2 = e1[i5:]
299 Listing3.append(e2) 92 Listing3.append(e2)
307 100
308 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME 101 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
309 102
310 SmallFastaName = SmallFastaName[1:-2] ### remove ">" and "\n" 103 SmallFastaName = SmallFastaName[1:-2] ### remove ">" and "\n"
311 104
312 105 """
106 3 lines below : only difference with S05
107 """
313 S1 = string.split(SmallFastaName, "||") 108 S1 = string.split(SmallFastaName, "||")
314 S2 = string.split(S1[0], " ") 109 S2 = string.split(S1[0], " ")
315
316
317 PutInFastaName1 = S2[0] 110 PutInFastaName1 = S2[0]
318 111
319 ### [C] END FORMAT MATCH NAME 2nd STEP ### 112 ### [C] END FORMAT MATCH NAME 2nd STEP ###
320 ########################################## 113 ##########################################
321 114
322 SUBSEK = len(Listing3) 115 SUBSEK = len(Listing3)
323 NB_SUBSEK = 0 116 NB_SUBSEK = 0
324 list_inBatch = [] 117 list_inBatch = []
118
325 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3": 119 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
326 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!! 120 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
327 Listing4 = [] 121 Listing4 = []
328 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!! 122 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!!
329 elif SUBMATCHEU == 1: 123 elif SUBMATCHEU == 1:
330 Listing4 = Listing3 124 Listing4 = Listing3
331 125
332 126 for l in Listing4: ## "listing3" contents
333 for l in Listing4: ## "listing3" contents
334
335 NB_SUBSEK = NB_SUBSEK+1 127 NB_SUBSEK = NB_SUBSEK+1
336 128
337 ll1 = string.replace(l[0], " ", "") 129 ll1 = string.replace(l[0], " ", "")
338 ll2 = string.replace(l[1], " ", "") 130 ll2 = string.replace(l[1], " ", "")
339 ll3 = string.replace(l[2], " ", "") 131 ll3 = string.replace(l[2], " ", "")
341 133
342 seq_query = "" 134 seq_query = ""
343 pos_query = [] 135 pos_query = []
344 seq_match = "" 136 seq_match = ""
345 pos_match = [] 137 pos_match = []
138
346 for line in l: 139 for line in l:
347 if "Query:" in line: 140 if "Query:" in line:
348 line = string.replace(line, " ", " ") # remove multiple spaces in line 141 line = string.replace(line, " ", " ") # remove multiple spaces in line
349 line = string.replace(line, " ", " ") 142 line = string.replace(line, " ", " ")
350 line = string.replace(line, " ", " ") 143 line = string.replace(line, " ", " ")
358 pos2 = lll1[3][:-1] 151 pos2 = lll1[3][:-1]
359 pos2 = string.atoi(pos2) 152 pos2 = string.atoi(pos2)
360 pos_query.append(pos2) 153 pos_query.append(pos2)
361 154
362 seq = lll1[2] 155 seq = lll1[2]
363 seq_query = seq_query + seq 156 seq_query = seq_query + seq
364
365 157
366 if "Sbjct:" in line: 158 if "Sbjct:" in line:
367 line = string.replace(line, " ", " ") # remove multiple spaces in line 159 line = string.replace(line, " ", " ") # remove multiple spaces in line
368 line = string.replace(line, " ", " ") 160 line = string.replace(line, " ", " ")
369 line = string.replace(line, " ", " ") 161 line = string.replace(line, " ", " ")
379 pos_match.append(pos2) 171 pos_match.append(pos2)
380 172
381 seq = lll2[2] 173 seq = lll2[2]
382 seq_match = seq_match + seq 174 seq_match = seq_match + seq
383 175
384 ## Get the query and matched sequences and the corresponding positions 176 ## Get the query and matched sequences and the corresponding positions
385
386 pos_query.sort() # rank small to big 177 pos_query.sort() # rank small to big
387 pos_query_start = pos_query[0] # get the smaller 178 pos_query_start = pos_query[0] # get the smaller
388 pos_query_end = pos_query[-1] # get the bigger 179 pos_query_end = pos_query[-1] # get the bigger
389 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end) 180 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)
390 181
391
392 ###################################### 182 ######################################
393 ### [D] FORMAT QUERY NAME 2nd STEP ### 183 ### [D] FORMAT QUERY NAME 2nd STEP ###
394 184
395 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) 185 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
396 186
397 ### [D] END FORMAT QUERY NAME 2nd STEP ### 187 ### [D] END FORMAT QUERY NAME 2nd STEP ###
398 ########################################## 188 ##########################################
399
400 189
401 pos_match.sort() 190 pos_match.sort()
402 pos_match_start = pos_match[0] 191 pos_match_start = pos_match[0]
403 pos_match_end = pos_match[-1] 192 pos_match_end = pos_match[-1]
404 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end) 193 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)
405 194
406
407 ###################################### 195 ######################################
408 ### [E] FORMAT MATCH NAME 3rd STEP ### 196 ### [E] FORMAT MATCH NAME 3rd STEP ###
409 197
410 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) 198 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
411 199
412 ### [E] END FORMAT MATCH NAME 3rd STEP ### 200 ### [E] END FORMAT MATCH NAME 3rd STEP ###
413 ########################################## 201 ##########################################
414 202
415
416
417 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members 203 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members
418 list_Pairwise.append(Pairwise) 204 list_Pairwise.append(Pairwise)
419 205
420 ### Get informations about matches 206 ### Get informations about matches
421 list_info = get_information_on_matches(l) ### DEF3 ### 207 list_info = get_information_on_matches(l) ### DEF3 ###
422
423 208
424 F8.close() 209 F8.close()
425 return(list_Pairwise, list_info) 210 return(list_Pairwise, list_info)
426 ######################################### 211 #########################################
427 212
428 213
429 ################### 214 ###################
430 ### RUN RUN RUN ### 215 ### RUN RUN RUN ###
431 ################### 216 ###################
432 import string, os, time, re, sys 217 import string, os, time, re, sys
218 from functions import split_file, detect_Matches, get_information_on_matches
433 219
434 ## 1 ## INPUT/OUTPUT 220 ## 1 ## INPUT/OUTPUT
435 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db 221 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db
436 222
223 """
224 04 and 06 for S05, 11 and 13 for S08
225 """
437 path_in = "%s/11_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) 226 path_in = "%s/11_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE)
438 file_out = open("%s/13_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") 227 file_out = open("%s/13_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")
439 228
440 ## 2 ## RUN 229 ## 2 ## RUN
441 ## create Bash1 ## 230 ## create Bash1 ##
442 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ### 231 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###
443 232
444 ## detect and save match ## 233 ## detect and save match ##
445 list_hits =[] 234 list_hits =[]
446 list_no_hits = [] 235 list_no_hits = []
447 j= 0 236 j= 0
449 lene = len(bash1.keys()) 238 lene = len(bash1.keys())
450 for query in bash1.keys(): 239 for query in bash1.keys():
451 j = j+1 240 j = j+1
452 241
453 ## 2.1. detect matches ## 242 ## 2.1. detect matches ##
454 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ### 243 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1) ### DEF2 ###
455 244
456 if hit == 1: # match(es) 245 if hit == 1: # match(es)
457 list_hits.append(query) 246 list_hits.append(query)
458 if hit == 0: # no match for that sequence 247 if hit == 0: # no match for that sequence
459 list_no_hits.append(query) 248 list_no_hits.append(query)
460 249
461 ## 2.2. get sequences ## 250 ## 2.2. get sequences ##
462 if hit ==1: 251 if hit ==1:
463 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### DEF4 ### 252 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)
464 253
465 # divergencve 254 # divergencve
466 divergence = list_info[6] 255 divergence = list_info[6]
467 # gap number 256 # gap number
468 gap_number = list_info[7] 257 gap_number = list_info[7]
480 match_name = pairwise[2] 269 match_name = pairwise[2]
481 match_seq = pairwise[3] 270 match_seq = pairwise[3]
482 271
483 len_query_seq = len(query_seq) 272 len_query_seq = len(query_seq)
484 273
485
486 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD: 274 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:
487 275
488 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched)) 276 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
489 file_out.write("\n") 277 file_out.write("\n")
490 file_out.write("%s" %query_seq) 278 file_out.write("%s" %query_seq)