comparison scripts/S05_script_extract_match_v20_blastx.py @ 1:c8af52875b0f draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author lecorguille
date Thu, 13 Apr 2017 09:46:45 -0400
parents
children 6709645eff5d
comparison
equal deleted inserted replaced
0:e95d4b20c62d 1:c8af52875b0f
1 #!/usr/bin/env python
2 ## AUTHOR: Eric Fontanillas
3 ## LAST VERSION: 14/08/14 by Julie BAFFARD
4
5 ### TBLASTX formatting
6
7 ### MATCH = Only the first match keeped
8 MATCH = 0 # Only 1rst match Wanted
9 #MATCH = 1 # All match wanted
10
11 ### SUBMATCH = several part of a same sequence match with the query
12 SUBMATCH = 0 # SUBMATCH NOT WANTED (only the best hit)
13 #SUBMATCH =1 # SUBMATCH WANTED
14
15
16 ### NAME FORMATTING:
17 # [A] FORMAT QUERY NAME 1st STEP [IN DEF1]
18
19 # [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1]
20
21 # [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3]
22 # [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3]
23 # [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3]
24
25 ### SPECIFICITY TBLASTX (/BLASTN) formatting:
26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
27 ## 2/ change line "if keyword in nextline:" in function "split_file"
28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
29
30
31 ########################################
32 ### DEF 1. Split each "BLASTN" event ###
33 ########################################
34 def split_file(path_in, keyword):
35
36 file_in = open(path_in, "r")
37 RUN = ''
38 BASH1={}
39
40 while 1:
41 nextline = file_in.readline()
42
43 ##################################
44 ### [A] FORMATTING QUERY NAME ###
45
46 ### Get query name ###
47 if nextline[0:6]=='Query=':
48 L1 = string.split(nextline, "||")
49 L2 = string.split(L1[0], " ")
50 query = L2[1]
51 if query[-1] == "\n":
52 query = query[:-1]
53
54 ### [A] END FORMATTING QUERY NAME ###
55 ######################################
56
57
58 ### split the file with keyword ###
59 if keyword in nextline:
60
61 # Two cases here:
62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
64
65 if RUN == '': # case #1#
66 RUN = RUN + nextline # we just added the first line of the file
67
68 else: # case #2# (there was a run before)
69 BASH1[query] = RUN # add the previous run to the bash
70 RUN = '' # re-initialize the "RUN"
71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
72
73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously)
74 RUN = RUN + nextline
75
76
77 if not nextline: # when no more line, we should record the last "RUN" in the bash1
78 BASH1[query] = RUN # add the last "RUN"
79 break
80
81 file_in.close()
82 return(BASH1)
83 #########################################################
84
85
86 ################################################
87 ### DEF2 : Parse blast output for each query ###
88 ################################################
89 ### detect matches (i.e. 'Sequences producing significant alignments:' ###
90
91 def detect_Matches(query, MATCH, WORK_DIR):
92 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w')
93 F5.write(bash1[query])
94 F5.close()
95
96 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r')
97 list1 =[]
98 list2 =[]
99
100 while 1:
101 nexteu = F6.readline()
102 if not nexteu : break
103
104 if "***** No hits found ******" in nexteu :
105 hit = 0
106 break
107
108 if 'Sequences producing significant alignments:' in nexteu:
109 hit = 1
110 F6.readline() # jump a line
111
112 while 1:
113 nexteu2 = F6.readline()
114 if nexteu2[0]==">": break
115
116 ######################################
117 ### [B] FORMAT MATCH NAME 1st STEP ###
118
119 if nexteu2 != '\n':
120 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
121 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators
122 list1.append(match)
123
124 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!!
125 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators
126
127 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match)
128 break
129 else: ## Read the other lines (i.e. All the Matches)
130 continue
131
132 ### [B] END FORMAT MATCH NAME 1st STEP ###
133 ##########################################
134
135 break
136
137 F6.close()
138 return(list1, list2, hit) # list1 = short name // list2 = more complete name
139 #######################################
140
141
142 #########################################
143 ### DEF3 : Get Information on matches ###
144 #########################################
145 ### Function used in the next function (2.3.)
146
147 def get_information_on_matches(list_of_line):
148 for line in list_of_line:
149
150 ## Score and Expect
151 if "Score" in line:
152 line = line[:-1] # remove "\n"
153 S_line = string.split(line, " = ")
154 Expect = S_line[-1] ## ***** Expect
155 S_line2 = string.split(S_line[1], " bits ")
156 Score = string.atof(S_line2[0])
157
158 ## Identities/gaps/percent/divergence/length_matched
159 elif "Identities" in line:
160 line = line[:-1] # remove "\n"
161 g = 0
162
163 if "Gaps" in line:
164 pre_S_line = string.split(line, ",")
165 identity_line = pre_S_line[0]
166 gaps_line = pre_S_line[1]
167 g = 1
168 else:
169 identity_line = line
170 g = 0
171
172 ## treat identity line
173 S_line = string.split(identity_line, " ")
174
175 identities = S_line[-2] ## ***** identities
176
177 S_line2 = string.split(identities, "/")
178 hits = string.atof(S_line2[0]) ## ***** hits
179 length_matched = string.atof(S_line2[1]) ## ***** length_matched
180 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences
181
182
183 identity_percent = hits/length_matched * 100 ## ***** identity_percent
184
185 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent
186
187 ## treat gap line if any
188 if g ==1: # means there are gaps
189 S_line3 = string.split(gaps_line, " ")
190 gaps_part = S_line3[-2]
191 S_line4 = string.split(gaps_part, "/")
192 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number
193
194 real_differences = abs_nb_differences - gaps_number ## ***** real_differences
195 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent
196 else:
197 gaps_number = 0
198 real_differences = 0
199 real_divergence_percent = divergence_percent
200
201 ## Frame
202 elif "Frame" in line:
203 line = line[:-1]
204 S_line = string.split(line, " = ")
205 frame = S_line[1]
206
207 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
208
209 return(list_informations)
210 ########################################
211
212
213 ############################
214 ### DEF4 : get sequences ###
215 ############################
216 ### [+ get informations from the function 2.2.]
217
218 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR):
219 list_Pairwise = []
220
221 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w')
222 F7.write(bash1[query]) # bash1[query] ==> blast output for each query
223 F7.close()
224 F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r')
225
226 text1 = F8.readlines()
227
228 miniList = []
229 for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function
230
231 l = -1
232 for n in text1:
233 l = l+1
234 if name in n:
235 i = l
236 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])
237
238 miniList.reverse()
239 if miniList != []:
240 length = len(miniList)
241 ii = 0
242
243 Listing1 = []
244 while ii < length:
245 iii = miniList[ii]
246 entry = text1[iii:]
247 text1 = text1[:iii]
248 Listing1.append(entry) # each "entry" = list of thing beginning by ">"
249 ii = ii+1 # Listing1 is a table of table!!
250
251 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
252 Listing1.reverse()
253
254 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
255 SEK = len(Listing2)
256 NB_SEK = 0
257
258 for e1 in Listing2: # "Listing2" contents all the entries begining with ">"
259 NB_SEK = NB_SEK + 1
260 list51 = []
261
262 l = -1
263 for line in e1:
264 l = l+1
265 if "Score =" in line:
266 index = l
267 list51.append(l) # index of the lines with score
268
269 list51.reverse()
270 Listing3 = []
271
272 for i5 in list51:
273 e2 = e1[i5:]
274 Listing3.append(e2)
275 e1 = e1[:i5]
276
277 ######################################
278 ### [C] FORMAT MATCH NAME 2nd STEP ###
279
280 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
281
282 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
283 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n"
284
285 if SmallFastaName[-1] == " ":
286 SmallFastaName = SmallFastaName[:-1]
287
288 PutInFastaName1 = SmallFastaName
289
290 ### [C] END FORMAT MATCH NAME 2nd STEP ###
291 ##########################################
292
293 SUBSEK = len(Listing3)
294 NB_SUBSEK = 0
295 list_inBatch = []
296
297 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
298 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
299 Listing4 = []
300 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!!
301 elif SUBMATCHEU == 1:
302 Listing4 = Listing3
303
304 for l in Listing4: ## "listing3" contents
305 NB_SUBSEK = NB_SUBSEK+1
306
307 ll1 = string.replace(l[0], " ", "")
308 ll2 = string.replace(l[1], " ", "")
309 ll3 = string.replace(l[2], " ", "")
310 PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information
311
312 seq_query = ""
313 pos_query = []
314 seq_match = ""
315 pos_match = []
316
317 for line in l:
318 if "Query:" in line:
319 line = string.replace(line, " ", " ") # remove multiple spaces in line
320 line = string.replace(line, " ", " ")
321 line = string.replace(line, " ", " ")
322
323 lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
324
325 pos1 = lll1[1]
326 pos1 = string.atoi(pos1)
327 pos_query.append(pos1)
328
329 pos2 = lll1[3][:-1]
330 pos2 = string.atoi(pos2)
331 pos_query.append(pos2)
332
333 seq = lll1[2]
334 seq_query = seq_query + seq
335
336 if "Sbjct:" in line:
337 line = string.replace(line, " ", " ") # remove multiple spaces in line
338 line = string.replace(line, " ", " ")
339 line = string.replace(line, " ", " ")
340
341 lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
342
343 pos1 = lll2[1]
344 pos1 = string.atoi(pos1)
345 pos_match.append(pos1)
346
347 pos2 = lll2[3][:-1]
348 pos2 = string.atoi(pos2)
349 pos_match.append(pos2)
350
351 seq = lll2[2]
352 seq_match = seq_match + seq
353
354 ## Get the query and matched sequences and the corresponding positions
355 pos_query.sort() # rank small to big
356 pos_query_start = pos_query[0] # get the smaller
357 pos_query_end = pos_query[-1] # get the bigger
358 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)
359
360 ######################################
361 ### [D] FORMAT QUERY NAME 2nd STEP ###
362
363 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
364
365 ### [D] END FORMAT QUERY NAME 2nd STEP ###
366 ##########################################
367
368 pos_match.sort()
369 pos_match_start = pos_match[0]
370 pos_match_end = pos_match[-1]
371 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)
372
373 ######################################
374 ### [E] FORMAT MATCH NAME 3rd STEP ###
375
376 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
377
378 ### [E] END FORMAT MATCH NAME 3rd STEP ###
379 ##########################################
380
381 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members
382 list_Pairwise.append(Pairwise)
383
384 ### Get informations about matches
385 list_info = get_information_on_matches(l) ### DEF3 ###
386
387 F8.close()
388 return(list_Pairwise, list_info)
389 #########################################
390
391
392 ######################
393 ### 2. RUN RUN RUN ###
394 ######################
395 import string, os, time, re, sys
396
397 ## 1 ## INPUT/OUTPUT
398 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db
399
400 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE)
401 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")
402
403 ## 2 ## RUN
404 ## create Bash1 ##
405 bash1 = split_file(path_in, "TBLASTX") ## DEF1 ##
406
407 ## detect and save match ##
408 list_hits =[]
409 list_no_hits = []
410 j= 0
411 k = 0
412 lene = len(bash1.keys())
413 for query in bash1.keys():
414 j = j+1
415
416 ## 2.1. detect matches ##
417 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ###
418
419 if hit == 1: # match(es)
420 list_hits.append(query)
421 if hit == 0: # no match for that sequence
422 list_no_hits.append(query)
423
424 ## 2.2. get sequences ##
425 if hit ==1:
426 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### FUNCTION ###
427
428 # divergencve
429 divergence = list_info[6]
430 # gap number
431 gap_number = list_info[7]
432 # real divergence (divergence without accounting INDELs)
433 real_divergence = list_info[8]
434 # length matched
435 length_matched = list_info[10]
436
437 ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES
438 for pairwise in list_pairwiseMatch:
439 k = k+1
440
441 query_name = pairwise[0]
442 query_seq = pairwise[1]
443 match_name = pairwise[2]
444 match_seq = pairwise[3]
445
446 len_query_seq = len(query_seq)
447
448 Lis1 = string.split(query_name, "||")
449 short_query_name = Lis1[0]
450 Lis2 = string.split(match_name, "||")
451 short_match_name = Lis2[0]
452
453 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:
454
455 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
456 file_out.write("\n")
457 file_out.write("%s" %query_seq)
458 file_out.write("\n")
459
460 file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched))
461 file_out.write("\n")
462 file_out.write("%s" %match_seq)
463 file_out.write("\n")
464
465 file_out.close()