comparison scripts/S08_script_extract_match_v20_blastx.py @ 1:c8af52875b0f draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author lecorguille
date Thu, 13 Apr 2017 09:46:45 -0400
parents
children 6709645eff5d
comparison
equal deleted inserted replaced
0:e95d4b20c62d 1:c8af52875b0f
1 #!/usr/bin/env python
2 ## AUTHOR: Eric Fontanillas
3 ## LAST VERSION: 14/08/14 by Julie BAFFARD
4
5 ### TBLASTX formatting
6
7 ### MATCH = Only the first match keeped
8 MATCH = 0 # Only 1rst match Wanted
9 #MATCH = 1 # All match want
10 ### SUBMATCH = several part of a same sequence match with the query
11 SUBMATCH = 0 # SUBMATCH NOT WANTED (ONLY 1rst HIT)
12 #SUBMATCH =1 # SUBMATCH WANTED
13
14
15 ### NAME FORMATTING:
16 # [A] FORMAT QUERY NAME 1st STEP [IN DEF1]
17
18 # [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1]
19
20 # [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3]
21 # [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3]
22 # [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3]
23
24 ### SPECIFICITY TBLASTX (/BLASTN) formatting:
25 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
26 ## 2/ change line "if keyword in nextline:" in function "split_file"
27 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
28
29
30
31 ########################################
32 ### DEF 1. Split each "BLASTN" event ###
33 ########################################
34 def split_file(path_in, keyword):
35
36 file_in = open(path_in, "r")
37
38 RUN = ''
39 BASH1={}
40
41 while 1:
42 nextline = file_in.readline()
43
44 ##################################
45 ### [A] FORMATTING QUERY NAME ###
46
47 # Get query name
48 if nextline[0:6]=='Query=':
49 L1 = string.split(nextline, "||")
50 L2 = string.split(L1[0], " ")
51 query = L2[1]
52 if query[-1] == "\n":
53 query = query[:-1]
54
55 ### [A] END FORMATTING QUERY NAME ###
56 ######################################
57
58
59 ### split the file with keyword ###
60 if keyword in nextline:
61 # Two cases here:
62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
64
65 if RUN == '': # case #1#
66 RUN = RUN + nextline # we just added the first line of the file
67
68 else: # case #2# (there was a run before)
69 BASH1[query] = RUN # add the previous run to the bash
70 RUN = '' # re-initialize the "RUN"
71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
72
73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously)
74 RUN = RUN + nextline
75
76
77 if not nextline: # when no more line, we should record the last "RUN" in the bash1
78 BASH1[query] = RUN # add the last "RUN"
79 break
80
81
82 file_in.close()
83 return(BASH1)
84 #########################################################
85
86
87 ###############################################
88 ### DEF2 : Parse blast output for each query###
89 ###############################################
90
91 ### 2.1. detect matches (i.e. 'Sequences producing significant alignments:' ###
92 def detect_Matches(query, MATCH, WORK_DIR):
93 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w')
94 F5.write(bash1[query])
95 F5.close()
96
97 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r')
98 list1 =[]
99 list2 =[]
100
101 while 1:
102 nexteu = F6.readline()
103
104 if not nexteu : break
105
106 if "***** No hits found ******" in nexteu :
107 hit = 0
108 break
109
110 if 'Sequences producing significant alignments:' in nexteu:
111 hit = 1
112 F6.readline() # jump a line
113
114 while 1:
115 nexteu2 = F6.readline()
116
117 if nexteu2[0]==">": break
118
119 ######################################
120 ### [B] FORMAT MATCH NAME 1st STEP ###
121
122 if nexteu2 != '\n':
123 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
124 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators
125 list1.append(match)
126
127 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!!
128 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators
129
130 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match)
131 break
132 else: ## Read the other lines (i.e. All the Matches)
133 continue
134
135 ### [B] END FORMAT MATCH NAME 1st STEP ###
136 ##########################################
137
138 break
139
140 F6.close()
141 return(list1, list2, hit) # list1 = short name // list2 = more complete name
142 ######################################
143
144
145 #########################################
146 ### DEF3 : Get Information on matches ###
147 #########################################
148 ### Function used in the next function (2.3.)
149 def get_information_on_matches(list_of_line):
150
151 for line in list_of_line:
152
153 ## Score and Expect
154 if "Score" in line:
155 #print line
156 line = line[:-1] # remove "\n"
157 S_line = string.split(line, " = ")
158 Expect = S_line[-1] ## ***** Expect
159 S_line2 = string.split(S_line[1], " bits ")
160 Score = string.atof(S_line2[0])
161
162
163 ## Identities/gaps/percent/divergence/length_matched
164 elif "Identities" in line:
165 line = line[:-1] # remove "\n"
166
167 g = 0
168 if "Gaps" in line:
169 #print "HIT!!!"
170 pre_S_line = string.split(line, ",")
171 identity_line = pre_S_line[0]
172 gaps_line = pre_S_line[1]
173 g = 1
174 else:
175 identity_line = line
176 g = 0
177
178 ## treat identity line
179 S_line = string.split(identity_line, " ")
180
181 identities = S_line[-2] ## ***** identities
182 #print "\t\tIdentities = %s" %identities
183
184 S_line2 = string.split(identities, "/")
185 hits = string.atof(S_line2[0]) ## ***** hits
186 length_matched = string.atof(S_line2[1]) ## ***** length_matched
187 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences
188
189
190 ## identity_percent = S_line[-1]
191 identity_percent = hits/length_matched * 100 ## ***** identity_percent
192 #print "\t\tIdentity (percent) = %.2f" %identity_percent
193
194 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent
195 #print "\t\tDivergence (percent) = %.2f" %divergence_percent
196
197 ## treat gap line if any
198 if g ==1: # means there are gaps
199 S_line3 = string.split(gaps_line, " ")
200 gaps_part = S_line3[-2]
201 S_line4 = string.split(gaps_part, "/")
202 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number
203 #print "\t\tGaps number = %s" %gaps_number
204
205 real_differences = abs_nb_differences - gaps_number ## ***** real_differences
206 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent
207 #print "\t\tReal divergence (percent)= %.2f" %real_divergence_percent
208 else:
209 gaps_number = 0
210 #print "\t\tGaps number = %s" %gaps_number
211 real_differences = 0
212 real_divergence_percent = divergence_percent
213
214 ## Strand
215 #elif "Strand" in line:
216 # line = line[:-1] # remove "\n"
217 # S_line = string.split(line, " = ")
218 # strand = S_line[1]
219 # print "\t\tStrand = %s" %strand
220
221 ## Frame
222 elif "Frame" in line:
223 line = line[:-1] # remove "\n"
224 S_line = string.split(line, " = ")
225 frame = S_line[1]
226 #print "\t\tFrame = %s" %frame
227
228
229
230 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
231
232 return(list_informations)
233 ##################################
234
235
236 ############################
237 ### DEF4 : get sequences ###
238 ############################
239 ### [+ get informations from the DEF3.]
240 def get_sequences(query, list2, SUBMATCHEU, WORK_DIR):
241 list_Pairwise = []
242
243 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w')
244 F7.write(bash1[query]) # bash1[query] ==> blast output for each query
245 F7.close()
246 F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r')
247
248 text1 = F8.readlines()
249
250 miniList = []
251 for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function
252
253 l = -1
254 for n in text1:
255 l = l+1
256 if name in n:
257 i = l
258 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])
259
260 miniList.reverse()
261
262 if miniList != []:
263 length = len(miniList)
264
265 ii = 0
266 Listing1 = []
267 while ii < length:
268 iii = miniList[ii]
269 entry = text1[iii:]
270 text1 = text1[:iii]
271 Listing1.append(entry) # each "entry" = list of thing beginning by ">"
272 ii = ii+1 # Listing1 is a table of table!!
273
274 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
275 Listing1.reverse()
276
277 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
278 SEK = len(Listing2)
279
280 NB_SEK = 0
281
282 for e1 in Listing2: # "Listing2" contents all the entries begining with ">"
283 NB_SEK = NB_SEK + 1
284 list51 = []
285
286 l = -1
287 for line in e1:
288 l = l+1
289 if "Score =" in line:
290 index = l
291 list51.append(l) # index of the lines with score
292
293 list51.reverse()
294
295 Listing3 = []
296
297 for i5 in list51:
298 e2 = e1[i5:]
299 Listing3.append(e2)
300 e1 = e1[:i5]
301
302
303 ######################################
304 ### [C] FORMAT MATCH NAME 2nd STEP ###
305
306 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
307
308 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
309
310 SmallFastaName = SmallFastaName[1:-2] ### remove ">" and "\n"
311
312
313 S1 = string.split(SmallFastaName, "||")
314 S2 = string.split(S1[0], " ")
315
316
317 PutInFastaName1 = S2[0]
318
319 ### [C] END FORMAT MATCH NAME 2nd STEP ###
320 ##########################################
321
322 SUBSEK = len(Listing3)
323 NB_SUBSEK = 0
324 list_inBatch = []
325 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
326 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
327 Listing4 = []
328 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!!
329 elif SUBMATCHEU == 1:
330 Listing4 = Listing3
331
332
333 for l in Listing4: ## "listing3" contents
334
335 NB_SUBSEK = NB_SUBSEK+1
336
337 ll1 = string.replace(l[0], " ", "")
338 ll2 = string.replace(l[1], " ", "")
339 ll3 = string.replace(l[2], " ", "")
340 PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information
341
342 seq_query = ""
343 pos_query = []
344 seq_match = ""
345 pos_match = []
346 for line in l:
347 if "Query:" in line:
348 line = string.replace(line, " ", " ") # remove multiple spaces in line
349 line = string.replace(line, " ", " ")
350 line = string.replace(line, " ", " ")
351
352 lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
353
354 pos1 = lll1[1]
355 pos1 = string.atoi(pos1)
356 pos_query.append(pos1)
357
358 pos2 = lll1[3][:-1]
359 pos2 = string.atoi(pos2)
360 pos_query.append(pos2)
361
362 seq = lll1[2]
363 seq_query = seq_query + seq
364
365
366 if "Sbjct:" in line:
367 line = string.replace(line, " ", " ") # remove multiple spaces in line
368 line = string.replace(line, " ", " ")
369 line = string.replace(line, " ", " ")
370
371 lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
372
373 pos1 = lll2[1]
374 pos1 = string.atoi(pos1)
375 pos_match.append(pos1)
376
377 pos2 = lll2[3][:-1]
378 pos2 = string.atoi(pos2)
379 pos_match.append(pos2)
380
381 seq = lll2[2]
382 seq_match = seq_match + seq
383
384 ## Get the query and matched sequences and the corresponding positions
385
386 pos_query.sort() # rank small to big
387 pos_query_start = pos_query[0] # get the smaller
388 pos_query_end = pos_query[-1] # get the bigger
389 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)
390
391
392 ######################################
393 ### [D] FORMAT QUERY NAME 2nd STEP ###
394
395 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
396
397 ### [D] END FORMAT QUERY NAME 2nd STEP ###
398 ##########################################
399
400
401 pos_match.sort()
402 pos_match_start = pos_match[0]
403 pos_match_end = pos_match[-1]
404 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)
405
406
407 ######################################
408 ### [E] FORMAT MATCH NAME 3rd STEP ###
409
410 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
411
412 ### [E] END FORMAT MATCH NAME 3rd STEP ###
413 ##########################################
414
415
416
417 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members
418 list_Pairwise.append(Pairwise)
419
420 ### Get informations about matches
421 list_info = get_information_on_matches(l) ### DEF3 ###
422
423
424 F8.close()
425 return(list_Pairwise, list_info)
426 #########################################
427
428
429 ###################
430 ### RUN RUN RUN ###
431 ###################
432 import string, os, time, re, sys
433
434 ## 1 ## INPUT/OUTPUT
435 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db
436
437 path_in = "%s/11_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE)
438 file_out = open("%s/13_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")
439
440 ## 2 ## RUN
441 ## create Bash1 ##
442 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###
443
444 ## detect and save match ##
445 list_hits =[]
446 list_no_hits = []
447 j= 0
448 k = 0
449 lene = len(bash1.keys())
450 for query in bash1.keys():
451 j = j+1
452
453 ## 2.1. detect matches ##
454 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ###
455
456 if hit == 1: # match(es)
457 list_hits.append(query)
458 if hit == 0: # no match for that sequence
459 list_no_hits.append(query)
460
461 ## 2.2. get sequences ##
462 if hit ==1:
463 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### DEF4 ###
464
465 # divergencve
466 divergence = list_info[6]
467 # gap number
468 gap_number = list_info[7]
469 # real divergence (divergence without accounting INDELs)
470 real_divergence = list_info[8]
471 # length matched
472 length_matched = list_info[10]
473
474 ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES
475 for pairwise in list_pairwiseMatch:
476 k = k+1
477
478 query_name = pairwise[0]
479 query_seq = pairwise[1]
480 match_name = pairwise[2]
481 match_seq = pairwise[3]
482
483 len_query_seq = len(query_seq)
484
485
486 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:
487
488 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
489 file_out.write("\n")
490 file_out.write("%s" %query_seq)
491 file_out.write("\n")
492
493 file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched))
494 file_out.write("\n")
495 file_out.write("%s" %match_seq)
496 file_out.write("\n")
497
498
499 file_out.close()