comparison scripts/S05_script_extract_match_v20_blastx.py @ 4:6709645eff5d draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:53:53 -0500
parents c8af52875b0f
children
comparison
equal deleted inserted replaced
3:5f68b2fc02c1 4:6709645eff5d
26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword 26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
27 ## 2/ change line "if keyword in nextline:" in function "split_file" 27 ## 2/ change line "if keyword in nextline:" in function "split_file"
28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" 28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
29 29
30 30
31 ########################################
32 ### DEF 1. Split each "BLASTN" event ###
33 ########################################
34 def split_file(path_in, keyword):
35
36 file_in = open(path_in, "r")
37 RUN = ''
38 BASH1={}
39
40 while 1:
41 nextline = file_in.readline()
42
43 ##################################
44 ### [A] FORMATTING QUERY NAME ###
45
46 ### Get query name ###
47 if nextline[0:6]=='Query=':
48 L1 = string.split(nextline, "||")
49 L2 = string.split(L1[0], " ")
50 query = L2[1]
51 if query[-1] == "\n":
52 query = query[:-1]
53
54 ### [A] END FORMATTING QUERY NAME ###
55 ######################################
56
57
58 ### split the file with keyword ###
59 if keyword in nextline:
60
61 # Two cases here:
62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
64
65 if RUN == '': # case #1#
66 RUN = RUN + nextline # we just added the first line of the file
67
68 else: # case #2# (there was a run before)
69 BASH1[query] = RUN # add the previous run to the bash
70 RUN = '' # re-initialize the "RUN"
71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
72
73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously)
74 RUN = RUN + nextline
75
76
77 if not nextline: # when no more line, we should record the last "RUN" in the bash1
78 BASH1[query] = RUN # add the last "RUN"
79 break
80
81 file_in.close()
82 return(BASH1)
83 #########################################################
84
85
86 ################################################
87 ### DEF2 : Parse blast output for each query ###
88 ################################################
89 ### detect matches (i.e. 'Sequences producing significant alignments:' ###
90
91 def detect_Matches(query, MATCH, WORK_DIR):
92 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w')
93 F5.write(bash1[query])
94 F5.close()
95
96 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r')
97 list1 =[]
98 list2 =[]
99
100 while 1:
101 nexteu = F6.readline()
102 if not nexteu : break
103
104 if "***** No hits found ******" in nexteu :
105 hit = 0
106 break
107
108 if 'Sequences producing significant alignments:' in nexteu:
109 hit = 1
110 F6.readline() # jump a line
111
112 while 1:
113 nexteu2 = F6.readline()
114 if nexteu2[0]==">": break
115
116 ######################################
117 ### [B] FORMAT MATCH NAME 1st STEP ###
118
119 if nexteu2 != '\n':
120 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
121 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators
122 list1.append(match)
123
124 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!!
125 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators
126
127 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match)
128 break
129 else: ## Read the other lines (i.e. All the Matches)
130 continue
131
132 ### [B] END FORMAT MATCH NAME 1st STEP ###
133 ##########################################
134
135 break
136
137 F6.close()
138 return(list1, list2, hit) # list1 = short name // list2 = more complete name
139 #######################################
140
141
142 #########################################
143 ### DEF3 : Get Information on matches ###
144 #########################################
145 ### Function used in the next function (2.3.)
146
147 def get_information_on_matches(list_of_line):
148 for line in list_of_line:
149
150 ## Score and Expect
151 if "Score" in line:
152 line = line[:-1] # remove "\n"
153 S_line = string.split(line, " = ")
154 Expect = S_line[-1] ## ***** Expect
155 S_line2 = string.split(S_line[1], " bits ")
156 Score = string.atof(S_line2[0])
157
158 ## Identities/gaps/percent/divergence/length_matched
159 elif "Identities" in line:
160 line = line[:-1] # remove "\n"
161 g = 0
162
163 if "Gaps" in line:
164 pre_S_line = string.split(line, ",")
165 identity_line = pre_S_line[0]
166 gaps_line = pre_S_line[1]
167 g = 1
168 else:
169 identity_line = line
170 g = 0
171
172 ## treat identity line
173 S_line = string.split(identity_line, " ")
174
175 identities = S_line[-2] ## ***** identities
176
177 S_line2 = string.split(identities, "/")
178 hits = string.atof(S_line2[0]) ## ***** hits
179 length_matched = string.atof(S_line2[1]) ## ***** length_matched
180 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences
181
182
183 identity_percent = hits/length_matched * 100 ## ***** identity_percent
184
185 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent
186
187 ## treat gap line if any
188 if g ==1: # means there are gaps
189 S_line3 = string.split(gaps_line, " ")
190 gaps_part = S_line3[-2]
191 S_line4 = string.split(gaps_part, "/")
192 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number
193
194 real_differences = abs_nb_differences - gaps_number ## ***** real_differences
195 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent
196 else:
197 gaps_number = 0
198 real_differences = 0
199 real_divergence_percent = divergence_percent
200
201 ## Frame
202 elif "Frame" in line:
203 line = line[:-1]
204 S_line = string.split(line, " = ")
205 frame = S_line[1]
206
207 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
208
209 return(list_informations)
210 ########################################
211
212
213 ############################ 31 ############################
214 ### DEF4 : get sequences ### 32 ### DEF4 : get sequences ###
215 ############################ 33 ############################
216 ### [+ get informations from the function 2.2.] 34 ### [+ get informations from the function 2.2.]
217
218 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR): 35 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR):
219 list_Pairwise = [] 36 list_Pairwise = []
220 37
221 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w') 38 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w')
222 F7.write(bash1[query]) # bash1[query] ==> blast output for each query 39 F7.write(bash1[query]) # bash1[query] ==> blast output for each query
233 l = l+1 50 l = l+1
234 if name in n: 51 if name in n:
235 i = l 52 i = l
236 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...]) 53 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])
237 54
238 miniList.reverse() 55 miniList.reverse()
56
239 if miniList != []: 57 if miniList != []:
240 length = len(miniList) 58 length = len(miniList)
59
241 ii = 0 60 ii = 0
242
243 Listing1 = [] 61 Listing1 = []
244 while ii < length: 62 while ii < length:
245 iii = miniList[ii] 63 iii = miniList[ii]
246 entry = text1[iii:] 64 entry = text1[iii:]
247 text1 = text1[:iii] 65 text1 = text1[:iii]
277 ###################################### 95 ######################################
278 ### [C] FORMAT MATCH NAME 2nd STEP ### 96 ### [C] FORMAT MATCH NAME 2nd STEP ###
279 97
280 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines 98 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
281 99
282 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME 100 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
101
283 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n" 102 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n"
284 103
104 """
105 3 lines below : only difference with S08
106 """
285 if SmallFastaName[-1] == " ": 107 if SmallFastaName[-1] == " ":
286 SmallFastaName = SmallFastaName[:-1] 108 SmallFastaName = SmallFastaName[:-1]
287
288 PutInFastaName1 = SmallFastaName 109 PutInFastaName1 = SmallFastaName
289 110
290 ### [C] END FORMAT MATCH NAME 2nd STEP ### 111 ### [C] END FORMAT MATCH NAME 2nd STEP ###
291 ########################################## 112 ##########################################
292 113
387 F8.close() 208 F8.close()
388 return(list_Pairwise, list_info) 209 return(list_Pairwise, list_info)
389 ######################################### 210 #########################################
390 211
391 212
392 ###################### 213 ###################
393 ### 2. RUN RUN RUN ### 214 ### RUN RUN RUN ###
394 ###################### 215 ###################
395 import string, os, time, re, sys 216 import string, os, time, re, sys
217 from functions import split_file, detect_Matches, get_information_on_matches
396 218
397 ## 1 ## INPUT/OUTPUT 219 ## 1 ## INPUT/OUTPUT
398 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db 220 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db
399 221
222 """
223 04 and 06 for S05, 11 and 13 for S08
224 """
400 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) 225 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE)
401 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") 226 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")
402 227
403 ## 2 ## RUN 228 ## 2 ## RUN
404 ## create Bash1 ## 229 ## create Bash1 ##
405 bash1 = split_file(path_in, "TBLASTX") ## DEF1 ## 230 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###
406 231
407 ## detect and save match ## 232 ## detect and save match ##
408 list_hits =[] 233 list_hits =[]
409 list_no_hits = [] 234 list_no_hits = []
410 j= 0 235 j= 0
412 lene = len(bash1.keys()) 237 lene = len(bash1.keys())
413 for query in bash1.keys(): 238 for query in bash1.keys():
414 j = j+1 239 j = j+1
415 240
416 ## 2.1. detect matches ## 241 ## 2.1. detect matches ##
417 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ### 242 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1) ### DEF2 ###
418 243
419 if hit == 1: # match(es) 244 if hit == 1: # match(es)
420 list_hits.append(query) 245 list_hits.append(query)
421 if hit == 0: # no match for that sequence 246 if hit == 0: # no match for that sequence
422 list_no_hits.append(query) 247 list_no_hits.append(query)
423 248
424 ## 2.2. get sequences ## 249 ## 2.2. get sequences ##
425 if hit ==1: 250 if hit ==1:
426 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### FUNCTION ### 251 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)#
427 252
428 # divergencve 253 # divergencve
429 divergence = list_info[6] 254 divergence = list_info[6]
430 # gap number 255 # gap number
431 gap_number = list_info[7] 256 gap_number = list_info[7]
443 match_name = pairwise[2] 268 match_name = pairwise[2]
444 match_seq = pairwise[3] 269 match_seq = pairwise[3]
445 270
446 len_query_seq = len(query_seq) 271 len_query_seq = len(query_seq)
447 272
273 """
274 4 lines below : only in S05
275 """
448 Lis1 = string.split(query_name, "||") 276 Lis1 = string.split(query_name, "||")
449 short_query_name = Lis1[0] 277 short_query_name = Lis1[0]
450 Lis2 = string.split(match_name, "||") 278 Lis2 = string.split(match_name, "||")
451 short_match_name = Lis2[0] 279 short_match_name = Lis2[0]
452 280