Mercurial > repos > abims-sbr > pairwise
comparison scripts/S05_script_extract_match_v20_blastx.py @ 1:c8af52875b0f draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:46:45 -0400 |
| parents | |
| children | 6709645eff5d |
comparison
equal
deleted
inserted
replaced
| 0:e95d4b20c62d | 1:c8af52875b0f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ## AUTHOR: Eric Fontanillas | |
| 3 ## LAST VERSION: 14/08/14 by Julie BAFFARD | |
| 4 | |
| 5 ### TBLASTX formatting | |
| 6 | |
| 7 ### MATCH = Only the first match keeped | |
| 8 MATCH = 0 # Only 1rst match Wanted | |
| 9 #MATCH = 1 # All match wanted | |
| 10 | |
| 11 ### SUBMATCH = several part of a same sequence match with the query | |
| 12 SUBMATCH = 0 # SUBMATCH NOT WANTED (only the best hit) | |
| 13 #SUBMATCH =1 # SUBMATCH WANTED | |
| 14 | |
| 15 | |
| 16 ### NAME FORMATTING: | |
| 17 # [A] FORMAT QUERY NAME 1st STEP [IN DEF1] | |
| 18 | |
| 19 # [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1] | |
| 20 | |
| 21 # [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3] | |
| 22 # [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3] | |
| 23 # [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3] | |
| 24 | |
| 25 ### SPECIFICITY TBLASTX (/BLASTN) formatting: | |
| 26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword | |
| 27 ## 2/ change line "if keyword in nextline:" in function "split_file" | |
| 28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" | |
| 29 | |
| 30 | |
| 31 ######################################## | |
| 32 ### DEF 1. Split each "BLASTN" event ### | |
| 33 ######################################## | |
| 34 def split_file(path_in, keyword): | |
| 35 | |
| 36 file_in = open(path_in, "r") | |
| 37 RUN = '' | |
| 38 BASH1={} | |
| 39 | |
| 40 while 1: | |
| 41 nextline = file_in.readline() | |
| 42 | |
| 43 ################################## | |
| 44 ### [A] FORMATTING QUERY NAME ### | |
| 45 | |
| 46 ### Get query name ### | |
| 47 if nextline[0:6]=='Query=': | |
| 48 L1 = string.split(nextline, "||") | |
| 49 L2 = string.split(L1[0], " ") | |
| 50 query = L2[1] | |
| 51 if query[-1] == "\n": | |
| 52 query = query[:-1] | |
| 53 | |
| 54 ### [A] END FORMATTING QUERY NAME ### | |
| 55 ###################################### | |
| 56 | |
| 57 | |
| 58 ### split the file with keyword ### | |
| 59 if keyword in nextline: | |
| 60 | |
| 61 # Two cases here: | |
| 62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN". | |
| 63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN" | |
| 64 | |
| 65 if RUN == '': # case #1# | |
| 66 RUN = RUN + nextline # we just added the first line of the file | |
| 67 | |
| 68 else: # case #2# (there was a run before) | |
| 69 BASH1[query] = RUN # add the previous run to the bash | |
| 70 RUN = '' # re-initialize the "RUN" | |
| 71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN") | |
| 72 | |
| 73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously) | |
| 74 RUN = RUN + nextline | |
| 75 | |
| 76 | |
| 77 if not nextline: # when no more line, we should record the last "RUN" in the bash1 | |
| 78 BASH1[query] = RUN # add the last "RUN" | |
| 79 break | |
| 80 | |
| 81 file_in.close() | |
| 82 return(BASH1) | |
| 83 ######################################################### | |
| 84 | |
| 85 | |
| 86 ################################################ | |
| 87 ### DEF2 : Parse blast output for each query ### | |
| 88 ################################################ | |
| 89 ### detect matches (i.e. 'Sequences producing significant alignments:' ### | |
| 90 | |
| 91 def detect_Matches(query, MATCH, WORK_DIR): | |
| 92 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w') | |
| 93 F5.write(bash1[query]) | |
| 94 F5.close() | |
| 95 | |
| 96 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r') | |
| 97 list1 =[] | |
| 98 list2 =[] | |
| 99 | |
| 100 while 1: | |
| 101 nexteu = F6.readline() | |
| 102 if not nexteu : break | |
| 103 | |
| 104 if "***** No hits found ******" in nexteu : | |
| 105 hit = 0 | |
| 106 break | |
| 107 | |
| 108 if 'Sequences producing significant alignments:' in nexteu: | |
| 109 hit = 1 | |
| 110 F6.readline() # jump a line | |
| 111 | |
| 112 while 1: | |
| 113 nexteu2 = F6.readline() | |
| 114 if nexteu2[0]==">": break | |
| 115 | |
| 116 ###################################### | |
| 117 ### [B] FORMAT MATCH NAME 1st STEP ### | |
| 118 | |
| 119 if nexteu2 != '\n': | |
| 120 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!! | |
| 121 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators | |
| 122 list1.append(match) | |
| 123 | |
| 124 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!! | |
| 125 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators | |
| 126 | |
| 127 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match) | |
| 128 break | |
| 129 else: ## Read the other lines (i.e. All the Matches) | |
| 130 continue | |
| 131 | |
| 132 ### [B] END FORMAT MATCH NAME 1st STEP ### | |
| 133 ########################################## | |
| 134 | |
| 135 break | |
| 136 | |
| 137 F6.close() | |
| 138 return(list1, list2, hit) # list1 = short name // list2 = more complete name | |
| 139 ####################################### | |
| 140 | |
| 141 | |
| 142 ######################################### | |
| 143 ### DEF3 : Get Information on matches ### | |
| 144 ######################################### | |
| 145 ### Function used in the next function (2.3.) | |
| 146 | |
| 147 def get_information_on_matches(list_of_line): | |
| 148 for line in list_of_line: | |
| 149 | |
| 150 ## Score and Expect | |
| 151 if "Score" in line: | |
| 152 line = line[:-1] # remove "\n" | |
| 153 S_line = string.split(line, " = ") | |
| 154 Expect = S_line[-1] ## ***** Expect | |
| 155 S_line2 = string.split(S_line[1], " bits ") | |
| 156 Score = string.atof(S_line2[0]) | |
| 157 | |
| 158 ## Identities/gaps/percent/divergence/length_matched | |
| 159 elif "Identities" in line: | |
| 160 line = line[:-1] # remove "\n" | |
| 161 g = 0 | |
| 162 | |
| 163 if "Gaps" in line: | |
| 164 pre_S_line = string.split(line, ",") | |
| 165 identity_line = pre_S_line[0] | |
| 166 gaps_line = pre_S_line[1] | |
| 167 g = 1 | |
| 168 else: | |
| 169 identity_line = line | |
| 170 g = 0 | |
| 171 | |
| 172 ## treat identity line | |
| 173 S_line = string.split(identity_line, " ") | |
| 174 | |
| 175 identities = S_line[-2] ## ***** identities | |
| 176 | |
| 177 S_line2 = string.split(identities, "/") | |
| 178 hits = string.atof(S_line2[0]) ## ***** hits | |
| 179 length_matched = string.atof(S_line2[1]) ## ***** length_matched | |
| 180 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences | |
| 181 | |
| 182 | |
| 183 identity_percent = hits/length_matched * 100 ## ***** identity_percent | |
| 184 | |
| 185 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent | |
| 186 | |
| 187 ## treat gap line if any | |
| 188 if g ==1: # means there are gaps | |
| 189 S_line3 = string.split(gaps_line, " ") | |
| 190 gaps_part = S_line3[-2] | |
| 191 S_line4 = string.split(gaps_part, "/") | |
| 192 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number | |
| 193 | |
| 194 real_differences = abs_nb_differences - gaps_number ## ***** real_differences | |
| 195 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent | |
| 196 else: | |
| 197 gaps_number = 0 | |
| 198 real_differences = 0 | |
| 199 real_divergence_percent = divergence_percent | |
| 200 | |
| 201 ## Frame | |
| 202 elif "Frame" in line: | |
| 203 line = line[:-1] | |
| 204 S_line = string.split(line, " = ") | |
| 205 frame = S_line[1] | |
| 206 | |
| 207 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched] | |
| 208 | |
| 209 return(list_informations) | |
| 210 ######################################## | |
| 211 | |
| 212 | |
| 213 ############################ | |
| 214 ### DEF4 : get sequences ### | |
| 215 ############################ | |
| 216 ### [+ get informations from the function 2.2.] | |
| 217 | |
| 218 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR): | |
| 219 list_Pairwise = [] | |
| 220 | |
| 221 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w') | |
| 222 F7.write(bash1[query]) # bash1[query] ==> blast output for each query | |
| 223 F7.close() | |
| 224 F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r') | |
| 225 | |
| 226 text1 = F8.readlines() | |
| 227 | |
| 228 miniList = [] | |
| 229 for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function | |
| 230 | |
| 231 l = -1 | |
| 232 for n in text1: | |
| 233 l = l+1 | |
| 234 if name in n: | |
| 235 i = l | |
| 236 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...]) | |
| 237 | |
| 238 miniList.reverse() | |
| 239 if miniList != []: | |
| 240 length = len(miniList) | |
| 241 ii = 0 | |
| 242 | |
| 243 Listing1 = [] | |
| 244 while ii < length: | |
| 245 iii = miniList[ii] | |
| 246 entry = text1[iii:] | |
| 247 text1 = text1[:iii] | |
| 248 Listing1.append(entry) # each "entry" = list of thing beginning by ">" | |
| 249 ii = ii+1 # Listing1 is a table of table!! | |
| 250 | |
| 251 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]" | |
| 252 Listing1.reverse() | |
| 253 | |
| 254 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">" | |
| 255 SEK = len(Listing2) | |
| 256 NB_SEK = 0 | |
| 257 | |
| 258 for e1 in Listing2: # "Listing2" contents all the entries begining with ">" | |
| 259 NB_SEK = NB_SEK + 1 | |
| 260 list51 = [] | |
| 261 | |
| 262 l = -1 | |
| 263 for line in e1: | |
| 264 l = l+1 | |
| 265 if "Score =" in line: | |
| 266 index = l | |
| 267 list51.append(l) # index of the lines with score | |
| 268 | |
| 269 list51.reverse() | |
| 270 Listing3 = [] | |
| 271 | |
| 272 for i5 in list51: | |
| 273 e2 = e1[i5:] | |
| 274 Listing3.append(e2) | |
| 275 e1 = e1[:i5] | |
| 276 | |
| 277 ###################################### | |
| 278 ### [C] FORMAT MATCH NAME 2nd STEP ### | |
| 279 | |
| 280 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines | |
| 281 | |
| 282 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME | |
| 283 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n" | |
| 284 | |
| 285 if SmallFastaName[-1] == " ": | |
| 286 SmallFastaName = SmallFastaName[:-1] | |
| 287 | |
| 288 PutInFastaName1 = SmallFastaName | |
| 289 | |
| 290 ### [C] END FORMAT MATCH NAME 2nd STEP ### | |
| 291 ########################################## | |
| 292 | |
| 293 SUBSEK = len(Listing3) | |
| 294 NB_SUBSEK = 0 | |
| 295 list_inBatch = [] | |
| 296 | |
| 297 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3": | |
| 298 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!! | |
| 299 Listing4 = [] | |
| 300 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!! | |
| 301 elif SUBMATCHEU == 1: | |
| 302 Listing4 = Listing3 | |
| 303 | |
| 304 for l in Listing4: ## "listing3" contents | |
| 305 NB_SUBSEK = NB_SUBSEK+1 | |
| 306 | |
| 307 ll1 = string.replace(l[0], " ", "") | |
| 308 ll2 = string.replace(l[1], " ", "") | |
| 309 ll3 = string.replace(l[2], " ", "") | |
| 310 PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information | |
| 311 | |
| 312 seq_query = "" | |
| 313 pos_query = [] | |
| 314 seq_match = "" | |
| 315 pos_match = [] | |
| 316 | |
| 317 for line in l: | |
| 318 if "Query:" in line: | |
| 319 line = string.replace(line, " ", " ") # remove multiple spaces in line | |
| 320 line = string.replace(line, " ", " ") | |
| 321 line = string.replace(line, " ", " ") | |
| 322 | |
| 323 lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end | |
| 324 | |
| 325 pos1 = lll1[1] | |
| 326 pos1 = string.atoi(pos1) | |
| 327 pos_query.append(pos1) | |
| 328 | |
| 329 pos2 = lll1[3][:-1] | |
| 330 pos2 = string.atoi(pos2) | |
| 331 pos_query.append(pos2) | |
| 332 | |
| 333 seq = lll1[2] | |
| 334 seq_query = seq_query + seq | |
| 335 | |
| 336 if "Sbjct:" in line: | |
| 337 line = string.replace(line, " ", " ") # remove multiple spaces in line | |
| 338 line = string.replace(line, " ", " ") | |
| 339 line = string.replace(line, " ", " ") | |
| 340 | |
| 341 lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end | |
| 342 | |
| 343 pos1 = lll2[1] | |
| 344 pos1 = string.atoi(pos1) | |
| 345 pos_match.append(pos1) | |
| 346 | |
| 347 pos2 = lll2[3][:-1] | |
| 348 pos2 = string.atoi(pos2) | |
| 349 pos_match.append(pos2) | |
| 350 | |
| 351 seq = lll2[2] | |
| 352 seq_match = seq_match + seq | |
| 353 | |
| 354 ## Get the query and matched sequences and the corresponding positions | |
| 355 pos_query.sort() # rank small to big | |
| 356 pos_query_start = pos_query[0] # get the smaller | |
| 357 pos_query_end = pos_query[-1] # get the bigger | |
| 358 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end) | |
| 359 | |
| 360 ###################################### | |
| 361 ### [D] FORMAT QUERY NAME 2nd STEP ### | |
| 362 | |
| 363 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) | |
| 364 | |
| 365 ### [D] END FORMAT QUERY NAME 2nd STEP ### | |
| 366 ########################################## | |
| 367 | |
| 368 pos_match.sort() | |
| 369 pos_match_start = pos_match[0] | |
| 370 pos_match_end = pos_match[-1] | |
| 371 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end) | |
| 372 | |
| 373 ###################################### | |
| 374 ### [E] FORMAT MATCH NAME 3rd STEP ### | |
| 375 | |
| 376 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) | |
| 377 | |
| 378 ### [E] END FORMAT MATCH NAME 3rd STEP ### | |
| 379 ########################################## | |
| 380 | |
| 381 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members | |
| 382 list_Pairwise.append(Pairwise) | |
| 383 | |
| 384 ### Get informations about matches | |
| 385 list_info = get_information_on_matches(l) ### DEF3 ### | |
| 386 | |
| 387 F8.close() | |
| 388 return(list_Pairwise, list_info) | |
| 389 ######################################### | |
| 390 | |
| 391 | |
| 392 ###################### | |
| 393 ### 2. RUN RUN RUN ### | |
| 394 ###################### | |
| 395 import string, os, time, re, sys | |
| 396 | |
| 397 ## 1 ## INPUT/OUTPUT | |
| 398 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db | |
| 399 | |
| 400 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) | |
| 401 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") | |
| 402 | |
| 403 ## 2 ## RUN | |
| 404 ## create Bash1 ## | |
| 405 bash1 = split_file(path_in, "TBLASTX") ## DEF1 ## | |
| 406 | |
| 407 ## detect and save match ## | |
| 408 list_hits =[] | |
| 409 list_no_hits = [] | |
| 410 j= 0 | |
| 411 k = 0 | |
| 412 lene = len(bash1.keys()) | |
| 413 for query in bash1.keys(): | |
| 414 j = j+1 | |
| 415 | |
| 416 ## 2.1. detect matches ## | |
| 417 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ### | |
| 418 | |
| 419 if hit == 1: # match(es) | |
| 420 list_hits.append(query) | |
| 421 if hit == 0: # no match for that sequence | |
| 422 list_no_hits.append(query) | |
| 423 | |
| 424 ## 2.2. get sequences ## | |
| 425 if hit ==1: | |
| 426 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### FUNCTION ### | |
| 427 | |
| 428 # divergencve | |
| 429 divergence = list_info[6] | |
| 430 # gap number | |
| 431 gap_number = list_info[7] | |
| 432 # real divergence (divergence without accounting INDELs) | |
| 433 real_divergence = list_info[8] | |
| 434 # length matched | |
| 435 length_matched = list_info[10] | |
| 436 | |
| 437 ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES | |
| 438 for pairwise in list_pairwiseMatch: | |
| 439 k = k+1 | |
| 440 | |
| 441 query_name = pairwise[0] | |
| 442 query_seq = pairwise[1] | |
| 443 match_name = pairwise[2] | |
| 444 match_seq = pairwise[3] | |
| 445 | |
| 446 len_query_seq = len(query_seq) | |
| 447 | |
| 448 Lis1 = string.split(query_name, "||") | |
| 449 short_query_name = Lis1[0] | |
| 450 Lis2 = string.split(match_name, "||") | |
| 451 short_match_name = Lis2[0] | |
| 452 | |
| 453 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD: | |
| 454 | |
| 455 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched)) | |
| 456 file_out.write("\n") | |
| 457 file_out.write("%s" %query_seq) | |
| 458 file_out.write("\n") | |
| 459 | |
| 460 file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched)) | |
| 461 file_out.write("\n") | |
| 462 file_out.write("%s" %match_seq) | |
| 463 file_out.write("\n") | |
| 464 | |
| 465 file_out.close() |
