Mercurial > repos > abims-sbr > pairwise
comparison scripts/S08_script_extract_match_v20_blastx.py @ 1:c8af52875b0f draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:46:45 -0400 |
| parents | |
| children | 6709645eff5d |
comparison
equal
deleted
inserted
replaced
| 0:e95d4b20c62d | 1:c8af52875b0f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ## AUTHOR: Eric Fontanillas | |
| 3 ## LAST VERSION: 14/08/14 by Julie BAFFARD | |
| 4 | |
| 5 ### TBLASTX formatting | |
| 6 | |
| 7 ### MATCH = Only the first match keeped | |
| 8 MATCH = 0 # Only 1rst match Wanted | |
| 9 #MATCH = 1 # All match want | |
| 10 ### SUBMATCH = several part of a same sequence match with the query | |
| 11 SUBMATCH = 0 # SUBMATCH NOT WANTED (ONLY 1rst HIT) | |
| 12 #SUBMATCH =1 # SUBMATCH WANTED | |
| 13 | |
| 14 | |
| 15 ### NAME FORMATTING: | |
| 16 # [A] FORMAT QUERY NAME 1st STEP [IN DEF1] | |
| 17 | |
| 18 # [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1] | |
| 19 | |
| 20 # [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3] | |
| 21 # [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3] | |
| 22 # [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3] | |
| 23 | |
| 24 ### SPECIFICITY TBLASTX (/BLASTN) formatting: | |
| 25 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword | |
| 26 ## 2/ change line "if keyword in nextline:" in function "split_file" | |
| 27 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" | |
| 28 | |
| 29 | |
| 30 | |
| 31 ######################################## | |
| 32 ### DEF 1. Split each "BLASTN" event ### | |
| 33 ######################################## | |
| 34 def split_file(path_in, keyword): | |
| 35 | |
| 36 file_in = open(path_in, "r") | |
| 37 | |
| 38 RUN = '' | |
| 39 BASH1={} | |
| 40 | |
| 41 while 1: | |
| 42 nextline = file_in.readline() | |
| 43 | |
| 44 ################################## | |
| 45 ### [A] FORMATTING QUERY NAME ### | |
| 46 | |
| 47 # Get query name | |
| 48 if nextline[0:6]=='Query=': | |
| 49 L1 = string.split(nextline, "||") | |
| 50 L2 = string.split(L1[0], " ") | |
| 51 query = L2[1] | |
| 52 if query[-1] == "\n": | |
| 53 query = query[:-1] | |
| 54 | |
| 55 ### [A] END FORMATTING QUERY NAME ### | |
| 56 ###################################### | |
| 57 | |
| 58 | |
| 59 ### split the file with keyword ### | |
| 60 if keyword in nextline: | |
| 61 # Two cases here: | |
| 62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN". | |
| 63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN" | |
| 64 | |
| 65 if RUN == '': # case #1# | |
| 66 RUN = RUN + nextline # we just added the first line of the file | |
| 67 | |
| 68 else: # case #2# (there was a run before) | |
| 69 BASH1[query] = RUN # add the previous run to the bash | |
| 70 RUN = '' # re-initialize the "RUN" | |
| 71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN") | |
| 72 | |
| 73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously) | |
| 74 RUN = RUN + nextline | |
| 75 | |
| 76 | |
| 77 if not nextline: # when no more line, we should record the last "RUN" in the bash1 | |
| 78 BASH1[query] = RUN # add the last "RUN" | |
| 79 break | |
| 80 | |
| 81 | |
| 82 file_in.close() | |
| 83 return(BASH1) | |
| 84 ######################################################### | |
| 85 | |
| 86 | |
| 87 ############################################### | |
| 88 ### DEF2 : Parse blast output for each query### | |
| 89 ############################################### | |
| 90 | |
| 91 ### 2.1. detect matches (i.e. 'Sequences producing significant alignments:' ### | |
| 92 def detect_Matches(query, MATCH, WORK_DIR): | |
| 93 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w') | |
| 94 F5.write(bash1[query]) | |
| 95 F5.close() | |
| 96 | |
| 97 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r') | |
| 98 list1 =[] | |
| 99 list2 =[] | |
| 100 | |
| 101 while 1: | |
| 102 nexteu = F6.readline() | |
| 103 | |
| 104 if not nexteu : break | |
| 105 | |
| 106 if "***** No hits found ******" in nexteu : | |
| 107 hit = 0 | |
| 108 break | |
| 109 | |
| 110 if 'Sequences producing significant alignments:' in nexteu: | |
| 111 hit = 1 | |
| 112 F6.readline() # jump a line | |
| 113 | |
| 114 while 1: | |
| 115 nexteu2 = F6.readline() | |
| 116 | |
| 117 if nexteu2[0]==">": break | |
| 118 | |
| 119 ###################################### | |
| 120 ### [B] FORMAT MATCH NAME 1st STEP ### | |
| 121 | |
| 122 if nexteu2 != '\n': | |
| 123 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!! | |
| 124 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators | |
| 125 list1.append(match) | |
| 126 | |
| 127 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!! | |
| 128 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators | |
| 129 | |
| 130 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match) | |
| 131 break | |
| 132 else: ## Read the other lines (i.e. All the Matches) | |
| 133 continue | |
| 134 | |
| 135 ### [B] END FORMAT MATCH NAME 1st STEP ### | |
| 136 ########################################## | |
| 137 | |
| 138 break | |
| 139 | |
| 140 F6.close() | |
| 141 return(list1, list2, hit) # list1 = short name // list2 = more complete name | |
| 142 ###################################### | |
| 143 | |
| 144 | |
| 145 ######################################### | |
| 146 ### DEF3 : Get Information on matches ### | |
| 147 ######################################### | |
| 148 ### Function used in the next function (2.3.) | |
| 149 def get_information_on_matches(list_of_line): | |
| 150 | |
| 151 for line in list_of_line: | |
| 152 | |
| 153 ## Score and Expect | |
| 154 if "Score" in line: | |
| 155 #print line | |
| 156 line = line[:-1] # remove "\n" | |
| 157 S_line = string.split(line, " = ") | |
| 158 Expect = S_line[-1] ## ***** Expect | |
| 159 S_line2 = string.split(S_line[1], " bits ") | |
| 160 Score = string.atof(S_line2[0]) | |
| 161 | |
| 162 | |
| 163 ## Identities/gaps/percent/divergence/length_matched | |
| 164 elif "Identities" in line: | |
| 165 line = line[:-1] # remove "\n" | |
| 166 | |
| 167 g = 0 | |
| 168 if "Gaps" in line: | |
| 169 #print "HIT!!!" | |
| 170 pre_S_line = string.split(line, ",") | |
| 171 identity_line = pre_S_line[0] | |
| 172 gaps_line = pre_S_line[1] | |
| 173 g = 1 | |
| 174 else: | |
| 175 identity_line = line | |
| 176 g = 0 | |
| 177 | |
| 178 ## treat identity line | |
| 179 S_line = string.split(identity_line, " ") | |
| 180 | |
| 181 identities = S_line[-2] ## ***** identities | |
| 182 #print "\t\tIdentities = %s" %identities | |
| 183 | |
| 184 S_line2 = string.split(identities, "/") | |
| 185 hits = string.atof(S_line2[0]) ## ***** hits | |
| 186 length_matched = string.atof(S_line2[1]) ## ***** length_matched | |
| 187 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences | |
| 188 | |
| 189 | |
| 190 ## identity_percent = S_line[-1] | |
| 191 identity_percent = hits/length_matched * 100 ## ***** identity_percent | |
| 192 #print "\t\tIdentity (percent) = %.2f" %identity_percent | |
| 193 | |
| 194 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent | |
| 195 #print "\t\tDivergence (percent) = %.2f" %divergence_percent | |
| 196 | |
| 197 ## treat gap line if any | |
| 198 if g ==1: # means there are gaps | |
| 199 S_line3 = string.split(gaps_line, " ") | |
| 200 gaps_part = S_line3[-2] | |
| 201 S_line4 = string.split(gaps_part, "/") | |
| 202 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number | |
| 203 #print "\t\tGaps number = %s" %gaps_number | |
| 204 | |
| 205 real_differences = abs_nb_differences - gaps_number ## ***** real_differences | |
| 206 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent | |
| 207 #print "\t\tReal divergence (percent)= %.2f" %real_divergence_percent | |
| 208 else: | |
| 209 gaps_number = 0 | |
| 210 #print "\t\tGaps number = %s" %gaps_number | |
| 211 real_differences = 0 | |
| 212 real_divergence_percent = divergence_percent | |
| 213 | |
| 214 ## Strand | |
| 215 #elif "Strand" in line: | |
| 216 # line = line[:-1] # remove "\n" | |
| 217 # S_line = string.split(line, " = ") | |
| 218 # strand = S_line[1] | |
| 219 # print "\t\tStrand = %s" %strand | |
| 220 | |
| 221 ## Frame | |
| 222 elif "Frame" in line: | |
| 223 line = line[:-1] # remove "\n" | |
| 224 S_line = string.split(line, " = ") | |
| 225 frame = S_line[1] | |
| 226 #print "\t\tFrame = %s" %frame | |
| 227 | |
| 228 | |
| 229 | |
| 230 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched] | |
| 231 | |
| 232 return(list_informations) | |
| 233 ################################## | |
| 234 | |
| 235 | |
| 236 ############################ | |
| 237 ### DEF4 : get sequences ### | |
| 238 ############################ | |
| 239 ### [+ get informations from the DEF3.] | |
| 240 def get_sequences(query, list2, SUBMATCHEU, WORK_DIR): | |
| 241 list_Pairwise = [] | |
| 242 | |
| 243 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w') | |
| 244 F7.write(bash1[query]) # bash1[query] ==> blast output for each query | |
| 245 F7.close() | |
| 246 F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r') | |
| 247 | |
| 248 text1 = F8.readlines() | |
| 249 | |
| 250 miniList = [] | |
| 251 for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function | |
| 252 | |
| 253 l = -1 | |
| 254 for n in text1: | |
| 255 l = l+1 | |
| 256 if name in n: | |
| 257 i = l | |
| 258 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...]) | |
| 259 | |
| 260 miniList.reverse() | |
| 261 | |
| 262 if miniList != []: | |
| 263 length = len(miniList) | |
| 264 | |
| 265 ii = 0 | |
| 266 Listing1 = [] | |
| 267 while ii < length: | |
| 268 iii = miniList[ii] | |
| 269 entry = text1[iii:] | |
| 270 text1 = text1[:iii] | |
| 271 Listing1.append(entry) # each "entry" = list of thing beginning by ">" | |
| 272 ii = ii+1 # Listing1 is a table of table!! | |
| 273 | |
| 274 Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]" | |
| 275 Listing1.reverse() | |
| 276 | |
| 277 Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">" | |
| 278 SEK = len(Listing2) | |
| 279 | |
| 280 NB_SEK = 0 | |
| 281 | |
| 282 for e1 in Listing2: # "Listing2" contents all the entries begining with ">" | |
| 283 NB_SEK = NB_SEK + 1 | |
| 284 list51 = [] | |
| 285 | |
| 286 l = -1 | |
| 287 for line in e1: | |
| 288 l = l+1 | |
| 289 if "Score =" in line: | |
| 290 index = l | |
| 291 list51.append(l) # index of the lines with score | |
| 292 | |
| 293 list51.reverse() | |
| 294 | |
| 295 Listing3 = [] | |
| 296 | |
| 297 for i5 in list51: | |
| 298 e2 = e1[i5:] | |
| 299 Listing3.append(e2) | |
| 300 e1 = e1[:i5] | |
| 301 | |
| 302 | |
| 303 ###################################### | |
| 304 ### [C] FORMAT MATCH NAME 2nd STEP ### | |
| 305 | |
| 306 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines | |
| 307 | |
| 308 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME | |
| 309 | |
| 310 SmallFastaName = SmallFastaName[1:-2] ### remove ">" and "\n" | |
| 311 | |
| 312 | |
| 313 S1 = string.split(SmallFastaName, "||") | |
| 314 S2 = string.split(S1[0], " ") | |
| 315 | |
| 316 | |
| 317 PutInFastaName1 = S2[0] | |
| 318 | |
| 319 ### [C] END FORMAT MATCH NAME 2nd STEP ### | |
| 320 ########################################## | |
| 321 | |
| 322 SUBSEK = len(Listing3) | |
| 323 NB_SUBSEK = 0 | |
| 324 list_inBatch = [] | |
| 325 ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3": | |
| 326 if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!! | |
| 327 Listing4 = [] | |
| 328 Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!! | |
| 329 elif SUBMATCHEU == 1: | |
| 330 Listing4 = Listing3 | |
| 331 | |
| 332 | |
| 333 for l in Listing4: ## "listing3" contents | |
| 334 | |
| 335 NB_SUBSEK = NB_SUBSEK+1 | |
| 336 | |
| 337 ll1 = string.replace(l[0], " ", "") | |
| 338 ll2 = string.replace(l[1], " ", "") | |
| 339 ll3 = string.replace(l[2], " ", "") | |
| 340 PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information | |
| 341 | |
| 342 seq_query = "" | |
| 343 pos_query = [] | |
| 344 seq_match = "" | |
| 345 pos_match = [] | |
| 346 for line in l: | |
| 347 if "Query:" in line: | |
| 348 line = string.replace(line, " ", " ") # remove multiple spaces in line | |
| 349 line = string.replace(line, " ", " ") | |
| 350 line = string.replace(line, " ", " ") | |
| 351 | |
| 352 lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end | |
| 353 | |
| 354 pos1 = lll1[1] | |
| 355 pos1 = string.atoi(pos1) | |
| 356 pos_query.append(pos1) | |
| 357 | |
| 358 pos2 = lll1[3][:-1] | |
| 359 pos2 = string.atoi(pos2) | |
| 360 pos_query.append(pos2) | |
| 361 | |
| 362 seq = lll1[2] | |
| 363 seq_query = seq_query + seq | |
| 364 | |
| 365 | |
| 366 if "Sbjct:" in line: | |
| 367 line = string.replace(line, " ", " ") # remove multiple spaces in line | |
| 368 line = string.replace(line, " ", " ") | |
| 369 line = string.replace(line, " ", " ") | |
| 370 | |
| 371 lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end | |
| 372 | |
| 373 pos1 = lll2[1] | |
| 374 pos1 = string.atoi(pos1) | |
| 375 pos_match.append(pos1) | |
| 376 | |
| 377 pos2 = lll2[3][:-1] | |
| 378 pos2 = string.atoi(pos2) | |
| 379 pos_match.append(pos2) | |
| 380 | |
| 381 seq = lll2[2] | |
| 382 seq_match = seq_match + seq | |
| 383 | |
| 384 ## Get the query and matched sequences and the corresponding positions | |
| 385 | |
| 386 pos_query.sort() # rank small to big | |
| 387 pos_query_start = pos_query[0] # get the smaller | |
| 388 pos_query_end = pos_query[-1] # get the bigger | |
| 389 PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end) | |
| 390 | |
| 391 | |
| 392 ###################################### | |
| 393 ### [D] FORMAT QUERY NAME 2nd STEP ### | |
| 394 | |
| 395 FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) | |
| 396 | |
| 397 ### [D] END FORMAT QUERY NAME 2nd STEP ### | |
| 398 ########################################## | |
| 399 | |
| 400 | |
| 401 pos_match.sort() | |
| 402 pos_match_start = pos_match[0] | |
| 403 pos_match_end = pos_match[-1] | |
| 404 PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end) | |
| 405 | |
| 406 | |
| 407 ###################################### | |
| 408 ### [E] FORMAT MATCH NAME 3rd STEP ### | |
| 409 | |
| 410 FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK) | |
| 411 | |
| 412 ### [E] END FORMAT MATCH NAME 3rd STEP ### | |
| 413 ########################################## | |
| 414 | |
| 415 | |
| 416 | |
| 417 Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members | |
| 418 list_Pairwise.append(Pairwise) | |
| 419 | |
| 420 ### Get informations about matches | |
| 421 list_info = get_information_on_matches(l) ### DEF3 ### | |
| 422 | |
| 423 | |
| 424 F8.close() | |
| 425 return(list_Pairwise, list_info) | |
| 426 ######################################### | |
| 427 | |
| 428 | |
| 429 ################### | |
| 430 ### RUN RUN RUN ### | |
| 431 ################### | |
| 432 import string, os, time, re, sys | |
| 433 | |
| 434 ## 1 ## INPUT/OUTPUT | |
| 435 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db | |
| 436 | |
| 437 path_in = "%s/11_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) | |
| 438 file_out = open("%s/13_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") | |
| 439 | |
| 440 ## 2 ## RUN | |
| 441 ## create Bash1 ## | |
| 442 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ### | |
| 443 | |
| 444 ## detect and save match ## | |
| 445 list_hits =[] | |
| 446 list_no_hits = [] | |
| 447 j= 0 | |
| 448 k = 0 | |
| 449 lene = len(bash1.keys()) | |
| 450 for query in bash1.keys(): | |
| 451 j = j+1 | |
| 452 | |
| 453 ## 2.1. detect matches ## | |
| 454 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ### | |
| 455 | |
| 456 if hit == 1: # match(es) | |
| 457 list_hits.append(query) | |
| 458 if hit == 0: # no match for that sequence | |
| 459 list_no_hits.append(query) | |
| 460 | |
| 461 ## 2.2. get sequences ## | |
| 462 if hit ==1: | |
| 463 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### DEF4 ### | |
| 464 | |
| 465 # divergencve | |
| 466 divergence = list_info[6] | |
| 467 # gap number | |
| 468 gap_number = list_info[7] | |
| 469 # real divergence (divergence without accounting INDELs) | |
| 470 real_divergence = list_info[8] | |
| 471 # length matched | |
| 472 length_matched = list_info[10] | |
| 473 | |
| 474 ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES | |
| 475 for pairwise in list_pairwiseMatch: | |
| 476 k = k+1 | |
| 477 | |
| 478 query_name = pairwise[0] | |
| 479 query_seq = pairwise[1] | |
| 480 match_name = pairwise[2] | |
| 481 match_seq = pairwise[3] | |
| 482 | |
| 483 len_query_seq = len(query_seq) | |
| 484 | |
| 485 | |
| 486 # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD: | |
| 487 | |
| 488 file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched)) | |
| 489 file_out.write("\n") | |
| 490 file_out.write("%s" %query_seq) | |
| 491 file_out.write("\n") | |
| 492 | |
| 493 file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched)) | |
| 494 file_out.write("\n") | |
| 495 file_out.write("%s" %match_seq) | |
| 496 file_out.write("\n") | |
| 497 | |
| 498 | |
| 499 file_out.close() |
