Mercurial > repos > abims-sbr > pairwise
comparison scripts/S05_script_extract_match_v20_blastx.py @ 4:6709645eff5d draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 08:53:53 -0500 |
| parents | c8af52875b0f |
| children |
comparison
equal
deleted
inserted
replaced
| 3:5f68b2fc02c1 | 4:6709645eff5d |
|---|---|
| 26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword | 26 ## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword |
| 27 ## 2/ change line "if keyword in nextline:" in function "split_file" | 27 ## 2/ change line "if keyword in nextline:" in function "split_file" |
| 28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" | 28 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches" |
| 29 | 29 |
| 30 | 30 |
| 31 ######################################## | |
| 32 ### DEF 1. Split each "BLASTN" event ### | |
| 33 ######################################## | |
| 34 def split_file(path_in, keyword): | |
| 35 | |
| 36 file_in = open(path_in, "r") | |
| 37 RUN = '' | |
| 38 BASH1={} | |
| 39 | |
| 40 while 1: | |
| 41 nextline = file_in.readline() | |
| 42 | |
| 43 ################################## | |
| 44 ### [A] FORMATTING QUERY NAME ### | |
| 45 | |
| 46 ### Get query name ### | |
| 47 if nextline[0:6]=='Query=': | |
| 48 L1 = string.split(nextline, "||") | |
| 49 L2 = string.split(L1[0], " ") | |
| 50 query = L2[1] | |
| 51 if query[-1] == "\n": | |
| 52 query = query[:-1] | |
| 53 | |
| 54 ### [A] END FORMATTING QUERY NAME ### | |
| 55 ###################################### | |
| 56 | |
| 57 | |
| 58 ### split the file with keyword ### | |
| 59 if keyword in nextline: | |
| 60 | |
| 61 # Two cases here: | |
| 62 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN". | |
| 63 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN" | |
| 64 | |
| 65 if RUN == '': # case #1# | |
| 66 RUN = RUN + nextline # we just added the first line of the file | |
| 67 | |
| 68 else: # case #2# (there was a run before) | |
| 69 BASH1[query] = RUN # add the previous run to the bash | |
| 70 RUN = '' # re-initialize the "RUN" | |
| 71 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN") | |
| 72 | |
| 73 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously) | |
| 74 RUN = RUN + nextline | |
| 75 | |
| 76 | |
| 77 if not nextline: # when no more line, we should record the last "RUN" in the bash1 | |
| 78 BASH1[query] = RUN # add the last "RUN" | |
| 79 break | |
| 80 | |
| 81 file_in.close() | |
| 82 return(BASH1) | |
| 83 ######################################################### | |
| 84 | |
| 85 | |
| 86 ################################################ | |
| 87 ### DEF2 : Parse blast output for each query ### | |
| 88 ################################################ | |
| 89 ### detect matches (i.e. 'Sequences producing significant alignments:' ### | |
| 90 | |
| 91 def detect_Matches(query, MATCH, WORK_DIR): | |
| 92 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w') | |
| 93 F5.write(bash1[query]) | |
| 94 F5.close() | |
| 95 | |
| 96 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r') | |
| 97 list1 =[] | |
| 98 list2 =[] | |
| 99 | |
| 100 while 1: | |
| 101 nexteu = F6.readline() | |
| 102 if not nexteu : break | |
| 103 | |
| 104 if "***** No hits found ******" in nexteu : | |
| 105 hit = 0 | |
| 106 break | |
| 107 | |
| 108 if 'Sequences producing significant alignments:' in nexteu: | |
| 109 hit = 1 | |
| 110 F6.readline() # jump a line | |
| 111 | |
| 112 while 1: | |
| 113 nexteu2 = F6.readline() | |
| 114 if nexteu2[0]==">": break | |
| 115 | |
| 116 ###################################### | |
| 117 ### [B] FORMAT MATCH NAME 1st STEP ### | |
| 118 | |
| 119 if nexteu2 != '\n': | |
| 120 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!! | |
| 121 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators | |
| 122 list1.append(match) | |
| 123 | |
| 124 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!! | |
| 125 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators | |
| 126 | |
| 127 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match) | |
| 128 break | |
| 129 else: ## Read the other lines (i.e. All the Matches) | |
| 130 continue | |
| 131 | |
| 132 ### [B] END FORMAT MATCH NAME 1st STEP ### | |
| 133 ########################################## | |
| 134 | |
| 135 break | |
| 136 | |
| 137 F6.close() | |
| 138 return(list1, list2, hit) # list1 = short name // list2 = more complete name | |
| 139 ####################################### | |
| 140 | |
| 141 | |
| 142 ######################################### | |
| 143 ### DEF3 : Get Information on matches ### | |
| 144 ######################################### | |
| 145 ### Function used in the next function (2.3.) | |
| 146 | |
| 147 def get_information_on_matches(list_of_line): | |
| 148 for line in list_of_line: | |
| 149 | |
| 150 ## Score and Expect | |
| 151 if "Score" in line: | |
| 152 line = line[:-1] # remove "\n" | |
| 153 S_line = string.split(line, " = ") | |
| 154 Expect = S_line[-1] ## ***** Expect | |
| 155 S_line2 = string.split(S_line[1], " bits ") | |
| 156 Score = string.atof(S_line2[0]) | |
| 157 | |
| 158 ## Identities/gaps/percent/divergence/length_matched | |
| 159 elif "Identities" in line: | |
| 160 line = line[:-1] # remove "\n" | |
| 161 g = 0 | |
| 162 | |
| 163 if "Gaps" in line: | |
| 164 pre_S_line = string.split(line, ",") | |
| 165 identity_line = pre_S_line[0] | |
| 166 gaps_line = pre_S_line[1] | |
| 167 g = 1 | |
| 168 else: | |
| 169 identity_line = line | |
| 170 g = 0 | |
| 171 | |
| 172 ## treat identity line | |
| 173 S_line = string.split(identity_line, " ") | |
| 174 | |
| 175 identities = S_line[-2] ## ***** identities | |
| 176 | |
| 177 S_line2 = string.split(identities, "/") | |
| 178 hits = string.atof(S_line2[0]) ## ***** hits | |
| 179 length_matched = string.atof(S_line2[1]) ## ***** length_matched | |
| 180 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences | |
| 181 | |
| 182 | |
| 183 identity_percent = hits/length_matched * 100 ## ***** identity_percent | |
| 184 | |
| 185 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent | |
| 186 | |
| 187 ## treat gap line if any | |
| 188 if g ==1: # means there are gaps | |
| 189 S_line3 = string.split(gaps_line, " ") | |
| 190 gaps_part = S_line3[-2] | |
| 191 S_line4 = string.split(gaps_part, "/") | |
| 192 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number | |
| 193 | |
| 194 real_differences = abs_nb_differences - gaps_number ## ***** real_differences | |
| 195 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent | |
| 196 else: | |
| 197 gaps_number = 0 | |
| 198 real_differences = 0 | |
| 199 real_divergence_percent = divergence_percent | |
| 200 | |
| 201 ## Frame | |
| 202 elif "Frame" in line: | |
| 203 line = line[:-1] | |
| 204 S_line = string.split(line, " = ") | |
| 205 frame = S_line[1] | |
| 206 | |
| 207 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched] | |
| 208 | |
| 209 return(list_informations) | |
| 210 ######################################## | |
| 211 | |
| 212 | |
| 213 ############################ | 31 ############################ |
| 214 ### DEF4 : get sequences ### | 32 ### DEF4 : get sequences ### |
| 215 ############################ | 33 ############################ |
| 216 ### [+ get informations from the function 2.2.] | 34 ### [+ get informations from the function 2.2.] |
| 217 | |
| 218 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR): | 35 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR): |
| 219 list_Pairwise = [] | 36 list_Pairwise = [] |
| 220 | 37 |
| 221 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w') | 38 F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w') |
| 222 F7.write(bash1[query]) # bash1[query] ==> blast output for each query | 39 F7.write(bash1[query]) # bash1[query] ==> blast output for each query |
| 233 l = l+1 | 50 l = l+1 |
| 234 if name in n: | 51 if name in n: |
| 235 i = l | 52 i = l |
| 236 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...]) | 53 miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...]) |
| 237 | 54 |
| 238 miniList.reverse() | 55 miniList.reverse() |
| 56 | |
| 239 if miniList != []: | 57 if miniList != []: |
| 240 length = len(miniList) | 58 length = len(miniList) |
| 59 | |
| 241 ii = 0 | 60 ii = 0 |
| 242 | |
| 243 Listing1 = [] | 61 Listing1 = [] |
| 244 while ii < length: | 62 while ii < length: |
| 245 iii = miniList[ii] | 63 iii = miniList[ii] |
| 246 entry = text1[iii:] | 64 entry = text1[iii:] |
| 247 text1 = text1[:iii] | 65 text1 = text1[:iii] |
| 277 ###################################### | 95 ###################################### |
| 278 ### [C] FORMAT MATCH NAME 2nd STEP ### | 96 ### [C] FORMAT MATCH NAME 2nd STEP ### |
| 279 | 97 |
| 280 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines | 98 BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines |
| 281 | 99 |
| 282 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME | 100 SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME |
| 101 | |
| 283 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n" | 102 SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n" |
| 284 | 103 |
| 104 """ | |
| 105 3 lines below : only difference with S08 | |
| 106 """ | |
| 285 if SmallFastaName[-1] == " ": | 107 if SmallFastaName[-1] == " ": |
| 286 SmallFastaName = SmallFastaName[:-1] | 108 SmallFastaName = SmallFastaName[:-1] |
| 287 | |
| 288 PutInFastaName1 = SmallFastaName | 109 PutInFastaName1 = SmallFastaName |
| 289 | 110 |
| 290 ### [C] END FORMAT MATCH NAME 2nd STEP ### | 111 ### [C] END FORMAT MATCH NAME 2nd STEP ### |
| 291 ########################################## | 112 ########################################## |
| 292 | 113 |
| 387 F8.close() | 208 F8.close() |
| 388 return(list_Pairwise, list_info) | 209 return(list_Pairwise, list_info) |
| 389 ######################################### | 210 ######################################### |
| 390 | 211 |
| 391 | 212 |
| 392 ###################### | 213 ################### |
| 393 ### 2. RUN RUN RUN ### | 214 ### RUN RUN RUN ### |
| 394 ###################### | 215 ################### |
| 395 import string, os, time, re, sys | 216 import string, os, time, re, sys |
| 217 from functions import split_file, detect_Matches, get_information_on_matches | |
| 396 | 218 |
| 397 ## 1 ## INPUT/OUTPUT | 219 ## 1 ## INPUT/OUTPUT |
| 398 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db | 220 SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db |
| 399 | 221 |
| 222 """ | |
| 223 04 and 06 for S05, 11 and 13 for S08 | |
| 224 """ | |
| 400 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) | 225 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) |
| 401 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") | 226 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w") |
| 402 | 227 |
| 403 ## 2 ## RUN | 228 ## 2 ## RUN |
| 404 ## create Bash1 ## | 229 ## create Bash1 ## |
| 405 bash1 = split_file(path_in, "TBLASTX") ## DEF1 ## | 230 bash1 = split_file(path_in, "TBLASTX") ### DEF1 ### |
| 406 | 231 |
| 407 ## detect and save match ## | 232 ## detect and save match ## |
| 408 list_hits =[] | 233 list_hits =[] |
| 409 list_no_hits = [] | 234 list_no_hits = [] |
| 410 j= 0 | 235 j= 0 |
| 412 lene = len(bash1.keys()) | 237 lene = len(bash1.keys()) |
| 413 for query in bash1.keys(): | 238 for query in bash1.keys(): |
| 414 j = j+1 | 239 j = j+1 |
| 415 | 240 |
| 416 ## 2.1. detect matches ## | 241 ## 2.1. detect matches ## |
| 417 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE) ### DEF2 ### | 242 list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1) ### DEF2 ### |
| 418 | 243 |
| 419 if hit == 1: # match(es) | 244 if hit == 1: # match(es) |
| 420 list_hits.append(query) | 245 list_hits.append(query) |
| 421 if hit == 0: # no match for that sequence | 246 if hit == 0: # no match for that sequence |
| 422 list_no_hits.append(query) | 247 list_no_hits.append(query) |
| 423 | 248 |
| 424 ## 2.2. get sequences ## | 249 ## 2.2. get sequences ## |
| 425 if hit ==1: | 250 if hit ==1: |
| 426 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE) ### FUNCTION ### | 251 list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)# |
| 427 | 252 |
| 428 # divergencve | 253 # divergencve |
| 429 divergence = list_info[6] | 254 divergence = list_info[6] |
| 430 # gap number | 255 # gap number |
| 431 gap_number = list_info[7] | 256 gap_number = list_info[7] |
| 443 match_name = pairwise[2] | 268 match_name = pairwise[2] |
| 444 match_seq = pairwise[3] | 269 match_seq = pairwise[3] |
| 445 | 270 |
| 446 len_query_seq = len(query_seq) | 271 len_query_seq = len(query_seq) |
| 447 | 272 |
| 273 """ | |
| 274 4 lines below : only in S05 | |
| 275 """ | |
| 448 Lis1 = string.split(query_name, "||") | 276 Lis1 = string.split(query_name, "||") |
| 449 short_query_name = Lis1[0] | 277 short_query_name = Lis1[0] |
| 450 Lis2 = string.split(match_name, "||") | 278 Lis2 = string.split(match_name, "||") |
| 451 short_match_name = Lis2[0] | 279 short_match_name = Lis2[0] |
| 452 | 280 |
