Mercurial > repos > abims-sbr > pairwise
comparison scripts/functions.py @ 4:6709645eff5d draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 08:53:53 -0500 |
| parents | |
| children | 471ed956ff13 |
comparison
equal
deleted
inserted
replaced
| 3:5f68b2fc02c1 | 4:6709645eff5d |
|---|---|
| 1 import string | |
| 2 | |
| 3 # Used in S05 and | |
| 4 def split_file(path_in, keyword): | |
| 5 | |
| 6 file_in = open(path_in, "r") | |
| 7 RUN = '' | |
| 8 BASH1={} | |
| 9 | |
| 10 with open(path_in, "r") as file_in: | |
| 11 for nextline in file_in.readlines(): | |
| 12 | |
| 13 ################################## | |
| 14 ### [A] FORMATTING QUERY NAME ### | |
| 15 | |
| 16 # Get query name | |
| 17 if nextline[0:6]=='Query=': | |
| 18 L1 = string.split(nextline, "||") | |
| 19 L2 = string.split(L1[0], " ") | |
| 20 query = L2[1] | |
| 21 if query[-1] == "\n": | |
| 22 query = query[:-1] | |
| 23 | |
| 24 ### [A] END FORMATTING QUERY NAME ### | |
| 25 ###################################### | |
| 26 | |
| 27 | |
| 28 ### split the file with keyword ### | |
| 29 if keyword in nextline: | |
| 30 # Two cases here: | |
| 31 #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN". | |
| 32 #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN" | |
| 33 | |
| 34 if RUN == '': # case #1# | |
| 35 RUN = RUN + nextline # we just added the first line of the file | |
| 36 | |
| 37 else: # case #2# (there was a run before) | |
| 38 BASH1[query] = RUN # add the previous run to the bash | |
| 39 RUN = '' # re-initialize the "RUN" | |
| 40 RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN") | |
| 41 | |
| 42 else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously) | |
| 43 RUN = RUN + nextline | |
| 44 | |
| 45 | |
| 46 if RUN: | |
| 47 BASH1[query] = RUN # add the last "RUN" | |
| 48 | |
| 49 | |
| 50 return(BASH1) | |
| 51 | |
| 52 def detect_Matches(query, MATCH, WORK_DIR, bash1): | |
| 53 F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w') | |
| 54 F5.write(bash1[query]) | |
| 55 F5.close() | |
| 56 | |
| 57 F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r') | |
| 58 list1 =[] | |
| 59 list2 =[] | |
| 60 | |
| 61 while 1: | |
| 62 nexteu = F6.readline() | |
| 63 | |
| 64 if not nexteu : break | |
| 65 | |
| 66 if "***** No hits found ******" in nexteu : | |
| 67 hit = 0 | |
| 68 break | |
| 69 | |
| 70 if 'Sequences producing significant alignments:' in nexteu: | |
| 71 hit = 1 | |
| 72 F6.readline() # jump a line | |
| 73 | |
| 74 while 1: | |
| 75 nexteu2 = F6.readline() | |
| 76 | |
| 77 if nexteu2[0]==">": break | |
| 78 | |
| 79 ###################################### | |
| 80 ### [B] FORMAT MATCH NAME 1st STEP ### | |
| 81 | |
| 82 if nexteu2 != '\n': | |
| 83 LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!! | |
| 84 match = LL1[0] #### SOUTH databank // NORTH will have "|" separators | |
| 85 list1.append(match) | |
| 86 | |
| 87 match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!! | |
| 88 list2.append(match2) #### SOUTH databank // NORTH will have "|" separators | |
| 89 | |
| 90 if MATCH == 0: ## Only read the 1rst line (i.e. the First Match) | |
| 91 break | |
| 92 else: ## Read the other lines (i.e. All the Matches) | |
| 93 continue | |
| 94 | |
| 95 ### [B] END FORMAT MATCH NAME 1st STEP ### | |
| 96 ########################################## | |
| 97 | |
| 98 break | |
| 99 | |
| 100 F6.close() | |
| 101 return(list1, list2, hit) # list1 = short name // list2 = more complete name | |
| 102 | |
| 103 def get_information_on_matches(list_of_line): | |
| 104 | |
| 105 for line in list_of_line: | |
| 106 | |
| 107 ## Score and Expect | |
| 108 if "Score" in line: | |
| 109 line = line[:-1] # remove "\n" | |
| 110 S_line = string.split(line, " = ") | |
| 111 Expect = S_line[-1] ## ***** Expect | |
| 112 S_line2 = string.split(S_line[1], " bits ") | |
| 113 Score = string.atof(S_line2[0]) | |
| 114 | |
| 115 ## Identities/gaps/percent/divergence/length_matched | |
| 116 elif "Identities" in line: | |
| 117 line = line[:-1] # remove "\n" | |
| 118 g = 0 | |
| 119 | |
| 120 if "Gaps" in line: | |
| 121 pre_S_line = string.split(line, ",") | |
| 122 identity_line = pre_S_line[0] | |
| 123 gaps_line = pre_S_line[1] | |
| 124 g = 1 | |
| 125 else: | |
| 126 identity_line = line | |
| 127 g = 0 | |
| 128 | |
| 129 ## treat identity line | |
| 130 S_line = string.split(identity_line, " ") | |
| 131 | |
| 132 identities = S_line[-2] ## ***** identities | |
| 133 | |
| 134 S_line2 = string.split(identities, "/") | |
| 135 hits = string.atof(S_line2[0]) ## ***** hits | |
| 136 length_matched = string.atof(S_line2[1]) ## ***** length_matched | |
| 137 abs_nb_differences = length_matched - hits ## ***** abs_nb_differences | |
| 138 | |
| 139 identity_percent = hits/length_matched * 100 ## ***** identity_percent | |
| 140 | |
| 141 divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent | |
| 142 | |
| 143 ## treat gap line if any | |
| 144 if g ==1: # means there are gaps | |
| 145 S_line3 = string.split(gaps_line, " ") | |
| 146 gaps_part = S_line3[-2] | |
| 147 S_line4 = string.split(gaps_part, "/") | |
| 148 gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number | |
| 149 | |
| 150 real_differences = abs_nb_differences - gaps_number ## ***** real_differences | |
| 151 real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent | |
| 152 else: | |
| 153 gaps_number = 0 | |
| 154 real_differences = 0 | |
| 155 real_divergence_percent = divergence_percent | |
| 156 | |
| 157 ## Frame | |
| 158 elif "Frame" in line: | |
| 159 line = line[:-1] # remove "\n" | |
| 160 S_line = string.split(line, " = ") | |
| 161 frame = S_line[1] | |
| 162 | |
| 163 list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched] | |
| 164 | |
| 165 return(list_informations) | |
| 166 | |
| 167 # Used in S06, S09, S11 | |
| 168 def get_pairs(fasta_file_path): | |
| 169 F2 = open(fasta_file_path, "r") | |
| 170 list_pairwises = [] | |
| 171 while 1: | |
| 172 next2 = F2.readline() | |
| 173 if not next2: | |
| 174 break | |
| 175 if next2[0] == ">": | |
| 176 fasta_name_query = next2[:-1] | |
| 177 next3 = F2.readline() | |
| 178 fasta_seq_query = next3[:-1] | |
| 179 next3 = F2.readline() ## jump one empty line (if any after the sequence) | |
| 180 fasta_name_match = next3[:-1] | |
| 181 next3 = F2.readline() | |
| 182 fasta_seq_match = next3[:-1] | |
| 183 pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match] | |
| 184 | |
| 185 ## ADD pairwise with condition | |
| 186 list_pairwises.append(pairwise) | |
| 187 F2.close() | |
| 188 | |
| 189 return(list_pairwises) | |
| 190 | |
| 191 def extract_length(length_string): # format length string = 57...902 | |
| 192 l3 = string.split(length_string, "...") | |
| 193 n1 = string.atoi(l3[0]) | |
| 194 n2 = string.atoi(l3[1]) | |
| 195 length = n2-n1 | |
| 196 return(length) | |
| 197 | |
| 198 def filter_redondancy(list_paireu, MIN_LENGTH): | |
| 199 | |
| 200 bash1 = {} | |
| 201 list_pairout = [] | |
| 202 | |
| 203 for pair in list_paireu: | |
| 204 query_name = pair[0] | |
| 205 query_seq = pair[1] | |
| 206 match_name = pair[2] | |
| 207 match_seq = pair[3] | |
| 208 | |
| 209 l1 = string.split(query_name, "||") | |
| 210 short_query_name = l1[0][1:] | |
| 211 length_matched = extract_length(l1[1]) | |
| 212 l2 = string.split(match_name, "||") | |
| 213 short_match_name = l2[0][1:] | |
| 214 binom = "%s_%s" %(short_query_name, short_match_name) | |
| 215 | |
| 216 if binom not in bash1.keys(): | |
| 217 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched] | |
| 218 else: | |
| 219 old_length = bash1[binom][-1] | |
| 220 if length_matched > old_length: | |
| 221 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched] | |
| 222 | |
| 223 | |
| 224 for bino in bash1.keys(): | |
| 225 length = bash1[bino][-1] | |
| 226 if length > MIN_LENGTH: | |
| 227 list_pairout.append(bash1[bino]) | |
| 228 | |
| 229 return(list_pairout) |
