comparison shm_csr.py @ 6:ea9d5fc4c001 draft default tip

"planemo upload commit 9ada186a78831ca2618ec817a23a77de6adf1a5d"
author rhpvorderman
date Wed, 22 Dec 2021 11:29:16 +0000
parents 64d74ba01a7c
children
comparison
equal deleted inserted replaced
5:495a521cf9f2 6:ea9d5fc4c001
1 import argparse 1 import argparse
2 import logging 2 import logging
3 import sys 3 import sys
4 import os 4 import os
5 import re 5 import typing
6 from typing import Optional
6 7
7 from collections import defaultdict 8 from collections import defaultdict
9
10 REGION_FILTERS = ("leader", "FR1", "CDR1", "FR2", "CDR2")
11
12
13 class Mutation(typing.NamedTuple):
14 """Represent a mutation type as a tuple"""
15 frm: str # 'from' is a reserved python keyword.
16 where: int
17 to: str
18 frmAA: Optional[str] = None
19 whereAA: Optional[int] = None
20 toAA: Optional[str] = None
21 thing: Optional[str] = None # '(---)' or '(+-+)' etc. No idea
22
23 @classmethod
24 def from_string(cls, string: str):
25 # Complete mutation example: a88>g,I30>V(+ - +)
26 # Only nucleotide example: g303>t
27 if ',' in string:
28 nucleotide_change, aa_change = string.split(',', maxsplit=1) # type: str, Optional[str]
29 else:
30 nucleotide_change = string
31 aa_change = None
32 frm_part, to = nucleotide_change.split('>', maxsplit=1)
33 frm = frm_part[0]
34 where = int(frm_part[1:])
35
36 if aa_change is None:
37 return cls(frm, where, to)
38
39 frmAA_part, toAA_part = aa_change.split('>', maxsplit=1) # type: str, str
40 frmAA = frmAA_part[0]
41 whereAA = int(frmAA_part[1:])
42 brace_start = toAA_part.index('(')
43 toAA = toAA_part[:brace_start]
44 thing = toAA_part[brace_start:]
45 return cls(frm, where, to, frmAA, whereAA, toAA, thing)
46
47
48 class Hotspot(typing.NamedTuple):
49 start: int
50 end: int
51 region: str
52
53 @classmethod
54 def from_string(cls, string):
55 # Example: aa,40-41(FR1)
56 sequence, rest = string.split(',') # type: str, str
57 brace_pos = rest.index('(')
58 numbers = rest[:brace_pos]
59 start, end = numbers.split('-')
60 region = rest[brace_pos + 1:-1] # Remove the braces
61 return cls(int(start), int(end), region)
62
8 63
9 def main(): 64 def main():
10 parser = argparse.ArgumentParser() 65 parser = argparse.ArgumentParser()
11 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") 66 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
12 parser.add_argument("--genes", help="The genes available in the 'best_match' column") 67 parser.add_argument("--genes", help="The genes available in the 'best_match' column")
13 parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2']) 68 parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=REGION_FILTERS)
14 parser.add_argument("--output", help="Output file") 69 parser.add_argument("--output", help="Output file")
15 70
16 args = parser.parse_args() 71 args = parser.parse_args()
17 72
18 infile = args.input 73 infile = args.input
21 outfile = args.output 76 outfile = args.output
22 77
23 genedic = dict() 78 genedic = dict()
24 79
25 mutationdic = dict() 80 mutationdic = dict()
26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
29 mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?")
30 NAMatchResult = (None, None, None, None, None, None, '') 81 NAMatchResult = (None, None, None, None, None, None, '')
31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
32 linecount = 0 82 linecount = 0
33 83
34 IDIndex = 0 84 IDIndex = 0
35 best_matchIndex = 0 85 best_matchIndex = 0
36 fr1Index = 0 86 fr1Index = 0
40 fr3Index = 0 90 fr3Index = 0
41 first = True 91 first = True
42 IDlist = [] 92 IDlist = []
43 mutationList = [] 93 mutationList = []
44 mutationListByID = {} 94 mutationListByID = {}
45 cdr1LengthDic = {} 95 cdr1AALengthDic = {}
46 cdr2LengthDic = {} 96 cdr2AALengthDic = {}
47 97
48 fr1LengthDict = {} 98 LengthDic = {}
49 fr2LengthDict = {}
50 fr3LengthDict = {}
51 99
52 cdr1LengthIndex = 0 100 cdr1LengthIndex = 0
53 cdr2LengthIndex = 0 101 cdr2LengthIndex = 0
54
55 fr1SeqIndex = 0
56 fr2SeqIndex = 0
57 fr3SeqIndex = 0
58 102
59 tandem_sum_by_class = defaultdict(int) 103 tandem_sum_by_class = defaultdict(int)
60 expected_tandem_sum_by_class = defaultdict(float) 104 expected_tandem_sum_by_class = defaultdict(float)
61 105
62 with open(infile, 'r') as i: 106 with open(infile, 'r') as i:
68 fr1Index = linesplt.index("FR1.IMGT") 112 fr1Index = linesplt.index("FR1.IMGT")
69 cdr1Index = linesplt.index("CDR1.IMGT") 113 cdr1Index = linesplt.index("CDR1.IMGT")
70 fr2Index = linesplt.index("FR2.IMGT") 114 fr2Index = linesplt.index("FR2.IMGT")
71 cdr2Index = linesplt.index("CDR2.IMGT") 115 cdr2Index = linesplt.index("CDR2.IMGT")
72 fr3Index = linesplt.index("FR3.IMGT") 116 fr3Index = linesplt.index("FR3.IMGT")
73 cdr1LengthIndex = linesplt.index("CDR1.IMGT.length") 117 fr1LengthIndex = linesplt.index("FR1.IMGT.Nb.of.nucleotides")
74 cdr2LengthIndex = linesplt.index("CDR2.IMGT.length") 118 fr2LengthIndex = linesplt.index("FR2.IMGT.Nb.of.nucleotides")
75 fr1SeqIndex = linesplt.index("FR1.IMGT.seq") 119 fr3LengthIndex = linesplt.index("FR3.IMGT.Nb.of.nucleotides")
76 fr2SeqIndex = linesplt.index("FR2.IMGT.seq") 120 cdr1LengthIndex = linesplt.index("CDR1.IMGT.Nb.of.nucleotides")
77 fr3SeqIndex = linesplt.index("FR3.IMGT.seq") 121 cdr2LengthIndex = linesplt.index("CDR2.IMGT.Nb.of.nucleotides")
122 cdr1AALengthIndex = linesplt.index("CDR1.IMGT.length")
123 cdr2AALengthIndex = linesplt.index("CDR2.IMGT.length")
78 first = False 124 first = False
79 continue 125 continue
80 linecount += 1 126 linecount += 1
81 linesplt = line.split("\t") 127 linesplt = line.split("\t")
82 ID = linesplt[IDIndex] 128 ID = linesplt[IDIndex]
83 genedic[ID] = linesplt[best_matchIndex] 129 genedic[ID] = linesplt[best_matchIndex]
84 130
85 mutationdic[ID + "_FR1"] = [] 131 mutationdic[ID + "_FR1"] = []
86 if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader": 132 if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader":
87 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] 133 mutationdic[ID + "_FR1"] = [Mutation.from_string(x) for x in linesplt[fr1Index].split("|") if x]
88 134
89 mutationdic[ID + "_CDR1"] = [] 135 mutationdic[ID + "_CDR1"] = []
90 if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]: 136 if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]:
91 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] 137 mutationdic[ID + "_CDR1"] = [Mutation.from_string(x) for x in linesplt[cdr1Index].split("|") if x]
92 138
93 mutationdic[ID + "_FR2"] = [] 139 mutationdic[ID + "_FR2"] = []
94 if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]: 140 if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]:
95 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] 141 mutationdic[ID + "_FR2"] = [Mutation.from_string(x) for x in linesplt[fr2Index].split("|") if x]
96 142
97 mutationdic[ID + "_CDR2"] = [] 143 mutationdic[ID + "_CDR2"] = []
98 if len(linesplt[cdr2Index]) > 5: 144 if len(linesplt[cdr2Index]) > 5:
99 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] 145 mutationdic[ID + "_CDR2"] = [Mutation.from_string(x) for x in linesplt[cdr2Index].split("|") if x]
100 146
101 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] 147 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
102 148
103 mutationdic[ID + "_FR3"] = [] 149 mutationdic[ID + "_FR3"] = []
104 if len(linesplt[fr3Index]) > 5: 150 if len(linesplt[fr3Index]) > 5:
105 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] 151 mutationdic[ID + "_FR3"] = [Mutation.from_string(x) for x in linesplt[fr3Index].split("|") if x]
106 152
107 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 153 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
108 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 154 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
109 155
110 try: 156 fr1Length = int(linesplt[fr1LengthIndex])
111 cdr1Length = int(linesplt[cdr1LengthIndex]) 157 fr2Length = int(linesplt[fr2LengthIndex])
112 except: 158 fr3Length = int(linesplt[fr3LengthIndex])
113 cdr1Length = 0 159 cdr1Length = int(linesplt[cdr1LengthIndex])
114 160 cdr2Length = int(linesplt[cdr2LengthIndex])
115 try: 161 LengthDic[ID] = (fr1Length, cdr1Length, fr2Length, cdr2Length, fr3Length)
116 cdr2Length = int(linesplt[cdr2LengthIndex]) 162
117 except: 163 cdr1AALengthDic[ID] = int(linesplt[cdr1AALengthIndex])
118 cdr2Length = 0 164 cdr2AALengthDic[ID] = int(linesplt[cdr2AALengthIndex])
119
120 #print linesplt[fr2SeqIndex]
121 fr1Length = len(linesplt[fr1SeqIndex]) if empty_region_filter == "leader" else 0
122 fr2Length = len(linesplt[fr2SeqIndex]) if empty_region_filter in ["leader", "FR1", "CDR1"] else 0
123 fr3Length = len(linesplt[fr3SeqIndex])
124
125 cdr1LengthDic[ID] = cdr1Length
126 cdr2LengthDic[ID] = cdr2Length
127
128 fr1LengthDict[ID] = fr1Length
129 fr2LengthDict[ID] = fr2Length
130 fr3LengthDict[ID] = fr3Length
131 165
132 IDlist += [ID] 166 IDlist += [ID]
133 print("len(mutationdic) =", len(mutationdic)) 167 print("len(mutationdic) =", len(mutationdic))
134 168
135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: 169 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
153 mutations_by_id_dic[splt[0]] = int(splt[1]) 187 mutations_by_id_dic[splt[0]] = int(splt[1])
154 188
155 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") 189 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt")
156 with open(tandem_file, 'w') as o: 190 with open(tandem_file, 'w') as o:
157 highest_tandem_length = 0 191 highest_tandem_length = 0
192 # LengthDic stores length as a tuple
193 # (fr1Length, cdr1Length, fr2Length, cdr2Length, fr3Length)
194 # To get the total length, we can sum(region_lengths)
195 # To get the total length for leader:
196 # sum(region_lengths[0:]) (Equivalent to everything)
197 # sum(region_lengths[1:]) Gets everything except FR1 etc.
198 # We determine the position to start summing below.
199 # This returns 0 for leader, 1 for FR1 etc.
200 length_start_pos = REGION_FILTERS.index(empty_region_filter)
158 201
159 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") 202 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n")
160 for ID in IDlist: 203 for ID in IDlist:
161 mutations = mutationListByID[ID] 204 mutations = mutationListByID[ID]
205 region_length = sum(LengthDic[ID][length_start_pos:])
162 if len(mutations) == 0: 206 if len(mutations) == 0:
163 continue 207 continue
164 last_mut = max(mutations, key=lambda x: int(x[1])) 208 last_mut = max(mutations, key=lambda x: int(x[1]))
165 209
166 last_mut_pos = int(last_mut[1]) 210 last_mut_pos = int(last_mut[1])
192 236
193 if len(tandem_muts) > 0: 237 if len(tandem_muts) > 0:
194 if highest_tandem_length < len(tandem_muts): 238 if highest_tandem_length < len(tandem_muts):
195 highest_tandem_length = len(tandem_muts) 239 highest_tandem_length = len(tandem_muts)
196 240
197 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID]
198 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) 241 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0)
199 num_mutations = mutations_by_id_dic[ID] # len(mutations) 242 num_mutations = mutations_by_id_dic[ID] # len(mutations)
200 f_num_mutations = float(num_mutations) 243 f_num_mutations = float(num_mutations)
201 num_tandem_muts = len(tandem_muts) 244 num_tandem_muts = len(tandem_muts)
202 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) 245 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length)
203 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, 246 # String format and round disagree slightly (see 3.605).
204 str(num_mutations), 247 # So round before formatting.
205 str(num_tandem_muts), 248 o.write(f"{ID}\t{num_mutations}\t{num_tandem_muts}\t{region_length}\t"
206 str(region_length), 249 f"{round(expected_tandem_muts, 2):.2f}\t"
207 str(round(expected_tandem_muts, 2)), 250 f"{longest_tandem[1]}\t{tandem_muts}\n")
208 str(longest_tandem[1]),
209 str(tandem_muts)))
210 gene = genedic[ID] 251 gene = genedic[ID]
211 if gene.find("unmatched") == -1: 252 if gene.find("unmatched") == -1:
212 tandem_sum_by_class[gene] += num_tandem_muts 253 tandem_sum_by_class[gene] += num_tandem_muts
213 expected_tandem_sum_by_class[gene] += expected_tandem_muts 254 expected_tandem_sum_by_class[gene] += expected_tandem_muts
214 255
299 absentAACDR2Dic[7] = list(range(59,62)) 340 absentAACDR2Dic[7] = list(range(59,62))
300 absentAACDR2Dic[8] = list(range(59,61)) 341 absentAACDR2Dic[8] = list(range(59,61))
301 absentAACDR2Dic[9] = [60] 342 absentAACDR2Dic[9] = [60]
302 343
303 absentAA = [len(IDlist)] * (AALength-1) 344 absentAA = [len(IDlist)] * (AALength-1)
304 for k, cdr1Length in cdr1LengthDic.items(): 345 for k, cdr1Length in cdr1AALengthDic.items():
305 for c in absentAACDR1Dic[cdr1Length]: 346 for c in absentAACDR1Dic[cdr1Length]:
306 absentAA[c] -= 1 347 absentAA[c] -= 1
307 348
308 for k, cdr2Length in cdr2LengthDic.items(): 349 for k, cdr2Length in cdr2AALengthDic.items():
309 for c in absentAACDR2Dic[cdr2Length]: 350 for c in absentAACDR2Dic[cdr2Length]:
310 absentAA[c] -= 1 351 absentAA[c] -= 1
311 352
312 353
313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" 354 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
314 with open(aa_mutations_by_id_file, 'w') as o: 355 with open(aa_mutations_by_id_file, 'w') as o:
315 o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") 356 o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
316 for ID in IDlist: 357 for ID in IDlist:
317 absentAAbyID = [1] * (AALength-1) 358 absentAAbyID = [1] * (AALength-1)
318 cdr1Length = cdr1LengthDic[ID] 359 cdr1Length = cdr1AALengthDic[ID]
319 for c in absentAACDR1Dic[cdr1Length]: 360 for c in absentAACDR1Dic[cdr1Length]:
320 absentAAbyID[c] -= 1 361 absentAAbyID[c] -= 1
321 362
322 cdr2Length = cdr2LengthDic[ID] 363 cdr2Length = cdr2AALengthDic[ID]
323 for c in absentAACDR2Dic[cdr2Length]: 364 for c in absentAACDR2Dic[cdr2Length]:
324 absentAAbyID[c] -= 1 365 absentAAbyID[c] -= 1
325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") 366 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
326 367
327 if linecount == 0: 368 if linecount == 0:
331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) 372 o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) 373 o.write("WA (%)," + ("0,0,0\n" * len(genes)))
333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) 374 o.write("TW (%)," + ("0,0,0\n" * len(genes)))
334 sys.exit() 375 sys.exit()
335 376
336 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
337 RGYWCount = {} 377 RGYWCount = {}
338 WRCYCount = {} 378 WRCYCount = {}
339 WACount = {} 379 WACount = {}
340 TWCount = {} 380 TWCount = {}
341 381
356 first = False 396 first = False
357 continue 397 continue
358 linesplt = line.split("\t") 398 linesplt = line.split("\t")
359 gene = linesplt[best_matchIndex] 399 gene = linesplt[best_matchIndex]
360 ID = linesplt[IDIndex] 400 ID = linesplt[IDIndex]
361 RGYW = [(int(x), int(y), z) for (x, y, z) in 401 RGYW = [Hotspot.from_string(x) for x in linesplt[aggctatIndex].split("|") if x]
362 [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] 402 WRCY = [Hotspot.from_string(x) for x in linesplt[atagcctIndex].split("|") if x]
363 WRCY = [(int(x), int(y), z) for (x, y, z) in 403 WA = [Hotspot.from_string(x) for x in linesplt[ataIndex].split("|") if x]
364 [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] 404 TW = [Hotspot.from_string(x) for x in linesplt[tatIndex].split("|") if x]
365 WA = [(int(x), int(y), z) for (x, y, z) in
366 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
367 TW = [(int(x), int(y), z) for (x, y, z) in
368 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
369 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 405 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
370 406
371 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle: 407 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle:
372 for hotspot in RGYW: 408 for hotspot in RGYW:
373 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot]))) 409 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot])))
415 for start, end, region in motif_dic[motif]: 451 for start, end, region in motif_dic[motif]:
416 if start <= int(where) <= end: 452 if start <= int(where) <= end:
417 out_handle.write("{0}\n".format( 453 out_handle.write("{0}\n".format(
418 "\t".join([ 454 "\t".join([
419 ID, 455 ID,
420 where, 456 str(where),
421 region, 457 region,
422 frm, 458 frm,
423 to, 459 to,
424 str(AAwhere), 460 str(AAwhere),
425 str(AAfrm), 461 str(AAfrm),
481 with open(foutfile, 'w') as o: 517 with open(foutfile, 'w') as o:
482 for typ in arr: 518 for typ in arr:
483 o.write(typ + " (%)") 519 o.write(typ + " (%)")
484 curr = dic[typ] 520 curr = dic[typ]
485 for gene in genes: 521 for gene in genes:
486 geneMatcher = geneMatchers[gene]
487 if valuedic[gene + "_" + fname] is 0: 522 if valuedic[gene + "_" + fname] is 0:
488 o.write(",0,0,0") 523 o.write(",0,0,0")
489 else: 524 else:
490 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname) 525 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if z.startswith(gene)]], gene, func, fname)
491 o.write("," + x + "," + y + "," + z) 526 o.write("," + x + "," + y + "," + z)
492 x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname) 527 x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname)
493 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) 528 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
494 o.write("," + x + "," + y + "," + z + "\n") 529 o.write("," + x + "," + y + "," + z + "\n")
495 530