annotate assess_alignment.py @ 0:7e49c6b19f5e draft

planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
author sanbi-uwc
date Tue, 19 Jul 2022 11:47:08 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
1 #!/usr/bin/env python
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
2
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
3 import argparse
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
4 import json
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
5 import sys
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
6 from dataclasses import dataclass
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
7 from typing import TextIO
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
8
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
9
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
10 @dataclass
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
11 class Sample:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
12 name: str
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
13 reference: str
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
14 mismatches: int
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
15 quality: str
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
16
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
17
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
18 def load_json(json_file: TextIO) -> dict:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
19 data = json.load(json_file)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
20 if "msa" not in data:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
21 raise ValueError("MSA missing from JSON, cannot proceed")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
22 if "gappedTraces" not in data:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
23 raise ValueError("gappedTraces missing from JSON, cannot proceed")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
24 return data
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
25
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
26
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
27 def analyse_mismatches(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
28 json_file: TextIO,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
29 offset: int,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
30 length: int,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
31 vp1only: bool = True,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
32 sec_is_conflict: bool = False,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
33 ) -> list:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
34 data = load_json(json_file)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
35 msas = [al for al in data["msa"] if not al["reference"]]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
36 reference = None
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
37 for al in data["msa"]:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
38 if al["reference"]:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
39 if reference is None:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
40 reference = al
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
41 else:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
42 sys.exit(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
43 "more than one reference found in JSON MSA list, cannot proceed"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
44 )
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
45 min_start = min([int(al["leadingGaps"]) for al in msas])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
47 base_state = ["n"] * len(reference["align"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
48 mismatch_bases = {}
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
49 for i, base in enumerate(reference["align"]):
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
50 for k, al in enumerate(msas):
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
51 leading_gaps = int(al["leadingGaps"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
52 align_len = len(al["align"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
53 if leading_gaps < i and (leading_gaps + align_len) > i:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
54 vp1pos = i - offset
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
55 if vp1only and vp1pos < 0 or vp1pos > length:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
56 # skip positions outside of vp1 gene region
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
57 continue
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
58 al_base = al["align"][i - leading_gaps]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
59 has_secondary_basecall = False
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
60 if sec_is_conflict:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
61 gappedTrace = data["gappedTraces"][k]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
62 pos = i - int(gappedTrace["leadingGaps"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
63 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
64 basecall_str = gappedTrace["basecalls"][
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
65 str(gappedTrace["basecallPos"][pos])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
66 ]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
67 if "|" in basecall_str:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
68 has_secondary_basecall = True
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
69 # set this position to conflicted
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
70 base_state[i] = "C"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
71 if al_base != base:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
72 # let's deal with all the cases where the base state doesn't match the reference
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
73 if base_state[i] == "G":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
74 # the base state was G (a trace matches reference) and now we see a mismatch
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
75 base_state[i] = "C"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
76 elif base_state[i] == "C":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
77 # already marked as conflicting - a mismatch doesn't change that
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
78 pass
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
79 elif base_state[i] == "n" or base_state[i] == "M":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
80 # we never saw this before or its already marked as a mismatch
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
81 base_state[i] = "M"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
82 mismatch_bases[i] = al_base
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
83 else:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
84 sys.exit("unexpected base state: " + base_state[i])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
85 else:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
86 if base_state[i] == "G" or base_state[i] == "n":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
87 # we saw this before and got a match or
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
88 # we never saw this before
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
89 base_state[i] = "G"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
90 elif base_state[i] == "M":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
91 # we saw this before but it was a mismatch - mark this as a conflict
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
92 base_state[i] = "C"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
93 if i in mismatch_bases:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
94 del mismatch_bases[i]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
95 elif base_state[i] == "C":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
96 # we have seen a conflict here before
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
97 pass
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
98 else:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
99 sys.exit("unexpected base_state: " + base_state[i])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
100 conflicts = base_state.count("C")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
101 matches = base_state.count("G")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
102 mismatches = base_state.count("M")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
103 mismatch_list = []
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
104 for i, state in enumerate(base_state):
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
105 # i is in zero-based genome coordinates
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
106 if state == "M":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
107 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
108 mismatch_list.append(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
109 [i, i - offset, reference["align"][i], mismatch_bases[i]]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
110 )
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
111 return [conflicts, matches, mismatches, mismatch_list]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
112
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
113
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
114 def analyse_trace_quality(json_file: TextIO) -> float:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
115 data = load_json(json_file)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
116
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
117 traces = data["gappedTraces"]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
118 overall_avg = 0
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
119 for trace in traces:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
120 start = min(trace["basecallPos"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
121 end = max(trace["basecallPos"])
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
122 call_quality = {}
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
123 avg_ratio = 0
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
124 for base in ("A", "C", "G", "T"):
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
125 calls = trace["peak" + base][start : end + 1]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
126 min_call = min(calls)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
127 max_call = max(calls)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
128 avg_call = sum(calls) / len(calls)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
129 ratio = max_call / avg_call
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
130 call_quality["avg" + base] = avg_call
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
131 call_quality["min" + base] = min_call
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
132 call_quality["max" + base] = max_call
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
133 call_quality["ratio" + base] = ratio
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
134 avg_ratio += ratio
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
135 avg_ratio = avg_ratio / 4
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
136 overall_avg += avg_ratio
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
137 overall_avg = overall_avg / len(traces)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
138 return overall_avg
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
139
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
140
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
141 def comma_split(args: str) -> list[str]:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
142 return args.split(",")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
143
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
144
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
145 if __name__ == "__main__":
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
146 parser = argparse.ArgumentParser()
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
147 parser.add_argument("--output_filename", help="Path to output file")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
148 parser.add_argument("--sample_name", help="Name of sample being analysed")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
149 parser.add_argument(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
150 "--dataset_names", type=comma_split, help="Comma separated names for datasets"
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
151 )
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
152 parser.add_argument("--datasets", nargs="+")
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
153 args = parser.parse_args()
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
154
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
155 offsets = {
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
156 "poliovirus1sabin": 2480,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
157 "poliovirus2sabin": 2482,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
158 "poliovirus3sabin": 2477,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
159 }
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
160
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
161 lengths = {
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
162 "poliovirus1sabin": 906,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
163 "poliovirus2sabin": 903,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
164 "poliovirus3sabin": 900,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
165 }
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
166
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
167 min_mismatches = None
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
168 for file_index, json_filename in enumerate(args.datasets):
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
169 dataset_name = args.dataset_names[file_index].replace(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
170 ".json", ""
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
171 ) # take the name but remove any json suffix
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
172 offset = offsets[dataset_name]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
173 length = lengths[dataset_name]
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
174 (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches(
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
175 open(json_filename), offset, length
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
176 )
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
177 # analyse_mismatches(json_filename, True)
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
178 quality = analyse_trace_quality(open(json_filename))
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
179 if min_mismatches is None or mismatches < min_mismatches:
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
180 min_mismatches = mismatches
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
181 best_match_mismatch_list = mismatch_list
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
182 best_match_quality = quality
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
183 best_match_reference = dataset_name
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
184
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
185 info = {
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
186 "sample_name": args.sample_name,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
187 "best_reference": best_match_reference,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
188 "mismatches": min_mismatches,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
189 "mismatch_list": best_match_mismatch_list,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
190 "quality": best_match_quality,
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
191 }
7e49c6b19f5e planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff changeset
192 json.dump(info, open(args.output_filename, "w"))