Mercurial > repos > sanbi-uwc > summarize_poliovirus_alignment
annotate assess_alignment.py @ 0:7e49c6b19f5e draft
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
author | sanbi-uwc |
---|---|
date | Tue, 19 Jul 2022 11:47:08 +0000 |
parents | |
children |
rev | line source |
---|---|
0
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
2 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
3 import argparse |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
4 import json |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
5 import sys |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
6 from dataclasses import dataclass |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
7 from typing import TextIO |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
8 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
9 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
10 @dataclass |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
11 class Sample: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
12 name: str |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
13 reference: str |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
14 mismatches: int |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
15 quality: str |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
16 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
17 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
18 def load_json(json_file: TextIO) -> dict: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
19 data = json.load(json_file) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
20 if "msa" not in data: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
21 raise ValueError("MSA missing from JSON, cannot proceed") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
22 if "gappedTraces" not in data: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
23 raise ValueError("gappedTraces missing from JSON, cannot proceed") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
24 return data |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
25 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
26 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
27 def analyse_mismatches( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
28 json_file: TextIO, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
29 offset: int, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
30 length: int, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
31 vp1only: bool = True, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
32 sec_is_conflict: bool = False, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
33 ) -> list: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
34 data = load_json(json_file) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
35 msas = [al for al in data["msa"] if not al["reference"]] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
36 reference = None |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
37 for al in data["msa"]: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
38 if al["reference"]: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
39 if reference is None: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
40 reference = al |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
41 else: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
42 sys.exit( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
43 "more than one reference found in JSON MSA list, cannot proceed" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
44 ) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
45 min_start = min([int(al["leadingGaps"]) for al in msas]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
47 base_state = ["n"] * len(reference["align"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
48 mismatch_bases = {} |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
49 for i, base in enumerate(reference["align"]): |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
50 for k, al in enumerate(msas): |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
51 leading_gaps = int(al["leadingGaps"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
52 align_len = len(al["align"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
53 if leading_gaps < i and (leading_gaps + align_len) > i: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
54 vp1pos = i - offset |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
55 if vp1only and vp1pos < 0 or vp1pos > length: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
56 # skip positions outside of vp1 gene region |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
57 continue |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
58 al_base = al["align"][i - leading_gaps] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
59 has_secondary_basecall = False |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
60 if sec_is_conflict: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
61 gappedTrace = data["gappedTraces"][k] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
62 pos = i - int(gappedTrace["leadingGaps"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
63 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
64 basecall_str = gappedTrace["basecalls"][ |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
65 str(gappedTrace["basecallPos"][pos]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
66 ] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
67 if "|" in basecall_str: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
68 has_secondary_basecall = True |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
69 # set this position to conflicted |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
70 base_state[i] = "C" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
71 if al_base != base: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
72 # let's deal with all the cases where the base state doesn't match the reference |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
73 if base_state[i] == "G": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
74 # the base state was G (a trace matches reference) and now we see a mismatch |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
75 base_state[i] = "C" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
76 elif base_state[i] == "C": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
77 # already marked as conflicting - a mismatch doesn't change that |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
78 pass |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
79 elif base_state[i] == "n" or base_state[i] == "M": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
80 # we never saw this before or its already marked as a mismatch |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
81 base_state[i] = "M" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
82 mismatch_bases[i] = al_base |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
83 else: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
84 sys.exit("unexpected base state: " + base_state[i]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
85 else: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
86 if base_state[i] == "G" or base_state[i] == "n": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
87 # we saw this before and got a match or |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
88 # we never saw this before |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
89 base_state[i] = "G" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
90 elif base_state[i] == "M": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
91 # we saw this before but it was a mismatch - mark this as a conflict |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
92 base_state[i] = "C" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
93 if i in mismatch_bases: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
94 del mismatch_bases[i] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
95 elif base_state[i] == "C": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
96 # we have seen a conflict here before |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
97 pass |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
98 else: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
99 sys.exit("unexpected base_state: " + base_state[i]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
100 conflicts = base_state.count("C") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
101 matches = base_state.count("G") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
102 mismatches = base_state.count("M") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
103 mismatch_list = [] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
104 for i, state in enumerate(base_state): |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
105 # i is in zero-based genome coordinates |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
106 if state == "M": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
107 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
108 mismatch_list.append( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
109 [i, i - offset, reference["align"][i], mismatch_bases[i]] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
110 ) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
111 return [conflicts, matches, mismatches, mismatch_list] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
112 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
113 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
114 def analyse_trace_quality(json_file: TextIO) -> float: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
115 data = load_json(json_file) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
116 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
117 traces = data["gappedTraces"] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
118 overall_avg = 0 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
119 for trace in traces: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
120 start = min(trace["basecallPos"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
121 end = max(trace["basecallPos"]) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
122 call_quality = {} |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
123 avg_ratio = 0 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
124 for base in ("A", "C", "G", "T"): |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
125 calls = trace["peak" + base][start : end + 1] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
126 min_call = min(calls) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
127 max_call = max(calls) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
128 avg_call = sum(calls) / len(calls) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
129 ratio = max_call / avg_call |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
130 call_quality["avg" + base] = avg_call |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
131 call_quality["min" + base] = min_call |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
132 call_quality["max" + base] = max_call |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
133 call_quality["ratio" + base] = ratio |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
134 avg_ratio += ratio |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
135 avg_ratio = avg_ratio / 4 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
136 overall_avg += avg_ratio |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
137 overall_avg = overall_avg / len(traces) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
138 return overall_avg |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
139 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
140 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
141 def comma_split(args: str) -> list[str]: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
142 return args.split(",") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
143 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
144 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
145 if __name__ == "__main__": |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
146 parser = argparse.ArgumentParser() |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
147 parser.add_argument("--output_filename", help="Path to output file") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
148 parser.add_argument("--sample_name", help="Name of sample being analysed") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
149 parser.add_argument( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
150 "--dataset_names", type=comma_split, help="Comma separated names for datasets" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
151 ) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
152 parser.add_argument("--datasets", nargs="+") |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
153 args = parser.parse_args() |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
154 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
155 offsets = { |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
156 "poliovirus1sabin": 2480, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
157 "poliovirus2sabin": 2482, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
158 "poliovirus3sabin": 2477, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
159 } |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
160 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
161 lengths = { |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
162 "poliovirus1sabin": 906, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
163 "poliovirus2sabin": 903, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
164 "poliovirus3sabin": 900, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
165 } |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
166 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
167 min_mismatches = None |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
168 for file_index, json_filename in enumerate(args.datasets): |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
169 dataset_name = args.dataset_names[file_index].replace( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
170 ".json", "" |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
171 ) # take the name but remove any json suffix |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
172 offset = offsets[dataset_name] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
173 length = lengths[dataset_name] |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
174 (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches( |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
175 open(json_filename), offset, length |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
176 ) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
177 # analyse_mismatches(json_filename, True) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
178 quality = analyse_trace_quality(open(json_filename)) |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
179 if min_mismatches is None or mismatches < min_mismatches: |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
180 min_mismatches = mismatches |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
181 best_match_mismatch_list = mismatch_list |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
182 best_match_quality = quality |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
183 best_match_reference = dataset_name |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
184 |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
185 info = { |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
186 "sample_name": args.sample_name, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
187 "best_reference": best_match_reference, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
188 "mismatches": min_mismatches, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
189 "mismatch_list": best_match_mismatch_list, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
190 "quality": best_match_quality, |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
191 } |
7e49c6b19f5e
planemo upload commit a99e10fec2fac5aae70974c977eb3b362a1a8429
sanbi-uwc
parents:
diff
changeset
|
192 json.dump(info, open(args.output_filename, "w")) |