Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment
annotate assess_alignment.py @ 13:bf0bebcb6bb1 draft default tip
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
| author | sanbi-uwc |
|---|---|
| date | Fri, 11 Nov 2022 06:18:15 +0000 |
| parents | ee98dcd9aad4 |
| children |
| rev | line source |
|---|---|
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
2 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
3 import argparse |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
4 import json |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
5 import sys |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
6 from dataclasses import dataclass |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
7 from textwrap import fill |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
8 from typing import TextIO |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
9 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
10 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
11 @dataclass |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
12 class Sample: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
13 name: str |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
14 reference: str |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
15 mismatches: int |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
16 quality: str |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
17 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
18 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
19 def load_json(json_file: TextIO) -> dict: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
20 data = json.load(json_file) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
21 if "msa" not in data: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
22 raise ValueError("MSA missing from JSON, cannot proceed") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
23 if "gappedTraces" not in data: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
24 raise ValueError("gappedTraces missing from JSON, cannot proceed") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
25 return data |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
26 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
27 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
28 def analyse_mismatches( |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
29 json_file: TextIO, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
30 offset: int, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
31 length: int, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
32 vp1only: bool = True, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
33 sec_is_conflict: bool = False, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
34 ) -> list: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
35 data = load_json(json_file) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
36 msas = [al for al in data["msa"] if not al["reference"]] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
37 reference = None |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
38 for al in data["msa"]: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
39 if al["reference"]: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
40 if reference is None: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
41 reference = al |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
42 else: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
43 sys.exit( |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
44 "more than one reference found in JSON MSA list, cannot proceed" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
45 ) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
46 min_start = min([int(al["leadingGaps"]) for al in msas]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
47 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
48 base_state = ["n"] * len(reference["align"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
49 mismatch_bases = {} |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
50 base_start = 0 |
|
9
acaaf49e2747
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
8
diff
changeset
|
51 for i, reference_base in enumerate(reference["align"]): |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
52 if vp1only and not base_start: |
|
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
53 if data['gappedConsensus'][i] != '-': |
|
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
54 base_start = i |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
55 for k, al in enumerate(msas): |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
56 leading_gaps = int(al["leadingGaps"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
57 align_len = len(al["align"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
58 if leading_gaps < i and (leading_gaps + align_len) > i: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
59 vp1pos = i - offset |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
60 if vp1only and vp1pos < 0 or vp1pos > length: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
61 # skip positions outside of vp1 gene region |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
62 continue |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
63 al_base = al["align"][i - leading_gaps] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
64 has_secondary_basecall = False |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
65 if sec_is_conflict: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
66 gappedTrace = data["gappedTraces"][k] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
67 pos = i - int(gappedTrace["leadingGaps"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
68 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
69 basecall_str = gappedTrace["basecalls"][ |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
70 str(gappedTrace["basecallPos"][pos]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
71 ] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
72 if "|" in basecall_str: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
73 has_secondary_basecall = True |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
74 # set this position to conflicted |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
75 base_state[i] = "C" |
|
9
acaaf49e2747
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
8
diff
changeset
|
76 if al_base != reference_base: |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
77 # let's deal with all the cases where the base state doesn't match the reference |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
78 if base_state[i] == "G": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
79 # the base state was G (a trace matches reference) and now we see a mismatch |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
80 base_state[i] = "C" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
81 elif base_state[i] == "C": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
82 # already marked as conflicting - a mismatch doesn't change that |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
83 pass |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
84 elif base_state[i] == "n" or base_state[i] == "M": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
85 # we never saw this before or its already marked as a mismatch |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
86 base_state[i] = "M" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
87 mismatch_bases[i] = al_base |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
88 else: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
89 sys.exit("unexpected base state: " + base_state[i]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
90 else: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
91 if base_state[i] == "G" or base_state[i] == "n": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
92 # we saw this before and got a match or |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
93 # we never saw this before |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
94 base_state[i] = "G" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
95 elif base_state[i] == "M": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
96 # we saw this before but it was a mismatch - mark this as a conflict |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
97 base_state[i] = "C" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
98 if i in mismatch_bases: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
99 del mismatch_bases[i] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
100 elif base_state[i] == "C": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
101 # we have seen a conflict here before |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
102 pass |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
103 else: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
104 sys.exit("unexpected base_state: " + base_state[i]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
105 conflicts = base_state.count("C") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
106 matches = base_state.count("G") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
107 mismatches = base_state.count("M") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
108 mismatch_list = [] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
109 for i, state in enumerate(base_state): |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
110 # i is in zero-based genome coordinates |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
111 if state == "M": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
112 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
113 mismatch_list.append( |
|
4
1897677e107c
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
2
diff
changeset
|
114 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
115 ) |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
116 if vp1only: |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
117 # a note on gap trimming: if the consensus is smaller than VP1, we might |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
118 # get gaps on either side. these will of course be trimmed. |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
119 # we don't expect gaps inside the consensus but in the rare event that |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
120 # these occur they are also discarded. |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
121 consensus = data['gappedConsensus'][offset:offset+length].replace('-','') |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
122 else: |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
123 # TODO: sometimes there is "orphan" sequence from false calls far in front |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
124 # or behind the consensus region. these should be removed somehow |
|
10
fb905d0f8201
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
9
diff
changeset
|
125 consensus = data['gapFreeConsensus'] |
|
9
acaaf49e2747
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
8
diff
changeset
|
126 return [conflicts, matches, mismatches, mismatch_list, consensus] |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
127 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
128 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
129 def analyse_trace_quality(json_file: TextIO) -> float: |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
130 # TODO: consider clipping this to VP1 region or at least aligned region |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
131 data = load_json(json_file) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
132 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
133 traces = data["gappedTraces"] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
134 overall_avg = 0 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
135 for trace in traces: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
136 start = min(trace["basecallPos"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
137 end = max(trace["basecallPos"]) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
138 call_quality = {} |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
139 avg_ratio = 0 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
140 for base in ("A", "C", "G", "T"): |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
141 calls = trace["peak" + base][start : end + 1] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
142 min_call = min(calls) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
143 max_call = max(calls) |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
144 if len(calls) == 0: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
145 # no calls for this base to deal with, skip it |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
146 continue |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
147 avg_call = sum(calls) / len(calls) |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
148 if avg_call == 0: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
149 # zero average base quality! |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
150 ratio = 0 |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
151 else: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
152 ratio = max_call / avg_call |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
153 call_quality["avg" + base] = avg_call |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
154 call_quality["min" + base] = min_call |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
155 call_quality["max" + base] = max_call |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
156 call_quality["ratio" + base] = ratio |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
157 avg_ratio += ratio |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
158 avg_ratio = avg_ratio / 4 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
159 overall_avg += avg_ratio |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
160 if len(traces) > 0: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
161 overall_avg = overall_avg / len(traces) |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
162 else: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
163 overall_avg = 0 |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
164 return overall_avg |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
165 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
166 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
167 def comma_split(args: str) -> list[str]: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
168 return args.split(",") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
169 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
170 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
171 if __name__ == "__main__": |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
172 parser = argparse.ArgumentParser() |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
173 parser.add_argument("--output_filename", help="Path to output file") |
|
5
0e556a3f85d6
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
4
diff
changeset
|
174 parser.add_argument("--consensus_output_filename", help="Path to output file for best consensus") |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
175 parser.add_argument("--sample_name", help="Name of sample being analysed") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
176 parser.add_argument( |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
177 "--dataset_names", type=comma_split, help="Comma separated names for datasets" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
178 ) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
179 parser.add_argument("--datasets", nargs="+") |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
180 args = parser.parse_args() |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
181 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
182 offsets = { |
|
2
31ca16290d4f
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
0
diff
changeset
|
183 # these are in 0-based coordinates, so off-by-one from NCBI 1-based coordinates |
|
31ca16290d4f
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
0
diff
changeset
|
184 "poliovirus1sabin": 2479, # V01150 |
|
31ca16290d4f
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
0
diff
changeset
|
185 "poliovirus2sabin": 2481, # AY184220 |
|
31ca16290d4f
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
0
diff
changeset
|
186 "poliovirus3sabin": 2478, # X00925 |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
187 } |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
188 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
189 lengths = { |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
190 "poliovirus1sabin": 906, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
191 "poliovirus2sabin": 903, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
192 "poliovirus3sabin": 900, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
193 } |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
194 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
195 min_mismatches = None |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
196 for file_index, json_filename in enumerate(args.datasets): |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
197 dataset_name = args.dataset_names[file_index].replace( |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
198 ".json", "" |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
199 ) # take the name but remove any json suffix |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
200 offset = offsets[dataset_name] |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
201 length = lengths[dataset_name] |
|
9
acaaf49e2747
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
8
diff
changeset
|
202 (conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches( |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
203 open(json_filename), offset, length |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
204 ) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
205 # analyse_mismatches(json_filename, True) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
206 quality = analyse_trace_quality(open(json_filename)) |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
207 if min_mismatches is None or mismatches < min_mismatches: |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
208 min_mismatches = mismatches |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
209 best_match_mismatch_list = mismatch_list |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
210 best_match_quality = quality |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
211 best_match_reference = dataset_name |
|
9
acaaf49e2747
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
sanbi-uwc
parents:
8
diff
changeset
|
212 best_consensus = consensus |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
213 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2), |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
214 vp1_coverage_perc = round(len(consensus) / lengths[best_match_reference] * 100) |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
215 |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
216 info = { |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
217 "sample_name": args.sample_name, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
218 "best_reference": best_match_reference, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
219 "mismatches": min_mismatches, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
220 "mismatch_list": best_match_mismatch_list, |
|
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
221 "quality": best_match_quality, |
|
5
0e556a3f85d6
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
4
diff
changeset
|
222 "perc_mismatches": percent_mismatches, |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
223 "consensus": best_consensus, |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
224 "vp1_coverage_perc": vp1_coverage_perc |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
225 } |
|
0
8bc106442b1a
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
diff
changeset
|
226 json.dump(info, open(args.output_filename, "w")) |
|
5
0e556a3f85d6
planemo upload for repository https://github.com/pvanheus/polio_report commit a99e10fec2fac5aae70974c977eb3b362a1a8429-dirty
sanbi-uwc
parents:
4
diff
changeset
|
227 |
|
13
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
228 if len(best_consensus): |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
229 consensus_record = f'>{args.sample_name}\n' + fill(best_consensus) + '\n' |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
230 else: |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
231 # better to write an empty FASTA file than one with just a header |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
232 consensus_record = '' |
|
bf0bebcb6bb1
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
sanbi-uwc
parents:
12
diff
changeset
|
233 open(args.consensus_output_filename, "w").write(consensus_record) |
