Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/assess_alignment.py	Tue Jul 19 11:54:46 2022 +0000
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from typing import TextIO
+
+
+@dataclass
+class Sample:
+    name: str
+    reference: str
+    mismatches: int
+    quality: str
+
+
+def load_json(json_file: TextIO) -> dict:
+    data = json.load(json_file)
+    if "msa" not in data:
+        raise ValueError("MSA missing from JSON, cannot proceed")
+    if "gappedTraces" not in data:
+        raise ValueError("gappedTraces missing from JSON, cannot proceed")
+    return data
+
+
+def analyse_mismatches(
+    json_file: TextIO,
+    offset: int,
+    length: int,
+    vp1only: bool = True,
+    sec_is_conflict: bool = False,
+) -> list:
+    data = load_json(json_file)
+    msas = [al for al in data["msa"] if not al["reference"]]
+    reference = None
+    for al in data["msa"]:
+        if al["reference"]:
+            if reference is None:
+                reference = al
+            else:
+                sys.exit(
+                    "more than one reference found in JSON MSA list, cannot proceed"
+                )
+    min_start = min([int(al["leadingGaps"]) for al in msas])
+    max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
+    base_state = ["n"] * len(reference["align"])
+    mismatch_bases = {}
+    for i, base in enumerate(reference["align"]):
+        for k, al in enumerate(msas):
+            leading_gaps = int(al["leadingGaps"])
+            align_len = len(al["align"])
+            if leading_gaps < i and (leading_gaps + align_len) > i:
+                vp1pos = i - offset
+                if vp1only and vp1pos < 0 or vp1pos > length:
+                    # skip positions outside of vp1 gene region
+                    continue
+                al_base = al["align"][i - leading_gaps]
+                has_secondary_basecall = False
+                if sec_is_conflict:
+                    gappedTrace = data["gappedTraces"][k]
+                    pos = i - int(gappedTrace["leadingGaps"])
+                    # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos])
+                    basecall_str = gappedTrace["basecalls"][
+                        str(gappedTrace["basecallPos"][pos])
+                    ]
+                    if "|" in basecall_str:
+                        has_secondary_basecall = True
+                        # set this position to conflicted
+                        base_state[i] = "C"
+                if al_base != base:
+                    # let's deal with all the cases where the base state doesn't match the reference
+                    if base_state[i] == "G":
+                        # the base state was G (a trace matches reference) and now we see a mismatch
+                        base_state[i] = "C"
+                    elif base_state[i] == "C":
+                        # already marked as conflicting - a mismatch doesn't change that
+                        pass
+                    elif base_state[i] == "n" or base_state[i] == "M":
+                        # we never saw this before or its already marked as a mismatch
+                        base_state[i] = "M"
+                        mismatch_bases[i] = al_base
+                    else:
+                        sys.exit("unexpected base state: " + base_state[i])
+                else:
+                    if base_state[i] == "G" or base_state[i] == "n":
+                        # we saw this before and got a match or
+                        # we never saw this before
+                        base_state[i] = "G"
+                    elif base_state[i] == "M":
+                        # we saw this before but it was a mismatch - mark this as a conflict
+                        base_state[i] = "C"
+                        if i in mismatch_bases:
+                            del mismatch_bases[i]
+                    elif base_state[i] == "C":
+                        # we have seen a conflict here before
+                        pass
+                    else:
+                        sys.exit("unexpected base_state: " + base_state[i])
+    conflicts = base_state.count("C")
+    matches = base_state.count("G")
+    mismatches = base_state.count("M")
+    mismatch_list = []
+    for i, state in enumerate(base_state):
+        # i is in zero-based genome coordinates
+        if state == "M":
+            # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
+            mismatch_list.append(
+                [i, i - offset, reference["align"][i], mismatch_bases[i]]
+            )
+    return [conflicts, matches, mismatches, mismatch_list]
+
+
+def analyse_trace_quality(json_file: TextIO) -> float:
+    data = load_json(json_file)
+
+    traces = data["gappedTraces"]
+    overall_avg = 0
+    for trace in traces:
+        start = min(trace["basecallPos"])
+        end = max(trace["basecallPos"])
+        call_quality = {}
+        avg_ratio = 0
+        for base in ("A", "C", "G", "T"):
+            calls = trace["peak" + base][start : end + 1]
+            min_call = min(calls)
+            max_call = max(calls)
+            avg_call = sum(calls) / len(calls)
+            ratio = max_call / avg_call
+            call_quality["avg" + base] = avg_call
+            call_quality["min" + base] = min_call
+            call_quality["max" + base] = max_call
+            call_quality["ratio" + base] = ratio
+            avg_ratio += ratio
+        avg_ratio = avg_ratio / 4
+        overall_avg += avg_ratio
+    overall_avg = overall_avg / len(traces)
+    return overall_avg
+
+
+def comma_split(args: str) -> list[str]:
+    return args.split(",")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_filename", help="Path to output file")
+    parser.add_argument("--sample_name", help="Name of sample being analysed")
+    parser.add_argument(
+        "--dataset_names", type=comma_split, help="Comma separated names for datasets"
+    )
+    parser.add_argument("--datasets", nargs="+")
+    args = parser.parse_args()
+
+    offsets = {
+        "poliovirus1sabin": 2480,
+        "poliovirus2sabin": 2482,
+        "poliovirus3sabin": 2477,
+    }
+
+    lengths = {
+        "poliovirus1sabin": 906,
+        "poliovirus2sabin": 903,
+        "poliovirus3sabin": 900,
+    }
+
+    min_mismatches = None
+    for file_index, json_filename in enumerate(args.datasets):
+        dataset_name = args.dataset_names[file_index].replace(
+            ".json", ""
+        )  # take the name but remove any json suffix
+        offset = offsets[dataset_name]
+        length = lengths[dataset_name]
+        (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches(
+            open(json_filename), offset, length
+        )
+        # analyse_mismatches(json_filename, True)
+        quality = analyse_trace_quality(open(json_filename))
+        if min_mismatches is None or mismatches < min_mismatches:
+            min_mismatches = mismatches
+            best_match_mismatch_list = mismatch_list
+            best_match_quality = quality
+            best_match_reference = dataset_name
+
+    info = {
+        "sample_name": args.sample_name,
+        "best_reference": best_match_reference,
+        "mismatches": min_mismatches,
+        "mismatch_list": best_match_mismatch_list,
+        "quality": best_match_quality,
+    }
+    json.dump(info, open(args.output_filename, "w"))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/assess_poliovirus_alignment.xml	Tue Jul 19 11:54:46 2022 +0000
@@ -0,0 +1,51 @@
+<tool id="assess_poliovirus_alignment" name="Assess poliovirus alignment" version="0.1.0+galaxy0" profile="21.05">
+    <requirements>
+        <requirement type="package" version="3.9">python</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #set $dataset_names = ','.join([ dataset.name for dataset in $tracy_results ])
+        python $__tool_directory__/assess_alignment.py
+            --dataset_names '$dataset_names'
+            --output_filename '$output1'
+            --sample_name '$tracy_results.name'
+            --datasets
+        #for $dataset in $tracy_results
+             '$dataset'
+        #end for
+
+    ]]></command>
+    <inputs>
+        <!-- input is list of reports for poliovirus sabin 1, 2 and 3 -->
+        <param name="tracy_results" format="json" type="data_collection" collection_type="list" label="Tracy JSON reports" help="Input is a list of tracy assemble JSON reports for assembly guided by there reference sequences for poliovirus sabin 1, 2 and 3" />
+    </inputs>
+    <outputs>
+        <data name="output1" format="json" label="Poliovirus alignment assessment on ${on_string}" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="tracy_results" ftype="json">
+                <collection type="list">
+                    <element name="poliovirus1sabin" value="sample1/poliovirus1sabin.json" />
+                    <element name="poliovirus2sabin" value="sample1/poliovirus2sabin.json" />
+                    <element name="poliovirus3sabin" value="sample1/poliovirus3sabin.json" />
+                </collection>
+            </param>
+            <output name="output1" ftype="json" file="sample1_output.json" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Given the output of the poliovirus pipeline, make a JSON summarising the comparison against the three poliovirus Sabin reference genomes.
+    ]]></help>
+    <citations>
+        <citation type="bibtex"><![CDATA[
+            @software{van_Heusden_Poliovirus_variation_reporting_2022,
+                author = {van Heusden, Peter},
+                month = {7},
+                title = {{Poliovirus variation reporting scripts}},
+                url = {https://github.com/pvanheus/polio_report},
+                version = {0.1.0},
+                year = {2022}
+                }
+        ]]></citation>
+    </citations>
+</tool>
\ No newline at end of file