Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment

diff assess_alignment.py @ 9:acaaf49e2747 draft
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
author: sanbi-uwc
date: Tue, 27 Sep 2022 05:52:53 +0000
parents: 852e76e7d22a
children: fb905d0f8201
--- a/assess_alignment.py	Wed Sep 14 11:25:43 2022 +0000
+++ b/assess_alignment.py	Tue Sep 27 05:52:53 2022 +0000
@@ -46,7 +46,8 @@
     max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
     base_state = ["n"] * len(reference["align"])
     mismatch_bases = {}
-    for i, base in enumerate(reference["align"]):
+    consensus = ''
+    for i, reference_base in enumerate(reference["align"]):
         for k, al in enumerate(msas):
             leading_gaps = int(al["leadingGaps"])
             align_len = len(al["align"])
@@ -56,6 +57,7 @@
                     # skip positions outside of vp1 gene region
                     continue
                 al_base = al["align"][i - leading_gaps]
+                consensus += al_base
                 has_secondary_basecall = False
                 if sec_is_conflict:
                     gappedTrace = data["gappedTraces"][k]
@@ -68,7 +70,7 @@
                         has_secondary_basecall = True
                         # set this position to conflicted
                         base_state[i] = "C"
-                if al_base != base:
+                if al_base != reference_base:
                     # let's deal with all the cases where the base state doesn't match the reference
                     if base_state[i] == "G":
                         # the base state was G (a trace matches reference) and now we see a mismatch
@@ -108,7 +110,7 @@
             mismatch_list.append(
                 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]]
             )
-    return [conflicts, matches, mismatches, mismatch_list]
+    return [conflicts, matches, mismatches, mismatch_list, consensus]
 
 
 def analyse_trace_quality(json_file: TextIO) -> float:
@@ -174,7 +176,7 @@
         )  # take the name but remove any json suffix
         offset = offsets[dataset_name]
         length = lengths[dataset_name]
-        (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches(
+        (conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches(
             open(json_filename), offset, length
         )
         # analyse_mismatches(json_filename, True)
@@ -184,7 +186,7 @@
             best_match_mismatch_list = mismatch_list
             best_match_quality = quality
             best_match_reference = dataset_name
-            best_consensus = open(args.consensi[file_index]).read().replace('>Consensus', f'>{args.sample_name}')
+            best_consensus = consensus
             percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2)
 
     info = {
@@ -194,6 +196,7 @@
         "mismatch_list": best_match_mismatch_list,
         "quality": best_match_quality,
         "perc_mismatches": percent_mismatches,
+        "consensus": best_consensus
     }
     json.dump(info, open(args.output_filename, "w"))
author	sanbi-uwc
date	Tue, 27 Sep 2022 05:52:53 +0000
parents	852e76e7d22a
children	fb905d0f8201