assess_poliovirus_alignment: assess_alignment.py comparison

comparison assess_alignment.py @ 9:acaaf49e2747 draft

planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty

author	sanbi-uwc
date	Tue, 27 Sep 2022 05:52:53 +0000
parents	852e76e7d22a
children	fb905d0f8201

comparison

equal deleted inserted replaced

-:852e76e7d22a
+:acaaf49e2747
 )
 min_start = min([int(al["leadingGaps"]) for al in msas])
 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
 base_state = ["n"] * len(reference["align"])
 mismatch_bases = {}
-for i, base in enumerate(reference["align"]):
+consensus = ''
+for i, reference_base in enumerate(reference["align"]):
 for k, al in enumerate(msas):
 leading_gaps = int(al["leadingGaps"])
 align_len = len(al["align"])
 if leading_gaps < i and (leading_gaps + align_len) > i:
 vp1pos = i - offset
 if vp1only and vp1pos < 0 or vp1pos > length:
 # skip positions outside of vp1 gene region
 continue
 al_base = al["align"][i - leading_gaps]
+consensus += al_base
 has_secondary_basecall = False
 if sec_is_conflict:
 gappedTrace = data["gappedTraces"][k]
 pos = i - int(gappedTrace["leadingGaps"])
 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos])
 ]
 if "|" in basecall_str:
 has_secondary_basecall = True
 # set this position to conflicted
 base_state[i] = "C"
-if al_base != base:
+if al_base != reference_base:
 # let's deal with all the cases where the base state doesn't match the reference
 if base_state[i] == "G":
 # the base state was G (a trace matches reference) and now we see a mismatch
 base_state[i] = "C"
 elif base_state[i] == "C":
 if state == "M":
 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
 mismatch_list.append(
 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]]
 )
-return [conflicts, matches, mismatches, mismatch_list]
+return [conflicts, matches, mismatches, mismatch_list, consensus]
 def analyse_trace_quality(json_file: TextIO) -> float:
 data = load_json(json_file)
 dataset_name = args.dataset_names[file_index].replace(
 ".json", ""
 )  # take the name but remove any json suffix
 offset = offsets[dataset_name]
 length = lengths[dataset_name]
-(conflicts, matches, mismatches, mismatch_list) = analyse_mismatches(
+(conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches(
 open(json_filename), offset, length
 )
 # analyse_mismatches(json_filename, True)
 quality = analyse_trace_quality(open(json_filename))
 if min_mismatches is None or mismatches < min_mismatches:
 min_mismatches = mismatches
 best_match_mismatch_list = mismatch_list
 best_match_quality = quality
 best_match_reference = dataset_name
-best_consensus = open(args.consensi[file_index]).read().replace('>Consensus', f'>{args.sample_name}')
+best_consensus = consensus
 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2)
 info = {
 "sample_name": args.sample_name,
 "best_reference": best_match_reference,
 "mismatches": min_mismatches,
 "mismatch_list": best_match_mismatch_list,
 "quality": best_match_quality,
 "perc_mismatches": percent_mismatches,
+"consensus": best_consensus
 }
 json.dump(info, open(args.output_filename, "w"))
 open(args.consensus_output_filename, "w").write(best_consensus)

Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment

comparison assess_alignment.py @ 9:acaaf49e2747 draft