Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment
changeset 13:bf0bebcb6bb1 draft default tip
planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty
author | sanbi-uwc |
---|---|
date | Fri, 11 Nov 2022 06:18:15 +0000 |
parents | ee98dcd9aad4 |
children | |
files | assess_alignment.py assess_poliovirus_alignment.xml |
diffstat | 2 files changed, 32 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/assess_alignment.py Tue Sep 27 18:56:28 2022 +0000 +++ b/assess_alignment.py Fri Nov 11 06:18:15 2022 +0000 @@ -114,18 +114,20 @@ [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] ) if vp1only: - # we have trim consensus bases before vp1 region - if base_start <= offset: - cons_start = offset - base_start - else: - cons_start = 0 - consensus = data['gapFreeConsensus'][cons_start:cons_start + length] + # a note on gap trimming: if the consensus is smaller than VP1, we might + # get gaps on either side. these will of course be trimmed. + # we don't expect gaps inside the consensus but in the rare event that + # these occur they are also discarded. + consensus = data['gappedConsensus'][offset:offset+length].replace('-','') else: + # TODO: sometimes there is "orphan" sequence from false calls far in front + # or behind the consensus region. these should be removed somehow consensus = data['gapFreeConsensus'] return [conflicts, matches, mismatches, mismatch_list, consensus] def analyse_trace_quality(json_file: TextIO) -> float: + # TODO: consider clipping this to VP1 region or at least aligned region data = load_json(json_file) traces = data["gappedTraces"] @@ -139,8 +141,15 @@ calls = trace["peak" + base][start : end + 1] min_call = min(calls) max_call = max(calls) + if len(calls) == 0: + # no calls for this base to deal with, skip it + continue avg_call = sum(calls) / len(calls) - ratio = max_call / avg_call + if avg_call == 0: + # zero average base quality! + ratio = 0 + else: + ratio = max_call / avg_call call_quality["avg" + base] = avg_call call_quality["min" + base] = min_call call_quality["max" + base] = max_call @@ -148,7 +157,10 @@ avg_ratio += ratio avg_ratio = avg_ratio / 4 overall_avg += avg_ratio - overall_avg = overall_avg / len(traces) + if len(traces) > 0: + overall_avg = overall_avg / len(traces) + else: + overall_avg = 0 return overall_avg @@ -198,7 +210,8 @@ best_match_quality = quality best_match_reference = dataset_name best_consensus = consensus - percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) + percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2), + vp1_coverage_perc = round(len(consensus) / lengths[best_match_reference] * 100) info = { "sample_name": args.sample_name, @@ -207,8 +220,14 @@ "mismatch_list": best_match_mismatch_list, "quality": best_match_quality, "perc_mismatches": percent_mismatches, - "consensus": best_consensus - } + "consensus": best_consensus, + "vp1_coverage_perc": vp1_coverage_perc + } json.dump(info, open(args.output_filename, "w")) - open(args.consensus_output_filename, "w").write(f'>{args.sample_name}\n' + fill(best_consensus) + '\n') + if len(best_consensus): + consensus_record = f'>{args.sample_name}\n' + fill(best_consensus) + '\n' + else: + # better to write an empty FASTA file than one with just a header + consensus_record = '' + open(args.consensus_output_filename, "w").write(consensus_record)
--- a/assess_poliovirus_alignment.xml Tue Sep 27 18:56:28 2022 +0000 +++ b/assess_poliovirus_alignment.xml Fri Nov 11 06:18:15 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="assess_poliovirus_alignment" name="Assess poliovirus alignment" version="0.11.0+galaxy0" profile="21.05"> +<tool id="assess_poliovirus_alignment" name="Assess poliovirus alignment" version="0.12.0+galaxy0" profile="21.05"> <requirements> <requirement type="package" version="3.9">python</requirement> </requirements>