assess_poliovirus_alignment: assess_alignment.py comparison

comparison assess_alignment.py @ 13:bf0bebcb6bb1 draft default tip

planemo upload for repository https://github.com/pvanheus/polio_report commit 753aad311378b064f2152c8e99e7c8097c7f4321-dirty

author	sanbi-uwc
date	Fri, 11 Nov 2022 06:18:15 +0000
parents	ee98dcd9aad4
children

comparison

equal deleted inserted replaced

-:ee98dcd9aad4
+:bf0bebcb6bb1
 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
 mismatch_list.append(
 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]]
 )
 if vp1only:
-# we have trim consensus bases before vp1 region
+# a note on gap trimming: if the consensus is smaller than VP1, we might
-if base_start <= offset:
+# get gaps on either side. these will of course be trimmed.
-cons_start = offset - base_start
+# we don't expect gaps inside the consensus but in the rare event that
-else:
+# these occur they are also discarded.
-cons_start = 0
+consensus = data['gappedConsensus'][offset:offset+length].replace('-','')
-consensus = data['gapFreeConsensus'][cons_start:cons_start + length]
 else:
+# TODO: sometimes there is "orphan" sequence from false calls far in front
+# or behind the consensus region. these should be removed somehow
 consensus = data['gapFreeConsensus']
 return [conflicts, matches, mismatches, mismatch_list, consensus]
 def analyse_trace_quality(json_file: TextIO) -> float:
+# TODO: consider clipping this to VP1 region or at least aligned region
 data = load_json(json_file)
 traces = data["gappedTraces"]
 overall_avg = 0
 for trace in traces:
 avg_ratio = 0
 for base in ("A", "C", "G", "T"):
 calls = trace["peak" + base][start : end + 1]
 min_call = min(calls)
 max_call = max(calls)
+if len(calls) == 0:
+# no calls for this base to deal with, skip it
+continue
 avg_call = sum(calls) / len(calls)
-ratio = max_call / avg_call
+if avg_call == 0:
+# zero average base quality!
+ratio = 0
+else:
+ratio = max_call / avg_call
 call_quality["avg" + base] = avg_call
 call_quality["min" + base] = min_call
 call_quality["max" + base] = max_call
 call_quality["ratio" + base] = ratio
 avg_ratio += ratio
 avg_ratio = avg_ratio / 4
 overall_avg += avg_ratio
-overall_avg = overall_avg / len(traces)
+if len(traces) > 0:
+overall_avg = overall_avg / len(traces)
+else:
+overall_avg = 0
 return overall_avg
 def comma_split(args: str) -> list[str]:
 return args.split(",")
 min_mismatches = mismatches
 best_match_mismatch_list = mismatch_list
 best_match_quality = quality
 best_match_reference = dataset_name
 best_consensus = consensus
-percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2)
+percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2),
+vp1_coverage_perc = round(len(consensus) / lengths[best_match_reference] * 100)
 info = {
 "sample_name": args.sample_name,
 "best_reference": best_match_reference,
 "mismatches": min_mismatches,
 "mismatch_list": best_match_mismatch_list,
 "quality": best_match_quality,
 "perc_mismatches": percent_mismatches,
-"consensus": best_consensus
+"consensus": best_consensus,
-}
+"vp1_coverage_perc": vp1_coverage_perc
+}
 json.dump(info, open(args.output_filename, "w"))
-open(args.consensus_output_filename, "w").write(f'>{args.sample_name}\n' + fill(best_consensus) + '\n')
+if len(best_consensus):
+consensus_record = f'>{args.sample_name}\n' + fill(best_consensus) + '\n'
+else:
+# better to write an empty FASTA file than one with just a header
+consensus_record = ''
+open(args.consensus_output_filename, "w").write(consensus_record)

Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment

comparison assess_alignment.py @ 13:bf0bebcb6bb1 draft default tip