comparison assess_alignment.py @ 10:fb905d0f8201 draft

planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
author sanbi-uwc
date Tue, 27 Sep 2022 08:44:35 +0000
parents acaaf49e2747
children fff48fa8a22b
comparison
equal deleted inserted replaced
9:acaaf49e2747 10:fb905d0f8201
2 2
3 import argparse 3 import argparse
4 import json 4 import json
5 import sys 5 import sys
6 from dataclasses import dataclass 6 from dataclasses import dataclass
7 from textwrap import fill
7 from typing import TextIO 8 from typing import TextIO
8 9
9 10
10 @dataclass 11 @dataclass
11 class Sample: 12 class Sample:
44 ) 45 )
45 min_start = min([int(al["leadingGaps"]) for al in msas]) 46 min_start = min([int(al["leadingGaps"]) for al in msas])
46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) 47 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
47 base_state = ["n"] * len(reference["align"]) 48 base_state = ["n"] * len(reference["align"])
48 mismatch_bases = {} 49 mismatch_bases = {}
49 consensus = '' 50 base_start = 0
50 for i, reference_base in enumerate(reference["align"]): 51 for i, reference_base in enumerate(reference["align"]):
52 if vp1only and not base_start:
53 if data['gappedConsensus'][i] != '-':
54 base_start = i
51 for k, al in enumerate(msas): 55 for k, al in enumerate(msas):
52 leading_gaps = int(al["leadingGaps"]) 56 leading_gaps = int(al["leadingGaps"])
53 align_len = len(al["align"]) 57 align_len = len(al["align"])
54 if leading_gaps < i and (leading_gaps + align_len) > i: 58 if leading_gaps < i and (leading_gaps + align_len) > i:
55 vp1pos = i - offset 59 vp1pos = i - offset
56 if vp1only and vp1pos < 0 or vp1pos > length: 60 if vp1only and vp1pos < 0 or vp1pos > length:
57 # skip positions outside of vp1 gene region 61 # skip positions outside of vp1 gene region
58 continue 62 continue
59 al_base = al["align"][i - leading_gaps] 63 al_base = al["align"][i - leading_gaps]
60 consensus += al_base
61 has_secondary_basecall = False 64 has_secondary_basecall = False
62 if sec_is_conflict: 65 if sec_is_conflict:
63 gappedTrace = data["gappedTraces"][k] 66 gappedTrace = data["gappedTraces"][k]
64 pos = i - int(gappedTrace["leadingGaps"]) 67 pos = i - int(gappedTrace["leadingGaps"])
65 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) 68 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos])
108 if state == "M": 111 if state == "M":
109 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] 112 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
110 mismatch_list.append( 113 mismatch_list.append(
111 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] 114 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]]
112 ) 115 )
116 if vp1only:
117 # we have trim consensus bases before vp1 region
118 if base_start <= offset:
119 cons_start = offset - base_start
120 else:
121 cons_start = 0
122 consensus = data['gapFreeConsensus'][cons_start:cons_start + length]
123 else:
124 consensus = data['gapFreeConsensus']
113 return [conflicts, matches, mismatches, mismatch_list, consensus] 125 return [conflicts, matches, mismatches, mismatch_list, consensus]
114 126
115 127
116 def analyse_trace_quality(json_file: TextIO) -> float: 128 def analyse_trace_quality(json_file: TextIO) -> float:
117 data = load_json(json_file) 129 data = load_json(json_file)
198 "perc_mismatches": percent_mismatches, 210 "perc_mismatches": percent_mismatches,
199 "consensus": best_consensus 211 "consensus": best_consensus
200 } 212 }
201 json.dump(info, open(args.output_filename, "w")) 213 json.dump(info, open(args.output_filename, "w"))
202 214
203 open(args.consensus_output_filename, "w").write(best_consensus) 215 open(args.consensus_output_filename, "w").write(f'>{args.sample_name}\n' + fill(best_consensus))