comparison assess_alignment.py @ 9:acaaf49e2747 draft

planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
author sanbi-uwc
date Tue, 27 Sep 2022 05:52:53 +0000
parents 852e76e7d22a
children fb905d0f8201
comparison
equal deleted inserted replaced
8:852e76e7d22a 9:acaaf49e2747
44 ) 44 )
45 min_start = min([int(al["leadingGaps"]) for al in msas]) 45 min_start = min([int(al["leadingGaps"]) for al in msas])
46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) 46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas])
47 base_state = ["n"] * len(reference["align"]) 47 base_state = ["n"] * len(reference["align"])
48 mismatch_bases = {} 48 mismatch_bases = {}
49 for i, base in enumerate(reference["align"]): 49 consensus = ''
50 for i, reference_base in enumerate(reference["align"]):
50 for k, al in enumerate(msas): 51 for k, al in enumerate(msas):
51 leading_gaps = int(al["leadingGaps"]) 52 leading_gaps = int(al["leadingGaps"])
52 align_len = len(al["align"]) 53 align_len = len(al["align"])
53 if leading_gaps < i and (leading_gaps + align_len) > i: 54 if leading_gaps < i and (leading_gaps + align_len) > i:
54 vp1pos = i - offset 55 vp1pos = i - offset
55 if vp1only and vp1pos < 0 or vp1pos > length: 56 if vp1only and vp1pos < 0 or vp1pos > length:
56 # skip positions outside of vp1 gene region 57 # skip positions outside of vp1 gene region
57 continue 58 continue
58 al_base = al["align"][i - leading_gaps] 59 al_base = al["align"][i - leading_gaps]
60 consensus += al_base
59 has_secondary_basecall = False 61 has_secondary_basecall = False
60 if sec_is_conflict: 62 if sec_is_conflict:
61 gappedTrace = data["gappedTraces"][k] 63 gappedTrace = data["gappedTraces"][k]
62 pos = i - int(gappedTrace["leadingGaps"]) 64 pos = i - int(gappedTrace["leadingGaps"])
63 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) 65 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos])
66 ] 68 ]
67 if "|" in basecall_str: 69 if "|" in basecall_str:
68 has_secondary_basecall = True 70 has_secondary_basecall = True
69 # set this position to conflicted 71 # set this position to conflicted
70 base_state[i] = "C" 72 base_state[i] = "C"
71 if al_base != base: 73 if al_base != reference_base:
72 # let's deal with all the cases where the base state doesn't match the reference 74 # let's deal with all the cases where the base state doesn't match the reference
73 if base_state[i] == "G": 75 if base_state[i] == "G":
74 # the base state was G (a trace matches reference) and now we see a mismatch 76 # the base state was G (a trace matches reference) and now we see a mismatch
75 base_state[i] = "C" 77 base_state[i] = "C"
76 elif base_state[i] == "C": 78 elif base_state[i] == "C":
106 if state == "M": 108 if state == "M":
107 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] 109 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base]
108 mismatch_list.append( 110 mismatch_list.append(
109 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] 111 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]]
110 ) 112 )
111 return [conflicts, matches, mismatches, mismatch_list] 113 return [conflicts, matches, mismatches, mismatch_list, consensus]
112 114
113 115
114 def analyse_trace_quality(json_file: TextIO) -> float: 116 def analyse_trace_quality(json_file: TextIO) -> float:
115 data = load_json(json_file) 117 data = load_json(json_file)
116 118
172 dataset_name = args.dataset_names[file_index].replace( 174 dataset_name = args.dataset_names[file_index].replace(
173 ".json", "" 175 ".json", ""
174 ) # take the name but remove any json suffix 176 ) # take the name but remove any json suffix
175 offset = offsets[dataset_name] 177 offset = offsets[dataset_name]
176 length = lengths[dataset_name] 178 length = lengths[dataset_name]
177 (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches( 179 (conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches(
178 open(json_filename), offset, length 180 open(json_filename), offset, length
179 ) 181 )
180 # analyse_mismatches(json_filename, True) 182 # analyse_mismatches(json_filename, True)
181 quality = analyse_trace_quality(open(json_filename)) 183 quality = analyse_trace_quality(open(json_filename))
182 if min_mismatches is None or mismatches < min_mismatches: 184 if min_mismatches is None or mismatches < min_mismatches:
183 min_mismatches = mismatches 185 min_mismatches = mismatches
184 best_match_mismatch_list = mismatch_list 186 best_match_mismatch_list = mismatch_list
185 best_match_quality = quality 187 best_match_quality = quality
186 best_match_reference = dataset_name 188 best_match_reference = dataset_name
187 best_consensus = open(args.consensi[file_index]).read().replace('>Consensus', f'>{args.sample_name}') 189 best_consensus = consensus
188 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) 190 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2)
189 191
190 info = { 192 info = {
191 "sample_name": args.sample_name, 193 "sample_name": args.sample_name,
192 "best_reference": best_match_reference, 194 "best_reference": best_match_reference,
193 "mismatches": min_mismatches, 195 "mismatches": min_mismatches,
194 "mismatch_list": best_match_mismatch_list, 196 "mismatch_list": best_match_mismatch_list,
195 "quality": best_match_quality, 197 "quality": best_match_quality,
196 "perc_mismatches": percent_mismatches, 198 "perc_mismatches": percent_mismatches,
199 "consensus": best_consensus
197 } 200 }
198 json.dump(info, open(args.output_filename, "w")) 201 json.dump(info, open(args.output_filename, "w"))
199 202
200 open(args.consensus_output_filename, "w").write(best_consensus) 203 open(args.consensus_output_filename, "w").write(best_consensus)