Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment
comparison assess_alignment.py @ 9:acaaf49e2747 draft
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
author | sanbi-uwc |
---|---|
date | Tue, 27 Sep 2022 05:52:53 +0000 |
parents | 852e76e7d22a |
children | fb905d0f8201 |
comparison
equal
deleted
inserted
replaced
8:852e76e7d22a | 9:acaaf49e2747 |
---|---|
44 ) | 44 ) |
45 min_start = min([int(al["leadingGaps"]) for al in msas]) | 45 min_start = min([int(al["leadingGaps"]) for al in msas]) |
46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) | 46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) |
47 base_state = ["n"] * len(reference["align"]) | 47 base_state = ["n"] * len(reference["align"]) |
48 mismatch_bases = {} | 48 mismatch_bases = {} |
49 for i, base in enumerate(reference["align"]): | 49 consensus = '' |
50 for i, reference_base in enumerate(reference["align"]): | |
50 for k, al in enumerate(msas): | 51 for k, al in enumerate(msas): |
51 leading_gaps = int(al["leadingGaps"]) | 52 leading_gaps = int(al["leadingGaps"]) |
52 align_len = len(al["align"]) | 53 align_len = len(al["align"]) |
53 if leading_gaps < i and (leading_gaps + align_len) > i: | 54 if leading_gaps < i and (leading_gaps + align_len) > i: |
54 vp1pos = i - offset | 55 vp1pos = i - offset |
55 if vp1only and vp1pos < 0 or vp1pos > length: | 56 if vp1only and vp1pos < 0 or vp1pos > length: |
56 # skip positions outside of vp1 gene region | 57 # skip positions outside of vp1 gene region |
57 continue | 58 continue |
58 al_base = al["align"][i - leading_gaps] | 59 al_base = al["align"][i - leading_gaps] |
60 consensus += al_base | |
59 has_secondary_basecall = False | 61 has_secondary_basecall = False |
60 if sec_is_conflict: | 62 if sec_is_conflict: |
61 gappedTrace = data["gappedTraces"][k] | 63 gappedTrace = data["gappedTraces"][k] |
62 pos = i - int(gappedTrace["leadingGaps"]) | 64 pos = i - int(gappedTrace["leadingGaps"]) |
63 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) | 65 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) |
66 ] | 68 ] |
67 if "|" in basecall_str: | 69 if "|" in basecall_str: |
68 has_secondary_basecall = True | 70 has_secondary_basecall = True |
69 # set this position to conflicted | 71 # set this position to conflicted |
70 base_state[i] = "C" | 72 base_state[i] = "C" |
71 if al_base != base: | 73 if al_base != reference_base: |
72 # let's deal with all the cases where the base state doesn't match the reference | 74 # let's deal with all the cases where the base state doesn't match the reference |
73 if base_state[i] == "G": | 75 if base_state[i] == "G": |
74 # the base state was G (a trace matches reference) and now we see a mismatch | 76 # the base state was G (a trace matches reference) and now we see a mismatch |
75 base_state[i] = "C" | 77 base_state[i] = "C" |
76 elif base_state[i] == "C": | 78 elif base_state[i] == "C": |
106 if state == "M": | 108 if state == "M": |
107 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] | 109 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] |
108 mismatch_list.append( | 110 mismatch_list.append( |
109 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] | 111 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] |
110 ) | 112 ) |
111 return [conflicts, matches, mismatches, mismatch_list] | 113 return [conflicts, matches, mismatches, mismatch_list, consensus] |
112 | 114 |
113 | 115 |
114 def analyse_trace_quality(json_file: TextIO) -> float: | 116 def analyse_trace_quality(json_file: TextIO) -> float: |
115 data = load_json(json_file) | 117 data = load_json(json_file) |
116 | 118 |
172 dataset_name = args.dataset_names[file_index].replace( | 174 dataset_name = args.dataset_names[file_index].replace( |
173 ".json", "" | 175 ".json", "" |
174 ) # take the name but remove any json suffix | 176 ) # take the name but remove any json suffix |
175 offset = offsets[dataset_name] | 177 offset = offsets[dataset_name] |
176 length = lengths[dataset_name] | 178 length = lengths[dataset_name] |
177 (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches( | 179 (conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches( |
178 open(json_filename), offset, length | 180 open(json_filename), offset, length |
179 ) | 181 ) |
180 # analyse_mismatches(json_filename, True) | 182 # analyse_mismatches(json_filename, True) |
181 quality = analyse_trace_quality(open(json_filename)) | 183 quality = analyse_trace_quality(open(json_filename)) |
182 if min_mismatches is None or mismatches < min_mismatches: | 184 if min_mismatches is None or mismatches < min_mismatches: |
183 min_mismatches = mismatches | 185 min_mismatches = mismatches |
184 best_match_mismatch_list = mismatch_list | 186 best_match_mismatch_list = mismatch_list |
185 best_match_quality = quality | 187 best_match_quality = quality |
186 best_match_reference = dataset_name | 188 best_match_reference = dataset_name |
187 best_consensus = open(args.consensi[file_index]).read().replace('>Consensus', f'>{args.sample_name}') | 189 best_consensus = consensus |
188 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) | 190 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) |
189 | 191 |
190 info = { | 192 info = { |
191 "sample_name": args.sample_name, | 193 "sample_name": args.sample_name, |
192 "best_reference": best_match_reference, | 194 "best_reference": best_match_reference, |
193 "mismatches": min_mismatches, | 195 "mismatches": min_mismatches, |
194 "mismatch_list": best_match_mismatch_list, | 196 "mismatch_list": best_match_mismatch_list, |
195 "quality": best_match_quality, | 197 "quality": best_match_quality, |
196 "perc_mismatches": percent_mismatches, | 198 "perc_mismatches": percent_mismatches, |
199 "consensus": best_consensus | |
197 } | 200 } |
198 json.dump(info, open(args.output_filename, "w")) | 201 json.dump(info, open(args.output_filename, "w")) |
199 | 202 |
200 open(args.consensus_output_filename, "w").write(best_consensus) | 203 open(args.consensus_output_filename, "w").write(best_consensus) |