Mercurial > repos > sanbi-uwc > assess_poliovirus_alignment
comparison assess_alignment.py @ 9:acaaf49e2747 draft
planemo upload for repository https://github.com/pvanheus/polio_report commit aa90f911e6269aba792c9814c98a659e631400b2-dirty
| author | sanbi-uwc |
|---|---|
| date | Tue, 27 Sep 2022 05:52:53 +0000 |
| parents | 852e76e7d22a |
| children | fb905d0f8201 |
comparison
equal
deleted
inserted
replaced
| 8:852e76e7d22a | 9:acaaf49e2747 |
|---|---|
| 44 ) | 44 ) |
| 45 min_start = min([int(al["leadingGaps"]) for al in msas]) | 45 min_start = min([int(al["leadingGaps"]) for al in msas]) |
| 46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) | 46 max_end = max([int(al["leadingGaps"]) + len(al["align"]) for al in msas]) |
| 47 base_state = ["n"] * len(reference["align"]) | 47 base_state = ["n"] * len(reference["align"]) |
| 48 mismatch_bases = {} | 48 mismatch_bases = {} |
| 49 for i, base in enumerate(reference["align"]): | 49 consensus = '' |
| 50 for i, reference_base in enumerate(reference["align"]): | |
| 50 for k, al in enumerate(msas): | 51 for k, al in enumerate(msas): |
| 51 leading_gaps = int(al["leadingGaps"]) | 52 leading_gaps = int(al["leadingGaps"]) |
| 52 align_len = len(al["align"]) | 53 align_len = len(al["align"]) |
| 53 if leading_gaps < i and (leading_gaps + align_len) > i: | 54 if leading_gaps < i and (leading_gaps + align_len) > i: |
| 54 vp1pos = i - offset | 55 vp1pos = i - offset |
| 55 if vp1only and vp1pos < 0 or vp1pos > length: | 56 if vp1only and vp1pos < 0 or vp1pos > length: |
| 56 # skip positions outside of vp1 gene region | 57 # skip positions outside of vp1 gene region |
| 57 continue | 58 continue |
| 58 al_base = al["align"][i - leading_gaps] | 59 al_base = al["align"][i - leading_gaps] |
| 60 consensus += al_base | |
| 59 has_secondary_basecall = False | 61 has_secondary_basecall = False |
| 60 if sec_is_conflict: | 62 if sec_is_conflict: |
| 61 gappedTrace = data["gappedTraces"][k] | 63 gappedTrace = data["gappedTraces"][k] |
| 62 pos = i - int(gappedTrace["leadingGaps"]) | 64 pos = i - int(gappedTrace["leadingGaps"]) |
| 63 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) | 65 # print(len(gappedTrace['basecallPos']), pos, k, len(gappedTrace['basecalls']), gappedTrace['basecallPos'][pos]) |
| 66 ] | 68 ] |
| 67 if "|" in basecall_str: | 69 if "|" in basecall_str: |
| 68 has_secondary_basecall = True | 70 has_secondary_basecall = True |
| 69 # set this position to conflicted | 71 # set this position to conflicted |
| 70 base_state[i] = "C" | 72 base_state[i] = "C" |
| 71 if al_base != base: | 73 if al_base != reference_base: |
| 72 # let's deal with all the cases where the base state doesn't match the reference | 74 # let's deal with all the cases where the base state doesn't match the reference |
| 73 if base_state[i] == "G": | 75 if base_state[i] == "G": |
| 74 # the base state was G (a trace matches reference) and now we see a mismatch | 76 # the base state was G (a trace matches reference) and now we see a mismatch |
| 75 base_state[i] = "C" | 77 base_state[i] = "C" |
| 76 elif base_state[i] == "C": | 78 elif base_state[i] == "C": |
| 106 if state == "M": | 108 if state == "M": |
| 107 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] | 109 # for mismatch store [pos_in_genome, pos_in_vp1, reference_base, sequenced_base] |
| 108 mismatch_list.append( | 110 mismatch_list.append( |
| 109 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] | 111 [i + 1, i - offset + 1, reference["align"][i], mismatch_bases[i]] |
| 110 ) | 112 ) |
| 111 return [conflicts, matches, mismatches, mismatch_list] | 113 return [conflicts, matches, mismatches, mismatch_list, consensus] |
| 112 | 114 |
| 113 | 115 |
| 114 def analyse_trace_quality(json_file: TextIO) -> float: | 116 def analyse_trace_quality(json_file: TextIO) -> float: |
| 115 data = load_json(json_file) | 117 data = load_json(json_file) |
| 116 | 118 |
| 172 dataset_name = args.dataset_names[file_index].replace( | 174 dataset_name = args.dataset_names[file_index].replace( |
| 173 ".json", "" | 175 ".json", "" |
| 174 ) # take the name but remove any json suffix | 176 ) # take the name but remove any json suffix |
| 175 offset = offsets[dataset_name] | 177 offset = offsets[dataset_name] |
| 176 length = lengths[dataset_name] | 178 length = lengths[dataset_name] |
| 177 (conflicts, matches, mismatches, mismatch_list) = analyse_mismatches( | 179 (conflicts, matches, mismatches, mismatch_list, consensus) = analyse_mismatches( |
| 178 open(json_filename), offset, length | 180 open(json_filename), offset, length |
| 179 ) | 181 ) |
| 180 # analyse_mismatches(json_filename, True) | 182 # analyse_mismatches(json_filename, True) |
| 181 quality = analyse_trace_quality(open(json_filename)) | 183 quality = analyse_trace_quality(open(json_filename)) |
| 182 if min_mismatches is None or mismatches < min_mismatches: | 184 if min_mismatches is None or mismatches < min_mismatches: |
| 183 min_mismatches = mismatches | 185 min_mismatches = mismatches |
| 184 best_match_mismatch_list = mismatch_list | 186 best_match_mismatch_list = mismatch_list |
| 185 best_match_quality = quality | 187 best_match_quality = quality |
| 186 best_match_reference = dataset_name | 188 best_match_reference = dataset_name |
| 187 best_consensus = open(args.consensi[file_index]).read().replace('>Consensus', f'>{args.sample_name}') | 189 best_consensus = consensus |
| 188 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) | 190 percent_mismatches = round(min_mismatches / lengths[best_match_reference] * 100, 2) |
| 189 | 191 |
| 190 info = { | 192 info = { |
| 191 "sample_name": args.sample_name, | 193 "sample_name": args.sample_name, |
| 192 "best_reference": best_match_reference, | 194 "best_reference": best_match_reference, |
| 193 "mismatches": min_mismatches, | 195 "mismatches": min_mismatches, |
| 194 "mismatch_list": best_match_mismatch_list, | 196 "mismatch_list": best_match_mismatch_list, |
| 195 "quality": best_match_quality, | 197 "quality": best_match_quality, |
| 196 "perc_mismatches": percent_mismatches, | 198 "perc_mismatches": percent_mismatches, |
| 199 "consensus": best_consensus | |
| 197 } | 200 } |
| 198 json.dump(info, open(args.output_filename, "w")) | 201 json.dump(info, open(args.output_filename, "w")) |
| 199 | 202 |
| 200 open(args.consensus_output_filename, "w").write(best_consensus) | 203 open(args.consensus_output_filename, "w").write(best_consensus) |
