Mercurial > repos > nick > duplex
comparison utils/sim-check.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 from __future__ import print_function | |
| 4 import re | |
| 5 import os | |
| 6 import sys | |
| 7 import argparse | |
| 8 import fastqreader | |
| 9 script_dir = os.path.dirname(os.path.realpath(__file__)) | |
| 10 sys.path.append(os.path.dirname(script_dir)) | |
| 11 import swalign | |
| 12 | |
| 13 CANON = 'ACGT-' | |
| 14 | |
| 15 WGSIM_ID_REGEX = r'^(.+)_\d+_\d+_\d+:\d+:\d+_\d+:\d+:\d+_([0-9a-f]+)/[12]$' | |
| 16 ARG_DEFAULTS = {'print_stats':True} | |
| 17 USAGE = "%(prog)s [options]" | |
| 18 DESCRIPTION = """Correlate (labeled) reads from duplex pipeline with truth from simulator input, | |
| 19 and print the number of errors.""" | |
| 20 | |
| 21 | |
| 22 def main(argv): | |
| 23 | |
| 24 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 25 parser.set_defaults(**ARG_DEFAULTS) | |
| 26 | |
| 27 parser.add_argument('reads', type=argparse.FileType('r'), | |
| 28 help='Output from duplex pipeline. Should be the tsv produced by sim-label.py.') | |
| 29 parser.add_argument('frags', type=fastqreader.FastqReadGenerator, | |
| 30 help='--frag-file from sim.py.') | |
| 31 parser.add_argument('-i', '--ignore-ambiguous', action='store_true', | |
| 32 help='Don\'t consider ambiguous bases ("N", "R", etc.) in SNV errors. Specifically, it will ' | |
| 33 'ignore any mismatch between a non-gap base in the fragment and read base that isn\'t ' | |
| 34 'one of "'+CANON+'".') | |
| 35 parser.add_argument('-a', '--print-alignments', action='store_true', | |
| 36 help='Print the alignments of each read with each fragment. Mostly for debug purposes.') | |
| 37 parser.add_argument('-S', '--no-stats', dest='print_stats', action='store_false', | |
| 38 help='Don\'t print the normal output of statistics on differences.') | |
| 39 | |
| 40 args = parser.parse_args(argv[1:]) | |
| 41 | |
| 42 reads = iter(args.reads) | |
| 43 frags = iter(args.frags) | |
| 44 while True: | |
| 45 # Read in the next output read. | |
| 46 try: | |
| 47 read_line = next(reads) | |
| 48 except StopIteration: | |
| 49 break | |
| 50 fields = read_line.rstrip('\r\n').split('\t') | |
| 51 assert len(fields) == 7, fields | |
| 52 read = dict(zip(('chrom', 'frag_num', 'frag_id', 'bar', 'reads+', 'reads-', 'seq'), fields)) | |
| 53 # Read in fragments until we find the one corresponding to the output read. | |
| 54 frag_chrom = None | |
| 55 frag_frag_id = None | |
| 56 while not (read['chrom'] == frag_chrom and read['frag_id'] == frag_frag_id): | |
| 57 try: | |
| 58 frag = next(frags) | |
| 59 except StopIteration: | |
| 60 break | |
| 61 match = re.search(WGSIM_ID_REGEX, frag.id) | |
| 62 if match: | |
| 63 frag_chrom = match.group(1) | |
| 64 frag_frag_id = match.group(2) | |
| 65 else: | |
| 66 sys.stderr.write('Invalid wgsim read name: {}\n'.format(frag.id)) | |
| 67 if frag_chrom is None and frag_frag_id is None: | |
| 68 break | |
| 69 # Align the output read to the fragment. | |
| 70 align = swalign.smith_waterman_duplex(frag.seq, read['seq']) | |
| 71 assert len(align.target) == len(align.query) | |
| 72 if args.print_alignments: | |
| 73 print(align.target) | |
| 74 diffs = get_diffs(align.target, align.query, print_mid=args.print_alignments, | |
| 75 ignore_ambig=args.ignore_ambiguous) | |
| 76 if args.print_alignments: | |
| 77 print(align.query) | |
| 78 read_len = len(read['seq']) | |
| 79 snvs = ins = dels = 0 | |
| 80 for diff in diffs: | |
| 81 if diff['type'] == 'snv': | |
| 82 snvs += 1 | |
| 83 elif diff['type'] == 'ins': | |
| 84 ins += 1 | |
| 85 elif diff['type'] == 'del': | |
| 86 dels += 1 | |
| 87 match_rate = round(align.matches/read_len, 2) | |
| 88 if args.print_stats: | |
| 89 print(read['bar'], read['frag_id'], read['reads+'], read['reads-'], read_len, | |
| 90 read_len-align.matches, match_rate, len(diffs), snvs, ins, dels, sep='\t') | |
| 91 | |
| 92 | |
| 93 def get_diffs(target, query, print_mid=False, ignore_ambig=False): | |
| 94 diffs = [] | |
| 95 diff = None | |
| 96 coord = 0 | |
| 97 for base1, base2 in zip(target, query): | |
| 98 if base1 != '-': | |
| 99 coord += 1 | |
| 100 if base1 == base2: | |
| 101 # Finish ongoing indels and add them to the list. | |
| 102 if diff is not None: | |
| 103 # But omit the "indel" that's just the unaligned portion at the start. | |
| 104 if diff['coord'] > 1: | |
| 105 diffs.append(diff) | |
| 106 diff = None | |
| 107 if print_mid: | |
| 108 sys.stdout.write('|') | |
| 109 elif ignore_ambig and base1 != '-' and base2 not in CANON: | |
| 110 if print_mid: | |
| 111 sys.stdout.write(' ') | |
| 112 elif base1 == '-': | |
| 113 if diff is None: | |
| 114 diff = {'coord':coord, 'type':'ins', 'alt':base2} | |
| 115 else: | |
| 116 diff['alt'] += base2 | |
| 117 if print_mid: | |
| 118 sys.stdout.write(' ') | |
| 119 elif base2 == '-': | |
| 120 if diff is None: | |
| 121 diff = {'coord':coord-1, 'type':'del', 'alt':1} | |
| 122 else: | |
| 123 diff['alt'] += 1 | |
| 124 if print_mid: | |
| 125 sys.stdout.write(' ') | |
| 126 else: | |
| 127 diffs.append({'coord':coord, 'type':'snv', 'alt':base2}) | |
| 128 if print_mid: | |
| 129 sys.stdout.write(' ') | |
| 130 if diff is not None: | |
| 131 diffs.append(diff) | |
| 132 if print_mid: | |
| 133 print() | |
| 134 return diffs | |
| 135 | |
| 136 | |
| 137 def fail(message): | |
| 138 sys.stderr.write(message+"\n") | |
| 139 sys.exit(1) | |
| 140 | |
| 141 if __name__ == '__main__': | |
| 142 sys.exit(main(sys.argv)) |
