comparison utils/sim-check.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #!/usr/bin/env python
2 from __future__ import division
3 from __future__ import print_function
4 import re
5 import os
6 import sys
7 import argparse
8 import fastqreader
9 script_dir = os.path.dirname(os.path.realpath(__file__))
10 sys.path.append(os.path.dirname(script_dir))
11 import swalign
12
13 CANON = 'ACGT-'
14
15 WGSIM_ID_REGEX = r'^(.+)_\d+_\d+_\d+:\d+:\d+_\d+:\d+:\d+_([0-9a-f]+)/[12]$'
16 ARG_DEFAULTS = {'print_stats':True}
17 USAGE = "%(prog)s [options]"
18 DESCRIPTION = """Correlate (labeled) reads from duplex pipeline with truth from simulator input,
19 and print the number of errors."""
20
21
22 def main(argv):
23
24 parser = argparse.ArgumentParser(description=DESCRIPTION)
25 parser.set_defaults(**ARG_DEFAULTS)
26
27 parser.add_argument('reads', type=argparse.FileType('r'),
28 help='Output from duplex pipeline. Should be the tsv produced by sim-label.py.')
29 parser.add_argument('frags', type=fastqreader.FastqReadGenerator,
30 help='--frag-file from sim.py.')
31 parser.add_argument('-i', '--ignore-ambiguous', action='store_true',
32 help='Don\'t consider ambiguous bases ("N", "R", etc.) in SNV errors. Specifically, it will '
33 'ignore any mismatch between a non-gap base in the fragment and read base that isn\'t '
34 'one of "'+CANON+'".')
35 parser.add_argument('-a', '--print-alignments', action='store_true',
36 help='Print the alignments of each read with each fragment. Mostly for debug purposes.')
37 parser.add_argument('-S', '--no-stats', dest='print_stats', action='store_false',
38 help='Don\'t print the normal output of statistics on differences.')
39
40 args = parser.parse_args(argv[1:])
41
42 reads = iter(args.reads)
43 frags = iter(args.frags)
44 while True:
45 # Read in the next output read.
46 try:
47 read_line = next(reads)
48 except StopIteration:
49 break
50 fields = read_line.rstrip('\r\n').split('\t')
51 assert len(fields) == 7, fields
52 read = dict(zip(('chrom', 'frag_num', 'frag_id', 'bar', 'reads+', 'reads-', 'seq'), fields))
53 # Read in fragments until we find the one corresponding to the output read.
54 frag_chrom = None
55 frag_frag_id = None
56 while not (read['chrom'] == frag_chrom and read['frag_id'] == frag_frag_id):
57 try:
58 frag = next(frags)
59 except StopIteration:
60 break
61 match = re.search(WGSIM_ID_REGEX, frag.id)
62 if match:
63 frag_chrom = match.group(1)
64 frag_frag_id = match.group(2)
65 else:
66 sys.stderr.write('Invalid wgsim read name: {}\n'.format(frag.id))
67 if frag_chrom is None and frag_frag_id is None:
68 break
69 # Align the output read to the fragment.
70 align = swalign.smith_waterman_duplex(frag.seq, read['seq'])
71 assert len(align.target) == len(align.query)
72 if args.print_alignments:
73 print(align.target)
74 diffs = get_diffs(align.target, align.query, print_mid=args.print_alignments,
75 ignore_ambig=args.ignore_ambiguous)
76 if args.print_alignments:
77 print(align.query)
78 read_len = len(read['seq'])
79 snvs = ins = dels = 0
80 for diff in diffs:
81 if diff['type'] == 'snv':
82 snvs += 1
83 elif diff['type'] == 'ins':
84 ins += 1
85 elif diff['type'] == 'del':
86 dels += 1
87 match_rate = round(align.matches/read_len, 2)
88 if args.print_stats:
89 print(read['bar'], read['frag_id'], read['reads+'], read['reads-'], read_len,
90 read_len-align.matches, match_rate, len(diffs), snvs, ins, dels, sep='\t')
91
92
93 def get_diffs(target, query, print_mid=False, ignore_ambig=False):
94 diffs = []
95 diff = None
96 coord = 0
97 for base1, base2 in zip(target, query):
98 if base1 != '-':
99 coord += 1
100 if base1 == base2:
101 # Finish ongoing indels and add them to the list.
102 if diff is not None:
103 # But omit the "indel" that's just the unaligned portion at the start.
104 if diff['coord'] > 1:
105 diffs.append(diff)
106 diff = None
107 if print_mid:
108 sys.stdout.write('|')
109 elif ignore_ambig and base1 != '-' and base2 not in CANON:
110 if print_mid:
111 sys.stdout.write(' ')
112 elif base1 == '-':
113 if diff is None:
114 diff = {'coord':coord, 'type':'ins', 'alt':base2}
115 else:
116 diff['alt'] += base2
117 if print_mid:
118 sys.stdout.write(' ')
119 elif base2 == '-':
120 if diff is None:
121 diff = {'coord':coord-1, 'type':'del', 'alt':1}
122 else:
123 diff['alt'] += 1
124 if print_mid:
125 sys.stdout.write(' ')
126 else:
127 diffs.append({'coord':coord, 'type':'snv', 'alt':base2})
128 if print_mid:
129 sys.stdout.write(' ')
130 if diff is not None:
131 diffs.append(diff)
132 if print_mid:
133 print()
134 return diffs
135
136
137 def fail(message):
138 sys.stderr.write(message+"\n")
139 sys.exit(1)
140
141 if __name__ == '__main__':
142 sys.exit(main(sys.argv))