Mercurial > repos > nick > duplex
comparison utils/stats.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | af383638de66 |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import division | |
3 import os | |
4 import sys | |
5 import math | |
6 import argparse | |
7 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) | |
8 import seqtools | |
9 import swalign | |
10 | |
11 INF = float('inf') | |
12 STATS = ('diffs', 'diffs-binned', 'seqlen', 'strand') | |
13 OPT_DEFAULTS = {'bins':10, 'probes':'', 'thres':0.75} | |
14 USAGE = "%(prog)s [options]" | |
15 DESCRIPTION = """""" | |
16 | |
17 | |
18 def main(argv): | |
19 | |
20 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
21 parser.set_defaults(**OPT_DEFAULTS) | |
22 | |
23 parser.add_argument('stats', | |
24 help='The type of statistics to compute and print. Give a comma-separated list of stat names, ' | |
25 'choosing from "{}".'.format('", "'.join(STATS))) | |
26 parser.add_argument('infile', metavar='read-families.msa.tsv', nargs='?', | |
27 help='The --msa output of sscs.py. Will read from stdin if not provided.') | |
28 parser.add_argument('-b', '--bins', type=int, | |
29 help='The number of bins to segment reads into when doing "diffs-binned".') | |
30 parser.add_argument('-p', '--probes', | |
31 help='Sequence excerpts from the sense strand. Required for "strand" statistic. ' | |
32 'Comma-separated.') | |
33 parser.add_argument('-t', '--thres', type=int, | |
34 help='Alignment identity threshold (in fraction, not decimal). Default: %(default)s') | |
35 | |
36 args = parser.parse_args(argv[1:]) | |
37 | |
38 stats = args.stats.split(',') | |
39 for stat in stats: | |
40 if stat not in STATS: | |
41 fail('Error: invalid statistic "{}". Must choose one of "{}".'.format(stat, '", "'.join(STATS))) | |
42 if 'strand' in stats and not args.probes: | |
43 fail('Error: must provide a probe if requesting "strand" statistic.') | |
44 | |
45 if args.infile: | |
46 infile = open(args.infile) | |
47 else: | |
48 infile = sys.stdin | |
49 | |
50 family = [] | |
51 consensus = None | |
52 barcode = None | |
53 for line in infile: | |
54 fields = line.rstrip('\r\n').split('\t') | |
55 if len(fields) != 3: | |
56 continue | |
57 (this_barcode, name, seq) = fields | |
58 if fields[1] == 'CONSENSUS': | |
59 if family and consensus: | |
60 process_family(stats, barcode, consensus, family, args) | |
61 barcode = this_barcode | |
62 consensus = seq | |
63 family = [] | |
64 else: | |
65 family.append(seq) | |
66 if family and consensus: | |
67 process_family(stats, barcode, consensus, family, args) | |
68 | |
69 if infile is not sys.stdin: | |
70 infile.close() | |
71 | |
72 | |
73 #TODO: Maybe print the number of N's in the consensus? | |
74 def process_family(stats, barcode, consensus, family, args): | |
75 # Compute stats requiring the whole family at once. | |
76 for stat in stats: | |
77 if stat == 'diffs': | |
78 diffs = seqtools.get_diffs_frac_simple(consensus, family) | |
79 elif stat == 'diffs-binned': | |
80 diffs_binned = seqtools.get_diffs_frac_binned(consensus, family, args.bins) | |
81 elif stat == 'strand': | |
82 probes = args.probes.split(',') | |
83 strand = get_strand(consensus, probes, args.thres) | |
84 # Print the requested stats for each read. | |
85 # Columns: barcode, [stat columns], read sequence. | |
86 for (i, read) in enumerate(family): | |
87 sys.stdout.write(barcode+'\t') | |
88 for stat in stats: | |
89 if stat == 'diffs': | |
90 sys.stdout.write('{}\t'.format(round_sig_figs(diffs[i], 3))) | |
91 elif stat == 'diffs-binned': | |
92 if diffs_binned is None: | |
93 sys.stdout.write('\t' * args.bins) | |
94 else: | |
95 for diff in diffs_binned[i]: | |
96 sys.stdout.write(str(round_sig_figs(diff, 3))+'\t') | |
97 elif stat == 'seqlen': | |
98 sys.stdout.write('{}\t'.format(len(read))) | |
99 elif stat == 'strand': | |
100 sys.stdout.write('{}\t'.format(strand)) | |
101 print read.upper() | |
102 | |
103 | |
104 def get_strand(seq, probes, thres): | |
105 """Determine which strand the sequence comes from by trying to align probes from the sense strand. | |
106 Returns 'sense', 'anti', or None. | |
107 Algorithm: This tries each probe in both directions. | |
108 If at least one of the alignments has an identity above the threshold, a vote is cast for the | |
109 direction with a higher identity. | |
110 If the votes that were cast are unanimous for one direction, that strand is returned. | |
111 Else, return None.""" | |
112 votes = [] | |
113 for probe in probes: | |
114 alignment = swalign.smith_waterman(seq, probe) | |
115 sense_id = alignment.matches/len(probe) | |
116 alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe)) | |
117 anti_id = alignment.matches/len(probe) | |
118 # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id) | |
119 if sense_id > thres or anti_id > thres: | |
120 if sense_id > anti_id: | |
121 votes.append('sense') | |
122 else: | |
123 votes.append('anti') | |
124 strand = None | |
125 for vote in votes: | |
126 if strand: | |
127 if strand != vote: | |
128 return None | |
129 else: | |
130 strand = vote | |
131 return strand | |
132 | |
133 | |
134 def round_sig_figs(n, figs): | |
135 if n == 0: | |
136 return n | |
137 elif n < 0: | |
138 n = -n | |
139 sign = -1 | |
140 elif n > 0: | |
141 sign = 1 | |
142 elif math.isnan(n) or n == INF: | |
143 return n | |
144 magnitude = int(math.floor(math.log10(n))) | |
145 return sign * round(n, figs - 1 - magnitude) | |
146 | |
147 | |
148 def fail(message): | |
149 sys.stderr.write(message+"\n") | |
150 sys.exit(1) | |
151 | |
152 | |
153 if __name__ == '__main__': | |
154 sys.exit(main(sys.argv)) |