comparison utils/stats.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #!/usr/bin/env python
2 from __future__ import division
3 import os
4 import sys
5 import math
6 import argparse
7 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
8 import seqtools
9 import swalign
10
11 INF = float('inf')
12 STATS = ('diffs', 'diffs-binned', 'seqlen', 'strand')
13 OPT_DEFAULTS = {'bins':10, 'probes':'', 'thres':0.75}
14 USAGE = "%(prog)s [options]"
15 DESCRIPTION = """"""
16
17
18 def main(argv):
19
20 parser = argparse.ArgumentParser(description=DESCRIPTION)
21 parser.set_defaults(**OPT_DEFAULTS)
22
23 parser.add_argument('stats',
24 help='The type of statistics to compute and print. Give a comma-separated list of stat names, '
25 'choosing from "{}".'.format('", "'.join(STATS)))
26 parser.add_argument('infile', metavar='read-families.msa.tsv', nargs='?',
27 help='The --msa output of sscs.py. Will read from stdin if not provided.')
28 parser.add_argument('-b', '--bins', type=int,
29 help='The number of bins to segment reads into when doing "diffs-binned".')
30 parser.add_argument('-p', '--probes',
31 help='Sequence excerpts from the sense strand. Required for "strand" statistic. '
32 'Comma-separated.')
33 parser.add_argument('-t', '--thres', type=int,
34 help='Alignment identity threshold (in fraction, not decimal). Default: %(default)s')
35
36 args = parser.parse_args(argv[1:])
37
38 stats = args.stats.split(',')
39 for stat in stats:
40 if stat not in STATS:
41 fail('Error: invalid statistic "{}". Must choose one of "{}".'.format(stat, '", "'.join(STATS)))
42 if 'strand' in stats and not args.probes:
43 fail('Error: must provide a probe if requesting "strand" statistic.')
44
45 if args.infile:
46 infile = open(args.infile)
47 else:
48 infile = sys.stdin
49
50 family = []
51 consensus = None
52 barcode = None
53 for line in infile:
54 fields = line.rstrip('\r\n').split('\t')
55 if len(fields) != 3:
56 continue
57 (this_barcode, name, seq) = fields
58 if fields[1] == 'CONSENSUS':
59 if family and consensus:
60 process_family(stats, barcode, consensus, family, args)
61 barcode = this_barcode
62 consensus = seq
63 family = []
64 else:
65 family.append(seq)
66 if family and consensus:
67 process_family(stats, barcode, consensus, family, args)
68
69 if infile is not sys.stdin:
70 infile.close()
71
72
73 #TODO: Maybe print the number of N's in the consensus?
74 def process_family(stats, barcode, consensus, family, args):
75 # Compute stats requiring the whole family at once.
76 for stat in stats:
77 if stat == 'diffs':
78 diffs = seqtools.get_diffs_frac_simple(consensus, family)
79 elif stat == 'diffs-binned':
80 diffs_binned = seqtools.get_diffs_frac_binned(consensus, family, args.bins)
81 elif stat == 'strand':
82 probes = args.probes.split(',')
83 strand = get_strand(consensus, probes, args.thres)
84 # Print the requested stats for each read.
85 # Columns: barcode, [stat columns], read sequence.
86 for (i, read) in enumerate(family):
87 sys.stdout.write(barcode+'\t')
88 for stat in stats:
89 if stat == 'diffs':
90 sys.stdout.write('{}\t'.format(round_sig_figs(diffs[i], 3)))
91 elif stat == 'diffs-binned':
92 if diffs_binned is None:
93 sys.stdout.write('\t' * args.bins)
94 else:
95 for diff in diffs_binned[i]:
96 sys.stdout.write(str(round_sig_figs(diff, 3))+'\t')
97 elif stat == 'seqlen':
98 sys.stdout.write('{}\t'.format(len(read)))
99 elif stat == 'strand':
100 sys.stdout.write('{}\t'.format(strand))
101 print read.upper()
102
103
104 def get_strand(seq, probes, thres):
105 """Determine which strand the sequence comes from by trying to align probes from the sense strand.
106 Returns 'sense', 'anti', or None.
107 Algorithm: This tries each probe in both directions.
108 If at least one of the alignments has an identity above the threshold, a vote is cast for the
109 direction with a higher identity.
110 If the votes that were cast are unanimous for one direction, that strand is returned.
111 Else, return None."""
112 votes = []
113 for probe in probes:
114 alignment = swalign.smith_waterman(seq, probe)
115 sense_id = alignment.matches/len(probe)
116 alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe))
117 anti_id = alignment.matches/len(probe)
118 # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id)
119 if sense_id > thres or anti_id > thres:
120 if sense_id > anti_id:
121 votes.append('sense')
122 else:
123 votes.append('anti')
124 strand = None
125 for vote in votes:
126 if strand:
127 if strand != vote:
128 return None
129 else:
130 strand = vote
131 return strand
132
133
134 def round_sig_figs(n, figs):
135 if n == 0:
136 return n
137 elif n < 0:
138 n = -n
139 sign = -1
140 elif n > 0:
141 sign = 1
142 elif math.isnan(n) or n == INF:
143 return n
144 magnitude = int(math.floor(math.log10(n)))
145 return sign * round(n, figs - 1 - magnitude)
146
147
148 def fail(message):
149 sys.stderr.write(message+"\n")
150 sys.exit(1)
151
152
153 if __name__ == '__main__':
154 sys.exit(main(sys.argv))