Mercurial > repos > nick > duplex
comparison utils/stats.py @ 4:af383638de66 draft
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
| author | nick |
|---|---|
| date | Mon, 23 Nov 2015 18:44:23 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:13bcc2f459b0 | 4:af383638de66 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 import os | |
| 4 import sys | |
| 5 import math | |
| 6 import argparse | |
| 7 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) | |
| 8 import seqtools | |
| 9 import swalign | |
| 10 | |
| 11 INF = float('inf') | |
| 12 STATS = ('diffs', 'diffs-binned', 'seqlen', 'strand') | |
| 13 OPT_DEFAULTS = {'bins':10, 'probes':'', 'thres':0.75} | |
| 14 USAGE = "%(prog)s [options]" | |
| 15 DESCRIPTION = """""" | |
| 16 | |
| 17 | |
| 18 def main(argv): | |
| 19 | |
| 20 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 21 parser.set_defaults(**OPT_DEFAULTS) | |
| 22 | |
| 23 parser.add_argument('stats', | |
| 24 help='The type of statistics to compute and print. Give a comma-separated list of stat names, ' | |
| 25 'choosing from "{}".'.format('", "'.join(STATS))) | |
| 26 parser.add_argument('infile', metavar='read-families.msa.tsv', nargs='?', | |
| 27 help='The --msa output of sscs.py. Will read from stdin if not provided.') | |
| 28 parser.add_argument('-b', '--bins', type=int, | |
| 29 help='The number of bins to segment reads into when doing "diffs-binned".') | |
| 30 parser.add_argument('-p', '--probes', | |
| 31 help='Sequence excerpts from the sense strand. Required for "strand" statistic. ' | |
| 32 'Comma-separated.') | |
| 33 parser.add_argument('-t', '--thres', type=int, | |
| 34 help='Alignment identity threshold (in fraction, not decimal). Default: %(default)s') | |
| 35 | |
| 36 args = parser.parse_args(argv[1:]) | |
| 37 | |
| 38 stats = args.stats.split(',') | |
| 39 for stat in stats: | |
| 40 if stat not in STATS: | |
| 41 fail('Error: invalid statistic "{}". Must choose one of "{}".'.format(stat, '", "'.join(STATS))) | |
| 42 if 'strand' in stats and not args.probes: | |
| 43 fail('Error: must provide a probe if requesting "strand" statistic.') | |
| 44 | |
| 45 if args.infile: | |
| 46 infile = open(args.infile) | |
| 47 else: | |
| 48 infile = sys.stdin | |
| 49 | |
| 50 family = [] | |
| 51 consensus = None | |
| 52 barcode = None | |
| 53 for line in infile: | |
| 54 fields = line.rstrip('\r\n').split('\t') | |
| 55 if len(fields) != 3: | |
| 56 continue | |
| 57 (this_barcode, name, seq) = fields | |
| 58 if fields[1] == 'CONSENSUS': | |
| 59 if family and consensus: | |
| 60 process_family(stats, barcode, consensus, family, args) | |
| 61 barcode = this_barcode | |
| 62 consensus = seq | |
| 63 family = [] | |
| 64 else: | |
| 65 family.append(seq) | |
| 66 if family and consensus: | |
| 67 process_family(stats, barcode, consensus, family, args) | |
| 68 | |
| 69 if infile is not sys.stdin: | |
| 70 infile.close() | |
| 71 | |
| 72 | |
| 73 #TODO: Maybe print the number of N's in the consensus? | |
| 74 def process_family(stats, barcode, consensus, family, args): | |
| 75 # Compute stats requiring the whole family at once. | |
| 76 for stat in stats: | |
| 77 if stat == 'diffs': | |
| 78 diffs = seqtools.get_diffs_frac_simple(consensus, family) | |
| 79 elif stat == 'diffs-binned': | |
| 80 diffs_binned = seqtools.get_diffs_frac_binned(consensus, family, args.bins) | |
| 81 elif stat == 'strand': | |
| 82 probes = args.probes.split(',') | |
| 83 strand = get_strand(consensus, probes, args.thres) | |
| 84 # Print the requested stats for each read. | |
| 85 # Columns: barcode, [stat columns], read sequence. | |
| 86 for (i, read) in enumerate(family): | |
| 87 sys.stdout.write(barcode+'\t') | |
| 88 for stat in stats: | |
| 89 if stat == 'diffs': | |
| 90 sys.stdout.write('{}\t'.format(round_sig_figs(diffs[i], 3))) | |
| 91 elif stat == 'diffs-binned': | |
| 92 if diffs_binned is None: | |
| 93 sys.stdout.write('\t' * args.bins) | |
| 94 else: | |
| 95 for diff in diffs_binned[i]: | |
| 96 sys.stdout.write(str(round_sig_figs(diff, 3))+'\t') | |
| 97 elif stat == 'seqlen': | |
| 98 sys.stdout.write('{}\t'.format(len(read))) | |
| 99 elif stat == 'strand': | |
| 100 sys.stdout.write('{}\t'.format(strand)) | |
| 101 print read.upper() | |
| 102 | |
| 103 | |
| 104 def get_strand(seq, probes, thres): | |
| 105 """Determine which strand the sequence comes from by trying to align probes from the sense strand. | |
| 106 Returns 'sense', 'anti', or None. | |
| 107 Algorithm: This tries each probe in both directions. | |
| 108 If at least one of the alignments has an identity above the threshold, a vote is cast for the | |
| 109 direction with a higher identity. | |
| 110 If the votes that were cast are unanimous for one direction, that strand is returned. | |
| 111 Else, return None.""" | |
| 112 votes = [] | |
| 113 for probe in probes: | |
| 114 alignment = swalign.smith_waterman(seq, probe) | |
| 115 sense_id = alignment.matches/len(probe) | |
| 116 alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe)) | |
| 117 anti_id = alignment.matches/len(probe) | |
| 118 # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id) | |
| 119 if sense_id > thres or anti_id > thres: | |
| 120 if sense_id > anti_id: | |
| 121 votes.append('sense') | |
| 122 else: | |
| 123 votes.append('anti') | |
| 124 strand = None | |
| 125 for vote in votes: | |
| 126 if strand: | |
| 127 if strand != vote: | |
| 128 return None | |
| 129 else: | |
| 130 strand = vote | |
| 131 return strand | |
| 132 | |
| 133 | |
| 134 def round_sig_figs(n, figs): | |
| 135 if n == 0: | |
| 136 return n | |
| 137 elif n < 0: | |
| 138 n = -n | |
| 139 sign = -1 | |
| 140 elif n > 0: | |
| 141 sign = 1 | |
| 142 elif math.isnan(n) or n == INF: | |
| 143 return n | |
| 144 magnitude = int(math.floor(math.log10(n))) | |
| 145 return sign * round(n, figs - 1 - magnitude) | |
| 146 | |
| 147 | |
| 148 def fail(message): | |
| 149 sys.stderr.write(message+"\n") | |
| 150 sys.exit(1) | |
| 151 | |
| 152 | |
| 153 if __name__ == '__main__': | |
| 154 sys.exit(main(sys.argv)) |
