annotate utils/subsample.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
1 #!/usr/bin/env python
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
2 from __future__ import division
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
3 import sys
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
4 import random
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
5 import argparse
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
6
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
7 OPT_DEFAULTS = {'fraction':0.1, 'seed':1}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
8 USAGE = "%(prog)s [options]"
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
9 DESCRIPTION = """"""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
10
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
11 def main(argv):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
12
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
13 parser = argparse.ArgumentParser(description=DESCRIPTION)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
14 parser.set_defaults(**OPT_DEFAULTS)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
15
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
16 parser.add_argument('infile', metavar='read-families.tsv', nargs='?',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
17 help='The input reads, sorted into families.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
18 parser.add_argument('-f', '--fraction', type=float,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
19 help='Fraction of families to output. Default: %(default)s')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
20 parser.add_argument('-s', '--seed', type=int,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
21 help='Random number generator seed. Default: %(default)s')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
22
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
23 args = parser.parse_args(argv[1:])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
24
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
25 random.seed(args.seed)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
26
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
27 if args.infile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
28 infile = open(args.infile)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
29 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
30 infile = sys.stdin
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
31
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
32 family = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
33 last_barcode = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
34 for line in infile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
35 fields = line.rstrip('\r\n').split('\t')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
36 if not fields:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
37 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
38 barcode = fields[0]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
39 if barcode != last_barcode:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
40 if random.random() <= args.fraction:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
41 sys.stdout.write(''.join(family))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
42 family = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
43 family.append(line)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
44 last_barcode = barcode
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
45
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
46 if infile is not sys.stdin:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
47 infile.close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
48
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
49
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
50 def fail(message):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
51 sys.stderr.write(message+"\n")
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
52 sys.exit(1)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
53
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
54 if __name__ == '__main__':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
55 sys.exit(main(sys.argv))