Mercurial > repos > nick > duplex
diff utils/subsample.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | af383638de66 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/subsample.py Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,55 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import random +import argparse + +OPT_DEFAULTS = {'fraction':0.1, 'seed':1} +USAGE = "%(prog)s [options]" +DESCRIPTION = """""" + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('infile', metavar='read-families.tsv', nargs='?', + help='The input reads, sorted into families.') + parser.add_argument('-f', '--fraction', type=float, + help='Fraction of families to output. Default: %(default)s') + parser.add_argument('-s', '--seed', type=int, + help='Random number generator seed. Default: %(default)s') + + args = parser.parse_args(argv[1:]) + + random.seed(args.seed) + + if args.infile: + infile = open(args.infile) + else: + infile = sys.stdin + + family = [] + last_barcode = None + for line in infile: + fields = line.rstrip('\r\n').split('\t') + if not fields: + continue + barcode = fields[0] + if barcode != last_barcode: + if random.random() <= args.fraction: + sys.stdout.write(''.join(family)) + family = [] + family.append(line) + last_barcode = barcode + + if infile is not sys.stdin: + infile.close() + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv))