diff utils/subsample.py @ 4:af383638de66 draft

planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
author nick
date Mon, 23 Nov 2015 18:44:23 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/subsample.py	Mon Nov 23 18:44:23 2015 -0500
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+from __future__ import division
+import sys
+import random
+import argparse
+
+OPT_DEFAULTS = {'fraction':0.1, 'seed':1}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """"""
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('infile', metavar='read-families.tsv', nargs='?',
+    help='The input reads, sorted into families.')
+  parser.add_argument('-f', '--fraction', type=float,
+    help='Fraction of families to output. Default: %(default)s')
+  parser.add_argument('-s', '--seed', type=int,
+    help='Random number generator seed. Default: %(default)s')
+
+  args = parser.parse_args(argv[1:])
+
+  random.seed(args.seed)
+
+  if args.infile:
+    infile = open(args.infile)
+  else:
+    infile = sys.stdin
+
+  family = []
+  last_barcode = None
+  for line in infile:
+    fields = line.rstrip('\r\n').split('\t')
+    if not fields:
+      continue
+    barcode = fields[0]
+    if barcode != last_barcode:
+      if random.random() <= args.fraction:
+        sys.stdout.write(''.join(family))
+      family = []
+    family.append(line)
+    last_barcode = barcode
+
+  if infile is not sys.stdin:
+    infile.close()
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))