diff misc/msa_sscs_matcher.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/misc/msa_sscs_matcher.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+from __future__ import division
+import sys
+import argparse
+
+OPT_DEFAULTS = {}
+USAGE = "gunzip -c families.msa.tsv.gz | %(prog)s sscs.set.fa"
+DESCRIPTION = """Find the input MSA's which produced a given set of SSCS's. Pipe the full set of
+MSA's to stdin and it will filter them to the matching MSA's on stdout."""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(usage=USAGE, description=DESCRIPTION)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('sscs', metavar='sscs.set.fa',
+    help='A set of SSCS\'s, as output from the duplex.py script with the --sscs-file option.')
+
+  args = parser.parse_args(argv[1:])
+
+  sscs = set()
+  with open(args.sscs) as sscs_file:
+    for line in sscs_file:
+      if line.startswith('>'):
+        name = line.lstrip('>').split()[0]
+        sscs.add(name)
+
+  for line in sys.stdin:
+    barcode, order, mate, rname, seq, qual = line.rstrip('\r\n').split('\t')
+    name = '.'.join((barcode, order, mate))
+    if name in sscs:
+      sys.stdout.write(line)
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))