view misc/msa_sscs_matcher.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
line wrap: on
line source

#!/usr/bin/env python
from __future__ import division
import sys
import argparse

OPT_DEFAULTS = {}
USAGE = "gunzip -c families.msa.tsv.gz | %(prog)s sscs.set.fa"
DESCRIPTION = """Find the input MSA's which produced a given set of SSCS's. Pipe the full set of
MSA's to stdin and it will filter them to the matching MSA's on stdout."""


def main(argv):

  parser = argparse.ArgumentParser(usage=USAGE, description=DESCRIPTION)
  parser.set_defaults(**OPT_DEFAULTS)

  parser.add_argument('sscs', metavar='sscs.set.fa',
    help='A set of SSCS\'s, as output from the duplex.py script with the --sscs-file option.')

  args = parser.parse_args(argv[1:])

  sscs = set()
  with open(args.sscs) as sscs_file:
    for line in sscs_file:
      if line.startswith('>'):
        name = line.lstrip('>').split()[0]
        sscs.add(name)

  for line in sys.stdin:
    barcode, order, mate, rname, seq, qual = line.rstrip('\r\n').split('\t')
    name = '.'.join((barcode, order, mate))
    if name in sscs:
      sys.stdout.write(line)


if __name__ == '__main__':
  sys.exit(main(sys.argv))