Mercurial > repos > nick > duplex
diff misc/msa_sscs_matcher.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | af383638de66 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/msa_sscs_matcher.py Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,37 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse + +OPT_DEFAULTS = {} +USAGE = "gunzip -c families.msa.tsv.gz | %(prog)s sscs.set.fa" +DESCRIPTION = """Find the input MSA's which produced a given set of SSCS's. Pipe the full set of +MSA's to stdin and it will filter them to the matching MSA's on stdout.""" + + +def main(argv): + + parser = argparse.ArgumentParser(usage=USAGE, description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('sscs', metavar='sscs.set.fa', + help='A set of SSCS\'s, as output from the duplex.py script with the --sscs-file option.') + + args = parser.parse_args(argv[1:]) + + sscs = set() + with open(args.sscs) as sscs_file: + for line in sscs_file: + if line.startswith('>'): + name = line.lstrip('>').split()[0] + sscs.add(name) + + for line in sys.stdin: + barcode, order, mate, rname, seq, qual = line.rstrip('\r\n').split('\t') + name = '.'.join((barcode, order, mate)) + if name in sscs: + sys.stdout.write(line) + + +if __name__ == '__main__': + sys.exit(main(sys.argv))