Mercurial > repos > nick > duplex
diff misc/sscs_diff.py @ 4:af383638de66 draft
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
| author | nick |
|---|---|
| date | Mon, 23 Nov 2015 18:44:23 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/sscs_diff.py Mon Nov 23 18:44:23 2015 -0500 @@ -0,0 +1,74 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse +import subprocess + +OPT_DEFAULTS = {} +USAGE = "%(prog)s [options]" +DESCRIPTION = """Find differences between the SSCS produced by one version of the pipeline and +another, when working on the same input MSA's.""" +EPILOG = """Warning: This injects raw command-line arguments into shell commands and executes them. +""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('sscs_before', metavar='sscs.all.before.fa', + help='SSCSs from earlier version (can be gzipped).') + parser.add_argument('sscs_after', metavar='sscs.all.after.fa', + help='SSCSs from later version (can be gzipped).') + parser.add_argument('-b', '--before', metavar='sscs.all.before.diffs.fa', required=True, + help='Output SSCSs from earlier version that differ from the SSCS in the later version here.') + parser.add_argument('-a', '--after', metavar='sscs.all.after.diffs.fa', required=True, + help='Output SSCSs from later version that differ from the SSCS in the earlier version here.') + + args = parser.parse_args(argv[1:]) + + sscs_before = {} + if args.sscs_before.endswith('.gz'): + command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_before) + else: + command = 'cat {} | paste - - | sort'.format(args.sscs_before) + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + for line in process.stdout: + fields = line.rstrip('\r\n').split('\t') + name = fields[0].lstrip('>').split()[0] + seq = fields[1] + sscs_before[name] = seq + + before_fh = open(args.before, 'w') + after_fh = open(args.after, 'w') + diffs = {} + if args.sscs_after.endswith('.gz'): + command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_after) + else: + command = 'cat {} | paste - - | sort'.format(args.sscs_after) + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + for line in process.stdout: + fields = line.rstrip('\r\n').split('\t') + header = fields[0].lstrip('>') + name, fam_size = header.split() + seq_after = fields[1] + if name in sscs_before: + seq_before = sscs_before[name] + if seq_before != seq_after: + diffs[name] = (seq_before, seq_after) + before_fh.write('>{} {}\n'.format(name, fam_size)) + before_fh.write(seq_before+'\n') + after_fh.write('>{} {}\n'.format(name, fam_size)) + after_fh.write(seq_after+'\n') + before_fh.close() + after_fh.close() + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + + +if __name__ == '__main__': + sys.exit(main(sys.argv))
