diff misc/sscs_diff.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/misc/sscs_diff.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+from __future__ import division
+import sys
+import argparse
+import subprocess
+
+OPT_DEFAULTS = {}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """Find differences between the SSCS produced by one version of the pipeline and
+another, when working on the same input MSA's."""
+EPILOG = """Warning: This injects raw command-line arguments into shell commands and executes them.
+"""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('sscs_before', metavar='sscs.all.before.fa',
+    help='SSCSs from earlier version (can be gzipped).')
+  parser.add_argument('sscs_after', metavar='sscs.all.after.fa',
+    help='SSCSs from later version (can be gzipped).')
+  parser.add_argument('-b', '--before', metavar='sscs.all.before.diffs.fa', required=True,
+    help='Output SSCSs from earlier version that differ from the SSCS in the later version here.')
+  parser.add_argument('-a', '--after', metavar='sscs.all.after.diffs.fa', required=True,
+    help='Output SSCSs from later version that differ from the SSCS in the earlier version here.')
+
+  args = parser.parse_args(argv[1:])
+
+  sscs_before = {}
+  if args.sscs_before.endswith('.gz'):
+    command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_before)
+  else:
+    command = 'cat {} | paste - - | sort'.format(args.sscs_before)
+  process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+  for line in process.stdout:
+    fields = line.rstrip('\r\n').split('\t')
+    name = fields[0].lstrip('>').split()[0]
+    seq = fields[1]
+    sscs_before[name] = seq
+
+  before_fh = open(args.before, 'w')
+  after_fh = open(args.after, 'w')
+  diffs = {}
+  if args.sscs_after.endswith('.gz'):
+    command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_after)
+  else:
+    command = 'cat {} | paste - - | sort'.format(args.sscs_after)
+  process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+  for line in process.stdout:
+    fields = line.rstrip('\r\n').split('\t')
+    header = fields[0].lstrip('>')
+    name, fam_size = header.split()
+    seq_after = fields[1]
+    if name in sscs_before:
+      seq_before = sscs_before[name]
+      if seq_before != seq_after:
+        diffs[name] = (seq_before, seq_after)
+        before_fh.write('>{} {}\n'.format(name, fam_size))
+        before_fh.write(seq_before+'\n')
+        after_fh.write('>{} {}\n'.format(name, fam_size))
+        after_fh.write(seq_after+'\n')
+  before_fh.close()
+  after_fh.close()
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))