annotate misc/sscs_diff.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
1 #!/usr/bin/env python
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
2 from __future__ import division
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
3 import sys
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
4 import argparse
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
5 import subprocess
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
6
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
7 OPT_DEFAULTS = {}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
8 USAGE = "%(prog)s [options]"
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
9 DESCRIPTION = """Find differences between the SSCS produced by one version of the pipeline and
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
10 another, when working on the same input MSA's."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
11 EPILOG = """Warning: This injects raw command-line arguments into shell commands and executes them.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
12 """
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
13
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
14
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
15 def main(argv):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
16
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
17 parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
18 parser.set_defaults(**OPT_DEFAULTS)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
19
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
20 parser.add_argument('sscs_before', metavar='sscs.all.before.fa',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
21 help='SSCSs from earlier version (can be gzipped).')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
22 parser.add_argument('sscs_after', metavar='sscs.all.after.fa',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
23 help='SSCSs from later version (can be gzipped).')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
24 parser.add_argument('-b', '--before', metavar='sscs.all.before.diffs.fa', required=True,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
25 help='Output SSCSs from earlier version that differ from the SSCS in the later version here.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
26 parser.add_argument('-a', '--after', metavar='sscs.all.after.diffs.fa', required=True,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
27 help='Output SSCSs from later version that differ from the SSCS in the earlier version here.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
28
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
29 args = parser.parse_args(argv[1:])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
30
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
31 sscs_before = {}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
32 if args.sscs_before.endswith('.gz'):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
33 command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_before)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
34 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
35 command = 'cat {} | paste - - | sort'.format(args.sscs_before)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
36 process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
37 for line in process.stdout:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
38 fields = line.rstrip('\r\n').split('\t')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
39 name = fields[0].lstrip('>').split()[0]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
40 seq = fields[1]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
41 sscs_before[name] = seq
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
42
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
43 before_fh = open(args.before, 'w')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
44 after_fh = open(args.after, 'w')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
45 diffs = {}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
46 if args.sscs_after.endswith('.gz'):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
47 command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_after)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
48 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
49 command = 'cat {} | paste - - | sort'.format(args.sscs_after)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
50 process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
51 for line in process.stdout:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
52 fields = line.rstrip('\r\n').split('\t')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
53 header = fields[0].lstrip('>')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
54 name, fam_size = header.split()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
55 seq_after = fields[1]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
56 if name in sscs_before:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
57 seq_before = sscs_before[name]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
58 if seq_before != seq_after:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
59 diffs[name] = (seq_before, seq_after)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
60 before_fh.write('>{} {}\n'.format(name, fam_size))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
61 before_fh.write(seq_before+'\n')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
62 after_fh.write('>{} {}\n'.format(name, fam_size))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
63 after_fh.write(seq_after+'\n')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
64 before_fh.close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
65 after_fh.close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
66
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
67
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
68 def fail(message):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
69 sys.stderr.write(message+"\n")
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
70 sys.exit(1)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
71
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
72
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
73 if __name__ == '__main__':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
74 sys.exit(main(sys.argv))