diff utils/outconv.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/outconv.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import argparse
+
+ARG_DEFAULTS = {'input':sys.stdin}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """Split interleaved outputs of dunovo.py into pairs of paired-end files."""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.set_defaults(**ARG_DEFAULTS)
+
+  parser.add_argument('input', metavar='consensuses.fa', type=argparse.FileType('r'),
+    help='Interleaved consensus reads (DCS or SSCS) from dunovo.py.')
+  parser.add_argument('-1', '--out1', metavar='out_1.fa', type=argparse.FileType('w'), required=True,
+    help='Output filename for mate 1 reads. CAUTION: Will overwrite any existing file.')
+  parser.add_argument('-2', '--out2', metavar='out_2.fa', type=argparse.FileType('w'), required=True,
+    help='Output filename for mate 2 reads. CAUTION: Will overwrite any existing file.')
+
+  args = parser.parse_args(argv[1:])
+
+  """
+SSCS:
+>AAAAAAAAAAAACTAAAATACAAA.ab.1 15
+ACTGATGAAAAGGCTGTTATTGTATCTGATGTGTAGTGTATGGCTAAGAAAAGACCTGTAATGATTTGGACTATTAGGCAGACTCCTAGAAGGGACCCAA
+>AAAAAAAAAAAACTAAAATACAAA.ab.2 15
+ATCCAAACACAACCAACATCCCCCCTAAATAAATTAAAAAAACTATTAAACCTAAAAACGATCCACCAAACCCTAAAACCATTAAACAACCAACAAACCC
+>AAAAAAAAAAAAGACCACGTTTCT.ab.1 15
+TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC
+>AAAAAAAAAAAAGACCACGTTTCT.ba.2 18
+TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC
+>AAAAAAAAAAAAGACCACGTTTCT.ab.2 15
+GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG
+>AAAAAAAAAAAAGACCACGTTTCT.ba.1 18
+GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG
+
+DCS:
+>AAAAAAAACACCAAATACGCCTAC.1 28-39
+GCTATGAATATAGGGGCTGTAAGAATAATATAGATTATGAGGTTGAGTAGAGTGAGGGATGGGTTGTAAGGAAGAATTGCTAATATTCATCCTATGTGGG
+>AAAAAAAACACCAAATACGCCTAC.2 28-39
+TACTTCGTCACACAAGCAACAGCCTCAATAATTATCCTCCTGGCCATCGTACTCAACTATAAACAACTAGGAACATGAATATTTCAACAACAAACAAACG
+  """
+
+  intype = None
+  line_num = 0
+  sscs_buffer = {}
+  for line_raw in args.input:
+    line_num += 1
+    line = line_raw.rstrip('\r\n')
+    if line.startswith('>'):
+      # We're in a header line.
+      barcode, strand, mate, famsizes = parse_header(line)
+      if intype is None:
+        if strand is None:
+          intype = 'DCS'
+        else:
+          intype = 'SSCS'
+    else:
+      # We're in a sequence line.
+      seq = line
+      if intype == 'DCS':
+        if mate == '1':
+          args.out1.write('>{} {}\n{}\n'.format(barcode, famsizes, seq))
+        elif mate == '2':
+          args.out2.write('>{} {}\n{}\n'.format(barcode, famsizes, seq))
+        else:
+          fail('Error: Invalid mate "{}" on line {}.'.format(mate, line_num))
+      elif intype == 'SSCS':
+        # Need to get SSCS in correct order.
+        # Collect reads until they're properly paired, then print both.
+        if (barcode, strand) in sscs_buffer:
+          stored_read = sscs_buffer[(barcode, strand)]
+          new_read = {'mate':mate, 'famsizes':famsizes, 'seq':seq}
+          if stored_read['mate'] == '1' and mate == '2':
+            read1, read2 = stored_read, new_read
+          elif stored_read['mate'] == '2' and mate == '1':
+            read1, read2 = new_read, stored_read
+          args.out1.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read1))
+          args.out2.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read2))
+          del sscs_buffer[(barcode, strand)]
+        else:
+          sscs_buffer[(barcode, strand)] = {'mate':mate, 'famsizes':famsizes, 'seq':seq}
+
+
+def parse_header(header):
+  read_id, famsizes = header.split()
+  id_fields = read_id[1:].split('.')
+  barcode = id_fields[0]
+  strand = None
+  mate = id_fields[-1]
+  if len(id_fields) == 3:
+    strand = id_fields[1]
+  return barcode, strand, mate, famsizes
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))