diff utils/correct-simple.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/correct-simple.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import errno
+import logging
+import argparse
+import subprocess
+
+ARG_DEFAULTS = {'nbarcodes':20, 'mapq_thres':25, 'log':sys.stderr, 'volume':logging.ERROR}
+DESCRIPTION = """"""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.set_defaults(**ARG_DEFAULTS)
+
+  parser.add_argument('nbarcodes', metavar='barcodes to try', type=int, nargs='?',
+    help='')
+  parser.add_argument('-s', '--summary', action='store_true',
+    help='Only print the summary of how many families were rescued.')
+  parser.add_argument('-m', '--mapq', type=int)
+  parser.add_argument('-r', '--random', action='store_true')
+  parser.add_argument('-l', '--log', type=argparse.FileType('w'),
+    help='Print log messages to this file instead of to stderr. Warning: Will overwrite the file.')
+  parser.add_argument('-q', '--quiet', dest='volume', action='store_const', const=logging.CRITICAL)
+  parser.add_argument('-v', '--verbose', dest='volume', action='store_const', const=logging.INFO)
+  parser.add_argument('-D', '--debug', dest='volume', action='store_const', const=logging.DEBUG)
+
+  args = parser.parse_args(argv[1:])
+
+  logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s')
+  tone_down_logger()
+
+  logging.info('Reading random barcodes from border-families.txt..')
+  rand_arg = '--random-source=border-families.txt'
+  if args.random:
+    rand_arg = ''
+  pipeline = 'cat border-families.txt | paste - - | shuf {} | head -n {}'.format(rand_arg,
+                                                                                 args.nbarcodes)
+  commands = [cmd.split() for cmd in pipeline.split('|')]
+  process = make_pipeline(*commands)
+  families_by_barcode = {}
+  for line_raw in process.stdout:
+    line = line_raw.rstrip('\r\n')
+    fields = line.split()
+    family = {}
+    count1, barcode, order1, count2, barcode2, order2 = fields
+    assert barcode == barcode2, (barcode, barcode2)
+    if order1 == 'ab':
+      assert order2 == 'ba', barcode
+    elif order1 == 'ba':
+      assert order2 == 'ab', barcode
+      count1, count2 = count2, count1
+    else:
+      fail(order1, order2, barcode)
+    family['count1'] = int(count1)
+    family['count2'] = int(count2)
+    family['barcode'] = barcode
+    families_by_barcode[barcode] = family
+
+  logging.info('Reading barcodes.fq to find read names..')
+  hits = 0
+  families_by_read_name = {}
+  line_num = 0
+  with open('barcodes.fq', 'rU') as barcodes_fq:
+    for line in barcodes_fq:
+      line_num += 1
+      if line_num % 4 == 1:
+        read_name = line[1:].rstrip('\r\n')
+      elif line_num % 4 == 2:
+        seq = line.rstrip('\r\n')
+        family = families_by_barcode.get(seq)
+        if family:
+          hits += 1
+          family['read_name'] = read_name
+          families_by_read_name[read_name] = family
+  logging.info('hits: {}'.format(hits))
+
+  logging.info('Reading barcodes.bam to find similar barcodes..')
+  hits = 0
+  neighbors_by_read_name = {}
+  # samtools view -f 256 barcodes.bam | awk '$1 == '$read_name' && $5 > 25 {print $3}'
+  process = subprocess.Popen(('samtools', 'view', '-f', '256', 'barcodes.bam'), stdout=subprocess.PIPE)
+  for line in process.stdout:
+    fields = line.split()
+    mapq = int(fields[4])
+    if mapq >= args.mapq_thres:
+      read_name = fields[0]
+      family = families_by_read_name.get(read_name)
+      if family:
+        hits += 1
+        read_name2 = fields[2]
+        neighbor = {'read_name':read_name2}
+        neighbors = family.get('neighbors', [])
+        neighbors.append(neighbor)
+        family['neighbors'] = neighbors
+        neighbors_by_read_name[read_name2] = neighbor
+  logging.info('hits: {}'.format(hits))
+
+  logging.info('Reading barcodes.fq to find sequences of neighbors..')
+  hits = 0
+  line_num = 0
+  neighbors_by_barcode = {}
+  with open('barcodes.fq', 'rU') as barcodes_fq:
+    for line in barcodes_fq:
+      line_num += 1
+      if line_num % 4 == 1:
+        read_name = line[1:].rstrip('\r\n')
+        neighbor = neighbors_by_read_name.get(read_name)
+      if line_num % 4 == 2 and neighbor:
+        seq = line.rstrip('\r\n')
+        neighbor['barcode'] = seq
+        neighbors_by_barcode[seq] = neighbor
+  logging.info('hits: {}'.format(hits))
+
+  logging.info('Reading families.uniq.txt to get counts of neighbors..')
+  hits = 0
+  with open('families.uniq.txt', 'rU') as families_uniq:
+    for line in families_uniq:
+      fields = line.split()
+      barcode = fields[1]
+      neighbor = neighbors_by_barcode.get(barcode)
+      if neighbor:
+        hits += 1
+        count = int(fields[0])
+        order = fields[2].rstrip('\r\n')
+        alpha = barcode[:len(seq)//2]
+        beta  = barcode[len(seq)//2:]
+        swap = alpha >= beta
+        if (not swap and order == 'ab') or (swap and order == 'ba'):
+          neighbor['count1'] = count
+          neighbor['count2'] = 0
+        elif (not swap and order == 'ba') or (swap and order == 'ab'):
+          neighbor['count1'] = 0
+          neighbor['count2'] = count
+        else:
+          fail(order, barcode, swap)
+  logging.info('hits: {}'.format(hits))
+
+  logging.info('Printing results..')
+  total = 0
+  passing = 0
+  for family in families_by_barcode.values():
+    total += 1
+    count1 = family['count1']
+    count2 = family['count2']
+    if not args.summary:
+      print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**family))
+    neighbors = family.get('neighbors')
+    if neighbors:
+      for neighbor in neighbors:
+        if not args.summary:
+          print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**neighbor))
+        count1 += neighbor['count1']
+        count2 += neighbor['count2']
+    if count1 >= 3 and count2 >= 3:
+      if not args.summary:
+        print('PASS!')
+      passing += 1
+    elif not args.summary:
+      print('fail')
+
+  print('{} families rescued out of {} ({:0.2f}%)'.format(passing, total, 100*passing/total))
+
+
+def make_pipeline(*commands):
+  processes = []
+  for command in commands:
+    if not processes:
+      processes.append(subprocess.Popen(command, stdout=subprocess.PIPE))
+    else:
+      processes.append(subprocess.Popen(command, stdin=processes[-1].stdout, stdout=subprocess.PIPE))
+  processes[0].stdout.close()
+  return processes[-1]
+
+
+def tone_down_logger():
+  """Change the logging level names from all-caps to capitalized lowercase.
+  E.g. "WARNING" -> "Warning" (turn down the volume a bit in your log files)"""
+  for level in (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG):
+    level_name = logging.getLevelName(level)
+    logging.addLevelName(level, level_name.capitalize())
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+
+if __name__ == '__main__':
+  try:
+    sys.exit(main(sys.argv))
+  except IOError as ioe:
+    if ioe.errno != errno.EPIPE:
+      raise