diff utils/precheck.py @ 4:af383638de66 draft

planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
author nick
date Mon, 23 Nov 2015 18:44:23 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/precheck.py	Mon Nov 23 18:44:23 2015 -0500
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+from __future__ import division
+import sys
+import argparse
+import getreads
+
+OPT_DEFAULTS = {'tag_len':12, 'const_len':5, 'min_reads':3, 'human':True}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """Print statistics on the raw duplex sequencing reads."""
+EPILOG = """Warning: This tracks all barcodes in a dict, so it can take a lot of memory. A guideline
+is about 200 bytes per (12bp) tag. For example, it took about 800MB for a 10GB, 32 million read
+dataset with an average of 4 pairs per barcode."""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('infile1', metavar='reads_1.fq',
+    help='The first mates in the read pairs.')
+  parser.add_argument('infile2', metavar='reads_2.fq',
+    help='The second mates in the read pairs.')
+  parser.add_argument('-t', '--tag-length', dest='tag_len', type=int)
+  parser.add_argument('-c', '--constant-length', dest='const_len', type=int)
+  parser.add_argument('-C', '--computer', dest='human', action='store_false',
+    help='Print results in computer-readable format. This will be a tab-delimited version of the '
+         'output, in the same order, but with two columns: stat name and value.')
+  parser.add_argument('-m', '--min-reads', type=int,
+    help='The minimum number of reads required in each single-stranded family. Default: '
+         '%(default)s')
+  parser.add_argument('-v', '--validate', action='store_true',
+    help='Check the id\'s of the reads to make sure the correct reads are mated into pairs (the '
+         'id\'s of mates must be identical).')
+
+  args = parser.parse_args(argv[1:])
+
+  with open(args.infile1) as infileh1:
+    with open(args.infile2) as infileh2:
+      barcodes = read_files(infileh1, infileh2, tag_len=args.tag_len, validate=args.validate)
+
+  stats = get_stats(barcodes, tag_len=args.tag_len, min_reads=args.min_reads)
+  print_stats(stats, min_reads=args.min_reads, human=args.human)
+
+
+def read_files(infileh1, infileh2, tag_len=12, validate=False):
+  reader1 = getreads.getparser(infileh1, filetype='fastq').parser()
+  reader2 = getreads.getparser(infileh2, filetype='fastq').parser()
+  barcodes = {}
+  while True:
+    try:
+      read1 = reader1.next()
+      read2 = reader2.next()
+    except StopIteration:
+      break
+    if validate and read1.id != read2.id:
+      raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id))
+    alpha = read1.seq[:tag_len]
+    beta  = read2.seq[:tag_len]
+    barcode = alpha + beta
+    if barcode in barcodes:
+      barcodes[barcode] += 1
+    else:
+      barcodes[barcode] = 1
+  return barcodes
+
+
+def get_stats(barcodes, tag_len=12, min_reads=3):
+  passed_sscs = 0
+  duplexes = 0
+  passed_duplexes = 0
+  singletons = 0
+  total_pairs = 0
+  for barcode, count in barcodes.items():
+    total_pairs += count
+    if count == 1:
+      singletons += 1
+    if count >= min_reads:
+      passed_sscs += 1
+    alpha = barcode[:tag_len]
+    beta  = barcode[tag_len:]
+    reverse = beta + alpha
+    if reverse in barcodes:
+      duplexes += 1
+      if count >= min_reads and barcodes[reverse] >= min_reads:
+        passed_duplexes += 1
+  # Each full duplex ends up being counted twice. Halve it to get the real total.
+  stats = {
+    'pairs':total_pairs,
+    'barcodes':len(barcodes),
+    'avg_pairs':total_pairs/len(barcodes),
+    'singletons':singletons,
+    'duplexes':duplexes//2,
+    'passed_sscs':passed_sscs*2,
+    'passed_duplexes':passed_duplexes,
+  }
+  return stats
+
+
+def print_stats(stats, min_reads=3, human=True):
+  all_stats = stats.copy()
+  all_stats.update({'min_reads':min_reads})
+  if human:
+    print """Total read pairs:\t{pairs}
+Unique barcodes:\t{barcodes}
+Avg # of read pairs per barcode:\t{avg_pairs}
+Singletons:\t{singletons}
+Barcodes with reverse (other strand) present:\t{duplexes}
+Passing threshold of {min_reads} reads per single-strand consensus:
+\tSingle-strand consensus sequences:\t{passed_sscs}
+\tDuplex consensus sequences:\t{passed_duplexes}""".format(**all_stats)
+  else:
+    for stat in ('pairs', 'barcodes', 'avg_pairs', 'singletons', 'duplexes', 'min_reads',
+                 'passed_sscs', 'passed_duplexes'):
+      print '{}\t{}'.format(stat, all_stats[stat])
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))