Mercurial > repos > nick > duplex
comparison utils/precheck.py @ 4:af383638de66 draft
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
| author | nick |
|---|---|
| date | Mon, 23 Nov 2015 18:44:23 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:13bcc2f459b0 | 4:af383638de66 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 import sys | |
| 4 import argparse | |
| 5 import getreads | |
| 6 | |
| 7 OPT_DEFAULTS = {'tag_len':12, 'const_len':5, 'min_reads':3, 'human':True} | |
| 8 USAGE = "%(prog)s [options]" | |
| 9 DESCRIPTION = """Print statistics on the raw duplex sequencing reads.""" | |
| 10 EPILOG = """Warning: This tracks all barcodes in a dict, so it can take a lot of memory. A guideline | |
| 11 is about 200 bytes per (12bp) tag. For example, it took about 800MB for a 10GB, 32 million read | |
| 12 dataset with an average of 4 pairs per barcode.""" | |
| 13 | |
| 14 | |
| 15 def main(argv): | |
| 16 | |
| 17 parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) | |
| 18 parser.set_defaults(**OPT_DEFAULTS) | |
| 19 | |
| 20 parser.add_argument('infile1', metavar='reads_1.fq', | |
| 21 help='The first mates in the read pairs.') | |
| 22 parser.add_argument('infile2', metavar='reads_2.fq', | |
| 23 help='The second mates in the read pairs.') | |
| 24 parser.add_argument('-t', '--tag-length', dest='tag_len', type=int) | |
| 25 parser.add_argument('-c', '--constant-length', dest='const_len', type=int) | |
| 26 parser.add_argument('-C', '--computer', dest='human', action='store_false', | |
| 27 help='Print results in computer-readable format. This will be a tab-delimited version of the ' | |
| 28 'output, in the same order, but with two columns: stat name and value.') | |
| 29 parser.add_argument('-m', '--min-reads', type=int, | |
| 30 help='The minimum number of reads required in each single-stranded family. Default: ' | |
| 31 '%(default)s') | |
| 32 parser.add_argument('-v', '--validate', action='store_true', | |
| 33 help='Check the id\'s of the reads to make sure the correct reads are mated into pairs (the ' | |
| 34 'id\'s of mates must be identical).') | |
| 35 | |
| 36 args = parser.parse_args(argv[1:]) | |
| 37 | |
| 38 with open(args.infile1) as infileh1: | |
| 39 with open(args.infile2) as infileh2: | |
| 40 barcodes = read_files(infileh1, infileh2, tag_len=args.tag_len, validate=args.validate) | |
| 41 | |
| 42 stats = get_stats(barcodes, tag_len=args.tag_len, min_reads=args.min_reads) | |
| 43 print_stats(stats, min_reads=args.min_reads, human=args.human) | |
| 44 | |
| 45 | |
| 46 def read_files(infileh1, infileh2, tag_len=12, validate=False): | |
| 47 reader1 = getreads.getparser(infileh1, filetype='fastq').parser() | |
| 48 reader2 = getreads.getparser(infileh2, filetype='fastq').parser() | |
| 49 barcodes = {} | |
| 50 while True: | |
| 51 try: | |
| 52 read1 = reader1.next() | |
| 53 read2 = reader2.next() | |
| 54 except StopIteration: | |
| 55 break | |
| 56 if validate and read1.id != read2.id: | |
| 57 raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id)) | |
| 58 alpha = read1.seq[:tag_len] | |
| 59 beta = read2.seq[:tag_len] | |
| 60 barcode = alpha + beta | |
| 61 if barcode in barcodes: | |
| 62 barcodes[barcode] += 1 | |
| 63 else: | |
| 64 barcodes[barcode] = 1 | |
| 65 return barcodes | |
| 66 | |
| 67 | |
| 68 def get_stats(barcodes, tag_len=12, min_reads=3): | |
| 69 passed_sscs = 0 | |
| 70 duplexes = 0 | |
| 71 passed_duplexes = 0 | |
| 72 singletons = 0 | |
| 73 total_pairs = 0 | |
| 74 for barcode, count in barcodes.items(): | |
| 75 total_pairs += count | |
| 76 if count == 1: | |
| 77 singletons += 1 | |
| 78 if count >= min_reads: | |
| 79 passed_sscs += 1 | |
| 80 alpha = barcode[:tag_len] | |
| 81 beta = barcode[tag_len:] | |
| 82 reverse = beta + alpha | |
| 83 if reverse in barcodes: | |
| 84 duplexes += 1 | |
| 85 if count >= min_reads and barcodes[reverse] >= min_reads: | |
| 86 passed_duplexes += 1 | |
| 87 # Each full duplex ends up being counted twice. Halve it to get the real total. | |
| 88 stats = { | |
| 89 'pairs':total_pairs, | |
| 90 'barcodes':len(barcodes), | |
| 91 'avg_pairs':total_pairs/len(barcodes), | |
| 92 'singletons':singletons, | |
| 93 'duplexes':duplexes//2, | |
| 94 'passed_sscs':passed_sscs*2, | |
| 95 'passed_duplexes':passed_duplexes, | |
| 96 } | |
| 97 return stats | |
| 98 | |
| 99 | |
| 100 def print_stats(stats, min_reads=3, human=True): | |
| 101 all_stats = stats.copy() | |
| 102 all_stats.update({'min_reads':min_reads}) | |
| 103 if human: | |
| 104 print """Total read pairs:\t{pairs} | |
| 105 Unique barcodes:\t{barcodes} | |
| 106 Avg # of read pairs per barcode:\t{avg_pairs} | |
| 107 Singletons:\t{singletons} | |
| 108 Barcodes with reverse (other strand) present:\t{duplexes} | |
| 109 Passing threshold of {min_reads} reads per single-strand consensus: | |
| 110 \tSingle-strand consensus sequences:\t{passed_sscs} | |
| 111 \tDuplex consensus sequences:\t{passed_duplexes}""".format(**all_stats) | |
| 112 else: | |
| 113 for stat in ('pairs', 'barcodes', 'avg_pairs', 'singletons', 'duplexes', 'min_reads', | |
| 114 'passed_sscs', 'passed_duplexes'): | |
| 115 print '{}\t{}'.format(stat, all_stats[stat]) | |
| 116 | |
| 117 | |
| 118 def fail(message): | |
| 119 sys.stderr.write(message+"\n") | |
| 120 sys.exit(1) | |
| 121 | |
| 122 if __name__ == '__main__': | |
| 123 sys.exit(main(sys.argv)) |
