Mercurial > repos > nick > duplex
annotate utils/precheck.py @ 4:af383638de66 draft
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
| author | nick |
|---|---|
| date | Mon, 23 Nov 2015 18:44:23 -0500 |
| parents | |
| children |
| rev | line source |
|---|---|
|
4
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
2 from __future__ import division |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
3 import sys |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
4 import argparse |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
5 import getreads |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
6 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
7 OPT_DEFAULTS = {'tag_len':12, 'const_len':5, 'min_reads':3, 'human':True} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
8 USAGE = "%(prog)s [options]" |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
9 DESCRIPTION = """Print statistics on the raw duplex sequencing reads.""" |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
10 EPILOG = """Warning: This tracks all barcodes in a dict, so it can take a lot of memory. A guideline |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
11 is about 200 bytes per (12bp) tag. For example, it took about 800MB for a 10GB, 32 million read |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
12 dataset with an average of 4 pairs per barcode.""" |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
13 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
14 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
15 def main(argv): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
16 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
17 parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
18 parser.set_defaults(**OPT_DEFAULTS) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
19 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
20 parser.add_argument('infile1', metavar='reads_1.fq', |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
21 help='The first mates in the read pairs.') |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
22 parser.add_argument('infile2', metavar='reads_2.fq', |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
23 help='The second mates in the read pairs.') |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
24 parser.add_argument('-t', '--tag-length', dest='tag_len', type=int) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
25 parser.add_argument('-c', '--constant-length', dest='const_len', type=int) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
26 parser.add_argument('-C', '--computer', dest='human', action='store_false', |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
27 help='Print results in computer-readable format. This will be a tab-delimited version of the ' |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
28 'output, in the same order, but with two columns: stat name and value.') |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
29 parser.add_argument('-m', '--min-reads', type=int, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
30 help='The minimum number of reads required in each single-stranded family. Default: ' |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
31 '%(default)s') |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
32 parser.add_argument('-v', '--validate', action='store_true', |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
33 help='Check the id\'s of the reads to make sure the correct reads are mated into pairs (the ' |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
34 'id\'s of mates must be identical).') |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
35 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
36 args = parser.parse_args(argv[1:]) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
37 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
38 with open(args.infile1) as infileh1: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
39 with open(args.infile2) as infileh2: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
40 barcodes = read_files(infileh1, infileh2, tag_len=args.tag_len, validate=args.validate) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
41 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
42 stats = get_stats(barcodes, tag_len=args.tag_len, min_reads=args.min_reads) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
43 print_stats(stats, min_reads=args.min_reads, human=args.human) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
44 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
45 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
46 def read_files(infileh1, infileh2, tag_len=12, validate=False): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
47 reader1 = getreads.getparser(infileh1, filetype='fastq').parser() |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
48 reader2 = getreads.getparser(infileh2, filetype='fastq').parser() |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
49 barcodes = {} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
50 while True: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
51 try: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
52 read1 = reader1.next() |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
53 read2 = reader2.next() |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
54 except StopIteration: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
55 break |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
56 if validate and read1.id != read2.id: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
57 raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id)) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
58 alpha = read1.seq[:tag_len] |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
59 beta = read2.seq[:tag_len] |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
60 barcode = alpha + beta |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
61 if barcode in barcodes: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
62 barcodes[barcode] += 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
63 else: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
64 barcodes[barcode] = 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
65 return barcodes |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
66 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
67 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
68 def get_stats(barcodes, tag_len=12, min_reads=3): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
69 passed_sscs = 0 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
70 duplexes = 0 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
71 passed_duplexes = 0 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
72 singletons = 0 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
73 total_pairs = 0 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
74 for barcode, count in barcodes.items(): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
75 total_pairs += count |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
76 if count == 1: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
77 singletons += 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
78 if count >= min_reads: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
79 passed_sscs += 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
80 alpha = barcode[:tag_len] |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
81 beta = barcode[tag_len:] |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
82 reverse = beta + alpha |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
83 if reverse in barcodes: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
84 duplexes += 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
85 if count >= min_reads and barcodes[reverse] >= min_reads: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
86 passed_duplexes += 1 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
87 # Each full duplex ends up being counted twice. Halve it to get the real total. |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
88 stats = { |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
89 'pairs':total_pairs, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
90 'barcodes':len(barcodes), |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
91 'avg_pairs':total_pairs/len(barcodes), |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
92 'singletons':singletons, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
93 'duplexes':duplexes//2, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
94 'passed_sscs':passed_sscs*2, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
95 'passed_duplexes':passed_duplexes, |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
96 } |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
97 return stats |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
98 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
99 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
100 def print_stats(stats, min_reads=3, human=True): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
101 all_stats = stats.copy() |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
102 all_stats.update({'min_reads':min_reads}) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
103 if human: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
104 print """Total read pairs:\t{pairs} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
105 Unique barcodes:\t{barcodes} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
106 Avg # of read pairs per barcode:\t{avg_pairs} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
107 Singletons:\t{singletons} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
108 Barcodes with reverse (other strand) present:\t{duplexes} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
109 Passing threshold of {min_reads} reads per single-strand consensus: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
110 \tSingle-strand consensus sequences:\t{passed_sscs} |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
111 \tDuplex consensus sequences:\t{passed_duplexes}""".format(**all_stats) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
112 else: |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
113 for stat in ('pairs', 'barcodes', 'avg_pairs', 'singletons', 'duplexes', 'min_reads', |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
114 'passed_sscs', 'passed_duplexes'): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
115 print '{}\t{}'.format(stat, all_stats[stat]) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
116 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
117 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
118 def fail(message): |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
119 sys.stderr.write(message+"\n") |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
120 sys.exit(1) |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
121 |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
122 if __name__ == '__main__': |
|
af383638de66
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff
changeset
|
123 sys.exit(main(sys.argv)) |
