annotate allele-counts.py @ 4:900d91d653cb

Add '#' back to header line
author nick
date Tue, 28 May 2013 16:56:43 -0400
parents 4627d99aa105
children f6d9742c1da0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
57e85db3f13c Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
57e85db3f13c Uploaded script
nick
parents:
diff changeset
4 #
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
5 # New in this version:
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
6 # Made header line customizable
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
7 # - separate from internal column labels, which are used as dict keys
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
8 #
57e85db3f13c Uploaded script
nick
parents:
diff changeset
9 # TODO:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
10 # - test handling of -c 0 (and -f 0?)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
11 # - should it technically handle data lines that start with a '#'?
57e85db3f13c Uploaded script
nick
parents:
diff changeset
12 import os
57e85db3f13c Uploaded script
nick
parents:
diff changeset
13 import sys
57e85db3f13c Uploaded script
nick
parents:
diff changeset
14 from optparse import OptionParser
57e85db3f13c Uploaded script
nick
parents:
diff changeset
15
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
57e85db3f13c Uploaded script
nick
parents:
diff changeset
20 %prog [options] -i variants.vcf -o alleles.csv"""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
22 'print_header':False, 'stdin':False}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
24 EPILOG = """Requirements:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
25 The input VCF must report the variants for each strand.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
26 The variants should be case-sensitive (e.g. all capital base letters).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
28
57e85db3f13c Uploaded script
nick
parents:
diff changeset
29
57e85db3f13c Uploaded script
nick
parents:
diff changeset
30 def get_options(defaults, usage, description='', epilog=''):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
31 """Get options, print usage text."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
32
57e85db3f13c Uploaded script
nick
parents:
diff changeset
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
34
57e85db3f13c Uploaded script
nick
parents:
diff changeset
35 parser.add_option('-i', '--infile', dest='infile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
36 default=defaults.get('infile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
37 help='Read input VCF data from this file instead of stdin.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
38 parser.add_option('-o', '--outfile', dest='outfile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
39 default=defaults.get('outfile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
40 help='Print output data to this file instead of stdout.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
42 default=defaults.get('freq_thres'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
43 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
44 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
45 default=defaults.get('covg_thres'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
46 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
47 parser.add_option('-H', '--header', dest='print_header', action='store_const',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
48 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
49 help='Print header line. This is a #-commented line with the column labels. Off by default.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
50 parser.add_option('-d', '--debug', dest='debug', action='store_true',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
51 default=False,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
52 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
53
57e85db3f13c Uploaded script
nick
parents:
diff changeset
54 (options, args) = parser.parse_args()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
55
57e85db3f13c Uploaded script
nick
parents:
diff changeset
56 # read in positional arguments
57e85db3f13c Uploaded script
nick
parents:
diff changeset
57 arguments = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
58 if options.debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
59 if len(args) >= 1:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
60 arguments['print_loc'] = args[0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
61 args.remove(args[0])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
62
57e85db3f13c Uploaded script
nick
parents:
diff changeset
63 return (options, arguments)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
64
57e85db3f13c Uploaded script
nick
parents:
diff changeset
65
57e85db3f13c Uploaded script
nick
parents:
diff changeset
66 def main():
57e85db3f13c Uploaded script
nick
parents:
diff changeset
67
57e85db3f13c Uploaded script
nick
parents:
diff changeset
68 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
69
57e85db3f13c Uploaded script
nick
parents:
diff changeset
70 infile = options.infile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
71 outfile = options.outfile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
72 print_header = options.print_header
57e85db3f13c Uploaded script
nick
parents:
diff changeset
73 freq_thres = options.freq_thres / 100.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
74 covg_thres = options.covg_thres
57e85db3f13c Uploaded script
nick
parents:
diff changeset
75 debug = options.debug
57e85db3f13c Uploaded script
nick
parents:
diff changeset
76
57e85db3f13c Uploaded script
nick
parents:
diff changeset
77 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
78 print_loc = args.get('print_loc')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
79 if print_loc:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
80 if ':' in print_loc:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
81 (print_chr, print_pos) = print_loc.split(':')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
82 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
83 print_pos = print_loc
57e85db3f13c Uploaded script
nick
parents:
diff changeset
84 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
85 sys.stderr.write("Warning: No site coordinate found in arguments. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
86 +"Turning off debug mode.\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
87 debug = False
57e85db3f13c Uploaded script
nick
parents:
diff changeset
88
57e85db3f13c Uploaded script
nick
parents:
diff changeset
89 # set infile_handle to either stdin or the input file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
90 if infile == OPT_DEFAULTS.get('infile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
91 infile_handle = sys.stdin
57e85db3f13c Uploaded script
nick
parents:
diff changeset
92 sys.stderr.write("Reading from standard input..\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
93 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
94 if os.path.exists(infile):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
95 infile_handle = open(infile, 'r')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
96 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
97 fail('Error: Input VCF file '+infile+' not found.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
98
57e85db3f13c Uploaded script
nick
parents:
diff changeset
99 # set outfile_handle to either stdout or the output file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
100 if outfile == OPT_DEFAULTS.get('outfile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
101 outfile_handle = sys.stdout
57e85db3f13c Uploaded script
nick
parents:
diff changeset
102 else:
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
103 try:
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
104 outfile_handle = open(outfile, 'w')
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
105 except IOError, e:
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
106 fail('Error: The given output filename '+outfile+' could not be opened.')
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
107
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
108 if len(COLUMNS) != len(COLUMN_LABELS):
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
109 fail('Error: Internal column names do not match column labels.')
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
110 if print_header:
4
900d91d653cb Add '#' back to header line
nick
parents: 3
diff changeset
111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
112
57e85db3f13c Uploaded script
nick
parents:
diff changeset
113 # main loop: process and print one line at a time
57e85db3f13c Uploaded script
nick
parents:
diff changeset
114 sample_names = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
115 for line in infile_handle:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
116 line = line.rstrip('\r\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
117
57e85db3f13c Uploaded script
nick
parents:
diff changeset
118 # header lines
57e85db3f13c Uploaded script
nick
parents:
diff changeset
119 if line[0] == '#':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
120 if line[0:6].upper() == '#CHROM':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
121 sample_names = line.split('\t')[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
122 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
123
57e85db3f13c Uploaded script
nick
parents:
diff changeset
124 if not sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
125 fail("Error in input VCF: Data line encountered before header line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
126 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
127
57e85db3f13c Uploaded script
nick
parents:
diff changeset
128 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
129
57e85db3f13c Uploaded script
nick
parents:
diff changeset
130 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
131 if site_data['pos'] != print_pos:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
132 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
133 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
134 if site_data['chr'] != print_chr:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
135 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
136 except NameError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
137 pass # No chr specified. Just go ahead and print the line.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
138
57e85db3f13c Uploaded script
nick
parents:
diff changeset
139 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
140 freq_thres, covg_thres, debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
141
57e85db3f13c Uploaded script
nick
parents:
diff changeset
142 if debug and site_summary[0]['print']:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
143 print line.split('\t')[9].split(':')[-1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
144
57e85db3f13c Uploaded script
nick
parents:
diff changeset
145 print_site(outfile_handle, site_summary, COLUMNS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
146
57e85db3f13c Uploaded script
nick
parents:
diff changeset
147 # close any open filehandles
57e85db3f13c Uploaded script
nick
parents:
diff changeset
148 if infile_handle is not sys.stdin:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
149 infile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
150 if outfile_handle is not sys.stdout:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
151 outfile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
152
57e85db3f13c Uploaded script
nick
parents:
diff changeset
153 # keeps Galaxy from giving an error if there were messages on stderr
57e85db3f13c Uploaded script
nick
parents:
diff changeset
154 sys.exit(0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
155
57e85db3f13c Uploaded script
nick
parents:
diff changeset
156
57e85db3f13c Uploaded script
nick
parents:
diff changeset
157
57e85db3f13c Uploaded script
nick
parents:
diff changeset
158 def read_site(line, sample_names, canonical):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
159 """Read in a line, parse the variants into a data structure, and return it.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
160 The line should be actual site data, not a header line, so check beforehand.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
161 Notes:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
162 - The line is assumed to have been chomped."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
163
57e85db3f13c Uploaded script
nick
parents:
diff changeset
164 site = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
165 fields = line.split('\t')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
166
57e85db3f13c Uploaded script
nick
parents:
diff changeset
167 if len(fields) < 9:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
168 fail("Error in input VCF: wrong number of fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
169 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
170
57e85db3f13c Uploaded script
nick
parents:
diff changeset
171 site['chr'] = fields[0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
172 site['pos'] = fields[1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
173 samples = fields[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
174
57e85db3f13c Uploaded script
nick
parents:
diff changeset
175 if len(samples) < len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
176 fail("Error in input VCF: missing sample fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
177 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
178 elif len(samples) > len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
179 fail("Error in input VCF: more sample fields in data line than in header. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
180 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
181
57e85db3f13c Uploaded script
nick
parents:
diff changeset
182 sample_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
183 for i in range(len(samples)):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
184
57e85db3f13c Uploaded script
nick
parents:
diff changeset
185 variant_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
186 counts = samples[i].split(':')[-1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
187 counts = counts.split(',')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
188
57e85db3f13c Uploaded script
nick
parents:
diff changeset
189 for count in counts:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
190 if not count:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
191 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
192 fields = count.split('=')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
193 if len(fields) != 2:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
194 fail("Error in input VCF: Incorrect variant data format (must contain "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
195 +"a single '='). Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
196 (variant, reads) = fields
57e85db3f13c Uploaded script
nick
parents:
diff changeset
197 if variant[1:] not in canonical:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
198 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
199 if variant[0] != '-' and variant[0] != '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
200 fail("Error in input VCF: variant data not strand-specific. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
201 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
202 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
203 variant_counts[variant] = int(reads)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
204 except ValueError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
205 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
206
57e85db3f13c Uploaded script
nick
parents:
diff changeset
207 sample_counts[sample_names[i]] = variant_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
208
57e85db3f13c Uploaded script
nick
parents:
diff changeset
209 site['samples'] = sample_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
210
57e85db3f13c Uploaded script
nick
parents:
diff changeset
211 return site
57e85db3f13c Uploaded script
nick
parents:
diff changeset
212
57e85db3f13c Uploaded script
nick
parents:
diff changeset
213
57e85db3f13c Uploaded script
nick
parents:
diff changeset
214 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
215 debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
216 """Take the raw data from the VCF line and transform it into the summary data
57e85db3f13c Uploaded script
nick
parents:
diff changeset
217 to be printed in the output format."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
218
57e85db3f13c Uploaded script
nick
parents:
diff changeset
219 site_summary = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
220 for sample_name in sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
221
57e85db3f13c Uploaded script
nick
parents:
diff changeset
222 sample = {'print':False}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
223 variants = site['samples'].get(sample_name)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
224 if not variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
225 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
226 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
227
57e85db3f13c Uploaded script
nick
parents:
diff changeset
228 sample['sample'] = sample_name
57e85db3f13c Uploaded script
nick
parents:
diff changeset
229 sample['chr'] = site['chr']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
230 sample['pos'] = site['pos']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
231
57e85db3f13c Uploaded script
nick
parents:
diff changeset
232 coverage = sum(variants.values())
57e85db3f13c Uploaded script
nick
parents:
diff changeset
233
57e85db3f13c Uploaded script
nick
parents:
diff changeset
234 # get stranded coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
235 covg_plus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
236 covg_minus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
237 for variant in variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
238 if variant[0] == '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
239 covg_plus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
240 elif variant[0] == '-':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
241 covg_minus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
242 # stranded coverage threshold
57e85db3f13c Uploaded script
nick
parents:
diff changeset
243 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
244 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
245 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
246 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
247 sample['print'] = True
57e85db3f13c Uploaded script
nick
parents:
diff changeset
248
57e85db3f13c Uploaded script
nick
parents:
diff changeset
249 # get an ordered list of read counts for all variants (either strand)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
250 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
251
57e85db3f13c Uploaded script
nick
parents:
diff changeset
252 # record read counts into dict for this sample
57e85db3f13c Uploaded script
nick
parents:
diff changeset
253 for base in ranked_bases:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
254 sample[base[0]] = base[1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
255 # fill in any zeros
57e85db3f13c Uploaded script
nick
parents:
diff changeset
256 for variant in canonical:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
257 if not sample.has_key(variant):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
258 sample[variant] = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
259
57e85db3f13c Uploaded script
nick
parents:
diff changeset
260 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
261
57e85db3f13c Uploaded script
nick
parents:
diff changeset
262 # set minor allele to N if there's a tie for 2nd
57e85db3f13c Uploaded script
nick
parents:
diff changeset
263 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
264 ranked_bases[1] = ('N', 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
265 sample['alleles'] = 1 if sample['alleles'] else 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
266
57e85db3f13c Uploaded script
nick
parents:
diff changeset
267 if debug: print ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
268
57e85db3f13c Uploaded script
nick
parents:
diff changeset
269 sample['coverage'] = coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
270 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
271 sample['major'] = ranked_bases[0][0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
272 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
273 sample['major'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
274 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
275 sample['minor'] = ranked_bases[1][0]
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
276 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
277 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
278 sample['minor'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
279 sample['freq'] = 0.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
280
57e85db3f13c Uploaded script
nick
parents:
diff changeset
281 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
282
57e85db3f13c Uploaded script
nick
parents:
diff changeset
283 return site_summary
57e85db3f13c Uploaded script
nick
parents:
diff changeset
284
57e85db3f13c Uploaded script
nick
parents:
diff changeset
285
57e85db3f13c Uploaded script
nick
parents:
diff changeset
286 def print_site(filehandle, site, columns):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
287 """Print the output lines for one site (one per sample).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
288 filehandle must be open."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
289 for sample in site:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
290 if sample['print']:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
291 fields = [str(sample.get(column)) for column in columns]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
292 filehandle.write('\t'.join(fields)+"\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
293
57e85db3f13c Uploaded script
nick
parents:
diff changeset
294
57e85db3f13c Uploaded script
nick
parents:
diff changeset
295 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
296 """Count the number of reads for each base, and create a ranked list of
57e85db3f13c Uploaded script
nick
parents:
diff changeset
297 alleles passing the frequency threshold.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
298 Arguments:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
299 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
300 freq_thres: The frequency threshold each allele needs to pass to be included.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
301 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
302 variants: A list of the variants of interest. Other types of variants will not
57e85db3f13c Uploaded script
nick
parents:
diff changeset
303 be included in the returned list. If no list is given, all variants found in
57e85db3f13c Uploaded script
nick
parents:
diff changeset
304 the variant_counts will be used.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
305 Return value:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
306 ranked_bases: A list of the alleles and their read counts. The elements are
57e85db3f13c Uploaded script
nick
parents:
diff changeset
307 tuples (base, read count). The alleles are listed in descending order of
57e85db3f13c Uploaded script
nick
parents:
diff changeset
308 frequency, and only those passing the threshold are included."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
309
57e85db3f13c Uploaded script
nick
parents:
diff changeset
310 # Get list of all variants from variant_counts list
57e85db3f13c Uploaded script
nick
parents:
diff changeset
311 variants = [variant[1:] for variant in variant_counts]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
312 # deduplicate via a dict
57e85db3f13c Uploaded script
nick
parents:
diff changeset
313 variant_dict = dict((variant, 1) for variant in variants)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
314 variants = variant_dict.keys()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
315
57e85db3f13c Uploaded script
nick
parents:
diff changeset
316 ranked_bases = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
317 for variant in variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
318 reads = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
319 for strand in strands:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
320 reads += variant_counts.get(strand+variant, 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
321 ranked_bases.append((variant, reads))
57e85db3f13c Uploaded script
nick
parents:
diff changeset
322
57e85db3f13c Uploaded script
nick
parents:
diff changeset
323 # get coverage for the specified strands
57e85db3f13c Uploaded script
nick
parents:
diff changeset
324 coverage = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
325 for variant in variant_counts:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
326 if variant[0] in strands:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
327 coverage += variant_counts.get(variant, 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
328 # if debug: print "strands: "+strands+', covg: '+str(coverage)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
329
57e85db3f13c Uploaded script
nick
parents:
diff changeset
330 if coverage < 1:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
331 return []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
332
57e85db3f13c Uploaded script
nick
parents:
diff changeset
333 # sort the list of alleles by read count
57e85db3f13c Uploaded script
nick
parents:
diff changeset
334 ranked_bases.sort(reverse=True, key=lambda base: base[1])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
335
57e85db3f13c Uploaded script
nick
parents:
diff changeset
336 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
337 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
338 for base in ranked_bases:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
339 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
57e85db3f13c Uploaded script
nick
parents:
diff changeset
340 str(base[1]/float(coverage)))
57e85db3f13c Uploaded script
nick
parents:
diff changeset
341
57e85db3f13c Uploaded script
nick
parents:
diff changeset
342 # remove bases below the frequency threshold
57e85db3f13c Uploaded script
nick
parents:
diff changeset
343 ranked_bases = [base for base in ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
344 if base[1]/float(coverage) >= freq_thres]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
345
57e85db3f13c Uploaded script
nick
parents:
diff changeset
346 return ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
347
57e85db3f13c Uploaded script
nick
parents:
diff changeset
348
57e85db3f13c Uploaded script
nick
parents:
diff changeset
349 def count_alleles(variant_counts, freq_thres, debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
350 """Determine how many alleles to report, based on filtering rules.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
351 The current rule determines which bases pass the frequency threshold on each
57e85db3f13c Uploaded script
nick
parents:
diff changeset
352 strand individually, then compares the two sets of bases. If they are the same
57e85db3f13c Uploaded script
nick
parents:
diff changeset
353 (regardless of order), the allele count is the number of bases. Otherwise it
57e85db3f13c Uploaded script
nick
parents:
diff changeset
354 is zero."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
355 allele_count = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
356
57e85db3f13c Uploaded script
nick
parents:
diff changeset
357 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
358 strands='+')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
359 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
360 strands='-')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
361
57e85db3f13c Uploaded script
nick
parents:
diff changeset
362 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
363 print '+ '+str(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
364 print '- '+str(alleles_minus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
365
57e85db3f13c Uploaded script
nick
parents:
diff changeset
366 # check if each strand reports the same set of alleles
57e85db3f13c Uploaded script
nick
parents:
diff changeset
367 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
368 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
369 if alleles_plus_sorted == alleles_minus_sorted:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
370 allele_count = len(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
371
57e85db3f13c Uploaded script
nick
parents:
diff changeset
372 return allele_count
57e85db3f13c Uploaded script
nick
parents:
diff changeset
373
57e85db3f13c Uploaded script
nick
parents:
diff changeset
374
57e85db3f13c Uploaded script
nick
parents:
diff changeset
375 def fail(message):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
376 sys.stderr.write(message+'\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
377 sys.exit(1)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
378
57e85db3f13c Uploaded script
nick
parents:
diff changeset
379 if __name__ == "__main__":
57e85db3f13c Uploaded script
nick
parents:
diff changeset
380 main()