annotate allele-counts.py @ 2:d83368b907f7

No error on existing output file
author nick
date Fri, 24 May 2013 10:54:13 -0400
parents 57e85db3f13c
children 4627d99aa105
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
57e85db3f13c Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
57e85db3f13c Uploaded script
nick
parents:
diff changeset
4 #
57e85db3f13c Uploaded script
nick
parents:
diff changeset
5 # New in this version: default to stdin and stdout, override by using -i and -o
57e85db3f13c Uploaded script
nick
parents:
diff changeset
6 # to specify filenames
57e85db3f13c Uploaded script
nick
parents:
diff changeset
7 #
57e85db3f13c Uploaded script
nick
parents:
diff changeset
8 # TODO:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
9 # - test handling of -c 0 (and -f 0?)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
10 # - should it technically handle data lines that start with a '#'?
57e85db3f13c Uploaded script
nick
parents:
diff changeset
11 import os
57e85db3f13c Uploaded script
nick
parents:
diff changeset
12 import sys
57e85db3f13c Uploaded script
nick
parents:
diff changeset
13 from optparse import OptionParser
57e85db3f13c Uploaded script
nick
parents:
diff changeset
14
57e85db3f13c Uploaded script
nick
parents:
diff changeset
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
16 'major', 'minor', 'freq'] #, 'bias']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
18 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
57e85db3f13c Uploaded script
nick
parents:
diff changeset
19 %prog [options] -i variants.vcf -o alleles.csv"""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
21 'print_header':False, 'stdin':False}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
22 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
23 EPILOG = """Requirements:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
24 The input VCF must report the variants for each strand.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
25 The variants should be case-sensitive (e.g. all capital base letters).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
26 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
27
57e85db3f13c Uploaded script
nick
parents:
diff changeset
28
57e85db3f13c Uploaded script
nick
parents:
diff changeset
29 def get_options(defaults, usage, description='', epilog=''):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
30 """Get options, print usage text."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
31
57e85db3f13c Uploaded script
nick
parents:
diff changeset
32 parser = OptionParser(usage=usage, description=description, epilog=epilog)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
33
57e85db3f13c Uploaded script
nick
parents:
diff changeset
34 parser.add_option('-i', '--infile', dest='infile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
35 default=defaults.get('infile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
36 help='Read input VCF data from this file instead of stdin.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
37 parser.add_option('-o', '--outfile', dest='outfile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
38 default=defaults.get('outfile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
39 help='Print output data to this file instead of stdout.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
40 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
41 default=defaults.get('freq_thres'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
42 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
43 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
44 default=defaults.get('covg_thres'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
45 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
46 parser.add_option('-H', '--header', dest='print_header', action='store_const',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
47 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
48 help='Print header line. This is a #-commented line with the column labels. Off by default.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
49 parser.add_option('-d', '--debug', dest='debug', action='store_true',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
50 default=False,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
51 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
52
57e85db3f13c Uploaded script
nick
parents:
diff changeset
53 (options, args) = parser.parse_args()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
54
57e85db3f13c Uploaded script
nick
parents:
diff changeset
55 # read in positional arguments
57e85db3f13c Uploaded script
nick
parents:
diff changeset
56 arguments = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
57 if options.debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
58 if len(args) >= 1:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
59 arguments['print_loc'] = args[0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
60 args.remove(args[0])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
61
57e85db3f13c Uploaded script
nick
parents:
diff changeset
62 return (options, arguments)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
63
57e85db3f13c Uploaded script
nick
parents:
diff changeset
64
57e85db3f13c Uploaded script
nick
parents:
diff changeset
65 def main():
57e85db3f13c Uploaded script
nick
parents:
diff changeset
66
57e85db3f13c Uploaded script
nick
parents:
diff changeset
67 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
68
57e85db3f13c Uploaded script
nick
parents:
diff changeset
69 infile = options.infile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
70 outfile = options.outfile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
71 print_header = options.print_header
57e85db3f13c Uploaded script
nick
parents:
diff changeset
72 freq_thres = options.freq_thres / 100.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
73 covg_thres = options.covg_thres
57e85db3f13c Uploaded script
nick
parents:
diff changeset
74 debug = options.debug
57e85db3f13c Uploaded script
nick
parents:
diff changeset
75
57e85db3f13c Uploaded script
nick
parents:
diff changeset
76 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
77 print_loc = args.get('print_loc')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
78 if print_loc:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
79 if ':' in print_loc:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
80 (print_chr, print_pos) = print_loc.split(':')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
81 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
82 print_pos = print_loc
57e85db3f13c Uploaded script
nick
parents:
diff changeset
83 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
84 sys.stderr.write("Warning: No site coordinate found in arguments. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
85 +"Turning off debug mode.\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
86 debug = False
57e85db3f13c Uploaded script
nick
parents:
diff changeset
87
57e85db3f13c Uploaded script
nick
parents:
diff changeset
88 # set infile_handle to either stdin or the input file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
89 if infile == OPT_DEFAULTS.get('infile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
90 infile_handle = sys.stdin
57e85db3f13c Uploaded script
nick
parents:
diff changeset
91 sys.stderr.write("Reading from standard input..\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
92 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
93 if os.path.exists(infile):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
94 infile_handle = open(infile, 'r')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
95 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
96 fail('Error: Input VCF file '+infile+' not found.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
97
57e85db3f13c Uploaded script
nick
parents:
diff changeset
98 # set outfile_handle to either stdout or the output file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
99 if outfile == OPT_DEFAULTS.get('outfile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
100 outfile_handle = sys.stdout
57e85db3f13c Uploaded script
nick
parents:
diff changeset
101 else:
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
102 try:
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
103 outfile_handle = open(outfile, 'w')
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
104 except IOError, e:
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
105 fail('Error: The given output filename '+outfile+' could not be opened.')
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
106
57e85db3f13c Uploaded script
nick
parents:
diff changeset
107 if print_header:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
108 outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
109
57e85db3f13c Uploaded script
nick
parents:
diff changeset
110 # main loop: process and print one line at a time
57e85db3f13c Uploaded script
nick
parents:
diff changeset
111 sample_names = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
112 for line in infile_handle:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
113 line = line.rstrip('\r\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
114
57e85db3f13c Uploaded script
nick
parents:
diff changeset
115 # header lines
57e85db3f13c Uploaded script
nick
parents:
diff changeset
116 if line[0] == '#':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
117 if line[0:6].upper() == '#CHROM':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
118 sample_names = line.split('\t')[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
119 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
120
57e85db3f13c Uploaded script
nick
parents:
diff changeset
121 if not sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
122 fail("Error in input VCF: Data line encountered before header line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
123 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
124
57e85db3f13c Uploaded script
nick
parents:
diff changeset
125 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
126
57e85db3f13c Uploaded script
nick
parents:
diff changeset
127 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
128 if site_data['pos'] != print_pos:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
129 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
130 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
131 if site_data['chr'] != print_chr:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
132 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
133 except NameError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
134 pass # No chr specified. Just go ahead and print the line.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
135
57e85db3f13c Uploaded script
nick
parents:
diff changeset
136 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
137 freq_thres, covg_thres, debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
138
57e85db3f13c Uploaded script
nick
parents:
diff changeset
139 if debug and site_summary[0]['print']:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
140 print line.split('\t')[9].split(':')[-1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
141
57e85db3f13c Uploaded script
nick
parents:
diff changeset
142 print_site(outfile_handle, site_summary, COLUMNS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
143
57e85db3f13c Uploaded script
nick
parents:
diff changeset
144 # close any open filehandles
57e85db3f13c Uploaded script
nick
parents:
diff changeset
145 if infile_handle is not sys.stdin:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
146 infile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
147 if outfile_handle is not sys.stdout:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
148 outfile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
149
57e85db3f13c Uploaded script
nick
parents:
diff changeset
150 # keeps Galaxy from giving an error if there were messages on stderr
57e85db3f13c Uploaded script
nick
parents:
diff changeset
151 sys.exit(0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
152
57e85db3f13c Uploaded script
nick
parents:
diff changeset
153
57e85db3f13c Uploaded script
nick
parents:
diff changeset
154
57e85db3f13c Uploaded script
nick
parents:
diff changeset
155 def read_site(line, sample_names, canonical):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
156 """Read in a line, parse the variants into a data structure, and return it.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
157 The line should be actual site data, not a header line, so check beforehand.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
158 Notes:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
159 - The line is assumed to have been chomped."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
160
57e85db3f13c Uploaded script
nick
parents:
diff changeset
161 site = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
162 fields = line.split('\t')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
163
57e85db3f13c Uploaded script
nick
parents:
diff changeset
164 if len(fields) < 9:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
165 fail("Error in input VCF: wrong number of fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
166 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
167
57e85db3f13c Uploaded script
nick
parents:
diff changeset
168 site['chr'] = fields[0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
169 site['pos'] = fields[1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
170 samples = fields[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
171
57e85db3f13c Uploaded script
nick
parents:
diff changeset
172 if len(samples) < len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
173 fail("Error in input VCF: missing sample fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
174 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
175 elif len(samples) > len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
176 fail("Error in input VCF: more sample fields in data line than in header. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
177 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
178
57e85db3f13c Uploaded script
nick
parents:
diff changeset
179 sample_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
180 for i in range(len(samples)):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
181
57e85db3f13c Uploaded script
nick
parents:
diff changeset
182 variant_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
183 counts = samples[i].split(':')[-1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
184 counts = counts.split(',')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
185
57e85db3f13c Uploaded script
nick
parents:
diff changeset
186 for count in counts:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
187 if not count:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
188 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
189 fields = count.split('=')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
190 if len(fields) != 2:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
191 fail("Error in input VCF: Incorrect variant data format (must contain "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
192 +"a single '='). Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
193 (variant, reads) = fields
57e85db3f13c Uploaded script
nick
parents:
diff changeset
194 if variant[1:] not in canonical:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
195 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
196 if variant[0] != '-' and variant[0] != '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
197 fail("Error in input VCF: variant data not strand-specific. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
198 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
199 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
200 variant_counts[variant] = int(reads)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
201 except ValueError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
202 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
203
57e85db3f13c Uploaded script
nick
parents:
diff changeset
204 sample_counts[sample_names[i]] = variant_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
205
57e85db3f13c Uploaded script
nick
parents:
diff changeset
206 site['samples'] = sample_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
207
57e85db3f13c Uploaded script
nick
parents:
diff changeset
208 return site
57e85db3f13c Uploaded script
nick
parents:
diff changeset
209
57e85db3f13c Uploaded script
nick
parents:
diff changeset
210
57e85db3f13c Uploaded script
nick
parents:
diff changeset
211 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
212 debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
213 """Take the raw data from the VCF line and transform it into the summary data
57e85db3f13c Uploaded script
nick
parents:
diff changeset
214 to be printed in the output format."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
215
57e85db3f13c Uploaded script
nick
parents:
diff changeset
216 site_summary = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
217 for sample_name in sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
218
57e85db3f13c Uploaded script
nick
parents:
diff changeset
219 sample = {'print':False}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
220 variants = site['samples'].get(sample_name)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
221 if not variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
222 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
223 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
224
57e85db3f13c Uploaded script
nick
parents:
diff changeset
225 sample['sample'] = sample_name
57e85db3f13c Uploaded script
nick
parents:
diff changeset
226 sample['chr'] = site['chr']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
227 sample['pos'] = site['pos']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
228
57e85db3f13c Uploaded script
nick
parents:
diff changeset
229 coverage = sum(variants.values())
57e85db3f13c Uploaded script
nick
parents:
diff changeset
230
57e85db3f13c Uploaded script
nick
parents:
diff changeset
231 # get stranded coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
232 covg_plus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
233 covg_minus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
234 for variant in variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
235 if variant[0] == '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
236 covg_plus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
237 elif variant[0] == '-':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
238 covg_minus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
239 # stranded coverage threshold
57e85db3f13c Uploaded script
nick
parents:
diff changeset
240 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
241 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
242 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
243 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
244 sample['print'] = True
57e85db3f13c Uploaded script
nick
parents:
diff changeset
245
57e85db3f13c Uploaded script
nick
parents:
diff changeset
246 # get an ordered list of read counts for all variants (either strand)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
247 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
248
57e85db3f13c Uploaded script
nick
parents:
diff changeset
249 # record read counts into dict for this sample
57e85db3f13c Uploaded script
nick
parents:
diff changeset
250 for base in ranked_bases:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
251 sample[base[0]] = base[1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
252 # fill in any zeros
57e85db3f13c Uploaded script
nick
parents:
diff changeset
253 for variant in canonical:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
254 if not sample.has_key(variant):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
255 sample[variant] = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
256
57e85db3f13c Uploaded script
nick
parents:
diff changeset
257 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
258
57e85db3f13c Uploaded script
nick
parents:
diff changeset
259 # set minor allele to N if there's a tie for 2nd
57e85db3f13c Uploaded script
nick
parents:
diff changeset
260 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
261 ranked_bases[1] = ('N', 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
262 sample['alleles'] = 1 if sample['alleles'] else 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
263
57e85db3f13c Uploaded script
nick
parents:
diff changeset
264 if debug: print ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
265
57e85db3f13c Uploaded script
nick
parents:
diff changeset
266 sample['coverage'] = coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
267 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
268 sample['major'] = ranked_bases[0][0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
269 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
270 sample['major'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
271 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
272 sample['minor'] = ranked_bases[1][0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
273 sample['freq'] = ranked_bases[1][1] / float(coverage)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
274 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
275 sample['minor'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
276 sample['freq'] = 0.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
277
57e85db3f13c Uploaded script
nick
parents:
diff changeset
278 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
279
57e85db3f13c Uploaded script
nick
parents:
diff changeset
280 return site_summary
57e85db3f13c Uploaded script
nick
parents:
diff changeset
281
57e85db3f13c Uploaded script
nick
parents:
diff changeset
282
57e85db3f13c Uploaded script
nick
parents:
diff changeset
283 def print_site(filehandle, site, columns):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
284 """Print the output lines for one site (one per sample).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
285 filehandle must be open."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
286 for sample in site:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
287 if sample['print']:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
288 fields = [str(sample.get(column)) for column in columns]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
289 filehandle.write('\t'.join(fields)+"\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
290
57e85db3f13c Uploaded script
nick
parents:
diff changeset
291
57e85db3f13c Uploaded script
nick
parents:
diff changeset
292 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
293 """Count the number of reads for each base, and create a ranked list of
57e85db3f13c Uploaded script
nick
parents:
diff changeset
294 alleles passing the frequency threshold.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
295 Arguments:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
296 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
297 freq_thres: The frequency threshold each allele needs to pass to be included.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
298 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
299 variants: A list of the variants of interest. Other types of variants will not
57e85db3f13c Uploaded script
nick
parents:
diff changeset
300 be included in the returned list. If no list is given, all variants found in
57e85db3f13c Uploaded script
nick
parents:
diff changeset
301 the variant_counts will be used.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
302 Return value:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
303 ranked_bases: A list of the alleles and their read counts. The elements are
57e85db3f13c Uploaded script
nick
parents:
diff changeset
304 tuples (base, read count). The alleles are listed in descending order of
57e85db3f13c Uploaded script
nick
parents:
diff changeset
305 frequency, and only those passing the threshold are included."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
306
57e85db3f13c Uploaded script
nick
parents:
diff changeset
307 # Get list of all variants from variant_counts list
57e85db3f13c Uploaded script
nick
parents:
diff changeset
308 variants = [variant[1:] for variant in variant_counts]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
309 # deduplicate via a dict
57e85db3f13c Uploaded script
nick
parents:
diff changeset
310 variant_dict = dict((variant, 1) for variant in variants)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
311 variants = variant_dict.keys()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
312
57e85db3f13c Uploaded script
nick
parents:
diff changeset
313 ranked_bases = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
314 for variant in variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
315 reads = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
316 for strand in strands:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
317 reads += variant_counts.get(strand+variant, 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
318 ranked_bases.append((variant, reads))
57e85db3f13c Uploaded script
nick
parents:
diff changeset
319
57e85db3f13c Uploaded script
nick
parents:
diff changeset
320 # get coverage for the specified strands
57e85db3f13c Uploaded script
nick
parents:
diff changeset
321 coverage = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
322 for variant in variant_counts:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
323 if variant[0] in strands:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
324 coverage += variant_counts.get(variant, 0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
325 # if debug: print "strands: "+strands+', covg: '+str(coverage)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
326
57e85db3f13c Uploaded script
nick
parents:
diff changeset
327 if coverage < 1:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
328 return []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
329
57e85db3f13c Uploaded script
nick
parents:
diff changeset
330 # sort the list of alleles by read count
57e85db3f13c Uploaded script
nick
parents:
diff changeset
331 ranked_bases.sort(reverse=True, key=lambda base: base[1])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
332
57e85db3f13c Uploaded script
nick
parents:
diff changeset
333 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
334 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
335 for base in ranked_bases:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
336 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
57e85db3f13c Uploaded script
nick
parents:
diff changeset
337 str(base[1]/float(coverage)))
57e85db3f13c Uploaded script
nick
parents:
diff changeset
338
57e85db3f13c Uploaded script
nick
parents:
diff changeset
339 # remove bases below the frequency threshold
57e85db3f13c Uploaded script
nick
parents:
diff changeset
340 ranked_bases = [base for base in ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
341 if base[1]/float(coverage) >= freq_thres]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
342
57e85db3f13c Uploaded script
nick
parents:
diff changeset
343 return ranked_bases
57e85db3f13c Uploaded script
nick
parents:
diff changeset
344
57e85db3f13c Uploaded script
nick
parents:
diff changeset
345
57e85db3f13c Uploaded script
nick
parents:
diff changeset
346 def count_alleles(variant_counts, freq_thres, debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
347 """Determine how many alleles to report, based on filtering rules.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
348 The current rule determines which bases pass the frequency threshold on each
57e85db3f13c Uploaded script
nick
parents:
diff changeset
349 strand individually, then compares the two sets of bases. If they are the same
57e85db3f13c Uploaded script
nick
parents:
diff changeset
350 (regardless of order), the allele count is the number of bases. Otherwise it
57e85db3f13c Uploaded script
nick
parents:
diff changeset
351 is zero."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
352 allele_count = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
353
57e85db3f13c Uploaded script
nick
parents:
diff changeset
354 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
355 strands='+')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
356 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
357 strands='-')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
358
57e85db3f13c Uploaded script
nick
parents:
diff changeset
359 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
360 print '+ '+str(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
361 print '- '+str(alleles_minus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
362
57e85db3f13c Uploaded script
nick
parents:
diff changeset
363 # check if each strand reports the same set of alleles
57e85db3f13c Uploaded script
nick
parents:
diff changeset
364 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
365 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
366 if alleles_plus_sorted == alleles_minus_sorted:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
367 allele_count = len(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
368
57e85db3f13c Uploaded script
nick
parents:
diff changeset
369 return allele_count
57e85db3f13c Uploaded script
nick
parents:
diff changeset
370
57e85db3f13c Uploaded script
nick
parents:
diff changeset
371
57e85db3f13c Uploaded script
nick
parents:
diff changeset
372 def fail(message):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
373 sys.stderr.write(message+'\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
374 sys.exit(1)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
375
57e85db3f13c Uploaded script
nick
parents:
diff changeset
376 if __name__ == "__main__":
57e85db3f13c Uploaded script
nick
parents:
diff changeset
377 main()