annotate allele-counts.py @ 10:db6f217dc45a

Uploaded tarball. Version 1.1. - Added stranded output. - Added, tested no-filter mode. - now prints all sites. - Changed handling of minor allele ties. - Added test datasets. - Revised help text.
author nick
date Fri, 06 Sep 2013 20:37:07 -0400
parents 34892857567b
children df1fb577db0d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
57e85db3f13c Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
57e85db3f13c Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
4 # Or, to put it briefly,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
6 #
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
7 # New in this version:
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
8 # Made header line customizable
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
9 # - separate from internal column labels, which are used as dict keys
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
10 import os
57e85db3f13c Uploaded script
nick
parents:
diff changeset
11 import sys
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
12 import random
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
13 from optparse import OptionParser
57e85db3f13c Uploaded script
nick
parents:
diff changeset
14
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
19 cat variants.vcf | %prog [options] > alleles.csv"""
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
22 'debug_loc':'', 'seed':''}
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
24 EPILOG = """Requirements:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
25 The input VCF must report the variants for each strand.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
26 The variants should be case-sensitive (e.g. all capital base letters).
57e85db3f13c Uploaded script
nick
parents:
diff changeset
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
28
57e85db3f13c Uploaded script
nick
parents:
diff changeset
29
57e85db3f13c Uploaded script
nick
parents:
diff changeset
30 def get_options(defaults, usage, description='', epilog=''):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
31 """Get options, print usage text."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
32
57e85db3f13c Uploaded script
nick
parents:
diff changeset
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
34
57e85db3f13c Uploaded script
nick
parents:
diff changeset
35 parser.add_option('-i', '--infile', dest='infile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
36 default=defaults.get('infile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
37 help='Read input VCF data from this file instead of stdin.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
38 parser.add_option('-o', '--outfile', dest='outfile',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
39 default=defaults.get('outfile'),
57e85db3f13c Uploaded script
nick
parents:
diff changeset
40 help='Print output data to this file instead of stdout.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
42 default=defaults.get('freq_thres'),
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
43 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
44 +'= 1% frequency. Default is %default%.'))
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
45 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
46 default=defaults.get('covg_thres'),
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
47 help=('Coverage threshold. Each site must be supported by at least this '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
48 +'many reads on each strand. Otherwise the site will not be printed in '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
49 +'the output. The default is %default reads per strand.'))
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
50 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
51 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'),
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
52 help=('Operate without a frequency threshold or coverage threshold. '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
53 +'Equivalent to "-c 0 -f 0".'))
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
54 parser.add_option('-H', '--header', dest='print_header', action='store_const',
57e85db3f13c Uploaded script
nick
parents:
diff changeset
55 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
56 help=('Print header line. This is a #-commented line with the column '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
57 +'labels. Off by default.'))
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
58 parser.add_option('-s', '--stranded', dest='stranded', action='store_const',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
59 const=not(defaults.get('stranded')), default=defaults.get('stranded'),
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
60 help='Report variant counts by strand, in separate columns. Off by default.')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
61 parser.add_option('-r', '--rand-seed', dest='seed',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
62 default=defaults.get('seed'), help=('Seed for random number generator.'))
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
63 parser.add_option('-d', '--debug', dest='debug_loc',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
64 default=defaults.get('debug_loc'),
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
65 help=('Turn on debug mode and specify a single site to process using UCSC '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
66 +'coordinate format. You can also append a sample ID after another ":" '
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
67 +'to restrict it further.'))
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
68
57e85db3f13c Uploaded script
nick
parents:
diff changeset
69 (options, args) = parser.parse_args()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
70
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
71 return (options, args)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
72
57e85db3f13c Uploaded script
nick
parents:
diff changeset
73
57e85db3f13c Uploaded script
nick
parents:
diff changeset
74 def main():
57e85db3f13c Uploaded script
nick
parents:
diff changeset
75
57e85db3f13c Uploaded script
nick
parents:
diff changeset
76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
77
57e85db3f13c Uploaded script
nick
parents:
diff changeset
78 infile = options.infile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
79 outfile = options.outfile
57e85db3f13c Uploaded script
nick
parents:
diff changeset
80 print_header = options.print_header
57e85db3f13c Uploaded script
nick
parents:
diff changeset
81 freq_thres = options.freq_thres / 100.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
82 covg_thres = options.covg_thres
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
83 stranded = options.stranded
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
84 debug_loc = options.debug_loc
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
85 seed = options.seed
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
86
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
87 if options.no_filter:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
88 freq_thres = 0
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
89 covg_thres = 0
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
90
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
91 if seed:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
92 random.seed(seed)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
93
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
94 debug = False
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
95 print_sample = ''
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
96 if debug_loc:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
97 debug = True
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
98 coords = debug_loc.split(':')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
99 print_chr = coords[0]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
100 print_pos = ''
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
101 if len(coords) > 1: print_pos = coords[1]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
102 if len(coords) > 2: print_sample = coords[2]
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
103
57e85db3f13c Uploaded script
nick
parents:
diff changeset
104 # set infile_handle to either stdin or the input file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
105 if infile == OPT_DEFAULTS.get('infile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
106 infile_handle = sys.stdin
57e85db3f13c Uploaded script
nick
parents:
diff changeset
107 sys.stderr.write("Reading from standard input..\n")
57e85db3f13c Uploaded script
nick
parents:
diff changeset
108 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
109 if os.path.exists(infile):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
110 infile_handle = open(infile, 'r')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
111 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
112 fail('Error: Input VCF file '+infile+' not found.')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
113
57e85db3f13c Uploaded script
nick
parents:
diff changeset
114 # set outfile_handle to either stdout or the output file
57e85db3f13c Uploaded script
nick
parents:
diff changeset
115 if outfile == OPT_DEFAULTS.get('outfile'):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
116 outfile_handle = sys.stdout
57e85db3f13c Uploaded script
nick
parents:
diff changeset
117 else:
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
118 try:
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
119 outfile_handle = open(outfile, 'w')
2
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
120 except IOError, e:
d83368b907f7 No error on existing output file
nick
parents: 0
diff changeset
121 fail('Error: The given output filename '+outfile+' could not be opened.')
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
122
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
123 # Take care of column names, print header
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
124 if len(COLUMNS) != len(COLUMN_LABELS):
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
125 fail('Error: Internal column names list do not match column labels list.')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
126 if stranded:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
127 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
128 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
129 if print_header:
4
900d91d653cb Add '#' back to header line
nick
parents: 3
diff changeset
130 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
131
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
132 # main loop
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
133 # each iteration processes one VCF line and prints one or more output lines
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
134 # one VCF line = one site, one or more samples
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
135 # one output line = one site, one sample
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
136 sample_names = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
137 for line in infile_handle:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
138 line = line.rstrip('\r\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
139
57e85db3f13c Uploaded script
nick
parents:
diff changeset
140 # header lines
57e85db3f13c Uploaded script
nick
parents:
diff changeset
141 if line[0] == '#':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
142 if line[0:6].upper() == '#CHROM':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
143 sample_names = line.split('\t')[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
144 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
145
57e85db3f13c Uploaded script
nick
parents:
diff changeset
146 if not sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
147 fail("Error in input VCF: Data line encountered before header line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
148 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
149
57e85db3f13c Uploaded script
nick
parents:
diff changeset
150 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
151
57e85db3f13c Uploaded script
nick
parents:
diff changeset
152 if debug:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
153 if print_pos != site_data['pos']:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
154 continue
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
155 if print_chr != site_data['chr'] and print_chr != '':
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
156 continue
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
157 if print_sample != '':
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
158 for sample in site_data['samples'].keys():
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
159 if sample.lower() != print_sample.lower():
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
160 site_data['samples'].pop(sample, None)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
161 if len(site_data['samples']) == 0:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
162 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
163 sys.exit(1)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
164
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
165
57e85db3f13c Uploaded script
nick
parents:
diff changeset
166 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
167 freq_thres, covg_thres, stranded, debug=debug)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
168
57e85db3f13c Uploaded script
nick
parents:
diff changeset
169 if debug and site_summary[0]['print']:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
170 print line.split('\t')[9].split(':')[-1]
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
171
57e85db3f13c Uploaded script
nick
parents:
diff changeset
172 print_site(outfile_handle, site_summary, COLUMNS)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
173
57e85db3f13c Uploaded script
nick
parents:
diff changeset
174 # close any open filehandles
57e85db3f13c Uploaded script
nick
parents:
diff changeset
175 if infile_handle is not sys.stdin:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
176 infile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
177 if outfile_handle is not sys.stdout:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
178 outfile_handle.close()
57e85db3f13c Uploaded script
nick
parents:
diff changeset
179
57e85db3f13c Uploaded script
nick
parents:
diff changeset
180 # keeps Galaxy from giving an error if there were messages on stderr
57e85db3f13c Uploaded script
nick
parents:
diff changeset
181 sys.exit(0)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
182
57e85db3f13c Uploaded script
nick
parents:
diff changeset
183
57e85db3f13c Uploaded script
nick
parents:
diff changeset
184
57e85db3f13c Uploaded script
nick
parents:
diff changeset
185 def read_site(line, sample_names, canonical):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
186 """Read in a line, parse the variants into a data structure, and return it.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
187 The line should be actual site data, not a header line, so check beforehand.
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
188 Only the variants in 'canonical' will be read; all others are ignored.
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
189 Note: the line is assumed to have been chomped.
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
190 The returned data is stored in a dict, with the following structure:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
191 {
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
192 'chr': 'chr1',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
193 'pos': '2617',
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
194 'samples': {
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
195 'THYROID': {
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
196 '+A': 32,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
197 '-A': 45,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
198 '-G': 1,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
199 },
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
200 'BLOOD': {
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
201 '+A': 2,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
202 '-C': 1,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
203 '+G': 37,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
204 '-G': 42,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
205 },
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
206 },
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
207 }
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
208 """
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
209
57e85db3f13c Uploaded script
nick
parents:
diff changeset
210 site = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
211 fields = line.split('\t')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
212
57e85db3f13c Uploaded script
nick
parents:
diff changeset
213 if len(fields) < 9:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
214 fail("Error in input VCF: wrong number of fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
215 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
216
57e85db3f13c Uploaded script
nick
parents:
diff changeset
217 site['chr'] = fields[0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
218 site['pos'] = fields[1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
219 samples = fields[9:]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
220
57e85db3f13c Uploaded script
nick
parents:
diff changeset
221 if len(samples) < len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
222 fail("Error in input VCF: missing sample fields in data line. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
223 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
224 elif len(samples) > len(sample_names):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
225 fail("Error in input VCF: more sample fields in data line than in header. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
226 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
227
57e85db3f13c Uploaded script
nick
parents:
diff changeset
228 sample_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
229 for i in range(len(samples)):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
230
57e85db3f13c Uploaded script
nick
parents:
diff changeset
231 variant_counts = {}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
232 counts = samples[i].split(':')[-1]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
233 counts = counts.split(',')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
234
57e85db3f13c Uploaded script
nick
parents:
diff changeset
235 for count in counts:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
236 if not count:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
237 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
238 fields = count.split('=')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
239 if len(fields) != 2:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
240 fail("Error in input VCF: Incorrect variant data format (must contain "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
241 +"a single '='). Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
242 (variant, reads) = fields
57e85db3f13c Uploaded script
nick
parents:
diff changeset
243 if variant[1:] not in canonical:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
244 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
245 if variant[0] != '-' and variant[0] != '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
246 fail("Error in input VCF: variant data not strand-specific. "
57e85db3f13c Uploaded script
nick
parents:
diff changeset
247 +"Failed on line:\n"+line)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
248 try:
6
f6d9742c1da0 fixed error on decimal variant counts
nick
parents: 4
diff changeset
249 variant_counts[variant] = int(float(reads))
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
250 except ValueError, e:
7
34892857567b Ready for release - script
nick
parents: 6
diff changeset
251 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
252
57e85db3f13c Uploaded script
nick
parents:
diff changeset
253 sample_counts[sample_names[i]] = variant_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
254
57e85db3f13c Uploaded script
nick
parents:
diff changeset
255 site['samples'] = sample_counts
57e85db3f13c Uploaded script
nick
parents:
diff changeset
256
57e85db3f13c Uploaded script
nick
parents:
diff changeset
257 return site
57e85db3f13c Uploaded script
nick
parents:
diff changeset
258
57e85db3f13c Uploaded script
nick
parents:
diff changeset
259
57e85db3f13c Uploaded script
nick
parents:
diff changeset
260 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
261 stranded=False, debug=False):
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
262 """Take the raw data from the VCF line and transform it into the summary data
57e85db3f13c Uploaded script
nick
parents:
diff changeset
263 to be printed in the output format."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
264
57e85db3f13c Uploaded script
nick
parents:
diff changeset
265 site_summary = []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
266 for sample_name in sample_names:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
267
57e85db3f13c Uploaded script
nick
parents:
diff changeset
268 sample = {'print':False}
57e85db3f13c Uploaded script
nick
parents:
diff changeset
269 variants = site['samples'].get(sample_name)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
270
57e85db3f13c Uploaded script
nick
parents:
diff changeset
271 sample['sample'] = sample_name
57e85db3f13c Uploaded script
nick
parents:
diff changeset
272 sample['chr'] = site['chr']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
273 sample['pos'] = site['pos']
57e85db3f13c Uploaded script
nick
parents:
diff changeset
274
57e85db3f13c Uploaded script
nick
parents:
diff changeset
275 coverage = sum(variants.values())
57e85db3f13c Uploaded script
nick
parents:
diff changeset
276
57e85db3f13c Uploaded script
nick
parents:
diff changeset
277 # get stranded coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
278 covg_plus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
279 covg_minus = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
280 for variant in variants:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
281 if variant[0] == '+':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
282 covg_plus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
283 elif variant[0] == '-':
57e85db3f13c Uploaded script
nick
parents:
diff changeset
284 covg_minus += variants[variant]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
285 # stranded coverage threshold
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
286 if covg_plus < covg_thres or covg_minus < covg_thres:
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
287 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
288 continue
57e85db3f13c Uploaded script
nick
parents:
diff changeset
289 else:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
290 sample['print'] = True
57e85db3f13c Uploaded script
nick
parents:
diff changeset
291
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
292 # get an ordered list of read counts for all variants (both strands)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
293 bases = get_read_counts(variants, '+-')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
294 ranked_bases = process_read_counts(bases, sort=True, debug=debug)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
295
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
296 # prepare stranded or unstranded lists of base counts
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
297 base_count_lists = []
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
298 if stranded:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
299 strands = ['+', '-']
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
300 base_count_lists.append(get_read_counts(variants, '+'))
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
301 base_count_lists.append(get_read_counts(variants, '-'))
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
302 else:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
303 strands = ['']
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
304 base_count_lists.append(ranked_bases)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
305
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
306 # record read counts into output dict
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
307 # If stranded, this will loop twice, once for each strand, and prepend '+'
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
308 # or '-' to the base name. If not stranded, it will loop once, and prepend
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
309 # nothing ('').
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
310 for (strand, base_count_list) in zip(strands, base_count_lists):
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
311 for base_count in base_count_list:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
312 sample[strand+base_count[0]] = base_count[1]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
313 # fill in any zeros
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
314 for base in canonical:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
315 if not sample.has_key(strand+base):
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
316 sample[strand+base] = 0
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
317
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
318 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
319
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
320 # If there's a tie for 2nd, randomly choose one to be 2nd
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
321 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
322 swap = random.choice([True,False])
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
323 if swap:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
324 tmp_base = ranked_bases[1]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
325 ranked_bases[1] = ranked_bases[2]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
326 ranked_bases[2] = tmp_base
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
327
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
328 if debug: print "ranked +-: "+str(ranked_bases)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
329
57e85db3f13c Uploaded script
nick
parents:
diff changeset
330 sample['coverage'] = coverage
57e85db3f13c Uploaded script
nick
parents:
diff changeset
331 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
332 sample['major'] = ranked_bases[0][0]
57e85db3f13c Uploaded script
nick
parents:
diff changeset
333 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
334 sample['major'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
335 try:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
336 sample['minor'] = ranked_bases[1][0]
3
4627d99aa105 New script version - change in header
nick
parents: 2
diff changeset
337 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
338 except IndexError, e:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
339 sample['minor'] = '.'
57e85db3f13c Uploaded script
nick
parents:
diff changeset
340 sample['freq'] = 0.0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
341
57e85db3f13c Uploaded script
nick
parents:
diff changeset
342 site_summary.append(sample)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
343
57e85db3f13c Uploaded script
nick
parents:
diff changeset
344 return site_summary
57e85db3f13c Uploaded script
nick
parents:
diff changeset
345
57e85db3f13c Uploaded script
nick
parents:
diff changeset
346
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
347 def get_read_counts(stranded_counts, strands='+-'):
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
348 """Do a simple sum of the read counts per variant, on the specified strands.
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
349 Arguments:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
350 stranded_counts: Dict of the stranded variants (keys) and their read counts
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
351 (values).
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
352 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
353 Return value:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
354 summed_counts: A list of the alleles and their read counts. The elements are
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
355 tuples (variant, read count)."""
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
356
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
357 variants = stranded_counts.keys()
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
358
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
359 summed_counts = {}
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
360 for variant in variants:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
361 strand = variant[0]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
362 base = variant[1:]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
363 if strand in strands:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
364 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
365
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
366 return summed_counts.items()
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
367
57e85db3f13c Uploaded script
nick
parents:
diff changeset
368
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
369 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
370 """Process a list of read counts by frequency filtering and/or sorting.
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
371 Arguments:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
372 variant_counts: List of the non-stranded variants and their read counts. The
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
373 elements are tuples (variant, read count).
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
374 freq_thres: The frequency threshold each allele needs to pass to be included.
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
375 sort: Whether to sort the list in descending order of read counts.
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
376 Return value:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
377 variant_counts: A list of the processed alleles and their read counts. The
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
378 elements are tuples (variant, read count)."""
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
379
57e85db3f13c Uploaded script
nick
parents:
diff changeset
380 # get coverage for the specified strands
57e85db3f13c Uploaded script
nick
parents:
diff changeset
381 coverage = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
382 for variant in variant_counts:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
383 coverage += variant[1]
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
384
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
385 if coverage <= 0:
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
386 return []
57e85db3f13c Uploaded script
nick
parents:
diff changeset
387
57e85db3f13c Uploaded script
nick
parents:
diff changeset
388 # sort the list of alleles by read count
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
389 if sort:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
390 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
391
57e85db3f13c Uploaded script
nick
parents:
diff changeset
392 if debug:
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
394 for variant in variant_counts:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
396 str(variant[1]/float(coverage)))
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
397
57e85db3f13c Uploaded script
nick
parents:
diff changeset
398 # remove bases below the frequency threshold
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
399 if freq_thres > 0:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
400 variant_counts = [variant for variant in variant_counts
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
401 if variant[1]/float(coverage) >= freq_thres]
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
402
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
403 return variant_counts
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
404
57e85db3f13c Uploaded script
nick
parents:
diff changeset
405
57e85db3f13c Uploaded script
nick
parents:
diff changeset
406 def count_alleles(variant_counts, freq_thres, debug=False):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
407 """Determine how many alleles to report, based on filtering rules.
57e85db3f13c Uploaded script
nick
parents:
diff changeset
408 The current rule determines which bases pass the frequency threshold on each
57e85db3f13c Uploaded script
nick
parents:
diff changeset
409 strand individually, then compares the two sets of bases. If they are the same
57e85db3f13c Uploaded script
nick
parents:
diff changeset
410 (regardless of order), the allele count is the number of bases. Otherwise it
57e85db3f13c Uploaded script
nick
parents:
diff changeset
411 is zero."""
57e85db3f13c Uploaded script
nick
parents:
diff changeset
412 allele_count = 0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
413
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
414 alleles_plus = get_read_counts(variant_counts, '+')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
415 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
416 sort=False, debug=debug)
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
417 alleles_minus = get_read_counts(variant_counts, '-')
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
418 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
419 sort=False, debug=debug)
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
420
57e85db3f13c Uploaded script
nick
parents:
diff changeset
421 if debug:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
422 print '+ '+str(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
423 print '- '+str(alleles_minus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
424
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
425 # Check if each strand reports the same set of alleles.
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
426 # Sorting by base is to compare lists without regard to order (as sets).
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
427 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
428 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
57e85db3f13c Uploaded script
nick
parents:
diff changeset
429 if alleles_plus_sorted == alleles_minus_sorted:
57e85db3f13c Uploaded script
nick
parents:
diff changeset
430 allele_count = len(alleles_plus)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
431
57e85db3f13c Uploaded script
nick
parents:
diff changeset
432 return allele_count
57e85db3f13c Uploaded script
nick
parents:
diff changeset
433
57e85db3f13c Uploaded script
nick
parents:
diff changeset
434
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
435 def print_site(filehandle, site, columns):
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
436 """Print the output lines for one site (one per sample).
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
437 filehandle must be open."""
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
438 for sample in site:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
439 if sample['print']:
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
440 fields = [str(sample.get(column)) for column in columns]
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
441 filehandle.write('\t'.join(fields)+"\n")
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
442
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
443
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
444 def fail(message):
57e85db3f13c Uploaded script
nick
parents:
diff changeset
445 sys.stderr.write(message+'\n')
57e85db3f13c Uploaded script
nick
parents:
diff changeset
446 sys.exit(1)
57e85db3f13c Uploaded script
nick
parents:
diff changeset
447
10
db6f217dc45a Uploaded tarball.
nick
parents: 7
diff changeset
448
0
57e85db3f13c Uploaded script
nick
parents:
diff changeset
449 if __name__ == "__main__":
57e85db3f13c Uploaded script
nick
parents:
diff changeset
450 main()