Mercurial > repos > nick > allele_counts_1
annotate allele-counts.py @ 17:44c3abd1b767 draft
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
| author | nick |
|---|---|
| date | Tue, 31 Mar 2020 09:00:51 +0000 |
| parents | e5d39283fe4d |
| children |
| rev | line source |
|---|---|
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
1 #!/usr/bin/python3 |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
2 """ |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
3 Run with -h option or see DESCRIPTION for description. |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
4 This script's functionality is being obsoleted by the new, and much more sanely |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
5 written, nvc-filter.py. |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
6 |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
7 New in this version: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
8 - Calculate strand bias |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
9 - Rename MINOR.FREQ.PERC to MAF |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
10 |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
11 Naive Variant Caller variant count parsing one-liner: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
13 """ |
| 0 | 14 import os |
| 15 import sys | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
16 import errno |
| 10 | 17 import random |
| 0 | 18 from optparse import OptionParser |
| 19 | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
20 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
21 'major', 'minor', 'freq', 'bias'] |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
22 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
23 'MAJOR', 'MINOR', 'MAF', 'BIAS'] |
| 0 | 24 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
|
14
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
25 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
26 cat variants.vcf | %prog [options] > alleles.tsv""" |
| 0 | 27 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
| 10 | 28 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, |
| 29 'debug_loc':'', 'seed':''} | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
30 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller" |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
31 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
32 number of reads of each base, determines the major allele, minor allele (second |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
33 most frequent variant), and number of alleles above a threshold. So currently |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
34 it only considers SNVs (ACGT), including in the coverage figure. By default it |
|
14
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
35 reads from stdin and prints to stdout. |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
36 Prints a tab-delimited set of statistics to stdout. |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
37 To print output column labels, run "$ echo -n | ./allele-counts.py -H". |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
38 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
39 11:MINOR 12:MAF 13:BIAS, |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
40 unless the --stranded option is used, in which case they are: |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
41 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
42 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS. |
|
e5d39283fe4d
allele-counts.py: Document output columns.
nicksto <nmapsy@gmail.com>
parents:
13
diff
changeset
|
43 """ |
| 0 | 44 EPILOG = """Requirements: |
| 45 The input VCF must report the variants for each strand. | |
| 46 The variants should be case-sensitive (e.g. all capital base letters). | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
47 Strand bias: Both strands must show the same bases passing the frequency |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
48 threshold (but not necessarily in the same order). If the site fails this test, |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
49 the number of alleles is reported as 0.""" |
| 0 | 50 |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
51 |
| 0 | 52 def get_options(defaults, usage, description='', epilog=''): |
| 53 """Get options, print usage text.""" | |
| 54 | |
| 55 parser = OptionParser(usage=usage, description=description, epilog=epilog) | |
| 56 | |
| 57 parser.add_option('-i', '--infile', dest='infile', | |
| 58 default=defaults.get('infile'), | |
| 59 help='Read input VCF data from this file instead of stdin.') | |
| 60 parser.add_option('-o', '--outfile', dest='outfile', | |
| 61 default=defaults.get('outfile'), | |
| 62 help='Print output data to this file instead of stdout.') | |
| 63 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
| 64 default=defaults.get('freq_thres'), | |
| 10 | 65 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' |
| 66 +'= 1% frequency. Default is %default%.')) | |
| 0 | 67 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', |
| 68 default=defaults.get('covg_thres'), | |
| 10 | 69 help=('Coverage threshold. Each site must be supported by at least this ' |
| 70 +'many reads on each strand. Otherwise the site will not be printed in ' | |
| 71 +'the output. The default is %default reads per strand.')) | |
| 72 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', | |
| 73 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), | |
| 74 help=('Operate without a frequency threshold or coverage threshold. ' | |
| 75 +'Equivalent to "-c 0 -f 0".')) | |
| 0 | 76 parser.add_option('-H', '--header', dest='print_header', action='store_const', |
| 77 const=not(defaults.get('print_header')), default=defaults.get('print_header'), | |
| 10 | 78 help=('Print header line. This is a #-commented line with the column ' |
| 79 +'labels. Off by default.')) | |
| 80 parser.add_option('-s', '--stranded', dest='stranded', action='store_const', | |
| 81 const=not(defaults.get('stranded')), default=defaults.get('stranded'), | |
| 82 help='Report variant counts by strand, in separate columns. Off by default.') | |
| 83 parser.add_option('-r', '--rand-seed', dest='seed', | |
| 84 default=defaults.get('seed'), help=('Seed for random number generator.')) | |
| 85 parser.add_option('-d', '--debug', dest='debug_loc', | |
| 86 default=defaults.get('debug_loc'), | |
| 87 help=('Turn on debug mode and specify a single site to process using UCSC ' | |
| 88 +'coordinate format. You can also append a sample ID after another ":" ' | |
| 89 +'to restrict it further.')) | |
| 0 | 90 |
| 91 (options, args) = parser.parse_args() | |
| 92 | |
| 10 | 93 return (options, args) |
| 0 | 94 |
| 95 | |
| 96 def main(): | |
| 97 | |
| 98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | |
| 99 | |
| 100 infile = options.infile | |
| 101 outfile = options.outfile | |
| 102 print_header = options.print_header | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
103 freq_thres = options.freq_thres / 100 |
| 0 | 104 covg_thres = options.covg_thres |
| 10 | 105 stranded = options.stranded |
| 106 debug_loc = options.debug_loc | |
| 107 seed = options.seed | |
| 108 | |
| 109 if options.no_filter: | |
| 110 freq_thres = 0 | |
| 111 covg_thres = 0 | |
| 0 | 112 |
| 10 | 113 if seed: |
| 114 random.seed(seed) | |
| 115 | |
| 116 debug = False | |
| 117 print_sample = '' | |
| 118 if debug_loc: | |
| 119 debug = True | |
| 120 coords = debug_loc.split(':') | |
| 121 print_chr = coords[0] | |
| 122 print_pos = '' | |
| 123 if len(coords) > 1: print_pos = coords[1] | |
| 124 if len(coords) > 2: print_sample = coords[2] | |
| 0 | 125 |
| 126 # set infile_handle to either stdin or the input file | |
| 127 if infile == OPT_DEFAULTS.get('infile'): | |
| 128 infile_handle = sys.stdin | |
| 129 sys.stderr.write("Reading from standard input..\n") | |
| 130 else: | |
| 131 if os.path.exists(infile): | |
| 132 infile_handle = open(infile, 'r') | |
| 133 else: | |
| 134 fail('Error: Input VCF file '+infile+' not found.') | |
| 135 | |
| 136 # set outfile_handle to either stdout or the output file | |
| 137 if outfile == OPT_DEFAULTS.get('outfile'): | |
| 138 outfile_handle = sys.stdout | |
| 139 else: | |
| 2 | 140 try: |
| 0 | 141 outfile_handle = open(outfile, 'w') |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
142 except IOError: |
| 2 | 143 fail('Error: The given output filename '+outfile+' could not be opened.') |
| 0 | 144 |
| 10 | 145 # Take care of column names, print header |
| 3 | 146 if len(COLUMNS) != len(COLUMN_LABELS): |
| 10 | 147 fail('Error: Internal column names list do not match column labels list.') |
| 148 if stranded: | |
| 149 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
| 150 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
| 0 | 151 if print_header: |
| 4 | 152 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") |
| 0 | 153 |
| 10 | 154 # main loop |
| 155 # each iteration processes one VCF line and prints one or more output lines | |
| 156 # one VCF line = one site, one or more samples | |
| 157 # one output line = one site, one sample | |
| 0 | 158 sample_names = [] |
| 159 for line in infile_handle: | |
| 160 line = line.rstrip('\r\n') | |
| 161 | |
| 162 # header lines | |
| 163 if line[0] == '#': | |
| 164 if line[0:6].upper() == '#CHROM': | |
| 165 sample_names = line.split('\t')[9:] | |
| 166 continue | |
| 167 | |
| 168 if not sample_names: | |
| 169 fail("Error in input VCF: Data line encountered before header line. " | |
| 170 +"Failed on line:\n"+line) | |
| 171 | |
| 172 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) | |
| 173 | |
| 174 if debug: | |
| 10 | 175 if print_pos != site_data['pos']: |
| 176 continue | |
| 177 if print_chr != site_data['chr'] and print_chr != '': | |
| 0 | 178 continue |
| 10 | 179 if print_sample != '': |
| 180 for sample in site_data['samples'].keys(): | |
| 181 if sample.lower() != print_sample.lower(): | |
| 182 site_data['samples'].pop(sample, None) | |
| 183 if len(site_data['samples']) == 0: | |
| 184 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | |
| 185 sys.exit(1) | |
| 186 | |
| 0 | 187 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, |
| 10 | 188 freq_thres, covg_thres, stranded, debug=debug) |
| 0 | 189 |
| 190 if debug and site_summary[0]['print']: | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
191 print(line.split('\t')[9].split(':')[-1]) |
| 0 | 192 |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
193 try: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
194 print_site(outfile_handle, site_summary, COLUMNS) |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
195 except IOError as ioe: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
196 if ioe.errno == errno.EPIPE: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
197 sys.exit(0) |
| 0 | 198 |
| 199 # keeps Galaxy from giving an error if there were messages on stderr | |
| 200 sys.exit(0) | |
| 201 | |
| 202 | |
| 203 | |
| 204 def read_site(line, sample_names, canonical): | |
| 205 """Read in a line, parse the variants into a data structure, and return it. | |
| 206 The line should be actual site data, not a header line, so check beforehand. | |
| 10 | 207 Only the variants in 'canonical' will be read; all others are ignored. |
| 208 Note: the line is assumed to have been chomped. | |
| 209 The returned data is stored in a dict, with the following structure: | |
| 210 { | |
| 211 'chr': 'chr1', | |
| 212 'pos': '2617', | |
| 213 'samples': { | |
| 214 'THYROID': { | |
| 215 '+A': 32, | |
| 216 '-A': 45, | |
| 217 '-G': 1, | |
| 218 }, | |
| 219 'BLOOD': { | |
| 220 '+A': 2, | |
| 221 '-C': 1, | |
| 222 '+G': 37, | |
| 223 '-G': 42, | |
| 224 }, | |
| 225 }, | |
| 226 } | |
| 227 """ | |
| 0 | 228 |
| 229 site = {} | |
| 230 fields = line.split('\t') | |
| 231 | |
| 232 if len(fields) < 9: | |
| 233 fail("Error in input VCF: wrong number of fields in data line. " | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
234 "Failed on line:\n"+line) |
| 0 | 235 |
| 236 site['chr'] = fields[0] | |
| 237 site['pos'] = fields[1] | |
| 238 samples = fields[9:] | |
| 239 | |
| 240 if len(samples) < len(sample_names): | |
| 241 fail("Error in input VCF: missing sample fields in data line. " | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
242 "Failed on line:\n"+line) |
| 0 | 243 elif len(samples) > len(sample_names): |
| 244 fail("Error in input VCF: more sample fields in data line than in header. " | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
245 "Failed on line:\n"+line) |
| 0 | 246 |
| 247 sample_counts = {} | |
| 248 for i in range(len(samples)): | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
249 |
| 0 | 250 variant_counts = {} |
| 251 counts = samples[i].split(':')[-1] | |
| 252 counts = counts.split(',') | |
| 253 | |
| 254 for count in counts: | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
255 if not count or count == '.': |
| 0 | 256 continue |
| 257 fields = count.split('=') | |
| 258 if len(fields) != 2: | |
| 259 fail("Error in input VCF: Incorrect variant data format (must contain " | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
260 "a single '='). Failed on data \"{}\" in line:\n{}" |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
261 .format(count, line)) |
| 0 | 262 (variant, reads) = fields |
| 263 if variant[1:] not in canonical: | |
| 264 continue | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
265 if not variant.startswith('-') and not variant.startswith('+'): |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
266 fail("Error in input VCF: variant data not strand-specific. Failed on " |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
267 "data \"{}\" on line:\n{}".format(variant, line)) |
| 0 | 268 try: |
| 6 | 269 variant_counts[variant] = int(float(reads)) |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
270 except ValueError: |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
271 fail("Error in input VCF: Variant count not a valid number. Failed on " |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
272 "variant count string \"{}\"\nIn the following line:\n{}" |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
273 .format(reads, line)) |
| 0 | 274 |
| 275 sample_counts[sample_names[i]] = variant_counts | |
| 276 | |
| 277 site['samples'] = sample_counts | |
| 278 | |
| 279 return site | |
| 280 | |
| 281 | |
| 282 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, | |
| 10 | 283 stranded=False, debug=False): |
| 0 | 284 """Take the raw data from the VCF line and transform it into the summary data |
| 285 to be printed in the output format.""" | |
| 286 | |
| 287 site_summary = [] | |
| 288 for sample_name in sample_names: | |
| 289 | |
| 290 sample = {'print':False} | |
| 291 variants = site['samples'].get(sample_name) | |
| 292 | |
| 293 sample['sample'] = sample_name | |
| 294 sample['chr'] = site['chr'] | |
| 295 sample['pos'] = site['pos'] | |
| 296 | |
| 297 coverage = sum(variants.values()) | |
| 298 | |
| 299 # get stranded coverage | |
| 300 covg_plus = 0 | |
| 301 covg_minus = 0 | |
| 302 for variant in variants: | |
| 303 if variant[0] == '+': | |
| 304 covg_plus += variants[variant] | |
| 305 elif variant[0] == '-': | |
| 306 covg_minus += variants[variant] | |
| 307 # stranded coverage threshold | |
| 10 | 308 if covg_plus < covg_thres or covg_minus < covg_thres: |
| 0 | 309 site_summary.append(sample) |
| 310 continue | |
| 311 else: | |
| 312 sample['print'] = True | |
| 313 | |
| 10 | 314 # get an ordered list of read counts for all variants (both strands) |
| 315 bases = get_read_counts(variants, '+-') | |
| 316 ranked_bases = process_read_counts(bases, sort=True, debug=debug) | |
| 317 | |
| 318 # prepare stranded or unstranded lists of base counts | |
| 319 base_count_lists = [] | |
| 320 if stranded: | |
| 321 strands = ['+', '-'] | |
| 322 base_count_lists.append(get_read_counts(variants, '+')) | |
| 323 base_count_lists.append(get_read_counts(variants, '-')) | |
| 324 else: | |
| 325 strands = [''] | |
| 326 base_count_lists.append(ranked_bases) | |
| 0 | 327 |
| 10 | 328 # record read counts into output dict |
| 329 # If stranded, this will loop twice, once for each strand, and prepend '+' | |
| 330 # or '-' to the base name. If not stranded, it will loop once, and prepend | |
| 331 # nothing (''). | |
| 332 for (strand, base_count_list) in zip(strands, base_count_lists): | |
| 333 for base_count in base_count_list: | |
| 334 sample[strand+base_count[0]] = base_count[1] | |
| 335 # fill in any zeros | |
| 336 for base in canonical: | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
337 if strand+base not in sample: |
| 10 | 338 sample[strand+base] = 0 |
| 0 | 339 |
| 10 | 340 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) |
| 0 | 341 |
| 10 | 342 # If there's a tie for 2nd, randomly choose one to be 2nd |
| 0 | 343 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: |
| 10 | 344 swap = random.choice([True,False]) |
| 345 if swap: | |
| 346 tmp_base = ranked_bases[1] | |
| 347 ranked_bases[1] = ranked_bases[2] | |
| 348 ranked_bases[2] = tmp_base | |
| 0 | 349 |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
350 if debug: print("ranked +-: "+str(ranked_bases)) |
| 0 | 351 |
| 352 sample['coverage'] = coverage | |
| 353 try: | |
| 354 sample['major'] = ranked_bases[0][0] | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
355 except IndexError: |
| 0 | 356 sample['major'] = '.' |
| 357 try: | |
| 358 sample['minor'] = ranked_bases[1][0] | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
359 sample['freq'] = round(ranked_bases[1][1]/coverage, 5) |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
360 except IndexError: |
| 0 | 361 sample['minor'] = '.' |
| 362 sample['freq'] = 0.0 | |
| 363 | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
364 # Now that we've decided major and minor, we can calculate strand bias |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
365 bias = get_strand_bias(sample, variants) |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
366 if bias is None: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
367 sample['bias'] = '.' |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
368 else: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
369 sample['bias'] = round(bias, 5) |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
370 |
| 0 | 371 site_summary.append(sample) |
| 372 | |
| 373 return site_summary | |
| 374 | |
| 375 | |
| 10 | 376 def get_read_counts(stranded_counts, strands='+-'): |
| 377 """Do a simple sum of the read counts per variant, on the specified strands. | |
| 378 Arguments: | |
| 379 stranded_counts: Dict of the stranded variants (keys) and their read counts | |
| 380 (values). | |
| 381 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
| 382 Return value: | |
| 383 summed_counts: A list of the alleles and their read counts. The elements are | |
| 384 tuples (variant, read count).""" | |
| 385 | |
| 386 variants = stranded_counts.keys() | |
| 387 | |
| 388 summed_counts = {} | |
| 389 for variant in variants: | |
| 390 strand = variant[0] | |
| 391 base = variant[1:] | |
| 392 if strand in strands: | |
| 393 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | |
| 394 | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
395 return list(summed_counts.items()) |
| 0 | 396 |
| 397 | |
| 10 | 398 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): |
| 399 """Process a list of read counts by frequency filtering and/or sorting. | |
| 0 | 400 Arguments: |
| 10 | 401 variant_counts: List of the non-stranded variants and their read counts. The |
| 402 elements are tuples (variant, read count). | |
| 0 | 403 freq_thres: The frequency threshold each allele needs to pass to be included. |
| 10 | 404 sort: Whether to sort the list in descending order of read counts. |
| 0 | 405 Return value: |
| 10 | 406 variant_counts: A list of the processed alleles and their read counts. The |
| 407 elements are tuples (variant, read count).""" | |
| 0 | 408 |
| 409 # get coverage for the specified strands | |
| 410 coverage = 0 | |
| 411 for variant in variant_counts: | |
| 10 | 412 coverage += variant[1] |
| 0 | 413 |
| 10 | 414 if coverage <= 0: |
| 0 | 415 return [] |
| 416 | |
| 417 # sort the list of alleles by read count | |
| 10 | 418 if sort: |
| 419 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | |
| 0 | 420 |
| 421 if debug: | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
422 print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)) |
| 10 | 423 for variant in variant_counts: |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
424 print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
425 str(variant[1]/coverage))) |
| 0 | 426 |
| 427 # remove bases below the frequency threshold | |
| 10 | 428 if freq_thres > 0: |
| 429 variant_counts = [variant for variant in variant_counts | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
430 if variant[1]/coverage >= freq_thres] |
| 0 | 431 |
| 10 | 432 return variant_counts |
| 0 | 433 |
| 434 | |
| 435 def count_alleles(variant_counts, freq_thres, debug=False): | |
| 436 """Determine how many alleles to report, based on filtering rules. | |
| 437 The current rule determines which bases pass the frequency threshold on each | |
| 438 strand individually, then compares the two sets of bases. If they are the same | |
| 439 (regardless of order), the allele count is the number of bases. Otherwise it | |
| 440 is zero.""" | |
| 441 allele_count = 0 | |
| 442 | |
| 10 | 443 alleles_plus = get_read_counts(variant_counts, '+') |
| 444 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, | |
| 445 sort=False, debug=debug) | |
| 446 alleles_minus = get_read_counts(variant_counts, '-') | |
| 447 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | |
| 448 sort=False, debug=debug) | |
| 0 | 449 |
| 450 if debug: | |
|
17
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
451 print('+ '+str(alleles_plus)) |
|
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
14
diff
changeset
|
452 print('- '+str(alleles_minus)) |
| 0 | 453 |
| 10 | 454 # Check if each strand reports the same set of alleles. |
| 455 # Sorting by base is to compare lists without regard to order (as sets). | |
| 0 | 456 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) |
| 457 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
| 458 if alleles_plus_sorted == alleles_minus_sorted: | |
| 459 allele_count = len(alleles_plus) | |
| 460 | |
| 461 return allele_count | |
| 462 | |
| 463 | |
|
13
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
464 def get_strand_bias(sample, variants): |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
465 """Based on method 1 (SB) of Guo et al., 2012 |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
466 If there a denominator would be 0, there is no valid result and this will |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
467 return None. This occurs when there are no reads on one of the strands, or |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
468 when there are no minor allele reads.""" |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
469 # using same variable names as in paper |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
470 a = variants.get('+'+sample['major'], 0) # forward major allele |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
471 b = variants.get('+'+sample['minor'], 0) # forward minor allele |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
472 c = variants.get('-'+sample['major'], 0) # reverse major allele |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
473 d = variants.get('-'+sample['minor'], 0) # reverse minor allele |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
474 # no minor allele reads |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
475 try: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
476 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d)) |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
477 except ZeroDivisionError: |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
478 return None |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
479 |
|
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
me <nmapsy@gmail.com>
parents:
10
diff
changeset
|
480 |
| 10 | 481 def print_site(filehandle, site, columns): |
| 482 """Print the output lines for one site (one per sample). | |
| 483 filehandle must be open.""" | |
| 484 for sample in site: | |
| 485 if sample['print']: | |
| 486 fields = [str(sample.get(column)) for column in columns] | |
| 487 filehandle.write('\t'.join(fields)+"\n") | |
| 488 | |
| 489 | |
| 0 | 490 def fail(message): |
| 491 sys.stderr.write(message+'\n') | |
| 492 sys.exit(1) | |
| 493 | |
| 10 | 494 |
| 0 | 495 if __name__ == "__main__": |
| 496 main() |
