Mercurial > repos > nick > allele_counts_1
annotate allele-counts.py @ 21:1fc7bef38c5f draft default tip
"planemo upload for repository https://github.com/galaxyproject/dunovo commit ac9577f0047ad984b307fe2c4b40e2eb45a0e6e2-dirty"
| author | nick | 
|---|---|
| date | Fri, 03 Apr 2020 19:47:34 +0000 | 
| parents | 44c3abd1b767 | 
| children | 
| rev | line source | 
|---|---|
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
1 #!/usr/bin/python3 | 
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
2 """ | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
3 Run with -h option or see DESCRIPTION for description. | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
4 This script's functionality is being obsoleted by the new, and much more sanely | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
5 written, nvc-filter.py. | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
6 | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
7 New in this version: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
8 - Calculate strand bias | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
9 - Rename MINOR.FREQ.PERC to MAF | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
10 | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
11 Naive Variant Caller variant count parsing one-liner: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
13 """ | 
| 0 | 14 import os | 
| 15 import sys | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
16 import errno | 
| 10 | 17 import random | 
| 0 | 18 from optparse import OptionParser | 
| 19 | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
20 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
21 'major', 'minor', 'freq', 'bias'] | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
22 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
23 'MAJOR', 'MINOR', 'MAF', 'BIAS'] | 
| 0 | 24 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] | 
| 
14
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
25 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
26 cat variants.vcf | %prog [options] > alleles.tsv""" | 
| 0 | 27 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, | 
| 10 | 28 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, | 
| 29 'debug_loc':'', 'seed':''} | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
30 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller" | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
31 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
32 number of reads of each base, determines the major allele, minor allele (second | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
33 most frequent variant), and number of alleles above a threshold. So currently | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
34 it only considers SNVs (ACGT), including in the coverage figure. By default it | 
| 
14
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
35 reads from stdin and prints to stdout. | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
36 Prints a tab-delimited set of statistics to stdout. | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
37 To print output column labels, run "$ echo -n | ./allele-counts.py -H". | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
38 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
39 11:MINOR 12:MAF 13:BIAS, | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
40 unless the --stranded option is used, in which case they are: | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
41 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
42 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS. | 
| 
 
e5d39283fe4d
allele-counts.py: Document output columns.
 
nicksto <nmapsy@gmail.com> 
parents: 
13 
diff
changeset
 | 
43 """ | 
| 0 | 44 EPILOG = """Requirements: | 
| 45 The input VCF must report the variants for each strand. | |
| 46 The variants should be case-sensitive (e.g. all capital base letters). | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
47 Strand bias: Both strands must show the same bases passing the frequency | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
48 threshold (but not necessarily in the same order). If the site fails this test, | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
49 the number of alleles is reported as 0.""" | 
| 0 | 50 | 
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
51 | 
| 0 | 52 def get_options(defaults, usage, description='', epilog=''): | 
| 53 """Get options, print usage text.""" | |
| 54 | |
| 55 parser = OptionParser(usage=usage, description=description, epilog=epilog) | |
| 56 | |
| 57 parser.add_option('-i', '--infile', dest='infile', | |
| 58 default=defaults.get('infile'), | |
| 59 help='Read input VCF data from this file instead of stdin.') | |
| 60 parser.add_option('-o', '--outfile', dest='outfile', | |
| 61 default=defaults.get('outfile'), | |
| 62 help='Print output data to this file instead of stdout.') | |
| 63 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
| 64 default=defaults.get('freq_thres'), | |
| 10 | 65 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' | 
| 66 +'= 1% frequency. Default is %default%.')) | |
| 0 | 67 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', | 
| 68 default=defaults.get('covg_thres'), | |
| 10 | 69 help=('Coverage threshold. Each site must be supported by at least this ' | 
| 70 +'many reads on each strand. Otherwise the site will not be printed in ' | |
| 71 +'the output. The default is %default reads per strand.')) | |
| 72 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', | |
| 73 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), | |
| 74 help=('Operate without a frequency threshold or coverage threshold. ' | |
| 75 +'Equivalent to "-c 0 -f 0".')) | |
| 0 | 76 parser.add_option('-H', '--header', dest='print_header', action='store_const', | 
| 77 const=not(defaults.get('print_header')), default=defaults.get('print_header'), | |
| 10 | 78 help=('Print header line. This is a #-commented line with the column ' | 
| 79 +'labels. Off by default.')) | |
| 80 parser.add_option('-s', '--stranded', dest='stranded', action='store_const', | |
| 81 const=not(defaults.get('stranded')), default=defaults.get('stranded'), | |
| 82 help='Report variant counts by strand, in separate columns. Off by default.') | |
| 83 parser.add_option('-r', '--rand-seed', dest='seed', | |
| 84 default=defaults.get('seed'), help=('Seed for random number generator.')) | |
| 85 parser.add_option('-d', '--debug', dest='debug_loc', | |
| 86 default=defaults.get('debug_loc'), | |
| 87 help=('Turn on debug mode and specify a single site to process using UCSC ' | |
| 88 +'coordinate format. You can also append a sample ID after another ":" ' | |
| 89 +'to restrict it further.')) | |
| 0 | 90 | 
| 91 (options, args) = parser.parse_args() | |
| 92 | |
| 10 | 93 return (options, args) | 
| 0 | 94 | 
| 95 | |
| 96 def main(): | |
| 97 | |
| 98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | |
| 99 | |
| 100 infile = options.infile | |
| 101 outfile = options.outfile | |
| 102 print_header = options.print_header | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
103 freq_thres = options.freq_thres / 100 | 
| 0 | 104 covg_thres = options.covg_thres | 
| 10 | 105 stranded = options.stranded | 
| 106 debug_loc = options.debug_loc | |
| 107 seed = options.seed | |
| 108 | |
| 109 if options.no_filter: | |
| 110 freq_thres = 0 | |
| 111 covg_thres = 0 | |
| 0 | 112 | 
| 10 | 113 if seed: | 
| 114 random.seed(seed) | |
| 115 | |
| 116 debug = False | |
| 117 print_sample = '' | |
| 118 if debug_loc: | |
| 119 debug = True | |
| 120 coords = debug_loc.split(':') | |
| 121 print_chr = coords[0] | |
| 122 print_pos = '' | |
| 123 if len(coords) > 1: print_pos = coords[1] | |
| 124 if len(coords) > 2: print_sample = coords[2] | |
| 0 | 125 | 
| 126 # set infile_handle to either stdin or the input file | |
| 127 if infile == OPT_DEFAULTS.get('infile'): | |
| 128 infile_handle = sys.stdin | |
| 129 sys.stderr.write("Reading from standard input..\n") | |
| 130 else: | |
| 131 if os.path.exists(infile): | |
| 132 infile_handle = open(infile, 'r') | |
| 133 else: | |
| 134 fail('Error: Input VCF file '+infile+' not found.') | |
| 135 | |
| 136 # set outfile_handle to either stdout or the output file | |
| 137 if outfile == OPT_DEFAULTS.get('outfile'): | |
| 138 outfile_handle = sys.stdout | |
| 139 else: | |
| 2 | 140 try: | 
| 0 | 141 outfile_handle = open(outfile, 'w') | 
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
142 except IOError: | 
| 2 | 143 fail('Error: The given output filename '+outfile+' could not be opened.') | 
| 0 | 144 | 
| 10 | 145 # Take care of column names, print header | 
| 3 | 146 if len(COLUMNS) != len(COLUMN_LABELS): | 
| 10 | 147 fail('Error: Internal column names list do not match column labels list.') | 
| 148 if stranded: | |
| 149 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
| 150 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
| 0 | 151 if print_header: | 
| 4 | 152 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") | 
| 0 | 153 | 
| 10 | 154 # main loop | 
| 155 # each iteration processes one VCF line and prints one or more output lines | |
| 156 # one VCF line = one site, one or more samples | |
| 157 # one output line = one site, one sample | |
| 0 | 158 sample_names = [] | 
| 159 for line in infile_handle: | |
| 160 line = line.rstrip('\r\n') | |
| 161 | |
| 162 # header lines | |
| 163 if line[0] == '#': | |
| 164 if line[0:6].upper() == '#CHROM': | |
| 165 sample_names = line.split('\t')[9:] | |
| 166 continue | |
| 167 | |
| 168 if not sample_names: | |
| 169 fail("Error in input VCF: Data line encountered before header line. " | |
| 170 +"Failed on line:\n"+line) | |
| 171 | |
| 172 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) | |
| 173 | |
| 174 if debug: | |
| 10 | 175 if print_pos != site_data['pos']: | 
| 176 continue | |
| 177 if print_chr != site_data['chr'] and print_chr != '': | |
| 0 | 178 continue | 
| 10 | 179 if print_sample != '': | 
| 180 for sample in site_data['samples'].keys(): | |
| 181 if sample.lower() != print_sample.lower(): | |
| 182 site_data['samples'].pop(sample, None) | |
| 183 if len(site_data['samples']) == 0: | |
| 184 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | |
| 185 sys.exit(1) | |
| 186 | |
| 0 | 187 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, | 
| 10 | 188 freq_thres, covg_thres, stranded, debug=debug) | 
| 0 | 189 | 
| 190 if debug and site_summary[0]['print']: | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
191 print(line.split('\t')[9].split(':')[-1]) | 
| 0 | 192 | 
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
193 try: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
194 print_site(outfile_handle, site_summary, COLUMNS) | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
195 except IOError as ioe: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
196 if ioe.errno == errno.EPIPE: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
197 sys.exit(0) | 
| 0 | 198 | 
| 199 # keeps Galaxy from giving an error if there were messages on stderr | |
| 200 sys.exit(0) | |
| 201 | |
| 202 | |
| 203 | |
| 204 def read_site(line, sample_names, canonical): | |
| 205 """Read in a line, parse the variants into a data structure, and return it. | |
| 206 The line should be actual site data, not a header line, so check beforehand. | |
| 10 | 207 Only the variants in 'canonical' will be read; all others are ignored. | 
| 208 Note: the line is assumed to have been chomped. | |
| 209 The returned data is stored in a dict, with the following structure: | |
| 210 { | |
| 211 'chr': 'chr1', | |
| 212 'pos': '2617', | |
| 213 'samples': { | |
| 214 'THYROID': { | |
| 215 '+A': 32, | |
| 216 '-A': 45, | |
| 217 '-G': 1, | |
| 218 }, | |
| 219 'BLOOD': { | |
| 220 '+A': 2, | |
| 221 '-C': 1, | |
| 222 '+G': 37, | |
| 223 '-G': 42, | |
| 224 }, | |
| 225 }, | |
| 226 } | |
| 227 """ | |
| 0 | 228 | 
| 229 site = {} | |
| 230 fields = line.split('\t') | |
| 231 | |
| 232 if len(fields) < 9: | |
| 233 fail("Error in input VCF: wrong number of fields in data line. " | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
234 "Failed on line:\n"+line) | 
| 0 | 235 | 
| 236 site['chr'] = fields[0] | |
| 237 site['pos'] = fields[1] | |
| 238 samples = fields[9:] | |
| 239 | |
| 240 if len(samples) < len(sample_names): | |
| 241 fail("Error in input VCF: missing sample fields in data line. " | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
242 "Failed on line:\n"+line) | 
| 0 | 243 elif len(samples) > len(sample_names): | 
| 244 fail("Error in input VCF: more sample fields in data line than in header. " | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
245 "Failed on line:\n"+line) | 
| 0 | 246 | 
| 247 sample_counts = {} | |
| 248 for i in range(len(samples)): | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
249 | 
| 0 | 250 variant_counts = {} | 
| 251 counts = samples[i].split(':')[-1] | |
| 252 counts = counts.split(',') | |
| 253 | |
| 254 for count in counts: | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
255 if not count or count == '.': | 
| 0 | 256 continue | 
| 257 fields = count.split('=') | |
| 258 if len(fields) != 2: | |
| 259 fail("Error in input VCF: Incorrect variant data format (must contain " | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
260 "a single '='). Failed on data \"{}\" in line:\n{}" | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
261 .format(count, line)) | 
| 0 | 262 (variant, reads) = fields | 
| 263 if variant[1:] not in canonical: | |
| 264 continue | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
265 if not variant.startswith('-') and not variant.startswith('+'): | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
266 fail("Error in input VCF: variant data not strand-specific. Failed on " | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
267 "data \"{}\" on line:\n{}".format(variant, line)) | 
| 0 | 268 try: | 
| 6 | 269 variant_counts[variant] = int(float(reads)) | 
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
270 except ValueError: | 
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
271 fail("Error in input VCF: Variant count not a valid number. Failed on " | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
272 "variant count string \"{}\"\nIn the following line:\n{}" | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
273 .format(reads, line)) | 
| 0 | 274 | 
| 275 sample_counts[sample_names[i]] = variant_counts | |
| 276 | |
| 277 site['samples'] = sample_counts | |
| 278 | |
| 279 return site | |
| 280 | |
| 281 | |
| 282 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, | |
| 10 | 283 stranded=False, debug=False): | 
| 0 | 284 """Take the raw data from the VCF line and transform it into the summary data | 
| 285 to be printed in the output format.""" | |
| 286 | |
| 287 site_summary = [] | |
| 288 for sample_name in sample_names: | |
| 289 | |
| 290 sample = {'print':False} | |
| 291 variants = site['samples'].get(sample_name) | |
| 292 | |
| 293 sample['sample'] = sample_name | |
| 294 sample['chr'] = site['chr'] | |
| 295 sample['pos'] = site['pos'] | |
| 296 | |
| 297 coverage = sum(variants.values()) | |
| 298 | |
| 299 # get stranded coverage | |
| 300 covg_plus = 0 | |
| 301 covg_minus = 0 | |
| 302 for variant in variants: | |
| 303 if variant[0] == '+': | |
| 304 covg_plus += variants[variant] | |
| 305 elif variant[0] == '-': | |
| 306 covg_minus += variants[variant] | |
| 307 # stranded coverage threshold | |
| 10 | 308 if covg_plus < covg_thres or covg_minus < covg_thres: | 
| 0 | 309 site_summary.append(sample) | 
| 310 continue | |
| 311 else: | |
| 312 sample['print'] = True | |
| 313 | |
| 10 | 314 # get an ordered list of read counts for all variants (both strands) | 
| 315 bases = get_read_counts(variants, '+-') | |
| 316 ranked_bases = process_read_counts(bases, sort=True, debug=debug) | |
| 317 | |
| 318 # prepare stranded or unstranded lists of base counts | |
| 319 base_count_lists = [] | |
| 320 if stranded: | |
| 321 strands = ['+', '-'] | |
| 322 base_count_lists.append(get_read_counts(variants, '+')) | |
| 323 base_count_lists.append(get_read_counts(variants, '-')) | |
| 324 else: | |
| 325 strands = [''] | |
| 326 base_count_lists.append(ranked_bases) | |
| 0 | 327 | 
| 10 | 328 # record read counts into output dict | 
| 329 # If stranded, this will loop twice, once for each strand, and prepend '+' | |
| 330 # or '-' to the base name. If not stranded, it will loop once, and prepend | |
| 331 # nothing (''). | |
| 332 for (strand, base_count_list) in zip(strands, base_count_lists): | |
| 333 for base_count in base_count_list: | |
| 334 sample[strand+base_count[0]] = base_count[1] | |
| 335 # fill in any zeros | |
| 336 for base in canonical: | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
337 if strand+base not in sample: | 
| 10 | 338 sample[strand+base] = 0 | 
| 0 | 339 | 
| 10 | 340 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) | 
| 0 | 341 | 
| 10 | 342 # If there's a tie for 2nd, randomly choose one to be 2nd | 
| 0 | 343 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: | 
| 10 | 344 swap = random.choice([True,False]) | 
| 345 if swap: | |
| 346 tmp_base = ranked_bases[1] | |
| 347 ranked_bases[1] = ranked_bases[2] | |
| 348 ranked_bases[2] = tmp_base | |
| 0 | 349 | 
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
350 if debug: print("ranked +-: "+str(ranked_bases)) | 
| 0 | 351 | 
| 352 sample['coverage'] = coverage | |
| 353 try: | |
| 354 sample['major'] = ranked_bases[0][0] | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
355 except IndexError: | 
| 0 | 356 sample['major'] = '.' | 
| 357 try: | |
| 358 sample['minor'] = ranked_bases[1][0] | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
359 sample['freq'] = round(ranked_bases[1][1]/coverage, 5) | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
360 except IndexError: | 
| 0 | 361 sample['minor'] = '.' | 
| 362 sample['freq'] = 0.0 | |
| 363 | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
364 # Now that we've decided major and minor, we can calculate strand bias | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
365 bias = get_strand_bias(sample, variants) | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
366 if bias is None: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
367 sample['bias'] = '.' | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
368 else: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
369 sample['bias'] = round(bias, 5) | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
370 | 
| 0 | 371 site_summary.append(sample) | 
| 372 | |
| 373 return site_summary | |
| 374 | |
| 375 | |
| 10 | 376 def get_read_counts(stranded_counts, strands='+-'): | 
| 377 """Do a simple sum of the read counts per variant, on the specified strands. | |
| 378 Arguments: | |
| 379 stranded_counts: Dict of the stranded variants (keys) and their read counts | |
| 380 (values). | |
| 381 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
| 382 Return value: | |
| 383 summed_counts: A list of the alleles and their read counts. The elements are | |
| 384 tuples (variant, read count).""" | |
| 385 | |
| 386 variants = stranded_counts.keys() | |
| 387 | |
| 388 summed_counts = {} | |
| 389 for variant in variants: | |
| 390 strand = variant[0] | |
| 391 base = variant[1:] | |
| 392 if strand in strands: | |
| 393 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | |
| 394 | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
395 return list(summed_counts.items()) | 
| 0 | 396 | 
| 397 | |
| 10 | 398 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): | 
| 399 """Process a list of read counts by frequency filtering and/or sorting. | |
| 0 | 400 Arguments: | 
| 10 | 401 variant_counts: List of the non-stranded variants and their read counts. The | 
| 402 elements are tuples (variant, read count). | |
| 0 | 403 freq_thres: The frequency threshold each allele needs to pass to be included. | 
| 10 | 404 sort: Whether to sort the list in descending order of read counts. | 
| 0 | 405 Return value: | 
| 10 | 406 variant_counts: A list of the processed alleles and their read counts. The | 
| 407 elements are tuples (variant, read count).""" | |
| 0 | 408 | 
| 409 # get coverage for the specified strands | |
| 410 coverage = 0 | |
| 411 for variant in variant_counts: | |
| 10 | 412 coverage += variant[1] | 
| 0 | 413 | 
| 10 | 414 if coverage <= 0: | 
| 0 | 415 return [] | 
| 416 | |
| 417 # sort the list of alleles by read count | |
| 10 | 418 if sort: | 
| 419 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | |
| 0 | 420 | 
| 421 if debug: | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
422 print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)) | 
| 10 | 423 for variant in variant_counts: | 
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
424 print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
425 str(variant[1]/coverage))) | 
| 0 | 426 | 
| 427 # remove bases below the frequency threshold | |
| 10 | 428 if freq_thres > 0: | 
| 429 variant_counts = [variant for variant in variant_counts | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
430 if variant[1]/coverage >= freq_thres] | 
| 0 | 431 | 
| 10 | 432 return variant_counts | 
| 0 | 433 | 
| 434 | |
| 435 def count_alleles(variant_counts, freq_thres, debug=False): | |
| 436 """Determine how many alleles to report, based on filtering rules. | |
| 437 The current rule determines which bases pass the frequency threshold on each | |
| 438 strand individually, then compares the two sets of bases. If they are the same | |
| 439 (regardless of order), the allele count is the number of bases. Otherwise it | |
| 440 is zero.""" | |
| 441 allele_count = 0 | |
| 442 | |
| 10 | 443 alleles_plus = get_read_counts(variant_counts, '+') | 
| 444 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, | |
| 445 sort=False, debug=debug) | |
| 446 alleles_minus = get_read_counts(variant_counts, '-') | |
| 447 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | |
| 448 sort=False, debug=debug) | |
| 0 | 449 | 
| 450 if debug: | |
| 
17
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
451 print('+ '+str(alleles_plus)) | 
| 
 
44c3abd1b767
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
 
nick 
parents: 
14 
diff
changeset
 | 
452 print('- '+str(alleles_minus)) | 
| 0 | 453 | 
| 10 | 454 # Check if each strand reports the same set of alleles. | 
| 455 # Sorting by base is to compare lists without regard to order (as sets). | |
| 0 | 456 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) | 
| 457 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
| 458 if alleles_plus_sorted == alleles_minus_sorted: | |
| 459 allele_count = len(alleles_plus) | |
| 460 | |
| 461 return allele_count | |
| 462 | |
| 463 | |
| 
13
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
464 def get_strand_bias(sample, variants): | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
465 """Based on method 1 (SB) of Guo et al., 2012 | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
466 If there a denominator would be 0, there is no valid result and this will | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
467 return None. This occurs when there are no reads on one of the strands, or | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
468 when there are no minor allele reads.""" | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
469 # using same variable names as in paper | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
470 a = variants.get('+'+sample['major'], 0) # forward major allele | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
471 b = variants.get('+'+sample['minor'], 0) # forward minor allele | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
472 c = variants.get('-'+sample['major'], 0) # reverse major allele | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
473 d = variants.get('-'+sample['minor'], 0) # reverse minor allele | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
474 # no minor allele reads | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
475 try: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
476 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d)) | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
477 except ZeroDivisionError: | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
478 return None | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
479 | 
| 
 
df1fb577db0d
Version 1.2: Add strand bias column, rename minor allele frequency column.
 
me <nmapsy@gmail.com> 
parents: 
10 
diff
changeset
 | 
480 | 
| 10 | 481 def print_site(filehandle, site, columns): | 
| 482 """Print the output lines for one site (one per sample). | |
| 483 filehandle must be open.""" | |
| 484 for sample in site: | |
| 485 if sample['print']: | |
| 486 fields = [str(sample.get(column)) for column in columns] | |
| 487 filehandle.write('\t'.join(fields)+"\n") | |
| 488 | |
| 489 | |
| 0 | 490 def fail(message): | 
| 491 sys.stderr.write(message+'\n') | |
| 492 sys.exit(1) | |
| 493 | |
| 10 | 494 | 
| 0 | 495 if __name__ == "__main__": | 
| 496 main() | 
