Mercurial > repos > nick > allele_counts_1
comparison tests/compute-site.py @ 10:db6f217dc45a
Uploaded tarball.
Version 1.1.
- Added stranded output.
- Added, tested no-filter mode.
- now prints all sites.
- Changed handling of minor allele ties.
- Added test datasets.
- Revised help text.
| author | nick |
|---|---|
| date | Fri, 06 Sep 2013 20:37:07 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 9:eca1ea054d0d | 10:db6f217dc45a |
|---|---|
| 1 #!/usr/bin/python | |
| 2 import os | |
| 3 import sys | |
| 4 from optparse import OptionParser | |
| 5 | |
| 6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0} | |
| 7 BASES = ['A', 'C', 'G', 'T'] | |
| 8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n" | |
| 9 +" cat variants.txt > %prog (options)") | |
| 10 | |
| 11 def main(): | |
| 12 | |
| 13 parser = OptionParser(usage=USAGE) | |
| 14 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
| 15 default=OPT_DEFAULTS.get('freq_thres'), | |
| 16 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' | |
| 17 +'= 1% frequency. Default is %default%.')) | |
| 18 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', | |
| 19 default=OPT_DEFAULTS.get('covg_thres'), | |
| 20 help=('Coverage threshold. Each site must be supported by at least this ' | |
| 21 +'many reads on each strand. Otherwise the site will not be printed in ' | |
| 22 +'the output. The default is %default reads per strand.')) | |
| 23 (options, args) = parser.parse_args() | |
| 24 freq_thres = options.freq_thres | |
| 25 covg_thres = options.covg_thres | |
| 26 | |
| 27 if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]: | |
| 28 script_name = os.path.basename(sys.argv[0]) | |
| 29 print """USAGE: | |
| 30 $ """+script_name+""" [sample column text] | |
| 31 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' | |
| 32 $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | |
| 33 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | |
| 34 Or invoke with no arguments to use interactively. It will read from stdin, so | |
| 35 just paste one sample per line.""" | |
| 36 sys.exit(0) | |
| 37 | |
| 38 if len(args) > 0: | |
| 39 stdin = False | |
| 40 samples = args | |
| 41 else: | |
| 42 stdin = True | |
| 43 samples = sys.stdin | |
| 44 print "Reading from standard input.." | |
| 45 | |
| 46 for sample in samples: | |
| 47 print '' | |
| 48 sample = sample.split(':')[-1] | |
| 49 if not sample: | |
| 50 continue | |
| 51 print sample | |
| 52 counts_dict = parse_counts(sample) | |
| 53 compute_stats(counts_dict, freq_thres, covg_thres) | |
| 54 | |
| 55 | |
| 56 def parse_counts(sample_str): | |
| 57 counts_dict = {} | |
| 58 counts = sample_str.split(',') | |
| 59 for count in counts: | |
| 60 if '=' not in count: | |
| 61 continue | |
| 62 (var, reads) = count.split('=') | |
| 63 if var[1:] in BASES: | |
| 64 counts_dict[var] = int(reads) | |
| 65 return counts_dict | |
| 66 | |
| 67 | |
| 68 def compute_stats(counts_dict, freq_thres, covg_thres): | |
| 69 # totals for A, C, G, T | |
| 70 counts_unstranded = {} | |
| 71 for base in BASES: | |
| 72 counts_unstranded[base] = 0 | |
| 73 for strand in '+-': | |
| 74 counts_unstranded[base] += counts_dict.get(strand+base, 0) | |
| 75 print '+- '+str(counts_unstranded) | |
| 76 | |
| 77 # get counts for each strand | |
| 78 plus_counts = get_stranded_counts(counts_dict, '+') | |
| 79 minus_counts = get_stranded_counts(counts_dict, '-') | |
| 80 counts_lists = {'+':plus_counts, '-':minus_counts} | |
| 81 print ' + '+str(plus_counts) | |
| 82 print ' - '+str(minus_counts) | |
| 83 | |
| 84 # stranded coverage threshold | |
| 85 coverages = {} | |
| 86 failing_strands = {} | |
| 87 for strand in '+-': | |
| 88 coverages[strand] = 0 | |
| 89 for count in counts_lists[strand].values(): | |
| 90 coverages[strand] += count | |
| 91 if coverages[strand] < covg_thres: | |
| 92 failing_strands[strand] = coverages[strand] | |
| 93 sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t") | |
| 94 coverages['+-'] = coverages['+'] + coverages['-'] | |
| 95 sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n") | |
| 96 | |
| 97 if failing_strands: | |
| 98 for strand in failing_strands: | |
| 99 print ('coverage on '+strand+' strand too low (' | |
| 100 +str(failing_strands[strand])+' < '+str(covg_thres)+")") | |
| 101 return | |
| 102 | |
| 103 # apply frequency threshold | |
| 104 for strand in counts_lists: | |
| 105 strand_counts = counts_lists[strand] | |
| 106 for variant in strand_counts.keys(): | |
| 107 # print (variant+" freq: "+str(strand_counts[variant])+"/" | |
| 108 # +str(coverages[strand])+" = " | |
| 109 # +str(strand_counts[variant]/float(coverages[strand]))) | |
| 110 if strand_counts[variant]/float(coverages[strand]) < freq_thres: | |
| 111 strand_counts.pop(variant) | |
| 112 plus_variants = sorted(plus_counts.keys()) | |
| 113 minus_variants = sorted(minus_counts.keys()) | |
| 114 if plus_variants == minus_variants: | |
| 115 strand_bias = False | |
| 116 print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants) | |
| 117 sys.stdout.write("alleles: "+str(len(plus_variants))+"\t") | |
| 118 else: | |
| 119 strand_bias = True | |
| 120 print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants) | |
| 121 sys.stdout.write("alleles: 0\t") | |
| 122 | |
| 123 variants_sorted = sort_variants(counts_unstranded) | |
| 124 if len(variants_sorted) >= 1: | |
| 125 sys.stdout.write("major: "+variants_sorted[0]+"\t") | |
| 126 minor = '.' | |
| 127 if len(variants_sorted) == 2: | |
| 128 minor = variants_sorted[1] | |
| 129 elif len(variants_sorted) > 2: | |
| 130 if (counts_unstranded.get(variants_sorted[1]) == | |
| 131 counts_unstranded.get(variants_sorted[2])): | |
| 132 minor = 'N' | |
| 133 else: | |
| 134 minor = variants_sorted[1] | |
| 135 | |
| 136 sys.stdout.write("minor: "+minor+"\tfreq: ") | |
| 137 print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5) | |
| 138 | |
| 139 | |
| 140 def get_stranded_counts(unstranded_counts, strand): | |
| 141 stranded_counts = {} | |
| 142 for variant in unstranded_counts: | |
| 143 if variant[0] == strand: | |
| 144 stranded_counts[variant[1:]] = unstranded_counts[variant] | |
| 145 return stranded_counts | |
| 146 | |
| 147 def sort_variants(variant_counts): | |
| 148 """Sort the list of variants based on their counts. Returns a list of just | |
| 149 the variants, no counts.""" | |
| 150 variants = variant_counts.keys() | |
| 151 var_del = [] | |
| 152 for variant in variants: | |
| 153 if variant_counts.get(variant) == 0: | |
| 154 var_del.append(variant) | |
| 155 for variant in var_del: | |
| 156 variants.remove(variant) | |
| 157 variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0)) | |
| 158 return variants | |
| 159 | |
| 160 if __name__ == "__main__": | |
| 161 main() |
