| 10 | 1 #!/usr/bin/python | 
|  | 2 import os | 
|  | 3 import sys | 
|  | 4 from optparse import OptionParser | 
|  | 5 | 
|  | 6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0} | 
|  | 7 BASES = ['A', 'C', 'G', 'T'] | 
|  | 8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n" | 
|  | 9     +"       cat variants.txt > %prog (options)") | 
|  | 10 | 
|  | 11 def main(): | 
|  | 12 | 
|  | 13   parser = OptionParser(usage=USAGE) | 
|  | 14   parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | 
|  | 15     default=OPT_DEFAULTS.get('freq_thres'), | 
|  | 16     help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' | 
|  | 17       +'= 1% frequency. Default is %default%.')) | 
|  | 18   parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', | 
|  | 19     default=OPT_DEFAULTS.get('covg_thres'), | 
|  | 20     help=('Coverage threshold. Each site must be supported by at least this ' | 
|  | 21       +'many reads on each strand. Otherwise the site will not be printed in ' | 
|  | 22       +'the output. The default is %default reads per strand.')) | 
|  | 23   (options, args) = parser.parse_args() | 
|  | 24   freq_thres = options.freq_thres | 
|  | 25   covg_thres = options.covg_thres | 
|  | 26 | 
|  | 27   if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]: | 
|  | 28     script_name = os.path.basename(sys.argv[0]) | 
|  | 29     print """USAGE: | 
|  | 30   $ """+script_name+""" [sample column text] | 
|  | 31   $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' | 
|  | 32   $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | 
|  | 33   $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | 
|  | 34 Or invoke with no arguments to use interactively. It will read from stdin, so | 
|  | 35 just paste one sample per line.""" | 
|  | 36     sys.exit(0) | 
|  | 37 | 
|  | 38   if len(args) > 0: | 
|  | 39     stdin = False | 
|  | 40     samples = args | 
|  | 41   else: | 
|  | 42     stdin = True | 
|  | 43     samples = sys.stdin | 
|  | 44     print "Reading from standard input.." | 
|  | 45 | 
|  | 46   for sample in samples: | 
|  | 47     print '' | 
|  | 48     sample = sample.split(':')[-1] | 
|  | 49     if not sample: | 
|  | 50       continue | 
|  | 51     print sample | 
|  | 52     counts_dict = parse_counts(sample) | 
|  | 53     compute_stats(counts_dict, freq_thres, covg_thres) | 
|  | 54 | 
|  | 55 | 
|  | 56 def parse_counts(sample_str): | 
|  | 57   counts_dict = {} | 
|  | 58   counts = sample_str.split(',') | 
|  | 59   for count in counts: | 
|  | 60     if '=' not in count: | 
|  | 61       continue | 
|  | 62     (var, reads) = count.split('=') | 
|  | 63     if var[1:] in BASES: | 
|  | 64       counts_dict[var] = int(reads) | 
|  | 65   return counts_dict | 
|  | 66 | 
|  | 67 | 
|  | 68 def compute_stats(counts_dict, freq_thres, covg_thres): | 
|  | 69   # totals for A, C, G, T | 
|  | 70   counts_unstranded = {} | 
|  | 71   for base in BASES: | 
|  | 72     counts_unstranded[base] = 0 | 
|  | 73     for strand in '+-': | 
|  | 74       counts_unstranded[base] += counts_dict.get(strand+base, 0) | 
|  | 75   print '+- '+str(counts_unstranded) | 
|  | 76 | 
|  | 77   # get counts for each strand | 
|  | 78   plus_counts  = get_stranded_counts(counts_dict, '+') | 
|  | 79   minus_counts = get_stranded_counts(counts_dict, '-') | 
|  | 80   counts_lists = {'+':plus_counts, '-':minus_counts} | 
|  | 81   print ' + '+str(plus_counts) | 
|  | 82   print ' - '+str(minus_counts) | 
|  | 83 | 
|  | 84   # stranded coverage threshold | 
|  | 85   coverages = {} | 
|  | 86   failing_strands = {} | 
|  | 87   for strand in '+-': | 
|  | 88     coverages[strand] = 0 | 
|  | 89     for count in counts_lists[strand].values(): | 
|  | 90       coverages[strand] += count | 
|  | 91     if coverages[strand] < covg_thres: | 
|  | 92       failing_strands[strand] = coverages[strand] | 
|  | 93     sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t") | 
|  | 94   coverages['+-'] = coverages['+'] + coverages['-'] | 
|  | 95   sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n") | 
|  | 96 | 
|  | 97   if failing_strands: | 
|  | 98     for strand in failing_strands: | 
|  | 99       print ('coverage on '+strand+' strand too low (' | 
|  | 100         +str(failing_strands[strand])+' < '+str(covg_thres)+")") | 
|  | 101     return | 
|  | 102 | 
|  | 103   # apply frequency threshold | 
|  | 104   for strand in counts_lists: | 
|  | 105     strand_counts = counts_lists[strand] | 
|  | 106     for variant in strand_counts.keys(): | 
|  | 107       # print (variant+" freq: "+str(strand_counts[variant])+"/" | 
|  | 108       #   +str(coverages[strand])+" = " | 
|  | 109       #   +str(strand_counts[variant]/float(coverages[strand]))) | 
|  | 110       if strand_counts[variant]/float(coverages[strand]) < freq_thres: | 
|  | 111         strand_counts.pop(variant) | 
|  | 112   plus_variants  = sorted(plus_counts.keys()) | 
|  | 113   minus_variants = sorted(minus_counts.keys()) | 
|  | 114   if plus_variants == minus_variants: | 
|  | 115     strand_bias = False | 
|  | 116     print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants) | 
|  | 117     sys.stdout.write("alleles: "+str(len(plus_variants))+"\t") | 
|  | 118   else: | 
|  | 119     strand_bias = True | 
|  | 120     print "   strand bias: +"+str(plus_variants)+" != -"+str(minus_variants) | 
|  | 121     sys.stdout.write("alleles: 0\t") | 
|  | 122 | 
|  | 123   variants_sorted = sort_variants(counts_unstranded) | 
|  | 124   if len(variants_sorted) >= 1: | 
|  | 125     sys.stdout.write("major: "+variants_sorted[0]+"\t") | 
|  | 126   minor = '.' | 
|  | 127   if len(variants_sorted) == 2: | 
|  | 128     minor = variants_sorted[1] | 
|  | 129   elif len(variants_sorted) > 2: | 
|  | 130     if (counts_unstranded.get(variants_sorted[1]) == | 
|  | 131         counts_unstranded.get(variants_sorted[2])): | 
|  | 132       minor = 'N' | 
|  | 133     else: | 
|  | 134       minor = variants_sorted[1] | 
|  | 135 | 
|  | 136   sys.stdout.write("minor: "+minor+"\tfreq: ") | 
|  | 137   print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5) | 
|  | 138 | 
|  | 139 | 
|  | 140 def get_stranded_counts(unstranded_counts, strand): | 
|  | 141   stranded_counts = {} | 
|  | 142   for variant in unstranded_counts: | 
|  | 143     if variant[0] == strand: | 
|  | 144       stranded_counts[variant[1:]] = unstranded_counts[variant] | 
|  | 145   return stranded_counts | 
|  | 146 | 
|  | 147 def sort_variants(variant_counts): | 
|  | 148   """Sort the list of variants based on their counts. Returns a list of just | 
|  | 149   the variants, no counts.""" | 
|  | 150   variants = variant_counts.keys() | 
|  | 151   var_del = [] | 
|  | 152   for variant in variants: | 
|  | 153     if variant_counts.get(variant) == 0: | 
|  | 154       var_del.append(variant) | 
|  | 155   for variant in var_del: | 
|  | 156     variants.remove(variant) | 
|  | 157   variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0)) | 
|  | 158   return variants | 
|  | 159 | 
|  | 160 if __name__ == "__main__": | 
|  | 161   main() |