Mercurial > repos > boris > hetbox
comparison hetbox.py @ 0:153d1a6e8c5e draft
Uploaded script
author | boris |
---|---|
date | Thu, 13 Jun 2013 15:09:48 -0400 |
parents | |
children | 57c5ea9c3c5c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:153d1a6e8c5e |
---|---|
1 #!/usr/bin/env python | |
2 # Code by Boris Rebolledo-Jaramillo | |
3 # (boris-at-bx.psu.edu) | |
4 # Edited by Nick Stoler | |
5 # (nick-at-bx.psu.edu) | |
6 # New in this version: | |
7 # - Add in proper header line if not present | |
8 | |
9 import os | |
10 import sys | |
11 import numpy | |
12 from rpy2.robjects import Formula | |
13 from rpy2.robjects.packages import importr | |
14 from rpy2 import robjects | |
15 | |
16 def fail(message): | |
17 sys.stderr.write(message+'\n') | |
18 sys.exit(1) | |
19 | |
20 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', | |
21 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | |
22 | |
23 args = sys.argv[1:] | |
24 if len(args) >= 1: | |
25 infile = args[0] | |
26 else: | |
27 fail('Error: No input filename provided (as argument 1).') | |
28 if len(args) >= 2: | |
29 outfile = args[1] | |
30 else: | |
31 fail('Error: No output filename provided (as argument 2).') | |
32 if len(args) >= 3: | |
33 report = args[2] | |
34 else: | |
35 report = '' | |
36 | |
37 # Check input file | |
38 add_header = False | |
39 if not os.path.exists(infile): | |
40 fail('Error: Input file '+infile+' could not be found.') | |
41 with open(infile, 'r') as lines: | |
42 line = lines.readline() | |
43 if not line: | |
44 fail('Error: Input file seems to be empty') | |
45 line = line.strip().lstrip('#') # rm whitespace, comment chars | |
46 labels = line.split("\t") | |
47 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': | |
48 sys.stderr.write("Error: Input file does not seem to have a proper header " | |
49 +"line.\nAdding an artificial header..") | |
50 add_header = True | |
51 | |
52 base = importr('base') | |
53 utils = importr('utils') | |
54 stats = importr('stats') | |
55 rprint = robjects.globalenv.get("print") | |
56 graphics = importr('graphics') | |
57 grdevices = importr('grDevices') | |
58 grdevices.png(file=outfile, width=1024, height=768) | |
59 | |
60 # Read file into a data frame | |
61 if add_header: | |
62 # add header line manually if not present | |
63 DATA = utils.read_delim(infile, header=False) | |
64 labels = robjects.r.names(DATA) | |
65 for i in range(len(labels)): | |
66 try: | |
67 labels[i] = COLUMN_LABELS[i] | |
68 except IndexError, e: | |
69 fail("Error in input file: Too many columns (does not match hardcoded " | |
70 +"column labels).") | |
71 else: | |
72 DATA = utils.read_delim(infile) | |
73 # Remove comment from header, if present | |
74 labels = robjects.r.names(DATA) | |
75 if labels[0][0:2] == 'X.': | |
76 labels[0] = labels[0][2:] | |
77 | |
78 # Multiply minor allele frequencies by 100 to get percentage | |
79 # .rx2() looks up a column by its label and returns it as a vector | |
80 # .ro turns the returned object into one that can be operated on per-element | |
81 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 | |
82 samples = DATA.rx2('SAMPLE') | |
83 | |
84 # Formula() creates a Python object representing the R object returned by x ~ y | |
85 formula = Formula('minor_freq ~ samples') | |
86 # The "environment" in .getenvironment() is the entire R workspace in which the | |
87 # Formula object exists. The R workspace meaning all the defined variables. | |
88 # Here, the .getenvironment() method is being used to set some variables in the | |
89 | |
90 # R workspace | |
91 formula.getenvironment()['minor_freq'] = minor_freq | |
92 formula.getenvironment()['samples'] = samples | |
93 | |
94 # create boxplot - fill kwargs1 with the options for the boxplot function | |
95 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", | |
96 'outpch':"*",'main':"Distribution of minor allele frequencies", | |
97 'cex.lab':"1.5"} | |
98 p = graphics.boxplot(formula, **kwargs1) | |
99 table = base.table(DATA.rx2('SAMPLE')) | |
100 graphics.text(0.5, 1, 'N:', font=2) | |
101 for i in range(1, base.length(table)[0]+1, 1): | |
102 graphics.text(i, 1, table[i-1], font=2) | |
103 | |
104 graphlabels = base.names(table) | |
105 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} | |
106 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) | |
107 grdevices.dev_off() | |
108 | |
109 if not report: | |
110 sys.exit(0) | |
111 | |
112 | |
113 #################################### | |
114 # GENERATE REPORT | |
115 # report should be something like: | |
116 # SAMPLE NoHET MEDIAN MAD TEST | |
117 # s1 7 10% n p/w/f | |
118 # n <= 5 pass | |
119 # 6 <= n <=10 warn | |
120 # n >= 11 fail | |
121 # MAD <= 2.0 fail | |
122 # MAD > 2.0 pass | |
123 ################################### | |
124 | |
125 SAMPLES=[] | |
126 for i in range(len(table)): | |
127 SAMPLES.append(base.names(table)[i]) | |
128 | |
129 def boxstats(data,sample): | |
130 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] | |
131 NoHET = len(VALUES) | |
132 MEDIAN = numpy.median(VALUES) | |
133 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) | |
134 return [NoHET,MEDIAN, MAD] | |
135 | |
136 boxreport = open(report, "w+") | |
137 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") | |
138 for sample in SAMPLES: | |
139 ENTRY = [sample] + boxstats(infile,sample) | |
140 if ENTRY[1] <= 5: | |
141 ENTRY.append('pass') | |
142 elif 6 <= ENTRY[1] <=10: | |
143 ENTRY.append('warn') | |
144 elif ENTRY[1] >= 11: | |
145 ENTRY.append('fail') | |
146 if ENTRY[3] <=2.0: | |
147 ENTRY.append('fail') | |
148 elif ENTRY[3] >2.0: | |
149 ENTRY.append('pass') | |
150 if len(set(ENTRY[4:6])) == 2: | |
151 ENTRY.append('warn') | |
152 else: | |
153 ENTRY.append(list(set(ENTRY[4:6]))[0]) | |
154 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) | |
155 | |
156 boxreport.close() | |
157 | |
158 | |
159 | |
160 |