annotate hetbox.py @ 2:57c5ea9c3c5c draft

Uploaded
author boris
date Fri, 21 Jun 2013 16:05:46 -0400
parents 153d1a6e8c5e
children 1c21e9f3104b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
1 #!/usr/bin/env python
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
2 # Code by Boris Rebolledo-Jaramillo
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
3 # (boris-at-bx.psu.edu)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
4 # Edited by Nick Stoler
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
5 # (nick-at-bx.psu.edu)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
6 # New in this version:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
7 # - Add in proper header line if not present
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
8
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
9 import os
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
10 import sys
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
11 import array
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
12 import numpy
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
13 from rpy2.robjects import Formula
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
14 from rpy2.robjects.packages import importr
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
15 from rpy2 import robjects
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
16
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
17 def fail(message):
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
18 sys.stderr.write(message+'\n')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
19 sys.exit(1)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
20
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
21 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG',
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
22 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
23
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
24 args = sys.argv[1:]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
25 if len(args) >= 1:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
26 infile = args[0]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
27 else:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
28 fail('Error: No input filename provided (as argument 1).')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
29 if len(args) >= 2:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
30 outfile = args[1]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
31 else:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
32 fail('Error: No output filename provided (as argument 2).')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
33 if len(args) >= 3:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
34 report = args[2]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
35 else:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
36 report = ''
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
37
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
38 # Check input file
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
39 add_header = False
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
40 if not os.path.exists(infile):
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
41 fail('Error: Input file '+infile+' could not be found.')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
42 with open(infile, 'r') as lines:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
43 line = lines.readline()
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
44 if not line:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
45 fail('Error: Input file seems to be empty')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
46 line = line.strip().lstrip('#') # rm whitespace, comment chars
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
47 labels = line.split("\t")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
48 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
49 sys.stderr.write("Error: Input file does not seem to have a proper header "
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
50 +"line.\nAdding an artificial header..")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
51 add_header = True
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
52
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
53 r = robjects.r
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
54 base = importr('base')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
55 utils = importr('utils')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
56 stats = importr('stats')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
57 rprint = robjects.globalenv.get("print")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
58 graphics = importr('graphics')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
59 grdevices = importr('grDevices')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
60 grdevices.png(file=outfile, width=1024, height=768)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
61
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
62 # Read file into a data frame
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
63 if add_header:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
64 # add header line manually if not present
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
65 DATA = utils.read_delim(infile, header=False)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
66 labels = robjects.r.names(DATA)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
67 for i in range(len(labels)):
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
68 try:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
69 labels[i] = COLUMN_LABELS[i]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
70 except IndexError, e:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
71 fail("Error in input file: Too many columns (does not match hardcoded "
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
72 +"column labels).")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
73 else:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
74 DATA = utils.read_delim(infile)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
75 # Remove comment from header, if present
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
76 labels = robjects.r.names(DATA)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
77 if labels[0][0:2] == 'X.':
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
78 labels[0] = labels[0][2:]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
79
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
80 # Multiply minor allele frequencies by 100 to get percentage
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
81 # .rx2() looks up a column by its label and returns it as a vector
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
82 # .ro turns the returned object into one that can be operated on per-element
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
83 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
84 samples = DATA.rx2('SAMPLE')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
85
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
86 # Formula() creates a Python object representing the R object returned by x ~ y
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
87 formula = Formula('minor_freq ~ samples')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
88 # The "environment" in .getenvironment() is the entire R workspace in which the
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
89 # Formula object exists. The R workspace meaning all the defined variables.
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
90 # Here, the .getenvironment() method is being used to set some variables in the
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
91 # R workspace
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
92
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
93 formula.getenvironment()['minor_freq'] = minor_freq
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
94 formula.getenvironment()['samples'] = samples
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
95
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
96
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
97 r.par(oma=array.array('i', [0,0,0,0]))
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
98 r.par(mar=array.array('i', [10,4,4,2]))
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
99 ylimit = array.array('i',[-5,50])
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
100
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
101 # create boxplot - fill kwargs1 with the options for the boxplot function
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
102 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n",
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
103 'outpch':"*",'main':"Distribution of minor allele frequencies",
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
104 'cex.lab':"1.5"}
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
105 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1)
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
106
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
107 table = base.table(DATA.rx2('SAMPLE'))
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
108 #graphics.text(-1, 1, 'N:', font=2)
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
109 for i in range(1, base.length(table)[0]+1, 1):
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
110 graphics.text(i, -1, table[i-1], font=2)
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
111
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
112 graphlabels = base.names(table)
2
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
113 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"}
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
114 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3)
57c5ea9c3c5c Uploaded
boris
parents: 0
diff changeset
115 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2)
0
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
116 grdevices.dev_off()
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
117
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
118 if not report:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
119 sys.exit(0)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
120
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
121
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
122 ####################################
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
123 # GENERATE REPORT
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
124 # report should be something like:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
125 # SAMPLE NoHET MEDIAN MAD TEST
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
126 # s1 7 10% n p/w/f
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
127 # n <= 5 pass
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
128 # 6 <= n <=10 warn
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
129 # n >= 11 fail
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
130 # MAD <= 2.0 fail
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
131 # MAD > 2.0 pass
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
132 ###################################
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
133
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
134 SAMPLES=[]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
135 for i in range(len(table)):
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
136 SAMPLES.append(base.names(table)[i])
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
137
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
138 def boxstats(data,sample):
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
139 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
140 NoHET = len(VALUES)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
141 MEDIAN = numpy.median(VALUES)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
142 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
143 return [NoHET,MEDIAN, MAD]
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
144
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
145 boxreport = open(report, "w+")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
146 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
147 for sample in SAMPLES:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
148 ENTRY = [sample] + boxstats(infile,sample)
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
149 if ENTRY[1] <= 5:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
150 ENTRY.append('pass')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
151 elif 6 <= ENTRY[1] <=10:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
152 ENTRY.append('warn')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
153 elif ENTRY[1] >= 11:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
154 ENTRY.append('fail')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
155 if ENTRY[3] <=2.0:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
156 ENTRY.append('fail')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
157 elif ENTRY[3] >2.0:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
158 ENTRY.append('pass')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
159 if len(set(ENTRY[4:6])) == 2:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
160 ENTRY.append('warn')
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
161 else:
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
162 ENTRY.append(list(set(ENTRY[4:6]))[0])
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
163 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
164
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
165 boxreport.close()
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
166
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
167
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
168
153d1a6e8c5e Uploaded script
boris
parents:
diff changeset
169