annotate hetbox.py @ 2:3a1ce69571e5 draft

Allow commented lines, de-kludge data frame handling
author nick
date Tue, 28 May 2013 16:55:39 -0400
parents 128db16c9399
children dfa2e75da6aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/env python
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
2 # New in this version:
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
3 # - handle commented-out header lines
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
4 # - did everything through the Rpy2 interface instead of inline R code
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
5
128db16c9399 Uploaded script
nick
parents:
diff changeset
6 import os,sys,numpy
128db16c9399 Uploaded script
nick
parents:
diff changeset
7 from rpy2.robjects import Formula
128db16c9399 Uploaded script
nick
parents:
diff changeset
8 from rpy2.robjects.packages import importr
128db16c9399 Uploaded script
nick
parents:
diff changeset
9 from rpy2 import robjects
128db16c9399 Uploaded script
nick
parents:
diff changeset
10
128db16c9399 Uploaded script
nick
parents:
diff changeset
11 def fail(message):
128db16c9399 Uploaded script
nick
parents:
diff changeset
12 sys.stderr.write(message+'\n')
128db16c9399 Uploaded script
nick
parents:
diff changeset
13 sys.exit(1)
128db16c9399 Uploaded script
nick
parents:
diff changeset
14
128db16c9399 Uploaded script
nick
parents:
diff changeset
15 args = sys.argv[1:]
128db16c9399 Uploaded script
nick
parents:
diff changeset
16 if '-r' in args:
128db16c9399 Uploaded script
nick
parents:
diff changeset
17 make_report = True
128db16c9399 Uploaded script
nick
parents:
diff changeset
18 else:
128db16c9399 Uploaded script
nick
parents:
diff changeset
19 make_report = False
128db16c9399 Uploaded script
nick
parents:
diff changeset
20 if len(args) >= 1:
128db16c9399 Uploaded script
nick
parents:
diff changeset
21 infile = args[0]
128db16c9399 Uploaded script
nick
parents:
diff changeset
22 else:
128db16c9399 Uploaded script
nick
parents:
diff changeset
23 fail('Error: No input filename provided (as argument 1).')
128db16c9399 Uploaded script
nick
parents:
diff changeset
24 if len(args) >= 2:
128db16c9399 Uploaded script
nick
parents:
diff changeset
25 outfile = args[1]
128db16c9399 Uploaded script
nick
parents:
diff changeset
26 else:
128db16c9399 Uploaded script
nick
parents:
diff changeset
27 fail('Error: No output filename provided (as argument 2).')
128db16c9399 Uploaded script
nick
parents:
diff changeset
28 if len(args) >= 3:
128db16c9399 Uploaded script
nick
parents:
diff changeset
29 report = args[2]
128db16c9399 Uploaded script
nick
parents:
diff changeset
30 elif make_report:
128db16c9399 Uploaded script
nick
parents:
diff changeset
31 fail('Error: No output report filename provided (as argument 3).')
128db16c9399 Uploaded script
nick
parents:
diff changeset
32
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
33 # Check input file
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
34 if not os.path.exists(infile):
128db16c9399 Uploaded script
nick
parents:
diff changeset
35 fail('Error: Input file '+infile+' could not be found.')
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
36 with open(infile, 'r') as lines:
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
37 line = lines.readline()
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
38 if not line:
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
39 fail('Error: Input file seems to be empty')
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
40 line = line.strip().lstrip('#') # rm whitespace, comment chars
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
41 labels = line.split("\t")
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
42 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
43 fail('Error: Input file does not seem to have a proper header line.')
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
44
128db16c9399 Uploaded script
nick
parents:
diff changeset
45
128db16c9399 Uploaded script
nick
parents:
diff changeset
46 utils = importr('utils')
128db16c9399 Uploaded script
nick
parents:
diff changeset
47 graphics = importr('graphics')
128db16c9399 Uploaded script
nick
parents:
diff changeset
48 base = importr('base')
128db16c9399 Uploaded script
nick
parents:
diff changeset
49 stats = importr('stats')
128db16c9399 Uploaded script
nick
parents:
diff changeset
50 rprint = robjects.globalenv.get("print")
128db16c9399 Uploaded script
nick
parents:
diff changeset
51 grdevices = importr('grDevices')
128db16c9399 Uploaded script
nick
parents:
diff changeset
52 grdevices.png(file=outfile, width=1024, height=768)
128db16c9399 Uploaded script
nick
parents:
diff changeset
53
128db16c9399 Uploaded script
nick
parents:
diff changeset
54 # Read file into a data frame
128db16c9399 Uploaded script
nick
parents:
diff changeset
55 DATA = utils.read_delim(infile)
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
56 # Remove comment from header, if
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
57 labels = robjects.r.names(DATA)
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
58 if labels[0][0:2] == 'X.':
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
59 labels[0] = labels[0][2:]
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
60 #robjects.r.assign('data', DATA)
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
61 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
62 #DATA = robjects.r('data')
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
63 #index = data.names.index('MINOR.FREQ.PERC.')
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
64 # Multiply minor allele frequencies by 100 to get percentage
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
65 # .rx2() looks up a column by its label and returns it as a vector
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
66 # .ro turns the returned object into one that can be operated on per-element
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
67 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
68 samples = DATA.rx2('SAMPLE')
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
69 # Formula() creates a Python object representing the R object returned by x ~ y
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
70 formula = Formula('minor_freq ~ samples')
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
71 # The "environment" in .getenvironment() is the entire R workspace in which the
128db16c9399 Uploaded script
nick
parents:
diff changeset
72 # Formula object exists. The R workspace meaning all the defined variables.
128db16c9399 Uploaded script
nick
parents:
diff changeset
73 # Here, the .getenvironment() method is being used to set some variables in the
128db16c9399 Uploaded script
nick
parents:
diff changeset
74 # R workspace
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
75 formula.getenvironment()['minor_freq'] = minor_freq
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
76 formula.getenvironment()['samples'] = samples
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
77
128db16c9399 Uploaded script
nick
parents:
diff changeset
78 # create boxplot - fill kwargs1 with the options for the boxplot function
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
79 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
80 p = graphics.boxplot(formula, **kwargs1)
128db16c9399 Uploaded script
nick
parents:
diff changeset
81
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
82 table = base.table(DATA.rx2('SAMPLE'))
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
83 graphics.text(0.5, 1, 'N:', font=2)
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
84 for i in range(1, base.length(table)[0]+1, 1):
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
85 graphics.text(i, 1, table[i-1], font=2)
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
86
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
87 graphlabels = base.names(table)
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
88 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
89 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3)
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
90 grdevices.dev_off()
128db16c9399 Uploaded script
nick
parents:
diff changeset
91
128db16c9399 Uploaded script
nick
parents:
diff changeset
92 if not make_report:
128db16c9399 Uploaded script
nick
parents:
diff changeset
93 sys.exit(0)
128db16c9399 Uploaded script
nick
parents:
diff changeset
94
2
3a1ce69571e5 Allow commented lines, de-kludge data frame handling
nick
parents: 0
diff changeset
95
0
128db16c9399 Uploaded script
nick
parents:
diff changeset
96 ####################################
128db16c9399 Uploaded script
nick
parents:
diff changeset
97 # GENERATE REPORT
128db16c9399 Uploaded script
nick
parents:
diff changeset
98 # report should be something like:
128db16c9399 Uploaded script
nick
parents:
diff changeset
99 # SAMPLE NoHET MEDIAN MAD TEST
128db16c9399 Uploaded script
nick
parents:
diff changeset
100 # s1 7 10% n p/w/f
128db16c9399 Uploaded script
nick
parents:
diff changeset
101 # n <= 5 pass
128db16c9399 Uploaded script
nick
parents:
diff changeset
102 # 6 <= n <=10 warn
128db16c9399 Uploaded script
nick
parents:
diff changeset
103 # n >= 11 fail
128db16c9399 Uploaded script
nick
parents:
diff changeset
104 # MAD <= 2.0 fail
128db16c9399 Uploaded script
nick
parents:
diff changeset
105 # MAD > 2.0 pass
128db16c9399 Uploaded script
nick
parents:
diff changeset
106 ###################################
128db16c9399 Uploaded script
nick
parents:
diff changeset
107
128db16c9399 Uploaded script
nick
parents:
diff changeset
108 SAMPLES=[]
128db16c9399 Uploaded script
nick
parents:
diff changeset
109 for i in range(len(tab)):
128db16c9399 Uploaded script
nick
parents:
diff changeset
110 SAMPLES.append(base.names(tab)[i])
128db16c9399 Uploaded script
nick
parents:
diff changeset
111
128db16c9399 Uploaded script
nick
parents:
diff changeset
112 def boxstats(data,sample):
128db16c9399 Uploaded script
nick
parents:
diff changeset
113 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
128db16c9399 Uploaded script
nick
parents:
diff changeset
114 NoHET = len(VALUES)
128db16c9399 Uploaded script
nick
parents:
diff changeset
115 MEDIAN = numpy.median(VALUES)
128db16c9399 Uploaded script
nick
parents:
diff changeset
116 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
128db16c9399 Uploaded script
nick
parents:
diff changeset
117 return [NoHET,MEDIAN, MAD]
128db16c9399 Uploaded script
nick
parents:
diff changeset
118
128db16c9399 Uploaded script
nick
parents:
diff changeset
119 boxreport = open(report, "w+")
128db16c9399 Uploaded script
nick
parents:
diff changeset
120 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
128db16c9399 Uploaded script
nick
parents:
diff changeset
121 for sample in SAMPLES:
128db16c9399 Uploaded script
nick
parents:
diff changeset
122 ENTRY = [sample] + boxstats(infile,sample)
128db16c9399 Uploaded script
nick
parents:
diff changeset
123 if ENTRY[1] <= 5:
128db16c9399 Uploaded script
nick
parents:
diff changeset
124 ENTRY.append('pass')
128db16c9399 Uploaded script
nick
parents:
diff changeset
125 elif 6 <= ENTRY[1] <=10:
128db16c9399 Uploaded script
nick
parents:
diff changeset
126 ENTRY.append('warn')
128db16c9399 Uploaded script
nick
parents:
diff changeset
127 elif ENTRY[1] >= 11:
128db16c9399 Uploaded script
nick
parents:
diff changeset
128 ENTRY.append('fail')
128db16c9399 Uploaded script
nick
parents:
diff changeset
129 if ENTRY[3] <=2.0:
128db16c9399 Uploaded script
nick
parents:
diff changeset
130 ENTRY.append('fail')
128db16c9399 Uploaded script
nick
parents:
diff changeset
131 elif ENTRY[3] >2.0:
128db16c9399 Uploaded script
nick
parents:
diff changeset
132 ENTRY.append('pass')
128db16c9399 Uploaded script
nick
parents:
diff changeset
133 if len(set(ENTRY[4:6])) == 2:
128db16c9399 Uploaded script
nick
parents:
diff changeset
134 ENTRY.append('warn')
128db16c9399 Uploaded script
nick
parents:
diff changeset
135 else:
128db16c9399 Uploaded script
nick
parents:
diff changeset
136 ENTRY.append(list(set(ENTRY[4:6]))[0])
128db16c9399 Uploaded script
nick
parents:
diff changeset
137 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
128db16c9399 Uploaded script
nick
parents:
diff changeset
138
128db16c9399 Uploaded script
nick
parents:
diff changeset
139 boxreport.close()
128db16c9399 Uploaded script
nick
parents:
diff changeset
140
128db16c9399 Uploaded script
nick
parents:
diff changeset
141
128db16c9399 Uploaded script
nick
parents:
diff changeset
142
128db16c9399 Uploaded script
nick
parents:
diff changeset
143
128db16c9399 Uploaded script
nick
parents:
diff changeset
144
128db16c9399 Uploaded script
nick
parents:
diff changeset
145
128db16c9399 Uploaded script
nick
parents:
diff changeset
146 #####################################
128db16c9399 Uploaded script
nick
parents:
diff changeset
147 #STUFF
128db16c9399 Uploaded script
nick
parents:
diff changeset
148 #
128db16c9399 Uploaded script
nick
parents:
diff changeset
149 #kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"}
128db16c9399 Uploaded script
nick
parents:
diff changeset
150 #boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%")
128db16c9399 Uploaded script
nick
parents:
diff changeset
151 #biglabels2=sort(as.vector(unique(run020$id)))
128db16c9399 Uploaded script
nick
parents:
diff changeset
152 #biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5))
128db16c9399 Uploaded script
nick
parents:
diff changeset
153 #axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9)
128db16c9399 Uploaded script
nick
parents:
diff changeset
154 #axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9)
128db16c9399 Uploaded script
nick
parents:
diff changeset
155 #nbig=as.vector(table(run020[run020$freq>=0.02,1]))
128db16c9399 Uploaded script
nick
parents:
diff changeset
156 #for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}
128db16c9399 Uploaded script
nick
parents:
diff changeset
157