comparison hetbox.py @ 0:128db16c9399 draft

Uploaded script
author nick
date Tue, 28 May 2013 12:47:51 -0400
parents
children 3a1ce69571e5
comparison
equal deleted inserted replaced
-1:000000000000 0:128db16c9399
1 #!/usr/bin/env python
2
3 import os,sys,numpy
4 from rpy2.robjects import Formula
5 from rpy2.robjects.packages import importr
6 from rpy2 import robjects
7
8 def fail(message):
9 sys.stderr.write(message+'\n')
10 sys.exit(1)
11
12 args = sys.argv[1:]
13 if '-r' in args:
14 make_report = True
15 else:
16 make_report = False
17 if len(args) >= 1:
18 infile = args[0]
19 else:
20 fail('Error: No input filename provided (as argument 1).')
21 if len(args) >= 2:
22 outfile = args[1]
23 else:
24 fail('Error: No output filename provided (as argument 2).')
25 if len(args) >= 3:
26 report = args[2]
27 elif make_report:
28 fail('Error: No output report filename provided (as argument 3).')
29
30 if not os.path.exists(infile):
31 fail('Error: Input file '+infile+' could not be found.')
32
33
34 utils = importr('utils')
35 graphics = importr('graphics')
36 base = importr('base')
37 stats = importr('stats')
38 rprint = robjects.globalenv.get("print")
39 grdevices = importr('grDevices')
40 grdevices.png(file=outfile, width=1024, height=768)
41
42 # Read file into a data frame
43 DATA = utils.read_delim(infile)
44 # Multiply minor allele frequencies by 100 to get percentage
45 # Doing this in R, since I can't find a way to modify a column in a data frame
46 # in rpy2
47 robjects.r.assign('data', DATA)
48 robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
49 DATA = robjects.r('data')
50 #index = data.names.index('MINOR.FREQ.PERC.')
51 ## .ro turns the returned object into one that can be operated on per-element
52 #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100
53 # Formula() creates a Python object representing the R object returned by x ~ y
54 formula = Formula('MINOR.FREQ.PERC.~SAMPLE')
55 # The "environment" in .getenvironment() is the entire R workspace in which the
56 # Formula object exists. The R workspace meaning all the defined variables.
57 # Here, the .getenvironment() method is being used to set some variables in the
58 # R workspace
59 # Note: .rx2() looks up a column by its label and returns it as a vector
60 formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.')
61 formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE')
62
63 # create boxplot - fill kwargs1 with the options for the boxplot function
64 kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
65 p = graphics.boxplot(formula, **kwargs1)
66
67 tab = base.table(DATA.rx2('SAMPLE'))
68 graphics.text(0.5,1,'N:',font=2)
69 for i in range(1,base.length(tab)[0]+1,1):
70 graphics.text(i,1,tab[i-1], font=2)
71
72 graphlabels = base.names(tab)
73 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
74 graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3)
75 grdevices.dev_off()
76
77 if not make_report:
78 sys.exit(0)
79
80 ####################################
81 # GENERATE REPORT
82 # report should be something like:
83 # SAMPLE NoHET MEDIAN MAD TEST
84 # s1 7 10% n p/w/f
85 # n <= 5 pass
86 # 6 <= n <=10 warn
87 # n >= 11 fail
88 # MAD <= 2.0 fail
89 # MAD > 2.0 pass
90 ###################################
91
92 SAMPLES=[]
93 for i in range(len(tab)):
94 SAMPLES.append(base.names(tab)[i])
95
96 def boxstats(data,sample):
97 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
98 NoHET = len(VALUES)
99 MEDIAN = numpy.median(VALUES)
100 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
101 return [NoHET,MEDIAN, MAD]
102
103 boxreport = open(report, "w+")
104 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
105 for sample in SAMPLES:
106 ENTRY = [sample] + boxstats(infile,sample)
107 if ENTRY[1] <= 5:
108 ENTRY.append('pass')
109 elif 6 <= ENTRY[1] <=10:
110 ENTRY.append('warn')
111 elif ENTRY[1] >= 11:
112 ENTRY.append('fail')
113 if ENTRY[3] <=2.0:
114 ENTRY.append('fail')
115 elif ENTRY[3] >2.0:
116 ENTRY.append('pass')
117 if len(set(ENTRY[4:6])) == 2:
118 ENTRY.append('warn')
119 else:
120 ENTRY.append(list(set(ENTRY[4:6]))[0])
121 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
122
123 boxreport.close()
124
125
126
127
128
129
130 #####################################
131 #STUFF
132 #
133 #kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"}
134 #boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%")
135 #biglabels2=sort(as.vector(unique(run020$id)))
136 #biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5))
137 #axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9)
138 #axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9)
139 #nbig=as.vector(table(run020[run020$freq>=0.02,1]))
140 #for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}
141