0
|
1 #!/usr/bin/env python
|
2
|
2 # New in this version:
|
4
|
3 # - option to generate report is triggered simply by including report filename
|
|
4 # as third argument
|
0
|
5
|
|
6 import os,sys,numpy
|
|
7 from rpy2.robjects import Formula
|
|
8 from rpy2.robjects.packages import importr
|
|
9 from rpy2 import robjects
|
|
10
|
|
11 def fail(message):
|
|
12 sys.stderr.write(message+'\n')
|
|
13 sys.exit(1)
|
|
14
|
|
15 args = sys.argv[1:]
|
|
16 if len(args) >= 1:
|
|
17 infile = args[0]
|
|
18 else:
|
|
19 fail('Error: No input filename provided (as argument 1).')
|
|
20 if len(args) >= 2:
|
|
21 outfile = args[1]
|
|
22 else:
|
|
23 fail('Error: No output filename provided (as argument 2).')
|
|
24 if len(args) >= 3:
|
|
25 report = args[2]
|
4
|
26 else:
|
|
27 report = ''
|
0
|
28
|
2
|
29 # Check input file
|
0
|
30 if not os.path.exists(infile):
|
|
31 fail('Error: Input file '+infile+' could not be found.')
|
2
|
32 with open(infile, 'r') as lines:
|
|
33 line = lines.readline()
|
|
34 if not line:
|
|
35 fail('Error: Input file seems to be empty')
|
|
36 line = line.strip().lstrip('#') # rm whitespace, comment chars
|
|
37 labels = line.split("\t")
|
|
38 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
|
|
39 fail('Error: Input file does not seem to have a proper header line.')
|
0
|
40
|
|
41
|
|
42 utils = importr('utils')
|
|
43 graphics = importr('graphics')
|
|
44 base = importr('base')
|
|
45 stats = importr('stats')
|
|
46 rprint = robjects.globalenv.get("print")
|
|
47 grdevices = importr('grDevices')
|
|
48 grdevices.png(file=outfile, width=1024, height=768)
|
|
49
|
|
50 # Read file into a data frame
|
|
51 DATA = utils.read_delim(infile)
|
2
|
52 # Remove comment from header, if
|
|
53 labels = robjects.r.names(DATA)
|
|
54 if labels[0][0:2] == 'X.':
|
|
55 labels[0] = labels[0][2:]
|
|
56 #robjects.r.assign('data', DATA)
|
|
57 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
|
|
58 #DATA = robjects.r('data')
|
|
59 #index = data.names.index('MINOR.FREQ.PERC.')
|
0
|
60 # Multiply minor allele frequencies by 100 to get percentage
|
2
|
61 # .rx2() looks up a column by its label and returns it as a vector
|
|
62 # .ro turns the returned object into one that can be operated on per-element
|
|
63 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
|
|
64 samples = DATA.rx2('SAMPLE')
|
0
|
65 # Formula() creates a Python object representing the R object returned by x ~ y
|
2
|
66 formula = Formula('minor_freq ~ samples')
|
0
|
67 # The "environment" in .getenvironment() is the entire R workspace in which the
|
|
68 # Formula object exists. The R workspace meaning all the defined variables.
|
|
69 # Here, the .getenvironment() method is being used to set some variables in the
|
|
70 # R workspace
|
2
|
71 formula.getenvironment()['minor_freq'] = minor_freq
|
|
72 formula.getenvironment()['samples'] = samples
|
0
|
73
|
|
74 # create boxplot - fill kwargs1 with the options for the boxplot function
|
2
|
75 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
|
0
|
76 p = graphics.boxplot(formula, **kwargs1)
|
|
77
|
2
|
78 table = base.table(DATA.rx2('SAMPLE'))
|
|
79 graphics.text(0.5, 1, 'N:', font=2)
|
|
80 for i in range(1, base.length(table)[0]+1, 1):
|
|
81 graphics.text(i, 1, table[i-1], font=2)
|
0
|
82
|
2
|
83 graphlabels = base.names(table)
|
0
|
84 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
|
2
|
85 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3)
|
0
|
86 grdevices.dev_off()
|
|
87
|
4
|
88 if not report:
|
0
|
89 sys.exit(0)
|
|
90
|
2
|
91
|
0
|
92 ####################################
|
|
93 # GENERATE REPORT
|
|
94 # report should be something like:
|
|
95 # SAMPLE NoHET MEDIAN MAD TEST
|
|
96 # s1 7 10% n p/w/f
|
|
97 # n <= 5 pass
|
|
98 # 6 <= n <=10 warn
|
|
99 # n >= 11 fail
|
|
100 # MAD <= 2.0 fail
|
|
101 # MAD > 2.0 pass
|
|
102 ###################################
|
|
103
|
|
104 SAMPLES=[]
|
4
|
105 for i in range(len(table)):
|
|
106 SAMPLES.append(base.names(table)[i])
|
0
|
107
|
|
108 def boxstats(data,sample):
|
|
109 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
|
|
110 NoHET = len(VALUES)
|
|
111 MEDIAN = numpy.median(VALUES)
|
|
112 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
|
|
113 return [NoHET,MEDIAN, MAD]
|
|
114
|
|
115 boxreport = open(report, "w+")
|
|
116 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
|
|
117 for sample in SAMPLES:
|
|
118 ENTRY = [sample] + boxstats(infile,sample)
|
|
119 if ENTRY[1] <= 5:
|
|
120 ENTRY.append('pass')
|
|
121 elif 6 <= ENTRY[1] <=10:
|
|
122 ENTRY.append('warn')
|
|
123 elif ENTRY[1] >= 11:
|
|
124 ENTRY.append('fail')
|
|
125 if ENTRY[3] <=2.0:
|
|
126 ENTRY.append('fail')
|
|
127 elif ENTRY[3] >2.0:
|
|
128 ENTRY.append('pass')
|
|
129 if len(set(ENTRY[4:6])) == 2:
|
|
130 ENTRY.append('warn')
|
|
131 else:
|
|
132 ENTRY.append(list(set(ENTRY[4:6]))[0])
|
|
133 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
|
|
134
|
|
135 boxreport.close()
|
|
136
|
|
137
|
|
138
|
|
139
|
|
140
|
|
141
|
|
142 #####################################
|
|
143 #STUFF
|
|
144 #
|
|
145 #kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"}
|
|
146 #boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%")
|
|
147 #biglabels2=sort(as.vector(unique(run020$id)))
|
|
148 #biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5))
|
|
149 #axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9)
|
|
150 #axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9)
|
|
151 #nbig=as.vector(table(run020[run020$freq>=0.02,1]))
|
|
152 #for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}
|
|
153
|