comparison cca.py @ 80:c4a3a8999945 draft

Uploaded
author bernhardlutz
date Mon, 20 Jan 2014 14:39:43 -0500
parents
children babf8ab95495
comparison
equal deleted inserted replaced
79:dc82017052ac 80:c4a3a8999945
1 #!/usr/bin/env python
2
3 from galaxy import eggs
4 import sys, string
5 #from rpy import *
6 import rpy2.robjects as robjects
7 import rpy2.rlike.container as rlc
8 from rpy2.robjects.packages import importr
9 r = robjects.r
10 grdevices = importr('grDevices')
11 import numpy
12
13
14 def stop_err(msg):
15 sys.stderr.write(msg)
16 sys.exit()
17
18 infile = sys.argv[1]
19 x_cols = sys.argv[2].split(',')
20 y_cols = sys.argv[3].split(',')
21
22 x_scale = x_center = "FALSE"
23 if sys.argv[4] == 'both':
24 x_scale = x_center = "TRUE"
25 elif sys.argv[4] == 'center':
26 x_center = "TRUE"
27 elif sys.argv[4] == 'scale':
28 x_scale = "TRUE"
29
30 y_scale = y_center = "FALSE"
31 if sys.argv[5] == 'both':
32 y_scale = y_center = "TRUE"
33 elif sys.argv[5] == 'center':
34 y_center = "TRUE"
35 elif sys.argv[5] == 'scale':
36 y_scale = "TRUE"
37
38 std_scores = "FALSE"
39 if sys.argv[6] == "yes":
40 std_scores = "TRUE"
41
42 outfile = sys.argv[7]
43 outfile2 = sys.argv[8]
44
45 fout = open(outfile,'w')
46 elems = []
47 for i, line in enumerate( file ( infile )):
48 line = line.rstrip('\r\n')
49 if len( line )>0 and not line.startswith( '#' ):
50 elems = line.split( '\t' )
51 break
52 if i == 30:
53 break # Hopefully we'll never get here...
54
55 if len( elems )<1:
56 stop_err( "The data in your input dataset is either missing or not formatted properly." )
57
58 x_vals = []
59
60 for k,col in enumerate(x_cols):
61 x_cols[k] = int(col)-1
62 #x_vals.append([])
63
64 y_vals = []
65
66 for k,col in enumerate(y_cols):
67 y_cols[k] = int(col)-1
68 #y_vals.append([])
69
70 skipped = 0
71 for ind,line in enumerate( file( infile )):
72 if line and not line.startswith( '#' ):
73 try:
74 fields = line.strip().split("\t")
75 valid_line = True
76 for col in x_cols+y_cols:
77 try:
78 assert float(fields[col])
79 except:
80 skipped += 1
81 valid_line = False
82 break
83 if valid_line:
84 for k,col in enumerate(x_cols):
85 try:
86 xval = float(fields[col])
87 except:
88 xval = NaN#
89 #x_vals[k].append(xval)
90 x_vals.append(xval)
91 for k,col in enumerate(y_cols):
92 try:
93 yval = float(fields[col])
94 except:
95 yval = NaN#
96 #y_vals[k].append(yval)
97 y_vals.append(yval)
98 except:
99 skipped += 1
100
101 #x_vals1 = numpy.asarray(x_vals).transpose()
102 #y_vals1 = numpy.asarray(y_vals).transpose()
103
104 #x_dat= r.list(array(x_vals1))
105 #y_dat= r.list(array(y_vals1))
106
107 x_dat = r['matrix'](robjects.FloatVector(x_vals),ncol=len(x_cols),byrow=True)
108 y_dat = r['matrix'](robjects.FloatVector(y_vals),ncol=len(y_cols),byrow=True)
109
110 try:
111 r.suppressWarnings(r.library("yacca"))
112 except:
113 stop_err("Missing R library yacca.")
114
115 #set_default_mode(NO_CONVERSION)
116 try:
117 xcolnames = ["c%d" %(el+1) for el in x_cols]
118 ycolnames = ["c%d" %(el+1) for el in y_cols]
119 #cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), standardize_scores=r(std_scores))
120 cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), **{'standardize.scores':r(std_scores)})
121 #ftest = r.F_test_cca(cc)
122 ftest = r['F.test.cca'](cc)
123 except RException, rex:
124 stop_err("Encountered error while performing CCA on the input data: %s" %(rex))
125
126 #set_default_mode(BASIC_CONVERSION)
127 summary = r.summary(cc)
128
129 #ncomps = len(summary['corr'])
130 ncomps = len(summary.rx2('corr'))
131 #comps = summary['corr'].keys()
132 #comps = summary.rx2('corr').names
133 comps = (','.join(summary.rx2('corr').names)).split(',')
134 #corr = summary['corr'].values()
135 corr = summary.rx2('corr')
136 #xlab = summary['xlab']
137 xlab = summary.rx2('xlab')
138 #ylab = summary['ylab']
139 ylab = summary.rx2('ylab')
140
141 for i in range(ncomps):
142 corr[comps.index('CV %s' %(i+1))] = summary.rx2('corr')[i]
143 #corr[comps.index('CV %s' %(i+1))] = summary['corr'].values()[i]
144
145 #ftest=ftest.as_py()
146 print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
147 print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in corr]))
148 #print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest['statistic']]))
149 print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest.rx2('statistic')]))
150 #print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest['p.value']]))
151 print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest.rx2('p.value')]))
152
153
154 print >>fout, "#X-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
155 #for i,val in enumerate(summary['xcoef']):
156 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
157 vm = summary.rx2('xcoef')
158 for i in range(vm.nrow):
159 vals = []
160 for j in range(vm.ncol):
161 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
162 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
163
164 print >>fout, "#Y-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
165 #for i,val in enumerate(summary['ycoef']):
166 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
167 vm = summary.rx2('ycoef')
168 for i in range(vm.nrow):
169 vals = []
170 for j in range(vm.ncol):
171 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
172 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
173
174 print >>fout, "#X-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
175 #for i,val in enumerate(summary['xstructcorr']):
176 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
177 vm = summary.rx2('xstructcorr')
178 for i in range(vm.nrow):
179 vals = []
180 for j in range(vm.ncol):
181 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
182 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
183
184 print >>fout, "#Y-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
185 #for i,val in enumerate(summary['ystructcorr']):
186 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
187 vm = summary.rx2('ystructcorr')
188 for i in range(vm.nrow):
189 vals = []
190 for j in range(vm.ncol):
191 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
192 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
193
194 print >>fout, "#X-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
195 #for i,val in enumerate(summary['xcrosscorr']):
196 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
197 vm = summary.rx2('xcrosscorr')
198 for i in range(vm.nrow):
199 vals = []
200 for j in range(vm.ncol):
201 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
202 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
203
204 print >>fout, "#Y-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
205 #for i,val in enumerate(summary['ycrosscorr']):
206 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
207 vm = summary.rx2('ycrosscorr')
208 for i in range(vm.nrow):
209 vals = []
210 for j in range(vm.ncol):
211 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
212 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
213
214 r.pdf( outfile2, 8, 8 )
215 #r.plot(cc)
216 for i in range(ncomps):
217 r['helio.plot'](cc, cv = i+1, main = r.paste("Explained Variance for CV",i+1), type = "variance")
218 #r.dev_off()
219 grdevices.dev_off()
220