comparison cca.py @ 0:ffcdde989859 draft

Uploaded
author iuc
date Tue, 29 Jul 2014 06:30:45 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ffcdde989859
1 #!/usr/bin/env python
2
3 import sys, string
4 #from rpy import *
5 import rpy2.robjects as robjects
6 import rpy2.rlike.container as rlc
7 from rpy2.robjects.packages import importr
8 r = robjects.r
9 grdevices = importr('grDevices')
10 import numpy
11
12
13 def stop_err(msg):
14 sys.stderr.write(msg)
15 sys.exit()
16
17 infile = sys.argv[1]
18 x_cols = sys.argv[2].split(',')
19 y_cols = sys.argv[3].split(',')
20
21 x_scale = x_center = "FALSE"
22 if sys.argv[4] == 'both':
23 x_scale = x_center = "TRUE"
24 elif sys.argv[4] == 'center':
25 x_center = "TRUE"
26 elif sys.argv[4] == 'scale':
27 x_scale = "TRUE"
28
29 y_scale = y_center = "FALSE"
30 if sys.argv[5] == 'both':
31 y_scale = y_center = "TRUE"
32 elif sys.argv[5] == 'center':
33 y_center = "TRUE"
34 elif sys.argv[5] == 'scale':
35 y_scale = "TRUE"
36
37 std_scores = "FALSE"
38 if sys.argv[6] == "yes":
39 std_scores = "TRUE"
40
41 outfile = sys.argv[7]
42 outfile2 = sys.argv[8]
43
44 fout = open(outfile,'w')
45 elems = []
46 for i, line in enumerate( file ( infile )):
47 line = line.rstrip('\r\n')
48 if len( line )>0 and not line.startswith( '#' ):
49 elems = line.split( '\t' )
50 break
51 if i == 30:
52 break # Hopefully we'll never get here...
53
54 if len( elems )<1:
55 stop_err( "The data in your input dataset is either missing or not formatted properly." )
56
57 x_vals = []
58
59 for k,col in enumerate(x_cols):
60 x_cols[k] = int(col)-1
61 #x_vals.append([])
62
63 y_vals = []
64
65 for k,col in enumerate(y_cols):
66 y_cols[k] = int(col)-1
67 #y_vals.append([])
68
69 skipped = 0
70 for ind,line in enumerate( file( infile )):
71 if line and not line.startswith( '#' ):
72 try:
73 fields = line.strip().split("\t")
74 valid_line = True
75 for col in x_cols+y_cols:
76 try:
77 assert float(fields[col])
78 except:
79 skipped += 1
80 valid_line = False
81 break
82 if valid_line:
83 for k,col in enumerate(x_cols):
84 try:
85 xval = float(fields[col])
86 except:
87 xval = NaN#
88 #x_vals[k].append(xval)
89 x_vals.append(xval)
90 for k,col in enumerate(y_cols):
91 try:
92 yval = float(fields[col])
93 except:
94 yval = NaN#
95 #y_vals[k].append(yval)
96 y_vals.append(yval)
97 except:
98 skipped += 1
99
100 #x_vals1 = numpy.asarray(x_vals).transpose()
101 #y_vals1 = numpy.asarray(y_vals).transpose()
102
103 #x_dat= r.list(array(x_vals1))
104 #y_dat= r.list(array(y_vals1))
105
106 x_dat = r['matrix'](robjects.FloatVector(x_vals),ncol=len(x_cols),byrow=True)
107 y_dat = r['matrix'](robjects.FloatVector(y_vals),ncol=len(y_cols),byrow=True)
108
109 try:
110 r.suppressWarnings(r.library("yacca"))
111 except:
112 stop_err("Missing R library yacca.")
113
114 #set_default_mode(NO_CONVERSION)
115 try:
116 xcolnames = ["c%d" %(el+1) for el in x_cols]
117 ycolnames = ["c%d" %(el+1) for el in y_cols]
118 #cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), standardize_scores=r(std_scores))
119 cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), **{'standardize.scores':r(std_scores)})
120 #ftest = r.F_test_cca(cc)
121 ftest = r['F.test.cca'](cc)
122 except RException, rex:
123 stop_err("Encountered error while performing CCA on the input data: %s" %(rex))
124
125 #set_default_mode(BASIC_CONVERSION)
126 summary = r.summary(cc)
127
128 #ncomps = len(summary['corr'])
129 ncomps = len(summary.rx2('corr'))
130 #comps = summary['corr'].keys()
131 #comps = summary.rx2('corr').names
132 comps = (','.join(summary.rx2('corr').names)).split(',')
133 #corr = summary['corr'].values()
134 corr = summary.rx2('corr')
135 #xlab = summary['xlab']
136 xlab = summary.rx2('xlab')
137 #ylab = summary['ylab']
138 ylab = summary.rx2('ylab')
139
140 for i in range(ncomps):
141 corr[comps.index('CV %s' %(i+1))] = summary.rx2('corr')[i]
142 #corr[comps.index('CV %s' %(i+1))] = summary['corr'].values()[i]
143
144 #ftest=ftest.as_py()
145 print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
146 print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in corr]))
147 #print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest['statistic']]))
148 print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest.rx2('statistic')]))
149 #print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest['p.value']]))
150 print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest.rx2('p.value')]))
151
152
153 print >>fout, "#X-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
154 #for i,val in enumerate(summary['xcoef']):
155 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
156 vm = summary.rx2('xcoef')
157 for i in range(vm.nrow):
158 vals = []
159 for j in range(vm.ncol):
160 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
161 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
162
163 print >>fout, "#Y-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
164 #for i,val in enumerate(summary['ycoef']):
165 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
166 vm = summary.rx2('ycoef')
167 for i in range(vm.nrow):
168 vals = []
169 for j in range(vm.ncol):
170 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
171 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
172
173 print >>fout, "#X-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
174 #for i,val in enumerate(summary['xstructcorr']):
175 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
176 vm = summary.rx2('xstructcorr')
177 for i in range(vm.nrow):
178 vals = []
179 for j in range(vm.ncol):
180 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
181 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
182
183 print >>fout, "#Y-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
184 #for i,val in enumerate(summary['ystructcorr']):
185 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
186 vm = summary.rx2('ystructcorr')
187 for i in range(vm.nrow):
188 vals = []
189 for j in range(vm.ncol):
190 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
191 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
192
193 print >>fout, "#X-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
194 #for i,val in enumerate(summary['xcrosscorr']):
195 # print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
196 vm = summary.rx2('xcrosscorr')
197 for i in range(vm.nrow):
198 vals = []
199 for j in range(vm.ncol):
200 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
201 print >>fout, "%s\t%s" %(xlab[i][0], "\t".join(vals))
202
203 print >>fout, "#Y-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
204 #for i,val in enumerate(summary['ycrosscorr']):
205 # print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
206 vm = summary.rx2('ycrosscorr')
207 for i in range(vm.nrow):
208 vals = []
209 for j in range(vm.ncol):
210 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
211 print >>fout, "%s\t%s" %(ylab[i][0], "\t".join(vals))
212
213 r.pdf( outfile2, 8, 8 )
214 #r.plot(cc)
215 for i in range(ncomps):
216 r['helio.plot'](cc, cv = i+1, main = r.paste("Explained Variance for CV",i+1), type = "variance")
217 #r.dev_off()
218 grdevices.dev_off()
219