comparison pca.py @ 80:c4a3a8999945 draft

Uploaded
author bernhardlutz
date Mon, 20 Jan 2014 14:39:43 -0500
parents
children babf8ab95495
comparison
equal deleted inserted replaced
79:dc82017052ac 80:c4a3a8999945
1 #!/usr/bin/env python
2
3 from galaxy import eggs
4 import sys, string
5 #from rpy import *
6 import rpy2.robjects as robjects
7 import rpy2.rlike.container as rlc
8 from rpy2.robjects.packages import importr
9 r = robjects.r
10 grdevices = importr('grDevices')
11 import numpy
12
13 def stop_err(msg):
14 sys.stderr.write(msg)
15 sys.exit()
16
17 infile = sys.argv[1]
18 x_cols = sys.argv[2].split(',')
19 method = sys.argv[3]
20 outfile = sys.argv[4]
21 outfile2 = sys.argv[5]
22
23 if method == 'svd':
24 scale = center = "FALSE"
25 if sys.argv[6] == 'both':
26 scale = center = "TRUE"
27 elif sys.argv[6] == 'center':
28 center = "TRUE"
29 elif sys.argv[6] == 'scale':
30 scale = "TRUE"
31
32 fout = open(outfile,'w')
33 elems = []
34 for i, line in enumerate( file ( infile )):
35 line = line.rstrip('\r\n')
36 if len( line )>0 and not line.startswith( '#' ):
37 elems = line.split( '\t' )
38 break
39 if i == 30:
40 break # Hopefully we'll never get here...
41
42 if len( elems )<1:
43 stop_err( "The data in your input dataset is either missing or not formatted properly." )
44
45 x_vals = []
46
47 for k,col in enumerate(x_cols):
48 x_cols[k] = int(col)-1
49 # x_vals.append([])
50
51 NA = 'NA'
52 skipped = 0
53 for ind,line in enumerate( file( infile )):
54 if line and not line.startswith( '#' ):
55 try:
56 fields = line.strip().split("\t")
57 valid_line = True
58 for k,col in enumerate(x_cols):
59 try:
60 xval = float(fields[col])
61 except:
62 skipped += 1
63 valid_line = False
64 break
65 if valid_line:
66 for k,col in enumerate(x_cols):
67 xval = float(fields[col])
68 #x_vals[k].append(xval)
69 x_vals.append(xval)
70 except:
71 skipped += 1
72
73 #x_vals1 = numpy.asarray(x_vals).transpose()
74 #dat= r.list(array(x_vals1))
75 dat = r['matrix'](robjects.FloatVector(x_vals),ncol=len(x_cols),byrow=True)
76
77 #set_default_mode(NO_CONVERSION)
78 try:
79 if method == "cor":
80 #pc = r.princomp(r.na_exclude(dat), cor = r("TRUE"))
81 pc = r.princomp(r['na.exclude'](dat), cor = r("TRUE"))
82 elif method == "cov":
83 #pc = r.princomp(r.na_exclude(dat), cor = r("FALSE"))
84 pc = r.princomp(r['na.exclude'](dat), cor = r("FALSE"))
85 elif method=="svd":
86 #pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale))
87 pc = r.prcomp(r['na.exclude'](dat), center = r(center), scale = r(scale))
88 #except RException, rex:
89 except Exception, rex: # need to find rpy2 RException
90 stop_err("Encountered error while performing PCA on the input data: %s" %(rex))
91
92 #set_default_mode(BASIC_CONVERSION)
93 summary = r.summary(pc, loadings="TRUE")
94 #ncomps = len(summary['sdev'])
95 ncomps = len(summary.rx2('sdev'))
96
97 #if type(summary['sdev']) == type({}):
98 # comps_unsorted = summary['sdev'].keys()
99 # comps=[]
100 # sd = summary['sdev'].values()
101 # for i in range(ncomps):
102 # sd[i] = summary['sdev'].values()[comps_unsorted.index('Comp.%s' %(i+1))]
103 # comps.append('Comp.%s' %(i+1))
104 #elif type(summary['sdev']) == type([]):
105 # comps=[]
106 # for i in range(ncomps):
107 # comps.append('Comp.%s' %(i+1))
108 # sd = summary['sdev']
109
110 comps=[]
111 for i in range(ncomps):
112 comps.append('Comp.%s' %(i+1))
113 sd = summary.rx2('sdev')
114
115 print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
116 #print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd]))
117 print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd]))
118 total_var = 0
119 vars = []
120 for s in sd:
121 var = s*s
122 total_var += var
123 vars.append(var)
124 for i,var in enumerate(vars):
125 vars[i] = vars[i]/total_var
126
127 print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars]))
128
129 print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
130 xcolnames = ["c%d" %(el+1) for el in x_cols]
131 #if 'loadings' in summary: #in case of princomp
132 if 'loadings' in summary.names: #in case of princomp
133 loadings = 'loadings'
134 #elif 'rotation' in summary: #in case of prcomp
135 elif 'rotation' in summary.names: #in case of prcomp
136 loadings = 'rotation'
137 #for i,val in enumerate(summary[loadings]):
138 # print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val]))
139 vm = summary.rx2(loadings)
140 for i in range(vm.nrow):
141 vals = []
142 for j in range(vm.ncol):
143 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
144 print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(vals))
145
146 print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
147 #if 'scores' in summary: #in case of princomp
148 if 'scores' in summary.names: #in case of princomp
149 scores = 'scores'
150 #elif 'x' in summary: #in case of prcomp
151 elif 'x' in summary.names: #in case of prcomp
152 scores = 'x'
153 #for obs,sc in enumerate(summary[scores]):
154 # print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc]))
155 vm = summary.rx2(scores)
156 for i in range(vm.nrow):
157 vals = []
158 for j in range(vm.ncol):
159 vals.append("%.4g" % vm.rx2(i+1,j+1)[0])
160 print >>fout, "%s\t%s" %(i+1, "\t".join(vals))
161 r.pdf( outfile2, 8, 8 )
162 r.biplot(pc)
163 #r.dev_off()
164 grdevices.dev_off()
165