1
|
1 <tool id="lda_analy1" name="Perform LDA" version="1.0.1">
|
|
2 <description>Linear Discriminant Analysis</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="2.11.0">R</requirement>
|
|
5 </requirements>
|
|
6 <command interpreter="sh">r_wrapper.sh $script_file</command>
|
|
7 <inputs>
|
|
8 <param format="tabular" name="input" type="data" label="Source file"/>
|
|
9 <param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below">
|
|
10 <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/>
|
|
11 </param>
|
|
12
|
|
13 </inputs>
|
|
14 <outputs>
|
|
15 <data format="txt" name="output" />
|
|
16 </outputs>
|
|
17
|
|
18 <tests>
|
|
19 <test>
|
|
20 <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/>
|
|
21 <output name="output" file="lda_analy_output.txt"/>
|
|
22 <param name="cond" value="2"/>
|
|
23
|
|
24 </test>
|
|
25 </tests>
|
|
26
|
|
27 <configfiles>
|
|
28 <configfile name="script_file">
|
|
29
|
|
30 rm(list = objects() )
|
|
31
|
|
32 ############# FORMAT X DATA #########################
|
|
33 format<-function(data) {
|
|
34 ind=NULL
|
|
35 for(i in 1 : ncol(data)){
|
|
36 if (is.na(data[nrow(data),i])) {
|
|
37 ind<-c(ind,i)
|
|
38 }
|
|
39 }
|
|
40 #print(is.null(ind))
|
|
41 if (!is.null(ind)) {
|
|
42 data<-data[,-c(ind)]
|
|
43 }
|
|
44
|
|
45 data
|
|
46 }
|
|
47
|
|
48 ########GET RESPONSES ###############################
|
|
49 get_resp<- function(data) {
|
|
50 resp1<-as.vector(data[,ncol(data)])
|
|
51 resp=numeric(length(resp1))
|
|
52 for (i in 1:length(resp1)) {
|
|
53 if (resp1[i]=="Y ") {
|
|
54 resp[i] = 0
|
|
55 }
|
|
56 if (resp1[i]=="X ") {
|
|
57 resp[i] = 1
|
|
58 }
|
|
59 }
|
|
60 return(resp)
|
|
61 }
|
|
62
|
|
63 ######## CHARS TO NUMBERS ###########################
|
|
64 f_to_numbers<- function(F) {
|
|
65 ind<-NULL
|
|
66 G<-matrix(0,nrow(F), ncol(F))
|
|
67 for (i in 1:nrow(F)) {
|
|
68 for (j in 1:ncol(F)) {
|
|
69 G[i,j]<-as.integer(F[i,j])
|
|
70 }
|
|
71 }
|
|
72 return(G)
|
|
73 }
|
|
74
|
|
75 ###################NORMALIZING#########################
|
|
76 norm <- function(M, a=NULL, b=NULL) {
|
|
77 C<-NULL
|
|
78 ind<-NULL
|
|
79
|
|
80 for (i in 1: ncol(M)) {
|
|
81 if (sd(M[,i])!=0) {
|
|
82 M[,i]<-(M[,i]-mean(M[,i]))/sd(M[,i])
|
|
83 }
|
|
84 # else {print(mean(M[,i]))}
|
|
85 }
|
|
86 return(M)
|
|
87 }
|
|
88
|
|
89 ##### LDA DIRECTIONS #################################
|
|
90 lda_dec <- function(data, k){
|
|
91 priors=numeric(k)
|
|
92 grandmean<-numeric(ncol(data)-1)
|
|
93 means=matrix(0,k,ncol(data)-1)
|
|
94 B = matrix(0, ncol(data)-1, ncol(data)-1)
|
|
95 N=nrow(data)
|
|
96 for (i in 1:k){
|
|
97 priors[i]=sum(data[,1]==i)/N
|
|
98 grp=subset(data,data\$group==i)
|
|
99 means[i,]=mean(grp[,2:ncol(data)])
|
|
100 #print(means[i,])
|
|
101 #print(priors[i])
|
|
102 #print(priors[i]*means[i,])
|
|
103 grandmean = priors[i]*means[i,] + grandmean
|
|
104 }
|
|
105
|
|
106 for (i in 1:k) {
|
|
107 B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))
|
|
108 }
|
|
109
|
|
110 W = var(data[,2:ncol(data)])
|
|
111 svdW = svd(W)
|
|
112 inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v))
|
|
113 B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW
|
|
114 B_star_decomp = svd(B_star)
|
|
115 directions = inv_sqrtW%*%B_star_decomp\$v
|
|
116 return( list(directions, B_star_decomp\$d) )
|
|
117 }
|
|
118
|
|
119 ################ NAIVE BAYES FOR 1D SIR OR LDA ##############
|
|
120 naive_bayes_classifier <- function(resp, tr_data, test_data, k=2, tau) {
|
|
121 tr_data=data.frame(resp=resp, dir=tr_data)
|
|
122 means=numeric(k)
|
|
123 #print(k)
|
|
124 cl=numeric(k)
|
|
125 predclass=numeric(length(test_data))
|
|
126 for (i in 1:k) {
|
|
127 grp = subset(tr_data, resp==i)
|
|
128 means[i] = mean(grp\$dir)
|
|
129 #print(i, means[i])
|
|
130 }
|
|
131 cutoff = tau*means[1]+(1-tau)*means[2]
|
|
132 #print(tau)
|
|
133 #print(means)
|
|
134 #print(cutoff)
|
|
135 if (cutoff>means[1]) {
|
|
136 cl[1]=1
|
|
137 cl[2]=2
|
|
138 }
|
|
139 else {
|
|
140 cl[1]=2
|
|
141 cl[2]=1
|
|
142 }
|
|
143
|
|
144 for (i in 1:length(test_data)) {
|
|
145
|
|
146 if (test_data[i] <= cutoff) {
|
|
147 predclass[i] = cl[1]
|
|
148 }
|
|
149 else {
|
|
150 predclass[i] = cl[2]
|
|
151 }
|
|
152 }
|
|
153 #print(means)
|
|
154 #print(mean(means))
|
|
155 #X11()
|
|
156 #plot(test_data,pch=predclass, col=resp)
|
|
157 predclass
|
|
158 }
|
|
159
|
|
160 ################# EXTENDED ERROR RATES #################
|
|
161 ext_error_rate <- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {
|
|
162 er=sum(predclass != actualclass)/length(predclass)
|
|
163
|
|
164 matr<-data.frame(predclass=predclass,actualclass=actualclass)
|
|
165 escapes = subset(matr, actualclass==1)
|
|
166 subjects = subset(matr, actualclass==2)
|
|
167 er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass)
|
|
168 er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass)
|
|
169
|
|
170 if (pr==1) {
|
|
171 # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" "))
|
|
172 # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" "))
|
|
173 # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" "))
|
|
174 }
|
|
175 return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))
|
|
176 }
|
|
177
|
|
178 ## Main Function ##
|
|
179
|
|
180 files<-matrix("${input}", 1,1, byrow=T)
|
|
181
|
|
182 d<-"${cond}" # Number of PC
|
|
183
|
|
184 tau<-seq(0,1, by=0.005)
|
|
185 #tau<-seq(0,1, by=0.1)
|
|
186 for_curve=matrix(-10, 3,length(tau))
|
|
187
|
|
188 ##############################################################
|
|
189
|
|
190 test_data_whole_X <-read.delim(files[1,1], row.names=1)
|
|
191
|
|
192 #### FORMAT TRAINING DATA ####################################
|
|
193 # get only necessary columns
|
|
194
|
|
195 test_data_whole_X<-format(test_data_whole_X)
|
|
196 oligo_labels<-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)]
|
|
197 test_data_whole_X<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
|
|
198
|
|
199 X_names<-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)]
|
|
200 test_data_whole_X<-t(test_data_whole_X)
|
|
201 resp<-get_resp(test_data_whole_X)
|
|
202 ldaqda_resp = resp + 1
|
|
203 a<-sum(resp) # Number of Subject
|
|
204 b<-length(resp) - a # Number of Escape
|
|
205 ## FREQUENCIES #################################################
|
|
206 F<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
|
|
207 F<-f_to_numbers(F)
|
|
208 FN<-norm(F, a, b)
|
|
209 ss<-svd(FN)
|
|
210 eigvar<-NULL
|
|
211 eig<-ss\$d^2
|
|
212
|
|
213 for ( i in 1:length(ss\$d)) {
|
|
214 eigvar[i]<-sum(eig[1:i])/sum(eig)
|
|
215 }
|
|
216
|
|
217 #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse=""))
|
|
218
|
|
219 Z<-F%*%ss\$v
|
|
220
|
|
221 ldaqda_data <- data.frame(group=ldaqda_resp,Z[,1:d])
|
|
222 lda_dir<-lda_dec(ldaqda_data,2)
|
|
223 train_lda_pred <-Z[,1:d]%*%lda_dir[[1]]
|
|
224
|
|
225 ############# NAIVE BAYES CROSS-VALIDATION #############
|
|
226 ### LDA #####
|
|
227
|
|
228 y<-ldaqda_resp
|
|
229 X<-F
|
|
230 cv<-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau))
|
|
231 for (i in 1:nrow(test_data_whole_X)) {
|
|
232 # print(i)
|
|
233 resp<-y[-i]
|
|
234 p<-matrix(X[-i,], dim(X)[1]-1, dim(X)[2])
|
|
235 testdata<-matrix(X[i,],1,dim(X)[2])
|
|
236 p1<-norm(p)
|
|
237 sss<-svd(p1)
|
|
238 pred<-(p%*%sss\$v)[,1:d]
|
|
239 test<- (testdata%*%sss\$v)[,1:d]
|
|
240 lda <- lda_dec(data.frame(group=resp,pred),2)
|
|
241 pred <- pred[,1:d]%*%lda[[1]][,1]
|
|
242 test <- test%*%lda[[1]][,1]
|
|
243 test<-matrix(test, 1, length(test))
|
|
244 for (t in 1:length(tau)) {
|
|
245 cv[i, t] <- naive_bayes_classifier (resp, pred, test,k=2, tau[t])
|
|
246 }
|
|
247 }
|
|
248
|
|
249 for (t in 1:length(tau)) {
|
|
250 tr_err<-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1)
|
|
251 for_curve[1:3,t]<-tr_err
|
|
252 }
|
|
253
|
|
254 dput(for_curve, file="${output}")
|
|
255
|
|
256
|
|
257 </configfile>
|
|
258 </configfiles>
|
|
259
|
|
260 <help>
|
|
261
|
|
262 .. class:: infomark
|
|
263
|
|
264 **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis*
|
|
265
|
|
266 -----
|
|
267
|
|
268 .. class:: infomark
|
|
269
|
|
270 **What it does**
|
|
271
|
|
272 This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)
|
|
273
|
|
274 *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
|
|
275
|
|
276 -----
|
|
277
|
|
278 .. class:: warningmark
|
|
279
|
|
280 **Note**
|
|
281
|
|
282 - Output from "Generate A Matrix" tool is used as input file for this tool
|
|
283 - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot.
|
|
284
|
|
285
|
|
286 </help>
|
|
287
|
|
288 </tool>
|