1
+ − 1 <tool id="lda_analy1" name="Perform LDA" version="1.0.1">
+ − 2 <description>Linear Discriminant Analysis</description>
+ − 3 <requirements>
+ − 4 <requirement type="package" version="2.11.0">R</requirement>
+ − 5 </requirements>
+ − 6 <command interpreter="sh">r_wrapper.sh $script_file</command>
+ − 7 <inputs>
+ − 8 <param format="tabular" name="input" type="data" label="Source file"/>
+ − 9 <param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below">
+ − 10 <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/>
+ − 11 </param>
+ − 12
+ − 13 </inputs>
+ − 14 <outputs>
+ − 15 <data format="txt" name="output" />
+ − 16 </outputs>
+ − 17
+ − 18 <tests>
+ − 19 <test>
+ − 20 <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/>
+ − 21 <output name="output" file="lda_analy_output.txt"/>
+ − 22 <param name="cond" value="2"/>
+ − 23
+ − 24 </test>
+ − 25 </tests>
+ − 26
+ − 27 <configfiles>
+ − 28 <configfile name="script_file">
+ − 29
+ − 30 rm(list = objects() )
+ − 31
+ − 32 ############# FORMAT X DATA #########################
+ − 33 format<-function(data) {
+ − 34 ind=NULL
+ − 35 for(i in 1 : ncol(data)){
+ − 36 if (is.na(data[nrow(data),i])) {
+ − 37 ind<-c(ind,i)
+ − 38 }
+ − 39 }
+ − 40 #print(is.null(ind))
+ − 41 if (!is.null(ind)) {
+ − 42 data<-data[,-c(ind)]
+ − 43 }
+ − 44
+ − 45 data
+ − 46 }
+ − 47
+ − 48 ########GET RESPONSES ###############################
+ − 49 get_resp<- function(data) {
+ − 50 resp1<-as.vector(data[,ncol(data)])
+ − 51 resp=numeric(length(resp1))
+ − 52 for (i in 1:length(resp1)) {
+ − 53 if (resp1[i]=="Y ") {
+ − 54 resp[i] = 0
+ − 55 }
+ − 56 if (resp1[i]=="X ") {
+ − 57 resp[i] = 1
+ − 58 }
+ − 59 }
+ − 60 return(resp)
+ − 61 }
+ − 62
+ − 63 ######## CHARS TO NUMBERS ###########################
+ − 64 f_to_numbers<- function(F) {
+ − 65 ind<-NULL
+ − 66 G<-matrix(0,nrow(F), ncol(F))
+ − 67 for (i in 1:nrow(F)) {
+ − 68 for (j in 1:ncol(F)) {
+ − 69 G[i,j]<-as.integer(F[i,j])
+ − 70 }
+ − 71 }
+ − 72 return(G)
+ − 73 }
+ − 74
+ − 75 ###################NORMALIZING#########################
+ − 76 norm <- function(M, a=NULL, b=NULL) {
+ − 77 C<-NULL
+ − 78 ind<-NULL
+ − 79
+ − 80 for (i in 1: ncol(M)) {
+ − 81 if (sd(M[,i])!=0) {
+ − 82 M[,i]<-(M[,i]-mean(M[,i]))/sd(M[,i])
+ − 83 }
+ − 84 # else {print(mean(M[,i]))}
+ − 85 }
+ − 86 return(M)
+ − 87 }
+ − 88
+ − 89 ##### LDA DIRECTIONS #################################
+ − 90 lda_dec <- function(data, k){
+ − 91 priors=numeric(k)
+ − 92 grandmean<-numeric(ncol(data)-1)
+ − 93 means=matrix(0,k,ncol(data)-1)
+ − 94 B = matrix(0, ncol(data)-1, ncol(data)-1)
+ − 95 N=nrow(data)
+ − 96 for (i in 1:k){
+ − 97 priors[i]=sum(data[,1]==i)/N
+ − 98 grp=subset(data,data\$group==i)
+ − 99 means[i,]=mean(grp[,2:ncol(data)])
+ − 100 #print(means[i,])
+ − 101 #print(priors[i])
+ − 102 #print(priors[i]*means[i,])
+ − 103 grandmean = priors[i]*means[i,] + grandmean
+ − 104 }
+ − 105
+ − 106 for (i in 1:k) {
+ − 107 B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))
+ − 108 }
+ − 109
+ − 110 W = var(data[,2:ncol(data)])
+ − 111 svdW = svd(W)
+ − 112 inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v))
+ − 113 B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW
+ − 114 B_star_decomp = svd(B_star)
+ − 115 directions = inv_sqrtW%*%B_star_decomp\$v
+ − 116 return( list(directions, B_star_decomp\$d) )
+ − 117 }
+ − 118
+ − 119 ################ NAIVE BAYES FOR 1D SIR OR LDA ##############
+ − 120 naive_bayes_classifier <- function(resp, tr_data, test_data, k=2, tau) {
+ − 121 tr_data=data.frame(resp=resp, dir=tr_data)
+ − 122 means=numeric(k)
+ − 123 #print(k)
+ − 124 cl=numeric(k)
+ − 125 predclass=numeric(length(test_data))
+ − 126 for (i in 1:k) {
+ − 127 grp = subset(tr_data, resp==i)
+ − 128 means[i] = mean(grp\$dir)
+ − 129 #print(i, means[i])
+ − 130 }
+ − 131 cutoff = tau*means[1]+(1-tau)*means[2]
+ − 132 #print(tau)
+ − 133 #print(means)
+ − 134 #print(cutoff)
+ − 135 if (cutoff>means[1]) {
+ − 136 cl[1]=1
+ − 137 cl[2]=2
+ − 138 }
+ − 139 else {
+ − 140 cl[1]=2
+ − 141 cl[2]=1
+ − 142 }
+ − 143
+ − 144 for (i in 1:length(test_data)) {
+ − 145
+ − 146 if (test_data[i] <= cutoff) {
+ − 147 predclass[i] = cl[1]
+ − 148 }
+ − 149 else {
+ − 150 predclass[i] = cl[2]
+ − 151 }
+ − 152 }
+ − 153 #print(means)
+ − 154 #print(mean(means))
+ − 155 #X11()
+ − 156 #plot(test_data,pch=predclass, col=resp)
+ − 157 predclass
+ − 158 }
+ − 159
+ − 160 ################# EXTENDED ERROR RATES #################
+ − 161 ext_error_rate <- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {
+ − 162 er=sum(predclass != actualclass)/length(predclass)
+ − 163
+ − 164 matr<-data.frame(predclass=predclass,actualclass=actualclass)
+ − 165 escapes = subset(matr, actualclass==1)
+ − 166 subjects = subset(matr, actualclass==2)
+ − 167 er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass)
+ − 168 er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass)
+ − 169
+ − 170 if (pr==1) {
+ − 171 # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" "))
+ − 172 # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" "))
+ − 173 # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" "))
+ − 174 }
+ − 175 return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))
+ − 176 }
+ − 177
+ − 178 ## Main Function ##
+ − 179
+ − 180 files<-matrix("${input}", 1,1, byrow=T)
+ − 181
+ − 182 d<-"${cond}" # Number of PC
+ − 183
+ − 184 tau<-seq(0,1, by=0.005)
+ − 185 #tau<-seq(0,1, by=0.1)
+ − 186 for_curve=matrix(-10, 3,length(tau))
+ − 187
+ − 188 ##############################################################
+ − 189
+ − 190 test_data_whole_X <-read.delim(files[1,1], row.names=1)
+ − 191
+ − 192 #### FORMAT TRAINING DATA ####################################
+ − 193 # get only necessary columns
+ − 194
+ − 195 test_data_whole_X<-format(test_data_whole_X)
+ − 196 oligo_labels<-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)]
+ − 197 test_data_whole_X<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
+ − 198
+ − 199 X_names<-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)]
+ − 200 test_data_whole_X<-t(test_data_whole_X)
+ − 201 resp<-get_resp(test_data_whole_X)
+ − 202 ldaqda_resp = resp + 1
+ − 203 a<-sum(resp) # Number of Subject
+ − 204 b<-length(resp) - a # Number of Escape
+ − 205 ## FREQUENCIES #################################################
+ − 206 F<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
+ − 207 F<-f_to_numbers(F)
+ − 208 FN<-norm(F, a, b)
+ − 209 ss<-svd(FN)
+ − 210 eigvar<-NULL
+ − 211 eig<-ss\$d^2
+ − 212
+ − 213 for ( i in 1:length(ss\$d)) {
+ − 214 eigvar[i]<-sum(eig[1:i])/sum(eig)
+ − 215 }
+ − 216
+ − 217 #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse=""))
+ − 218
+ − 219 Z<-F%*%ss\$v
+ − 220
+ − 221 ldaqda_data <- data.frame(group=ldaqda_resp,Z[,1:d])
+ − 222 lda_dir<-lda_dec(ldaqda_data,2)
+ − 223 train_lda_pred <-Z[,1:d]%*%lda_dir[[1]]
+ − 224
+ − 225 ############# NAIVE BAYES CROSS-VALIDATION #############
+ − 226 ### LDA #####
+ − 227
+ − 228 y<-ldaqda_resp
+ − 229 X<-F
+ − 230 cv<-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau))
+ − 231 for (i in 1:nrow(test_data_whole_X)) {
+ − 232 # print(i)
+ − 233 resp<-y[-i]
+ − 234 p<-matrix(X[-i,], dim(X)[1]-1, dim(X)[2])
+ − 235 testdata<-matrix(X[i,],1,dim(X)[2])
+ − 236 p1<-norm(p)
+ − 237 sss<-svd(p1)
+ − 238 pred<-(p%*%sss\$v)[,1:d]
+ − 239 test<- (testdata%*%sss\$v)[,1:d]
+ − 240 lda <- lda_dec(data.frame(group=resp,pred),2)
+ − 241 pred <- pred[,1:d]%*%lda[[1]][,1]
+ − 242 test <- test%*%lda[[1]][,1]
+ − 243 test<-matrix(test, 1, length(test))
+ − 244 for (t in 1:length(tau)) {
+ − 245 cv[i, t] <- naive_bayes_classifier (resp, pred, test,k=2, tau[t])
+ − 246 }
+ − 247 }
+ − 248
+ − 249 for (t in 1:length(tau)) {
+ − 250 tr_err<-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1)
+ − 251 for_curve[1:3,t]<-tr_err
+ − 252 }
+ − 253
+ − 254 dput(for_curve, file="${output}")
+ − 255
+ − 256
+ − 257 </configfile>
+ − 258 </configfiles>
+ − 259
+ − 260 <help>
+ − 261
+ − 262 .. class:: infomark
+ − 263
+ − 264 **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis*
+ − 265
+ − 266 -----
+ − 267
+ − 268 .. class:: infomark
+ − 269
+ − 270 **What it does**
+ − 271
+ − 272 This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)
+ − 273
+ − 274 *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
+ − 275
+ − 276 -----
+ − 277
+ − 278 .. class:: warningmark
+ − 279
+ − 280 **Note**
+ − 281
+ − 282 - Output from "Generate A Matrix" tool is used as input file for this tool
+ − 283 - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot.
+ − 284
+ − 285
+ − 286 </help>
+ − 287
+ − 288 </tool>