annotate feature_selection.R @ 1:69b8598d9338 draft

Uploaded
author deepakjadmin
date Wed, 23 Mar 2016 04:53:29 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
1 args <- commandArgs(T)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
2
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
3 arg1 <- args[1]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
4 arg2 <- args[2]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
5 arg3 <- args[3]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
6 arg4 <- args[4]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
7 arg5 <- args[5]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
8 arg6 <- args[6]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
9 arg7 <- args[7]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
10 arg8 <- args[8]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
11
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
12 library(caret)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
13 load(arg1)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
14
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
15 RAWDATA <- dataX
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
16 RAWDATA$outcome <- dataY
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
17 rawData <- dataX
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
18 predictorNames <- names(rawData)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
19
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
20 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
21 if(any(!isNum)) stop("all predictors in rawData should be numeric")
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
22
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
23 colRate <- apply(rawData[, predictorNames, drop = FALSE],
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
24 2, function(x) mean(is.na(x)))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
25 colExclude <- colRate > 0.01
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
26 if(any(colExclude)){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
27 predictorNames <- predictorNames[-which(colExclude)]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
28 rawData <- RAWDATA[, c(predictorNames,"outcome")]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
29 } else {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
30 rawData <- RAWDATA
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
31 }
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
32 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
33 1, function(x) mean(is.na(x)))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
34
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
35 rowno <- dim(rawData)[1]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
36 if (rowno <= 1000){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
37 cutoff <- rowno / (rowno * 100)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
38 } else if (rowno > 1000 & rowno <= 5000) {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
39 cutoff <- rowno / (rowno * 100 * 0.5 )
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
40 } else {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
41 cutoff <- rowno / (rowno * 100 * 0.5 * 0.5)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
42 }
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
43 rowExclude <- rowRate > cutoff
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
44 if(any(rowExclude)){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
45 rawData <- rawData[!rowExclude, ]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
46 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
47 ##1, function(x) mean(is.na(x)))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
48
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
49 ############################################################################
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
50
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
51
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
52 ###############################################################################
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
53 } else {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
54 rawData <- rawData[complete.cases(rawData),]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
55
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
56 }
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
57
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
58 set.seed(2)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
59
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
60 #print(dim(dataX))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
61 #print(dim(rawData))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
62 #print(length(dataY))
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
63
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
64 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
65 if(length(nzv) > 0) {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
66 #nzvVars <- names(rawData)[nzv]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
67 rawData <- rawData[,-nzv]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
68 #rawData$outcome <- dataY
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
69 }
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
70
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
71 predictorNames <- names(rawData)[names(rawData) != "outcome"]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
72
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
73 dx <- rawData[,1:length(rawData)-1]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
74 dy <- rawData[,length(rawData)]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
75 corrThresh <- as.numeric(arg8)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
76 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
77 dx <- dx[, -highCorr]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
78 subsets <- seq(1,length(dx),by=5)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
79 normalization <- preProcess(dx)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
80 dx <- predict(normalization, dx)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
81 dx <- as.data.frame(dx)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
82
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
83 if (arg4 == "lmFuncs"){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
84 ctrl1 <- rfeControl(functions = lmFuncs,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
85 method = arg5 ,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
86 repeats = as.numeric(arg6),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
87 number = as.numeric(arg7),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
88 verbose = FALSE)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
89 } else if(arg4 == "rfFuncs"){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
90 ctrl1 <- rfeControl(functions = rfFuncs,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
91 method = arg5 ,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
92 repeats = as.numeric(arg6),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
93 number = as.numeric(arg7),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
94 verbose = FALSE)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
95 }else if (arg4 == "treebagFuncs"){
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
96 ctrl1 <- rfeControl(functions = treebagFuncs,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
97 method = arg5 ,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
98 repeats = as.numeric(arg6),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
99 number = as.numeric(arg7),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
100 verbose = FALSE)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
101 }else {
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
102
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
103 ctrl1 <- rfeControl(functions = nbFuncs,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
104 method = arg5 ,
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
105 repeats = as.numeric(arg6),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
106 number = as.numeric(arg7),
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
107 verbose = FALSE)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
108 }
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
109
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
110
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
111
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
112
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
113 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
114
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
115 pred11 <- predictors(Profile)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
116 save(Profile,file=arg2)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
117 dataX <- rawData[,pred11]
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
118 dataY <- rawData$outcome
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
119
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
120 save(dataX,dataY,file=arg3)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
121 rm(dataX)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
122 rm(dataY)
69b8598d9338 Uploaded
deepakjadmin
parents:
diff changeset
123