comparison feature_selection.R @ 5:016c69bfb2a1 draft

Uploaded
author deepakjadmin
date Tue, 03 Jan 2017 02:26:17 -0500
parents
children d56ce97c4985
comparison
equal deleted inserted replaced
4:5364cf43a8c1 5:016c69bfb2a1
1 args <- commandArgs(T)
2
3 arg1 <- args[1]
4 arg2 <- args[2]
5 arg3 <- args[3]
6 arg4 <- args[4]
7 arg5 <- args[5]
8 arg6 <- args[6]
9 arg7 <- args[7]
10 arg8 <- args[8]
11 arg9 <- args[9]
12 library(caret)
13 load(arg1)
14
15 #RAWDATA <- dataX
16 #RAWDATA$outcome <- dataY
17
18
19 ###########################
20 Smpling <- arg9
21
22 if(Smpling=="downsampling")
23 {
24 dwnsmpl <- downSample(dataX,dataY)
25 RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
26 RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
27 dataX <- RAWDATA[,1:length(dwnsmpl)-1]
28 dataY <- RAWDATA[,"outcome"]
29 remove("dwnsmpl")
30 }else if(Smpling=="upsampling"){
31 upsmpl <- upSample(dataX,dataY)
32 RAWDATA <- upsmpl[,1:length(upsmpl)-1]
33 RAWDATA$outcome <- upsmpl[,length(upsmpl)]
34 dataX <- RAWDATA[,1:length(upsmpl)-1]
35 dataY <- RAWDATA[,"outcome"]
36 remove("upsmpl")
37 }else {
38 RAWDATA <- dataX
39 RAWDATA$outcome <- dataY
40 }
41
42
43
44
45 ##########################
46
47
48 rawData <- dataX
49 predictorNames <- names(rawData)
50
51 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
52 if(any(!isNum)) stop("all predictors in rawData should be numeric")
53
54 colRate <- apply(rawData[, predictorNames, drop = FALSE],
55 2, function(x) mean(is.na(x)))
56 colExclude <- colRate > 0.1
57 if(any(colExclude)){
58 predictorNames <- predictorNames[-which(colExclude)]
59 rawData <- RAWDATA[, c(predictorNames,"outcome")]
60 } else {
61 rawData <- RAWDATA
62 }
63 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
64 1, function(x) mean(is.na(x)))
65
66
67 rowExclude <- rowRate > 0
68 if(any(rowExclude)){
69 rawData <- rawData[!rowExclude, ]
70 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
71 ##1, function(x) mean(is.na(x)))
72
73 ############################################################################
74
75
76 ###############################################################################
77 } else {
78 rawData <- rawData[complete.cases(rawData),]
79
80 }
81
82 set.seed(2)
83
84 #print(dim(dataX))
85 #print(dim(rawData))
86 #print(length(dataY))
87
88 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
89 if(length(nzv) > 0) {
90 #nzvVars <- names(rawData)[nzv]
91 rawData <- rawData[,-nzv]
92 #rawData$outcome <- dataY
93 }
94
95 predictorNames <- names(rawData)[names(rawData) != "outcome"]
96
97 dx <- rawData[,1:length(rawData)-1]
98 dy <- rawData[,length(rawData)]
99 corrThresh <- as.numeric(arg8)
100 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
101 dx <- dx[, -highCorr]
102 subsets <- seq(1,length(dx),by=5)
103 normalization <- preProcess(dx)
104 dx <- predict(normalization, dx)
105 dx <- as.data.frame(dx)
106
107 if (arg4 == "lmFuncs"){
108 ctrl1 <- rfeControl(functions = lmFuncs,
109 method = arg5 ,
110 repeats = as.numeric(arg6),
111 number = as.numeric(arg7),
112 verbose = FALSE)
113 } else if(arg4 == "rfFuncs"){
114 ctrl1 <- rfeControl(functions = rfFuncs,
115 method = arg5 ,
116 repeats = as.numeric(arg6),
117 number = as.numeric(arg7),
118 verbose = FALSE)
119 }else if (arg4 == "treebagFuncs"){
120 ctrl1 <- rfeControl(functions = treebagFuncs,
121 method = arg5 ,
122 repeats = as.numeric(arg6),
123 number = as.numeric(arg7),
124 verbose = FALSE)
125 }else {
126
127 ctrl1 <- rfeControl(functions = nbFuncs,
128 method = arg5 ,
129 repeats = as.numeric(arg6),
130 number = as.numeric(arg7),
131 verbose = FALSE)
132 }
133
134
135
136
137 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
138
139 pred11 <- predictors(Profile)
140 save(Profile,file=arg2)
141 dataX <- rawData[,pred11]
142 dataY <- rawData$outcome
143
144 save(dataX,dataY,file=arg3)
145 rm(dataX)
146 rm(dataY)
147