Mercurial > repos > deepakjadmin > feature_selection_test1
comparison feature_selection.R @ 5:016c69bfb2a1 draft
Uploaded
author | deepakjadmin |
---|---|
date | Tue, 03 Jan 2017 02:26:17 -0500 |
parents | |
children | d56ce97c4985 |
comparison
equal
deleted
inserted
replaced
4:5364cf43a8c1 | 5:016c69bfb2a1 |
---|---|
1 args <- commandArgs(T) | |
2 | |
3 arg1 <- args[1] | |
4 arg2 <- args[2] | |
5 arg3 <- args[3] | |
6 arg4 <- args[4] | |
7 arg5 <- args[5] | |
8 arg6 <- args[6] | |
9 arg7 <- args[7] | |
10 arg8 <- args[8] | |
11 arg9 <- args[9] | |
12 library(caret) | |
13 load(arg1) | |
14 | |
15 #RAWDATA <- dataX | |
16 #RAWDATA$outcome <- dataY | |
17 | |
18 | |
19 ########################### | |
20 Smpling <- arg9 | |
21 | |
22 if(Smpling=="downsampling") | |
23 { | |
24 dwnsmpl <- downSample(dataX,dataY) | |
25 RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1] | |
26 RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)] | |
27 dataX <- RAWDATA[,1:length(dwnsmpl)-1] | |
28 dataY <- RAWDATA[,"outcome"] | |
29 remove("dwnsmpl") | |
30 }else if(Smpling=="upsampling"){ | |
31 upsmpl <- upSample(dataX,dataY) | |
32 RAWDATA <- upsmpl[,1:length(upsmpl)-1] | |
33 RAWDATA$outcome <- upsmpl[,length(upsmpl)] | |
34 dataX <- RAWDATA[,1:length(upsmpl)-1] | |
35 dataY <- RAWDATA[,"outcome"] | |
36 remove("upsmpl") | |
37 }else { | |
38 RAWDATA <- dataX | |
39 RAWDATA$outcome <- dataY | |
40 } | |
41 | |
42 | |
43 | |
44 | |
45 ########################## | |
46 | |
47 | |
48 rawData <- dataX | |
49 predictorNames <- names(rawData) | |
50 | |
51 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
52 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
53 | |
54 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
55 2, function(x) mean(is.na(x))) | |
56 colExclude <- colRate > 0.1 | |
57 if(any(colExclude)){ | |
58 predictorNames <- predictorNames[-which(colExclude)] | |
59 rawData <- RAWDATA[, c(predictorNames,"outcome")] | |
60 } else { | |
61 rawData <- RAWDATA | |
62 } | |
63 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
64 1, function(x) mean(is.na(x))) | |
65 | |
66 | |
67 rowExclude <- rowRate > 0 | |
68 if(any(rowExclude)){ | |
69 rawData <- rawData[!rowExclude, ] | |
70 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
71 ##1, function(x) mean(is.na(x))) | |
72 | |
73 ############################################################################ | |
74 | |
75 | |
76 ############################################################################### | |
77 } else { | |
78 rawData <- rawData[complete.cases(rawData),] | |
79 | |
80 } | |
81 | |
82 set.seed(2) | |
83 | |
84 #print(dim(dataX)) | |
85 #print(dim(rawData)) | |
86 #print(length(dataY)) | |
87 | |
88 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) | |
89 if(length(nzv) > 0) { | |
90 #nzvVars <- names(rawData)[nzv] | |
91 rawData <- rawData[,-nzv] | |
92 #rawData$outcome <- dataY | |
93 } | |
94 | |
95 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
96 | |
97 dx <- rawData[,1:length(rawData)-1] | |
98 dy <- rawData[,length(rawData)] | |
99 corrThresh <- as.numeric(arg8) | |
100 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) | |
101 dx <- dx[, -highCorr] | |
102 subsets <- seq(1,length(dx),by=5) | |
103 normalization <- preProcess(dx) | |
104 dx <- predict(normalization, dx) | |
105 dx <- as.data.frame(dx) | |
106 | |
107 if (arg4 == "lmFuncs"){ | |
108 ctrl1 <- rfeControl(functions = lmFuncs, | |
109 method = arg5 , | |
110 repeats = as.numeric(arg6), | |
111 number = as.numeric(arg7), | |
112 verbose = FALSE) | |
113 } else if(arg4 == "rfFuncs"){ | |
114 ctrl1 <- rfeControl(functions = rfFuncs, | |
115 method = arg5 , | |
116 repeats = as.numeric(arg6), | |
117 number = as.numeric(arg7), | |
118 verbose = FALSE) | |
119 }else if (arg4 == "treebagFuncs"){ | |
120 ctrl1 <- rfeControl(functions = treebagFuncs, | |
121 method = arg5 , | |
122 repeats = as.numeric(arg6), | |
123 number = as.numeric(arg7), | |
124 verbose = FALSE) | |
125 }else { | |
126 | |
127 ctrl1 <- rfeControl(functions = nbFuncs, | |
128 method = arg5 , | |
129 repeats = as.numeric(arg6), | |
130 number = as.numeric(arg7), | |
131 verbose = FALSE) | |
132 } | |
133 | |
134 | |
135 | |
136 | |
137 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) | |
138 | |
139 pred11 <- predictors(Profile) | |
140 save(Profile,file=arg2) | |
141 dataX <- rawData[,pred11] | |
142 dataY <- rawData$outcome | |
143 | |
144 save(dataX,dataY,file=arg3) | |
145 rm(dataX) | |
146 rm(dataY) | |
147 |