Mercurial > repos > deepakjadmin > r_caret_test1
comparison featureselect/feature_selection.R @ 0:a4a2ad5a214e draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:37:56 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4a2ad5a214e |
---|---|
1 args <- commandArgs(T) | |
2 | |
3 arg1 <- args[1] | |
4 arg2 <- args[2] | |
5 arg3 <- args[3] | |
6 arg4 <- args[4] | |
7 arg5 <- args[5] | |
8 arg6 <- args[6] | |
9 arg7 <- args[7] | |
10 | |
11 library(caret) | |
12 load(arg1) | |
13 print("data loaded") | |
14 RAWDATA <- dataX | |
15 RAWDATA$outcome <- dataY | |
16 rawData <- dataX | |
17 predictorNames <- names(rawData) | |
18 | |
19 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
20 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
21 | |
22 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
23 2, function(x) mean(is.na(x))) | |
24 colExclude <- colRate > 0.001 | |
25 if(any(colExclude)){ | |
26 predictorNames <- predictorNames[!colExclude] | |
27 rawData <- rawData[, predictorNames] | |
28 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
29 1, function(x) mean(is.na(x))) | |
30 } | |
31 | |
32 rowExclude <- rowRate > 0.00000001 | |
33 if(any(rowExclude)){ | |
34 rawData <- rawData[!rowExclude, ] | |
35 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
36 ##1, function(x) mean(is.na(x))) | |
37 | |
38 ############################################################################ | |
39 if(is.factor(dataY)){ | |
40 dataY <- as.vector(dataY) | |
41 dataY <- t(dataY) | |
42 colName1 <- rownames(dataX) | |
43 colnames(dataY) <- colName1 | |
44 names11 <- rownames(rawData) | |
45 dataY <- dataY[,names11] | |
46 #dataY <- t(dataY) | |
47 | |
48 colnames(dataY) <- NULL | |
49 dataY <- as.factor(dataY) | |
50 } else { | |
51 dataY <- t(dataY) | |
52 colnames(dataY) <- rownames(dataX) | |
53 names11 <- rownames(rawData) | |
54 dataY <- dataY[,names11] | |
55 | |
56 } | |
57 | |
58 | |
59 ############################################################################### | |
60 } else { | |
61 rawData <- RAWDATA[complete.cases(RAWDATA),] | |
62 dataX <- rawData[,1:lenght(rawData)-1] | |
63 dataY <- rawData[,length(rawData)] | |
64 | |
65 print(dim(dataX)) | |
66 print(dim(rawData)) | |
67 rawData <- dataX | |
68 | |
69 print(dim(rawData)) | |
70 } | |
71 | |
72 set.seed(2) | |
73 | |
74 print(dim(dataX)) | |
75 print(dim(rawData)) | |
76 print(length(dataY)) | |
77 | |
78 save(rawData,dataY,file="check.RData") | |
79 nzv <- nearZeroVar(rawData) | |
80 if(length(nzv) > 0) { | |
81 nzvVars <- names(rawData)[nzv] | |
82 rawData <- rawData[,-nzv] | |
83 rawData$outcome <- dataY | |
84 } else { | |
85 rawData <- rawData | |
86 rawData$outcome <- dataY | |
87 } | |
88 | |
89 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
90 | |
91 dx <- rawData[,1:length(rawData)-1] | |
92 dy <- rawData[,length(rawData)] | |
93 corrThresh <- 0.90 | |
94 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) | |
95 dx <- dx[, -highCorr] | |
96 subsets <- seq(1,length(dx),by=5) | |
97 normalization <- preProcess(dx) | |
98 dx <- predict(normalization, dx) | |
99 dx <- as.data.frame(dx) | |
100 | |
101 if (arg4 == "lmFuncs"){ | |
102 ctrl1 <- rfeControl(functions = lmFuncs, | |
103 method = arg5 , | |
104 repeats = as.numeric(arg6), | |
105 number = as.numeric(arg7), | |
106 verbose = FALSE) | |
107 } else if(arg4 == "rfFuncs"){ | |
108 ctrl1 <- rfeControl(functions = rfFuncs, | |
109 method = arg5 , | |
110 repeats = as.numeric(arg6), | |
111 number = as.numeric(arg7), | |
112 verbose = FALSE) | |
113 }else if (arg4 == "treebagFuncs"){ | |
114 ctrl1 <- rfeControl(functions = treebagFuncs, | |
115 method = arg5 , | |
116 repeats = as.numeric(arg6), | |
117 number = as.numeric(arg7), | |
118 verbose = FALSE) | |
119 }else { | |
120 | |
121 ctrl1 <- rfeControl(functions = nbFuncs, | |
122 method = arg5 , | |
123 repeats = as.numeric(arg6), | |
124 number = as.numeric(arg7), | |
125 verbose = FALSE) | |
126 } | |
127 | |
128 | |
129 | |
130 | |
131 Profile <- rfe(dx, dy, | |
132 sizes = subsets, | |
133 rfeControl = ctrl1) | |
134 | |
135 pred11 <- predictors(Profile) | |
136 save(Profile,file=arg2) | |
137 rawData <- RAWDATA[,pred11] | |
138 rawData$outcome <- RAWDATA$outcome | |
139 dataX <- rawData[,1:length(rawData)-1] | |
140 dataY <- rawData[,length(rawData)] | |
141 save(dataX,dataY,file=arg3) | |
142 rm(dataX) | |
143 rm(dataY) | |
144 |