Mercurial > repos > deepakjadmin > r_caret_test
comparison featureselect/feature_selection (copy).R @ 0:68300206e90d draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:41:30 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68300206e90d |
---|---|
1 args <- commandArgs(T) | |
2 | |
3 arg1 <- args[1] | |
4 arg2 <- args[2] | |
5 arg3 <- args[3] | |
6 arg4 <- args[4] | |
7 arg5 <- args[5] | |
8 arg6 <- args[6] | |
9 arg7 <- args[7] | |
10 | |
11 library(caret) | |
12 load(arg1) | |
13 RAWDATA <- dataX | |
14 RAWDATA$outcome <- dataY | |
15 rawData <- dataX | |
16 predictorNames <- names(rawData) | |
17 | |
18 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
19 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
20 | |
21 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
22 2, function(x) mean(is.na(x))) | |
23 colExclude <- colRate > 0.001 | |
24 if(any(colExclude)){ | |
25 predictorNames <- predictorNames[!colExclude] | |
26 rawData <- rawData[, predictorNames] | |
27 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
28 1, function(x) mean(is.na(x))) | |
29 } | |
30 | |
31 rowExclude <- rowRate > 0.001 | |
32 if(any(rowExclude)){ | |
33 rawData <- rawData[!rowExclude, ] | |
34 hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
35 1, function(x) mean(is.na(x))) | |
36 | |
37 } else { hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
38 1, function(x) any(is.na(x))) | |
39 | |
40 rawData <- rawData[complete.cases(rawData),] | |
41 } | |
42 | |
43 set.seed(2) | |
44 | |
45 nzv <- nearZeroVar(rawData) | |
46 if(length(nzv) > 0) { | |
47 nzvVars <- names(rawData)[nzv] | |
48 rawData <- rawData[,-nzv] | |
49 rawData$outcome <- dataY | |
50 } else { | |
51 rawData <- rawData | |
52 rawData$outcome <- dataY | |
53 } | |
54 | |
55 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
56 | |
57 dx <- rawData[,1:length(rawData)-1] | |
58 dy <- rawData[,length(rawData)] | |
59 corrThresh <- 0.90 | |
60 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) | |
61 dx <- dx[, -highCorr] | |
62 subsets <- seq(1,length(dx),by=2) | |
63 normalization <- preProcess(dx) | |
64 dx <- predict(normalization, dx) | |
65 dx <- as.data.frame(dx) | |
66 | |
67 if (arg4 == "lmFuncs"){ | |
68 ctrl1 <- rfeControl(functions = lmFuncs, | |
69 method = arg5 , | |
70 repeats = as.numeric(arg6), | |
71 number = as.numeric(arg7), | |
72 verbose = FALSE) | |
73 } else if(arg4 == "rfFuncs"){ | |
74 ctrl1 <- rfeControl(functions = rfFuncs, | |
75 method = arg5 , | |
76 repeats = as.numeric(arg6), | |
77 number = as.numeric(arg7), | |
78 verbose = FALSE) | |
79 }else if (arg4 == "treebagFuncs"){ | |
80 ctrl1 <- rfeControl(functions = treebagFuncs, | |
81 method = arg5 , | |
82 repeats = as.numeric(arg6), | |
83 number = as.numeric(arg7), | |
84 verbose = FALSE) | |
85 }else { | |
86 | |
87 ctrl1 <- rfeControl(functions = nbFuncs, | |
88 method = arg5 , | |
89 repeats = as.numeric(arg6), | |
90 number = as.numeric(arg7), | |
91 verbose = FALSE) | |
92 } | |
93 | |
94 | |
95 | |
96 | |
97 Profile <- rfe(dx, dy, | |
98 sizes = subsets, | |
99 rfeControl = ctrl1) | |
100 | |
101 pred11 <- predictors(Profile) | |
102 save(Profile,file=arg2) | |
103 rawData <- RAWDATA[,pred11] | |
104 rawData$outcome <- RAWDATA$outcome | |
105 dataX <- rawData[,1:length(rawData)-1] | |
106 dataY <- rawData[,length(rawData)] | |
107 save(dataX,dataY,file=arg3) | |
108 rm(dataX) | |
109 rm(dataY) | |
110 |