comparison featureselect/feature_selection (copy).R @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68300206e90d
1 args <- commandArgs(T)
2
3 arg1 <- args[1]
4 arg2 <- args[2]
5 arg3 <- args[3]
6 arg4 <- args[4]
7 arg5 <- args[5]
8 arg6 <- args[6]
9 arg7 <- args[7]
10
11 library(caret)
12 load(arg1)
13 RAWDATA <- dataX
14 RAWDATA$outcome <- dataY
15 rawData <- dataX
16 predictorNames <- names(rawData)
17
18 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
19 if(any(!isNum)) stop("all predictors in rawData should be numeric")
20
21 colRate <- apply(rawData[, predictorNames, drop = FALSE],
22 2, function(x) mean(is.na(x)))
23 colExclude <- colRate > 0.001
24 if(any(colExclude)){
25 predictorNames <- predictorNames[!colExclude]
26 rawData <- rawData[, predictorNames]
27 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
28 1, function(x) mean(is.na(x)))
29 }
30
31 rowExclude <- rowRate > 0.001
32 if(any(rowExclude)){
33 rawData <- rawData[!rowExclude, ]
34 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
35 1, function(x) mean(is.na(x)))
36
37 } else { hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
38 1, function(x) any(is.na(x)))
39
40 rawData <- rawData[complete.cases(rawData),]
41 }
42
43 set.seed(2)
44
45 nzv <- nearZeroVar(rawData)
46 if(length(nzv) > 0) {
47 nzvVars <- names(rawData)[nzv]
48 rawData <- rawData[,-nzv]
49 rawData$outcome <- dataY
50 } else {
51 rawData <- rawData
52 rawData$outcome <- dataY
53 }
54
55 predictorNames <- names(rawData)[names(rawData) != "outcome"]
56
57 dx <- rawData[,1:length(rawData)-1]
58 dy <- rawData[,length(rawData)]
59 corrThresh <- 0.90
60 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
61 dx <- dx[, -highCorr]
62 subsets <- seq(1,length(dx),by=2)
63 normalization <- preProcess(dx)
64 dx <- predict(normalization, dx)
65 dx <- as.data.frame(dx)
66
67 if (arg4 == "lmFuncs"){
68 ctrl1 <- rfeControl(functions = lmFuncs,
69 method = arg5 ,
70 repeats = as.numeric(arg6),
71 number = as.numeric(arg7),
72 verbose = FALSE)
73 } else if(arg4 == "rfFuncs"){
74 ctrl1 <- rfeControl(functions = rfFuncs,
75 method = arg5 ,
76 repeats = as.numeric(arg6),
77 number = as.numeric(arg7),
78 verbose = FALSE)
79 }else if (arg4 == "treebagFuncs"){
80 ctrl1 <- rfeControl(functions = treebagFuncs,
81 method = arg5 ,
82 repeats = as.numeric(arg6),
83 number = as.numeric(arg7),
84 verbose = FALSE)
85 }else {
86
87 ctrl1 <- rfeControl(functions = nbFuncs,
88 method = arg5 ,
89 repeats = as.numeric(arg6),
90 number = as.numeric(arg7),
91 verbose = FALSE)
92 }
93
94
95
96
97 Profile <- rfe(dx, dy,
98 sizes = subsets,
99 rfeControl = ctrl1)
100
101 pred11 <- predictors(Profile)
102 save(Profile,file=arg2)
103 rawData <- RAWDATA[,pred11]
104 rawData$outcome <- RAWDATA$outcome
105 dataX <- rawData[,1:length(rawData)-1]
106 dataY <- rawData[,length(rawData)]
107 save(dataX,dataY,file=arg3)
108 rm(dataX)
109 rm(dataY)
110