comparison featureselect/feature_selection.R @ 0:a4a2ad5a214e draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:37:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a4a2ad5a214e
1 args <- commandArgs(T)
2
3 arg1 <- args[1]
4 arg2 <- args[2]
5 arg3 <- args[3]
6 arg4 <- args[4]
7 arg5 <- args[5]
8 arg6 <- args[6]
9 arg7 <- args[7]
10
11 library(caret)
12 load(arg1)
13 print("data loaded")
14 RAWDATA <- dataX
15 RAWDATA$outcome <- dataY
16 rawData <- dataX
17 predictorNames <- names(rawData)
18
19 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
20 if(any(!isNum)) stop("all predictors in rawData should be numeric")
21
22 colRate <- apply(rawData[, predictorNames, drop = FALSE],
23 2, function(x) mean(is.na(x)))
24 colExclude <- colRate > 0.001
25 if(any(colExclude)){
26 predictorNames <- predictorNames[!colExclude]
27 rawData <- rawData[, predictorNames]
28 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
29 1, function(x) mean(is.na(x)))
30 }
31
32 rowExclude <- rowRate > 0.00000001
33 if(any(rowExclude)){
34 rawData <- rawData[!rowExclude, ]
35 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
36 ##1, function(x) mean(is.na(x)))
37
38 ############################################################################
39 if(is.factor(dataY)){
40 dataY <- as.vector(dataY)
41 dataY <- t(dataY)
42 colName1 <- rownames(dataX)
43 colnames(dataY) <- colName1
44 names11 <- rownames(rawData)
45 dataY <- dataY[,names11]
46 #dataY <- t(dataY)
47
48 colnames(dataY) <- NULL
49 dataY <- as.factor(dataY)
50 } else {
51 dataY <- t(dataY)
52 colnames(dataY) <- rownames(dataX)
53 names11 <- rownames(rawData)
54 dataY <- dataY[,names11]
55
56 }
57
58
59 ###############################################################################
60 } else {
61 rawData <- RAWDATA[complete.cases(RAWDATA),]
62 dataX <- rawData[,1:lenght(rawData)-1]
63 dataY <- rawData[,length(rawData)]
64
65 print(dim(dataX))
66 print(dim(rawData))
67 rawData <- dataX
68
69 print(dim(rawData))
70 }
71
72 set.seed(2)
73
74 print(dim(dataX))
75 print(dim(rawData))
76 print(length(dataY))
77
78 save(rawData,dataY,file="check.RData")
79 nzv <- nearZeroVar(rawData)
80 if(length(nzv) > 0) {
81 nzvVars <- names(rawData)[nzv]
82 rawData <- rawData[,-nzv]
83 rawData$outcome <- dataY
84 } else {
85 rawData <- rawData
86 rawData$outcome <- dataY
87 }
88
89 predictorNames <- names(rawData)[names(rawData) != "outcome"]
90
91 dx <- rawData[,1:length(rawData)-1]
92 dy <- rawData[,length(rawData)]
93 corrThresh <- 0.90
94 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
95 dx <- dx[, -highCorr]
96 subsets <- seq(1,length(dx),by=5)
97 normalization <- preProcess(dx)
98 dx <- predict(normalization, dx)
99 dx <- as.data.frame(dx)
100
101 if (arg4 == "lmFuncs"){
102 ctrl1 <- rfeControl(functions = lmFuncs,
103 method = arg5 ,
104 repeats = as.numeric(arg6),
105 number = as.numeric(arg7),
106 verbose = FALSE)
107 } else if(arg4 == "rfFuncs"){
108 ctrl1 <- rfeControl(functions = rfFuncs,
109 method = arg5 ,
110 repeats = as.numeric(arg6),
111 number = as.numeric(arg7),
112 verbose = FALSE)
113 }else if (arg4 == "treebagFuncs"){
114 ctrl1 <- rfeControl(functions = treebagFuncs,
115 method = arg5 ,
116 repeats = as.numeric(arg6),
117 number = as.numeric(arg7),
118 verbose = FALSE)
119 }else {
120
121 ctrl1 <- rfeControl(functions = nbFuncs,
122 method = arg5 ,
123 repeats = as.numeric(arg6),
124 number = as.numeric(arg7),
125 verbose = FALSE)
126 }
127
128
129
130
131 Profile <- rfe(dx, dy,
132 sizes = subsets,
133 rfeControl = ctrl1)
134
135 pred11 <- predictors(Profile)
136 save(Profile,file=arg2)
137 rawData <- RAWDATA[,pred11]
138 rawData$outcome <- RAWDATA$outcome
139 dataX <- rawData[,1:length(rawData)-1]
140 dataY <- rawData[,length(rawData)]
141 save(dataX,dataY,file=arg3)
142 rm(dataX)
143 rm(dataY)
144