annotate featureselect/feature_selection (copy).R @ 0:a4a2ad5a214e draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:37:56 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
1 args <- commandArgs(T)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
2
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
3 arg1 <- args[1]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
4 arg2 <- args[2]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
5 arg3 <- args[3]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
6 arg4 <- args[4]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
7 arg5 <- args[5]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
8 arg6 <- args[6]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
9 arg7 <- args[7]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
10
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
11 library(caret)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
12 load(arg1)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
13 RAWDATA <- dataX
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
14 RAWDATA$outcome <- dataY
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
15 rawData <- dataX
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
16 predictorNames <- names(rawData)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
17
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
18 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
19 if(any(!isNum)) stop("all predictors in rawData should be numeric")
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
20
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
21 colRate <- apply(rawData[, predictorNames, drop = FALSE],
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
22 2, function(x) mean(is.na(x)))
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
23 colExclude <- colRate > 0.001
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
24 if(any(colExclude)){
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
25 predictorNames <- predictorNames[!colExclude]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
26 rawData <- rawData[, predictorNames]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
27 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
28 1, function(x) mean(is.na(x)))
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
29 }
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
30
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
31 rowExclude <- rowRate > 0.001
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
32 if(any(rowExclude)){
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
33 rawData <- rawData[!rowExclude, ]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
34 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
35 1, function(x) mean(is.na(x)))
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
36
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
37 } else { hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
38 1, function(x) any(is.na(x)))
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
39
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
40 rawData <- rawData[complete.cases(rawData),]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
41 }
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
42
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
43 set.seed(2)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
44
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
45 nzv <- nearZeroVar(rawData)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
46 if(length(nzv) > 0) {
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
47 nzvVars <- names(rawData)[nzv]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
48 rawData <- rawData[,-nzv]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
49 rawData$outcome <- dataY
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
50 } else {
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
51 rawData <- rawData
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
52 rawData$outcome <- dataY
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
53 }
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
54
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
55 predictorNames <- names(rawData)[names(rawData) != "outcome"]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
56
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
57 dx <- rawData[,1:length(rawData)-1]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
58 dy <- rawData[,length(rawData)]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
59 corrThresh <- 0.90
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
60 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
61 dx <- dx[, -highCorr]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
62 subsets <- seq(1,length(dx),by=2)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
63 normalization <- preProcess(dx)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
64 dx <- predict(normalization, dx)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
65 dx <- as.data.frame(dx)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
66
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
67 if (arg4 == "lmFuncs"){
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
68 ctrl1 <- rfeControl(functions = lmFuncs,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
69 method = arg5 ,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
70 repeats = as.numeric(arg6),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
71 number = as.numeric(arg7),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
72 verbose = FALSE)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
73 } else if(arg4 == "rfFuncs"){
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
74 ctrl1 <- rfeControl(functions = rfFuncs,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
75 method = arg5 ,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
76 repeats = as.numeric(arg6),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
77 number = as.numeric(arg7),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
78 verbose = FALSE)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
79 }else if (arg4 == "treebagFuncs"){
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
80 ctrl1 <- rfeControl(functions = treebagFuncs,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
81 method = arg5 ,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
82 repeats = as.numeric(arg6),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
83 number = as.numeric(arg7),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
84 verbose = FALSE)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
85 }else {
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
86
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
87 ctrl1 <- rfeControl(functions = nbFuncs,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
88 method = arg5 ,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
89 repeats = as.numeric(arg6),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
90 number = as.numeric(arg7),
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
91 verbose = FALSE)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
92 }
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
93
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
94
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
95
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
96
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
97 Profile <- rfe(dx, dy,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
98 sizes = subsets,
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
99 rfeControl = ctrl1)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
100
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
101 pred11 <- predictors(Profile)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
102 save(Profile,file=arg2)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
103 rawData <- RAWDATA[,pred11]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
104 rawData$outcome <- RAWDATA$outcome
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
105 dataX <- rawData[,1:length(rawData)-1]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
106 dataY <- rawData[,length(rawData)]
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
107 save(dataX,dataY,file=arg3)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
108 rm(dataX)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
109 rm(dataY)
a4a2ad5a214e Uploaded
deepakjadmin
parents:
diff changeset
110