Mercurial > repos > deepakjadmin > r_caret_test
comparison featureselect/feature_selection.R @ 0:68300206e90d draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Thu, 05 Nov 2015 02:41:30 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:68300206e90d |
|---|---|
| 1 args <- commandArgs(T) | |
| 2 | |
| 3 arg1 <- args[1] | |
| 4 arg2 <- args[2] | |
| 5 arg3 <- args[3] | |
| 6 arg4 <- args[4] | |
| 7 arg5 <- args[5] | |
| 8 arg6 <- args[6] | |
| 9 arg7 <- args[7] | |
| 10 | |
| 11 library(caret) | |
| 12 load(arg1) | |
| 13 print("data loaded") | |
| 14 RAWDATA <- dataX | |
| 15 RAWDATA$outcome <- dataY | |
| 16 rawData <- dataX | |
| 17 predictorNames <- names(rawData) | |
| 18 | |
| 19 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
| 20 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
| 21 | |
| 22 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 23 2, function(x) mean(is.na(x))) | |
| 24 colExclude <- colRate > 0.001 | |
| 25 if(any(colExclude)){ | |
| 26 predictorNames <- predictorNames[!colExclude] | |
| 27 rawData <- rawData[, predictorNames] | |
| 28 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 29 1, function(x) mean(is.na(x))) | |
| 30 } | |
| 31 | |
| 32 rowExclude <- rowRate > 0.00000001 | |
| 33 if(any(rowExclude)){ | |
| 34 rawData <- rawData[!rowExclude, ] | |
| 35 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
| 36 ##1, function(x) mean(is.na(x))) | |
| 37 | |
| 38 ############################################################################ | |
| 39 if(is.factor(dataY)){ | |
| 40 dataY <- as.vector(dataY) | |
| 41 dataY <- t(dataY) | |
| 42 colName1 <- rownames(dataX) | |
| 43 colnames(dataY) <- colName1 | |
| 44 names11 <- rownames(rawData) | |
| 45 dataY <- dataY[,names11] | |
| 46 #dataY <- t(dataY) | |
| 47 | |
| 48 colnames(dataY) <- NULL | |
| 49 dataY <- as.factor(dataY) | |
| 50 } else { | |
| 51 dataY <- t(dataY) | |
| 52 colnames(dataY) <- rownames(dataX) | |
| 53 names11 <- rownames(rawData) | |
| 54 dataY <- dataY[,names11] | |
| 55 | |
| 56 } | |
| 57 | |
| 58 | |
| 59 ############################################################################### | |
| 60 } else { | |
| 61 rawData <- RAWDATA[complete.cases(RAWDATA),] | |
| 62 dataX <- rawData[,1:lenght(rawData)-1] | |
| 63 dataY <- rawData[,length(rawData)] | |
| 64 | |
| 65 print(dim(dataX)) | |
| 66 print(dim(rawData)) | |
| 67 rawData <- dataX | |
| 68 | |
| 69 print(dim(rawData)) | |
| 70 } | |
| 71 | |
| 72 set.seed(2) | |
| 73 | |
| 74 print(dim(dataX)) | |
| 75 print(dim(rawData)) | |
| 76 print(length(dataY)) | |
| 77 | |
| 78 save(rawData,dataY,file="check.RData") | |
| 79 nzv <- nearZeroVar(rawData) | |
| 80 if(length(nzv) > 0) { | |
| 81 nzvVars <- names(rawData)[nzv] | |
| 82 rawData <- rawData[,-nzv] | |
| 83 rawData$outcome <- dataY | |
| 84 } else { | |
| 85 rawData <- rawData | |
| 86 rawData$outcome <- dataY | |
| 87 } | |
| 88 | |
| 89 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
| 90 | |
| 91 dx <- rawData[,1:length(rawData)-1] | |
| 92 dy <- rawData[,length(rawData)] | |
| 93 corrThresh <- 0.90 | |
| 94 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) | |
| 95 dx <- dx[, -highCorr] | |
| 96 subsets <- seq(1,length(dx),by=5) | |
| 97 normalization <- preProcess(dx) | |
| 98 dx <- predict(normalization, dx) | |
| 99 dx <- as.data.frame(dx) | |
| 100 | |
| 101 if (arg4 == "lmFuncs"){ | |
| 102 ctrl1 <- rfeControl(functions = lmFuncs, | |
| 103 method = arg5 , | |
| 104 repeats = as.numeric(arg6), | |
| 105 number = as.numeric(arg7), | |
| 106 verbose = FALSE) | |
| 107 } else if(arg4 == "rfFuncs"){ | |
| 108 ctrl1 <- rfeControl(functions = rfFuncs, | |
| 109 method = arg5 , | |
| 110 repeats = as.numeric(arg6), | |
| 111 number = as.numeric(arg7), | |
| 112 verbose = FALSE) | |
| 113 }else if (arg4 == "treebagFuncs"){ | |
| 114 ctrl1 <- rfeControl(functions = treebagFuncs, | |
| 115 method = arg5 , | |
| 116 repeats = as.numeric(arg6), | |
| 117 number = as.numeric(arg7), | |
| 118 verbose = FALSE) | |
| 119 }else { | |
| 120 | |
| 121 ctrl1 <- rfeControl(functions = nbFuncs, | |
| 122 method = arg5 , | |
| 123 repeats = as.numeric(arg6), | |
| 124 number = as.numeric(arg7), | |
| 125 verbose = FALSE) | |
| 126 } | |
| 127 | |
| 128 | |
| 129 | |
| 130 | |
| 131 Profile <- rfe(dx, dy, | |
| 132 sizes = subsets, | |
| 133 rfeControl = ctrl1) | |
| 134 | |
| 135 pred11 <- predictors(Profile) | |
| 136 save(Profile,file=arg2) | |
| 137 rawData <- RAWDATA[,pred11] | |
| 138 rawData$outcome <- RAWDATA$outcome | |
| 139 dataX <- rawData[,1:length(rawData)-1] | |
| 140 dataY <- rawData[,length(rawData)] | |
| 141 save(dataX,dataY,file=arg3) | |
| 142 rm(dataX) | |
| 143 rm(dataY) | |
| 144 |
