Mercurial > repos > deepakjadmin > r_caret_test
view featureselect/feature_selection (copy).R @ 0:68300206e90d draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:41:30 -0500 |
parents | |
children |
line wrap: on
line source
args <- commandArgs(T) arg1 <- args[1] arg2 <- args[2] arg3 <- args[3] arg4 <- args[4] arg5 <- args[5] arg6 <- args[6] arg7 <- args[7] library(caret) load(arg1) RAWDATA <- dataX RAWDATA$outcome <- dataY rawData <- dataX predictorNames <- names(rawData) isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) if(any(!isNum)) stop("all predictors in rawData should be numeric") colRate <- apply(rawData[, predictorNames, drop = FALSE], 2, function(x) mean(is.na(x))) colExclude <- colRate > 0.001 if(any(colExclude)){ predictorNames <- predictorNames[!colExclude] rawData <- rawData[, predictorNames] rowRate <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) } rowExclude <- rowRate > 0.001 if(any(rowExclude)){ rawData <- rawData[!rowExclude, ] hasMissing <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) } else { hasMissing <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) any(is.na(x))) rawData <- rawData[complete.cases(rawData),] } set.seed(2) nzv <- nearZeroVar(rawData) if(length(nzv) > 0) { nzvVars <- names(rawData)[nzv] rawData <- rawData[,-nzv] rawData$outcome <- dataY } else { rawData <- rawData rawData$outcome <- dataY } predictorNames <- names(rawData)[names(rawData) != "outcome"] dx <- rawData[,1:length(rawData)-1] dy <- rawData[,length(rawData)] corrThresh <- 0.90 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) dx <- dx[, -highCorr] subsets <- seq(1,length(dx),by=2) normalization <- preProcess(dx) dx <- predict(normalization, dx) dx <- as.data.frame(dx) if (arg4 == "lmFuncs"){ ctrl1 <- rfeControl(functions = lmFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } else if(arg4 == "rfFuncs"){ ctrl1 <- rfeControl(functions = rfFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else if (arg4 == "treebagFuncs"){ ctrl1 <- rfeControl(functions = treebagFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else { ctrl1 <- rfeControl(functions = nbFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } Profile <- rfe(dx, dy, sizes = subsets, rfeControl = ctrl1) pred11 <- predictors(Profile) save(Profile,file=arg2) rawData <- RAWDATA[,pred11] rawData$outcome <- RAWDATA$outcome dataX <- rawData[,1:length(rawData)-1] dataY <- rawData[,length(rawData)] save(dataX,dataY,file=arg3) rm(dataX) rm(dataY)