Mercurial > repos > deepakjadmin > feature_selection_test1
view featureselect/feature_selection.R @ 4:5364cf43a8c1 draft
Uploaded
author | deepakjadmin |
---|---|
date | Sun, 02 Oct 2016 05:36:30 -0400 |
parents | 91c141c5efa6 |
children |
line wrap: on
line source
args <- commandArgs(T) arg1 <- args[1] arg2 <- args[2] arg3 <- args[3] arg4 <- args[4] arg5 <- args[5] arg6 <- args[6] arg7 <- args[7] arg8 <- args[8] library(caret) load(arg1) RAWDATA <- dataX RAWDATA$outcome <- dataY rawData <- dataX predictorNames <- names(rawData) isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) if(any(!isNum)) stop("all predictors in rawData should be numeric") colRate <- apply(rawData[, predictorNames, drop = FALSE], 2, function(x) mean(is.na(x))) colExclude <- colRate > 0.1 if(any(colExclude)){ predictorNames <- predictorNames[-which(colExclude)] rawData <- RAWDATA[, c(predictorNames,"outcome")] } else { rawData <- RAWDATA } rowRate <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) rowExclude <- rowRate > 0 if(any(rowExclude)){ rawData <- rawData[!rowExclude, ] ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], ##1, function(x) mean(is.na(x))) ############################################################################ ############################################################################### } else { rawData <- rawData[complete.cases(rawData),] } set.seed(2) #print(dim(dataX)) #print(dim(rawData)) #print(length(dataY)) nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) if(length(nzv) > 0) { #nzvVars <- names(rawData)[nzv] rawData <- rawData[,-nzv] #rawData$outcome <- dataY } predictorNames <- names(rawData)[names(rawData) != "outcome"] dx <- rawData[,1:length(rawData)-1] dy <- rawData[,length(rawData)] corrThresh <- as.numeric(arg8) highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) dx <- dx[, -highCorr] subsets <- seq(1,length(dx),by=5) normalization <- preProcess(dx) dx <- predict(normalization, dx) dx <- as.data.frame(dx) if (arg4 == "lmFuncs"){ ctrl1 <- rfeControl(functions = lmFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } else if(arg4 == "rfFuncs"){ ctrl1 <- rfeControl(functions = rfFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else if (arg4 == "treebagFuncs"){ ctrl1 <- rfeControl(functions = treebagFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else { ctrl1 <- rfeControl(functions = nbFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) pred11 <- predictors(Profile) save(Profile,file=arg2) dataX <- rawData[,pred11] dataY <- rawData$outcome save(dataX,dataY,file=arg3) rm(dataX) rm(dataY)