Mercurial > repos > deepakjadmin > feature_selection_test1
view featureselect/feature_selection.R @ 3:91c141c5efa6 draft
Uploaded
author | deepakjadmin |
---|---|
date | Fri, 25 Mar 2016 06:48:10 -0400 |
parents | |
children | 5364cf43a8c1 |
line wrap: on
line source
args <- commandArgs(T) arg1 <- args[1] arg2 <- args[2] arg3 <- args[3] arg4 <- args[4] arg5 <- args[5] arg6 <- args[6] arg7 <- args[7] arg8 <- args[8] library(caret) load(arg1) RAWDATA <- dataX RAWDATA$outcome <- dataY rawData <- dataX predictorNames <- names(rawData) isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) if(any(!isNum)) stop("all predictors in rawData should be numeric") colRate <- apply(rawData[, predictorNames, drop = FALSE], 2, function(x) mean(is.na(x))) colExclude <- colRate > 0.01 if(any(colExclude)){ predictorNames <- predictorNames[-which(colExclude)] rawData <- RAWDATA[, c(predictorNames,"outcome")] } else { rawData <- RAWDATA } rowRate <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) rowno <- dim(rawData)[1] if (rowno <= 1000){ cutoff <- rowno / (rowno * 100) } else if (rowno > 1000 & rowno <= 5000) { cutoff <- rowno / (rowno * 100 * 0.5 ) } else { cutoff <- rowno / (rowno * 100 * 0.5 * 0.5) } rowExclude <- rowRate > cutoff if(any(rowExclude)){ rawData <- rawData[!rowExclude, ] ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], ##1, function(x) mean(is.na(x))) ############################################################################ ############################################################################### } else { rawData <- rawData[complete.cases(rawData),] } set.seed(1234) #print(dim(dataX)) #print(dim(rawData)) #print(length(dataY)) nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) if(length(nzv) > 0) { #nzvVars <- names(rawData)[nzv] rawData <- rawData[,-nzv] #rawData$outcome <- dataY } predictorNames <- names(rawData)[names(rawData) != "outcome"] dx <- rawData[,1:length(rawData)-1] dy <- rawData[,length(rawData)] corrThresh <- as.numeric(arg8) highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) dx <- dx[, -highCorr] subsets <- seq(1,length(dx),by=5) normalization <- preProcess(dx) dx <- predict(normalization, dx) dx <- as.data.frame(dx) if (arg4 == "lmFuncs"){ ctrl1 <- rfeControl(functions = lmFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } else if(arg4 == "rfFuncs"){ ctrl1 <- rfeControl(functions = rfFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else if (arg4 == "treebagFuncs"){ ctrl1 <- rfeControl(functions = treebagFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else { ctrl1 <- rfeControl(functions = nbFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) pred11 <- predictors(Profile) save(Profile,file=arg2) dataX <- rawData[,pred11] dataY <- rawData$outcome save(dataX,dataY,file=arg3) rm(dataX) rm(dataY)