Mercurial > repos > deepakjadmin > feature_selection_test1

args <- commandArgs(T)

arg1 <- args[1]
arg2 <- args[2]
arg3 <- args[3]
arg4 <- args[4]
arg5 <- args[5]
arg6 <- args[6]
arg7 <- args[7]
arg8 <- args[8]

library(caret)
load(arg1)

RAWDATA <- dataX
RAWDATA$outcome <- dataY
rawData <- dataX
predictorNames <- names(rawData)

isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
if(any(!isNum)) stop("all predictors in rawData should be numeric")

colRate <- apply(rawData[, predictorNames, drop = FALSE],
                 2, function(x) mean(is.na(x)))
colExclude <- colRate > 0.01
	if(any(colExclude)){
				predictorNames <- predictorNames[-which(colExclude)]
				rawData <- RAWDATA[, c(predictorNames,"outcome")]
				 } else {
	                        rawData <- RAWDATA
						}
                		rowRate <- apply(rawData[, predictorNames, drop = FALSE],
                 		1, function(x) mean(is.na(x)))

rowno <- dim(rawData)[1]
if (rowno <= 1000){
cutoff <- rowno / (rowno * 100)
} else if (rowno > 1000 & rowno <= 5000) {
cutoff <- rowno / (rowno * 100 * 0.5 )
} else {
cutoff <- rowno / (rowno * 100 * 0.5 * 0.5)
}
rowExclude <- rowRate > cutoff
	if(any(rowExclude)){
  				rawData <- rawData[!rowExclude, ]
    				##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
                        	##1, function(x) mean(is.na(x)))

############################################################################


###############################################################################
                    } else {
                    		rawData <- rawData[complete.cases(rawData),]

                    		}

set.seed(1234)

#print(dim(dataX))
#print(dim(rawData))
#print(length(dataY))

nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
	  if(length(nzv) > 0)  {
    				#nzvVars <- names(rawData)[nzv]
    				rawData <- rawData[,-nzv]
   				#rawData$outcome <- dataY
    				}

predictorNames <- names(rawData)[names(rawData) != "outcome"]

dx <- rawData[,1:length(rawData)-1]
dy <- rawData[,length(rawData)]
corrThresh <- as.numeric(arg8)
highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
dx <- dx[, -highCorr]
subsets <- seq(1,length(dx),by=5)
normalization <- preProcess(dx)
dx <- predict(normalization, dx)
dx <- as.data.frame(dx)

if (arg4 == "lmFuncs"){
ctrl1 <- rfeControl(functions = lmFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
} else if(arg4 == "rfFuncs"){
ctrl1 <- rfeControl(functions = rfFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}else if (arg4 == "treebagFuncs"){
ctrl1 <- rfeControl(functions = treebagFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}else {

ctrl1 <- rfeControl(functions = nbFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}


Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)

pred11 <- predictors(Profile)
save(Profile,file=arg2)
dataX <- rawData[,pred11]
dataY <- rawData$outcome

save(dataX,dataY,file=arg3)
rm(dataX)
rm(dataY)
author	deepakjadmin
date	Fri, 25 Mar 2016 06:48:10 -0400
parents
children	5364cf43a8c1