view featureselect/feature_selection (copy).R @ 0:a4a2ad5a214e draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:37:56 -0500
parents
children
line wrap: on
line source

args <- commandArgs(T)

arg1 <- args[1]
arg2 <- args[2]
arg3 <- args[3]
arg4 <- args[4]
arg5 <- args[5]
arg6 <- args[6]
arg7 <- args[7]

library(caret)
load(arg1)
RAWDATA <- dataX
RAWDATA$outcome <- dataY
rawData <- dataX
predictorNames <- names(rawData)

isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
if(any(!isNum)) stop("all predictors in rawData should be numeric")

colRate <- apply(rawData[, predictorNames, drop = FALSE],
                 2, function(x) mean(is.na(x)))
colExclude <- colRate > 0.001
	if(any(colExclude)){
				predictorNames <- predictorNames[!colExclude]
				rawData <- rawData[, predictorNames]
				rowRate <- apply(rawData[, predictorNames, drop = FALSE],
                 		1, function(x) mean(is.na(x)))
			}

rowExclude <- rowRate > 0.001
	if(any(rowExclude)){
  				rawData <- rawData[!rowExclude, ]
    				hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
                        	1, function(x) mean(is.na(x)))
                        	
                    } else {  hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
                     		   1, function(x) any(is.na(x)))
                    
                    		rawData <- rawData[complete.cases(rawData),]
                    		} 
                    
set.seed(2)

nzv <- nearZeroVar(rawData)
	  if(length(nzv) > 0)  {
    				nzvVars <- names(rawData)[nzv]
    				rawData <- rawData[,-nzv]
   				rawData$outcome <- dataY
    				} else {
        				rawData <- rawData
					rawData$outcome <- dataY
    					}
    
predictorNames <- names(rawData)[names(rawData) != "outcome"]
   
dx <- rawData[,1:length(rawData)-1]
dy <- rawData[,length(rawData)]
corrThresh <- 0.90
highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
dx <- dx[, -highCorr]
subsets <- seq(1,length(dx),by=2)
normalization <- preProcess(dx)
dx <- predict(normalization, dx)
dx <- as.data.frame(dx)

if (arg4 == "lmFuncs"){
ctrl1 <- rfeControl(functions = lmFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
} else if(arg4 == "rfFuncs"){
ctrl1 <- rfeControl(functions = rfFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}else if (arg4 == "treebagFuncs"){
ctrl1 <- rfeControl(functions = treebagFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}else {

ctrl1 <- rfeControl(functions = nbFuncs,
                   method = arg5 ,
                   repeats = as.numeric(arg6),
                   number = as.numeric(arg7),
                   verbose = FALSE)
}



 
Profile <- rfe(dx, dy,
                 sizes = subsets,
                 rfeControl = ctrl1)

pred11 <- predictors(Profile)
save(Profile,file=arg2)
rawData <- RAWDATA[,pred11]
rawData$outcome <- RAWDATA$outcome
dataX <- rawData[,1:length(rawData)-1]
dataY <- rawData[,length(rawData)]
save(dataX,dataY,file=arg3)
rm(dataX)
rm(dataY)