Mercurial > repos > deepakjadmin > r_caret_test
diff featureselect/feature_selection.R @ 0:68300206e90d draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:41:30 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/featureselect/feature_selection.R Thu Nov 05 02:41:30 2015 -0500 @@ -0,0 +1,144 @@ +args <- commandArgs(T) + +arg1 <- args[1] +arg2 <- args[2] +arg3 <- args[3] +arg4 <- args[4] +arg5 <- args[5] +arg6 <- args[6] +arg7 <- args[7] + +library(caret) +load(arg1) +print("data loaded") +RAWDATA <- dataX +RAWDATA$outcome <- dataY +rawData <- dataX +predictorNames <- names(rawData) + +isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) +if(any(!isNum)) stop("all predictors in rawData should be numeric") + +colRate <- apply(rawData[, predictorNames, drop = FALSE], + 2, function(x) mean(is.na(x))) +colExclude <- colRate > 0.001 + if(any(colExclude)){ + predictorNames <- predictorNames[!colExclude] + rawData <- rawData[, predictorNames] + rowRate <- apply(rawData[, predictorNames, drop = FALSE], + 1, function(x) mean(is.na(x))) + } + +rowExclude <- rowRate > 0.00000001 + if(any(rowExclude)){ + rawData <- rawData[!rowExclude, ] + ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], + ##1, function(x) mean(is.na(x))) + +############################################################################ + if(is.factor(dataY)){ + dataY <- as.vector(dataY) + dataY <- t(dataY) + colName1 <- rownames(dataX) + colnames(dataY) <- colName1 + names11 <- rownames(rawData) + dataY <- dataY[,names11] + #dataY <- t(dataY) + + colnames(dataY) <- NULL + dataY <- as.factor(dataY) + } else { + dataY <- t(dataY) + colnames(dataY) <- rownames(dataX) + names11 <- rownames(rawData) + dataY <- dataY[,names11] + + } + + +############################################################################### + } else { + rawData <- RAWDATA[complete.cases(RAWDATA),] + dataX <- rawData[,1:lenght(rawData)-1] + dataY <- rawData[,length(rawData)] + + print(dim(dataX)) + print(dim(rawData)) + rawData <- dataX + + print(dim(rawData)) + } + +set.seed(2) + +print(dim(dataX)) +print(dim(rawData)) +print(length(dataY)) + +save(rawData,dataY,file="check.RData") +nzv <- nearZeroVar(rawData) + if(length(nzv) > 0) { + nzvVars <- names(rawData)[nzv] + rawData <- rawData[,-nzv] + rawData$outcome <- dataY + } else { + rawData <- rawData + rawData$outcome <- dataY + } + +predictorNames <- names(rawData)[names(rawData) != "outcome"] + +dx <- rawData[,1:length(rawData)-1] +dy <- rawData[,length(rawData)] +corrThresh <- 0.90 +highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) +dx <- dx[, -highCorr] +subsets <- seq(1,length(dx),by=5) +normalization <- preProcess(dx) +dx <- predict(normalization, dx) +dx <- as.data.frame(dx) + +if (arg4 == "lmFuncs"){ +ctrl1 <- rfeControl(functions = lmFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} else if(arg4 == "rfFuncs"){ +ctrl1 <- rfeControl(functions = rfFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else if (arg4 == "treebagFuncs"){ +ctrl1 <- rfeControl(functions = treebagFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else { + +ctrl1 <- rfeControl(functions = nbFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} + + + + +Profile <- rfe(dx, dy, + sizes = subsets, + rfeControl = ctrl1) + +pred11 <- predictors(Profile) +save(Profile,file=arg2) +rawData <- RAWDATA[,pred11] +rawData$outcome <- RAWDATA$outcome +dataX <- rawData[,1:length(rawData)-1] +dataY <- rawData[,length(rawData)] +save(dataX,dataY,file=arg3) +rm(dataX) +rm(dataY) +