diff featureselect/feature_selection.R @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/featureselect/feature_selection.R	Thu Nov 05 02:41:30 2015 -0500
@@ -0,0 +1,144 @@
+args <- commandArgs(T)
+
+arg1 <- args[1]
+arg2 <- args[2]
+arg3 <- args[3]
+arg4 <- args[4]
+arg5 <- args[5]
+arg6 <- args[6]
+arg7 <- args[7]
+
+library(caret)
+load(arg1)
+print("data loaded")
+RAWDATA <- dataX
+RAWDATA$outcome <- dataY
+rawData <- dataX
+predictorNames <- names(rawData)
+
+isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
+if(any(!isNum)) stop("all predictors in rawData should be numeric")
+
+colRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 2, function(x) mean(is.na(x)))
+colExclude <- colRate > 0.001
+	if(any(colExclude)){
+				predictorNames <- predictorNames[!colExclude]
+				rawData <- rawData[, predictorNames]
+				rowRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 		1, function(x) mean(is.na(x)))
+			}
+
+rowExclude <- rowRate > 0.00000001
+	if(any(rowExclude)){
+  				rawData <- rawData[!rowExclude, ]
+    				##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+                        	##1, function(x) mean(is.na(x)))
+                   
+############################################################################
+                               if(is.factor(dataY)){
+                                       dataY <- as.vector(dataY)
+                                       dataY <- t(dataY)
+                                       colName1 <- rownames(dataX)
+                                       colnames(dataY) <- colName1
+                                       names11 <- rownames(rawData) 
+                                       dataY <- dataY[,names11] 
+                                       #dataY <- t(dataY)
+                                    
+                                       colnames(dataY) <- NULL            
+                                       dataY <- as.factor(dataY)
+                                        } else {
+							dataY <- t(dataY)
+							colnames(dataY) <- rownames(dataX)
+                                                        names11 <- rownames(rawData)
+						        dataY <- dataY[,names11]
+								
+								}
+                                          
+            
+###############################################################################                        	
+                    } else {  
+                    		rawData <- RAWDATA[complete.cases(RAWDATA),]
+                                dataX <- rawData[,1:lenght(rawData)-1]
+				dataY <- rawData[,length(rawData)]
+                              
+                                print(dim(dataX))
+                                print(dim(rawData))
+                                rawData <- dataX
+                                
+                                print(dim(rawData))
+                    		} 
+                    
+set.seed(2)
+
+print(dim(dataX))
+print(dim(rawData))
+print(length(dataY))
+
+save(rawData,dataY,file="check.RData")
+nzv <- nearZeroVar(rawData)
+	  if(length(nzv) > 0)  {
+    				nzvVars <- names(rawData)[nzv]
+    				rawData <- rawData[,-nzv]
+   				rawData$outcome <- dataY
+    				} else {
+        				rawData <- rawData
+					rawData$outcome <- dataY
+    					}
+    
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+   
+dx <- rawData[,1:length(rawData)-1]
+dy <- rawData[,length(rawData)]
+corrThresh <- 0.90
+highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
+dx <- dx[, -highCorr]
+subsets <- seq(1,length(dx),by=5)
+normalization <- preProcess(dx)
+dx <- predict(normalization, dx)
+dx <- as.data.frame(dx)
+
+if (arg4 == "lmFuncs"){
+ctrl1 <- rfeControl(functions = lmFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+} else if(arg4 == "rfFuncs"){
+ctrl1 <- rfeControl(functions = rfFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else if (arg4 == "treebagFuncs"){
+ctrl1 <- rfeControl(functions = treebagFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else {
+
+ctrl1 <- rfeControl(functions = nbFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}
+
+
+
+ 
+Profile <- rfe(dx, dy,
+                 sizes = subsets,
+                 rfeControl = ctrl1)
+
+pred11 <- predictors(Profile)
+save(Profile,file=arg2)
+rawData <- RAWDATA[,pred11]
+rawData$outcome <- RAWDATA$outcome
+dataX <- rawData[,1:length(rawData)-1]
+dataY <- rawData[,length(rawData)]
+save(dataX,dataY,file=arg3)
+rm(dataX)
+rm(dataY)
+