# HG changeset patch
# User deepakjadmin
# Date 1483428377 18000
# Node ID 016c69bfb2a147e1f887c83ef3d8ea43f2cf3cae
# Parent 5364cf43a8c15ba64554d28a22f1d4f4dd47f09b
Uploaded
diff -r 5364cf43a8c1 -r 016c69bfb2a1 feature_selection.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_selection.R Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,147 @@
+args <- commandArgs(T)
+
+arg1 <- args[1]
+arg2 <- args[2]
+arg3 <- args[3]
+arg4 <- args[4]
+arg5 <- args[5]
+arg6 <- args[6]
+arg7 <- args[7]
+arg8 <- args[8]
+arg9 <- args[9]
+library(caret)
+load(arg1)
+
+#RAWDATA <- dataX
+#RAWDATA$outcome <- dataY
+
+
+###########################
+Smpling <- arg9
+
+if(Smpling=="downsampling")
+{
+dwnsmpl <- downSample(dataX,dataY)
+RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
+RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
+dataX <- RAWDATA[,1:length(dwnsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("dwnsmpl")
+}else if(Smpling=="upsampling"){
+upsmpl <- upSample(dataX,dataY)
+RAWDATA <- upsmpl[,1:length(upsmpl)-1]
+RAWDATA$outcome <- upsmpl[,length(upsmpl)]
+dataX <- RAWDATA[,1:length(upsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("upsmpl")
+}else {
+RAWDATA <- dataX
+RAWDATA$outcome <- dataY
+}
+
+
+
+
+##########################
+
+
+rawData <- dataX
+predictorNames <- names(rawData)
+
+isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
+if(any(!isNum)) stop("all predictors in rawData should be numeric")
+
+colRate <- apply(rawData[, predictorNames, drop = FALSE],
+ 2, function(x) mean(is.na(x)))
+colExclude <- colRate > 0.1
+ if(any(colExclude)){
+ predictorNames <- predictorNames[-which(colExclude)]
+ rawData <- RAWDATA[, c(predictorNames,"outcome")]
+ } else {
+ rawData <- RAWDATA
+ }
+ rowRate <- apply(rawData[, predictorNames, drop = FALSE],
+ 1, function(x) mean(is.na(x)))
+
+
+rowExclude <- rowRate > 0
+ if(any(rowExclude)){
+ rawData <- rawData[!rowExclude, ]
+ ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+ ##1, function(x) mean(is.na(x)))
+
+############################################################################
+
+
+###############################################################################
+ } else {
+ rawData <- rawData[complete.cases(rawData),]
+
+ }
+
+set.seed(2)
+
+#print(dim(dataX))
+#print(dim(rawData))
+#print(length(dataY))
+
+nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
+ if(length(nzv) > 0) {
+ #nzvVars <- names(rawData)[nzv]
+ rawData <- rawData[,-nzv]
+ #rawData$outcome <- dataY
+ }
+
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+
+dx <- rawData[,1:length(rawData)-1]
+dy <- rawData[,length(rawData)]
+corrThresh <- as.numeric(arg8)
+highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
+dx <- dx[, -highCorr]
+subsets <- seq(1,length(dx),by=5)
+normalization <- preProcess(dx)
+dx <- predict(normalization, dx)
+dx <- as.data.frame(dx)
+
+if (arg4 == "lmFuncs"){
+ctrl1 <- rfeControl(functions = lmFuncs,
+ method = arg5 ,
+ repeats = as.numeric(arg6),
+ number = as.numeric(arg7),
+ verbose = FALSE)
+} else if(arg4 == "rfFuncs"){
+ctrl1 <- rfeControl(functions = rfFuncs,
+ method = arg5 ,
+ repeats = as.numeric(arg6),
+ number = as.numeric(arg7),
+ verbose = FALSE)
+}else if (arg4 == "treebagFuncs"){
+ctrl1 <- rfeControl(functions = treebagFuncs,
+ method = arg5 ,
+ repeats = as.numeric(arg6),
+ number = as.numeric(arg7),
+ verbose = FALSE)
+}else {
+
+ctrl1 <- rfeControl(functions = nbFuncs,
+ method = arg5 ,
+ repeats = as.numeric(arg6),
+ number = as.numeric(arg7),
+ verbose = FALSE)
+}
+
+
+
+
+Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
+
+pred11 <- predictors(Profile)
+save(Profile,file=arg2)
+dataX <- rawData[,pred11]
+dataY <- rawData$outcome
+
+save(dataX,dataY,file=arg3)
+rm(dataX)
+rm(dataY)
+
diff -r 5364cf43a8c1 -r 016c69bfb2a1 featureselect/feature_selection.R
--- a/featureselect/feature_selection.R Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,116 +0,0 @@
-args <- commandArgs(T)
-
-arg1 <- args[1]
-arg2 <- args[2]
-arg3 <- args[3]
-arg4 <- args[4]
-arg5 <- args[5]
-arg6 <- args[6]
-arg7 <- args[7]
-arg8 <- args[8]
-
-library(caret)
-load(arg1)
-
-RAWDATA <- dataX
-RAWDATA$outcome <- dataY
-rawData <- dataX
-predictorNames <- names(rawData)
-
-isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
-if(any(!isNum)) stop("all predictors in rawData should be numeric")
-
-colRate <- apply(rawData[, predictorNames, drop = FALSE],
- 2, function(x) mean(is.na(x)))
-colExclude <- colRate > 0.1
- if(any(colExclude)){
- predictorNames <- predictorNames[-which(colExclude)]
- rawData <- RAWDATA[, c(predictorNames,"outcome")]
- } else {
- rawData <- RAWDATA
- }
- rowRate <- apply(rawData[, predictorNames, drop = FALSE],
- 1, function(x) mean(is.na(x)))
-
-
-rowExclude <- rowRate > 0
- if(any(rowExclude)){
- rawData <- rawData[!rowExclude, ]
- ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
- ##1, function(x) mean(is.na(x)))
-
-############################################################################
-
-
-###############################################################################
- } else {
- rawData <- rawData[complete.cases(rawData),]
-
- }
-
-set.seed(2)
-
-#print(dim(dataX))
-#print(dim(rawData))
-#print(length(dataY))
-
-nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
- if(length(nzv) > 0) {
- #nzvVars <- names(rawData)[nzv]
- rawData <- rawData[,-nzv]
- #rawData$outcome <- dataY
- }
-
-predictorNames <- names(rawData)[names(rawData) != "outcome"]
-
-dx <- rawData[,1:length(rawData)-1]
-dy <- rawData[,length(rawData)]
-corrThresh <- as.numeric(arg8)
-highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
-dx <- dx[, -highCorr]
-subsets <- seq(1,length(dx),by=5)
-normalization <- preProcess(dx)
-dx <- predict(normalization, dx)
-dx <- as.data.frame(dx)
-
-if (arg4 == "lmFuncs"){
-ctrl1 <- rfeControl(functions = lmFuncs,
- method = arg5 ,
- repeats = as.numeric(arg6),
- number = as.numeric(arg7),
- verbose = FALSE)
-} else if(arg4 == "rfFuncs"){
-ctrl1 <- rfeControl(functions = rfFuncs,
- method = arg5 ,
- repeats = as.numeric(arg6),
- number = as.numeric(arg7),
- verbose = FALSE)
-}else if (arg4 == "treebagFuncs"){
-ctrl1 <- rfeControl(functions = treebagFuncs,
- method = arg5 ,
- repeats = as.numeric(arg6),
- number = as.numeric(arg7),
- verbose = FALSE)
-}else {
-
-ctrl1 <- rfeControl(functions = nbFuncs,
- method = arg5 ,
- repeats = as.numeric(arg6),
- number = as.numeric(arg7),
- verbose = FALSE)
-}
-
-
-
-
-Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
-
-pred11 <- predictors(Profile)
-save(Profile,file=arg2)
-dataX <- rawData[,pred11]
-dataY <- rawData$outcome
-
-save(dataX,dataY,file=arg3)
-rm(dataX)
-rm(dataY)
-
diff -r 5364cf43a8c1 -r 016c69bfb2a1 featureselect/tool_dependencies.xml
--- a/featureselect/tool_dependencies.xml Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-
-
-
-
- $REPOSITORY_INSTALL_DIR
-
-
-
-
-
-
-
-
diff -r 5364cf43a8c1 -r 016c69bfb2a1 featureselect/toolrfe.xml
--- a/featureselect/toolrfe.xml Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-
-
- This tool used for extract best feature subsets cantaining input data for model building.
-
-
-
- FEATURE_SELECTION_R
- R_ROOT_DIR
- R
- caret-tools
-
-feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff > /dev/null 2>&1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**RFE based feature selection for classification and regression**
-
-Input file must be RData file obtained by converting csv file in to RData.
-
-output "Selected_feature.RData" file used for model building purpose.While profile
-
-represents feature selection model.
-
-Correlation cutoff value is desired for choosing independent variables For example
-
-Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values.
-
-User may choose varous resampling methods in combination with repeats and times of resample.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 5364cf43a8c1 -r 016c69bfb2a1 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,13 @@
+
+
+
+
+ $REPOSITORY_INSTALL_DIR
+
+
+
+
+
+
+
+
diff -r 5364cf43a8c1 -r 016c69bfb2a1 toolrfe.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/toolrfe.xml Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,92 @@
+
+
+ This tool used for extract best feature subsets cantaining input data for model building.
+
+
+
+ FEATURE_SELECTION_R
+ R_ROOT_DIR
+ R
+ caret-tools
+
+feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff $SAMPLING> /dev/null 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**RFE based feature selection for classification and regression**
+
+Input file must be RData file obtained by converting csv file in to RData.
+
+output "Selected_feature.RData" file used for model building purpose.While profile
+
+represents feature selection model.
+
+Correlation cutoff value is desired for choosing independent variables For example
+
+Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values.
+
+User may choose varous resampling methods in combination with repeats and times of resample.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+