changeset 5:016c69bfb2a1 draft

Uploaded
author deepakjadmin
date Tue, 03 Jan 2017 02:26:17 -0500
parents 5364cf43a8c1
children b84589b7c014
files feature_selection.R featureselect/feature_selection.R featureselect/tool_dependencies.xml featureselect/toolrfe.xml tool_dependencies.xml toolrfe.xml
diffstat 6 files changed, 252 insertions(+), 216 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_selection.R	Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,147 @@
+args <- commandArgs(T)
+
+arg1 <- args[1]
+arg2 <- args[2]
+arg3 <- args[3]
+arg4 <- args[4]
+arg5 <- args[5]
+arg6 <- args[6]
+arg7 <- args[7]
+arg8 <- args[8]
+arg9 <- args[9]
+library(caret)
+load(arg1)
+
+#RAWDATA <- dataX
+#RAWDATA$outcome <- dataY
+
+
+###########################
+Smpling <- arg9
+
+if(Smpling=="downsampling")
+{
+dwnsmpl <- downSample(dataX,dataY)
+RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
+RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
+dataX <- RAWDATA[,1:length(dwnsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("dwnsmpl")
+}else if(Smpling=="upsampling"){
+upsmpl <- upSample(dataX,dataY)
+RAWDATA <- upsmpl[,1:length(upsmpl)-1]
+RAWDATA$outcome <- upsmpl[,length(upsmpl)]
+dataX <- RAWDATA[,1:length(upsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("upsmpl")
+}else { 
+RAWDATA <- dataX
+RAWDATA$outcome <- dataY
+}
+
+
+
+
+##########################
+
+
+rawData <- dataX
+predictorNames <- names(rawData)
+
+isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
+if(any(!isNum)) stop("all predictors in rawData should be numeric")
+
+colRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 2, function(x) mean(is.na(x)))
+colExclude <- colRate > 0.1
+	if(any(colExclude)){
+				predictorNames <- predictorNames[-which(colExclude)]
+				rawData <- RAWDATA[, c(predictorNames,"outcome")]
+				 } else {
+	                        rawData <- RAWDATA 
+						}  
+                		rowRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 		1, function(x) mean(is.na(x)))
+			
+
+rowExclude <- rowRate > 0
+	if(any(rowExclude)){
+  				rawData <- rawData[!rowExclude, ]
+    				##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+                        	##1, function(x) mean(is.na(x)))
+                   
+############################################################################
+                                                                      
+            
+###############################################################################                        	
+                    } else {  
+                    		rawData <- rawData[complete.cases(rawData),]
+
+                    		} 
+                    
+set.seed(2)
+
+#print(dim(dataX))
+#print(dim(rawData))
+#print(length(dataY))
+
+nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
+	  if(length(nzv) > 0)  {
+    				#nzvVars <- names(rawData)[nzv]
+    				rawData <- rawData[,-nzv]
+   				#rawData$outcome <- dataY
+    				} 
+    
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+   
+dx <- rawData[,1:length(rawData)-1]
+dy <- rawData[,length(rawData)]
+corrThresh <- as.numeric(arg8)
+highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
+dx <- dx[, -highCorr]
+subsets <- seq(1,length(dx),by=5)
+normalization <- preProcess(dx)
+dx <- predict(normalization, dx)
+dx <- as.data.frame(dx)
+
+if (arg4 == "lmFuncs"){
+ctrl1 <- rfeControl(functions = lmFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+} else if(arg4 == "rfFuncs"){
+ctrl1 <- rfeControl(functions = rfFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else if (arg4 == "treebagFuncs"){
+ctrl1 <- rfeControl(functions = treebagFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else {
+
+ctrl1 <- rfeControl(functions = nbFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}
+
+
+
+ 
+Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
+
+pred11 <- predictors(Profile)
+save(Profile,file=arg2)
+dataX <- rawData[,pred11]
+dataY <- rawData$outcome
+
+save(dataX,dataY,file=arg3)
+rm(dataX)
+rm(dataY)
+
--- a/featureselect/feature_selection.R	Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,116 +0,0 @@
-args <- commandArgs(T)
-
-arg1 <- args[1]
-arg2 <- args[2]
-arg3 <- args[3]
-arg4 <- args[4]
-arg5 <- args[5]
-arg6 <- args[6]
-arg7 <- args[7]
-arg8 <- args[8]
-
-library(caret)
-load(arg1)
-
-RAWDATA <- dataX
-RAWDATA$outcome <- dataY
-rawData <- dataX
-predictorNames <- names(rawData)
-
-isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
-if(any(!isNum)) stop("all predictors in rawData should be numeric")
-
-colRate <- apply(rawData[, predictorNames, drop = FALSE],
-                 2, function(x) mean(is.na(x)))
-colExclude <- colRate > 0.1
-	if(any(colExclude)){
-				predictorNames <- predictorNames[-which(colExclude)]
-				rawData <- RAWDATA[, c(predictorNames,"outcome")]
-				 } else {
-	                        rawData <- RAWDATA 
-						}  
-                		rowRate <- apply(rawData[, predictorNames, drop = FALSE],
-                 		1, function(x) mean(is.na(x)))
-			
-
-rowExclude <- rowRate > 0
-	if(any(rowExclude)){
-  				rawData <- rawData[!rowExclude, ]
-    				##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
-                        	##1, function(x) mean(is.na(x)))
-                   
-############################################################################
-                                                                      
-            
-###############################################################################                        	
-                    } else {  
-                    		rawData <- rawData[complete.cases(rawData),]
-
-                    		} 
-                    
-set.seed(2)
-
-#print(dim(dataX))
-#print(dim(rawData))
-#print(length(dataY))
-
-nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
-	  if(length(nzv) > 0)  {
-    				#nzvVars <- names(rawData)[nzv]
-    				rawData <- rawData[,-nzv]
-   				#rawData$outcome <- dataY
-    				} 
-    
-predictorNames <- names(rawData)[names(rawData) != "outcome"]
-   
-dx <- rawData[,1:length(rawData)-1]
-dy <- rawData[,length(rawData)]
-corrThresh <- as.numeric(arg8)
-highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
-dx <- dx[, -highCorr]
-subsets <- seq(1,length(dx),by=5)
-normalization <- preProcess(dx)
-dx <- predict(normalization, dx)
-dx <- as.data.frame(dx)
-
-if (arg4 == "lmFuncs"){
-ctrl1 <- rfeControl(functions = lmFuncs,
-                   method = arg5 ,
-                   repeats = as.numeric(arg6),
-                   number = as.numeric(arg7),
-                   verbose = FALSE)
-} else if(arg4 == "rfFuncs"){
-ctrl1 <- rfeControl(functions = rfFuncs,
-                   method = arg5 ,
-                   repeats = as.numeric(arg6),
-                   number = as.numeric(arg7),
-                   verbose = FALSE)
-}else if (arg4 == "treebagFuncs"){
-ctrl1 <- rfeControl(functions = treebagFuncs,
-                   method = arg5 ,
-                   repeats = as.numeric(arg6),
-                   number = as.numeric(arg7),
-                   verbose = FALSE)
-}else {
-
-ctrl1 <- rfeControl(functions = nbFuncs,
-                   method = arg5 ,
-                   repeats = as.numeric(arg6),
-                   number = as.numeric(arg7),
-                   verbose = FALSE)
-}
-
-
-
- 
-Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
-
-pred11 <- predictors(Profile)
-save(Profile,file=arg2)
-dataX <- rawData[,pred11]
-dataY <- rawData$outcome
-
-save(dataX,dataY,file=arg3)
-rm(dataX)
-rm(dataY)
-
--- a/featureselect/tool_dependencies.xml	Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-
-<set_environment version="1.0">
-        <environment_variable name="FEATURE_SELECTION_R" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable>   
-        </set_environment>
-    <package name="R" version="3.2.0">
-                        <repository changeset_revision="7833b0ebf8d6" name="package_r_3_2_0" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-                        </package>
-    <package name="caret-tools" version="1.0.0">
-                        <repository changeset_revision="e5faefaf1037" name="caret_tool_test1" owner="deepakjadmin" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-                        </package>
-</tool_dependency>
--- a/featureselect/toolrfe.xml	Sun Oct 02 05:36:30 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-<tool id="featureSelectR" name="Feature Selection" >
-<description>
- This tool used for extract best feature subsets cantaining input data for model building.   
-</description>
-<!--command interpreter="bash">step3run.sh $file1 $model $output1  2>/dev/null </command-->
-<requirements>
-        <requirement type="set_environment">FEATURE_SELECTION_R</requirement>
-                 <requirement type="set_environment">R_ROOT_DIR</requirement>
-    <requirement type="package" version="3.2.0">R</requirement>
-    <requirement type="package" version="1.0.0">caret-tools</requirement>
-</requirements>
-<command interpreter="Rscript">feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff > /dev/null 2>&amp;1 </command>
-
-<inputs>
-<param name="input"  type="data" label="Select input data file" help="input .RData file" />
-<param name="function1" type="select" display="radio" label="Select appropriate function for algorithm"  >
-                <option value="rfFuncs" selected="true">random forest based function </option>
-                <option value="lmFuncs">linear model based function</option>
-                <option value="treebagFuncs">treebag(CART) based function</option>
-                <option value="nbFuncs">neive bayes based function</option>
-</param>
-
-<param name="corcutoff"  type="float" value= "0.8" min="0.0" max = "1.0" label="Select correlation cutoff" help="values bewteen 0-1. fileds above cufoff value removed from data " />
-<param name="resampling" type="select" label="Select appropriate resampling method"  >
-                <option value="repeatedcv" selected="true">repeatedcv </option>
-                <option value="boot">boot</option>
-                <option value="cv">cv</option>
-                <option value="boot632">boot632</option>
-</param>
- <param name="repeat" type="select" label="Set Number of times to repeat" help="default is 3 ">
-               <option value="3" selected="true">3</option>
-                <option value="1">1</option>
-                <option value="5">5</option>
-                <option value="10">10</option>
-        </param>
-<param name="number" type="select" label="Set Number of times Resample" help="default is 10">
-                <option value="10" selected="true">10</option>
-                <option value="5">5</option>
-                <option value="15">15</option>
-                <option value="20">20</option>
-                <option value="25">25</option>
-        </param>
-
-</inputs>
-<outputs>
-<data type="data" format="data" name="profile" label="$function1-profile" />
-<data type= "data" format="data" name="finalset" label="Selected_feature.RData "/>
-</outputs>
-<help>
-.. class:: infomark
-
-**RFE based feature selection for classification and regression**
-
-Input file must be  RData file obtained by converting csv file in to RData.
-
-output  "Selected_feature.RData"  file used for model building purpose.While profile
-
-represents feature selection model.
-
-Correlation cutoff value is desired for choosing independent variables For example
-
-Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values.
-
-User may choose varous resampling methods in combination with repeats and times of resample.
-
-
-
-</help>
-
-
-<tests>
-<test>
-          <param name="input" value="testinput.RData"/>
-          <param name="function1"  value="rfFuncs" />
-          <param name="corcutoff"  value="0.6" />
-          <param name="resampling"  value="repeatedcv" />
-          <param name="repeat"  value="1" />
-          <param name="number"  value="5" />
-
-
-          <output name="profile" file="rfprofile.RData" compare="sim_size" delta="2000000" />
-          <output name="finalset" file="selected_fet.RData" compare="sim_size" delta="2000000"/>
-    </test>
-</tests>
-
-
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<tool_dependency>
+
+<set_environment version="1.0">
+        <environment_variable name="FEATURE_SELECTION_R" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable>   
+        </set_environment>
+    <package name="R" version="3.2.0">
+                        <repository changeset_revision="7833b0ebf8d6" name="package_r_3_2_0" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+                        </package>
+    <package name="caret-tools" version="1.0.0">
+                        <repository changeset_revision="e5faefaf1037" name="caret_tool_test1" owner="deepakjadmin" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+                        </package>
+</tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/toolrfe.xml	Tue Jan 03 02:26:17 2017 -0500
@@ -0,0 +1,92 @@
+<tool id="featureSelectR" name="Feature Selection" >
+<description>
+ This tool used for extract best feature subsets cantaining input data for model building.   
+</description>
+<!--command interpreter="bash">step3run.sh $file1 $model $output1  2>/dev/null </command-->
+<requirements>
+        <requirement type="set_environment">FEATURE_SELECTION_R</requirement>
+                 <requirement type="set_environment">R_ROOT_DIR</requirement>
+    <requirement type="package" version="3.2.0">R</requirement>
+    <requirement type="package" version="1.0.0">caret-tools</requirement>
+</requirements>
+<command interpreter="Rscript">feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff $SAMPLING> /dev/null 2>&amp;1 </command>
+
+<inputs>
+<param name="input"  type="data" label="Select input data file" help="input .RData file" />
+<param name="SAMPLING" type="select"  label="3(i). Select Sampling Method for imbalanced data" help="Defualt is with No sampling. you may choose downsample or upsample" >
+                <option value="garBage" selected="true">No Sampling</option>
+                <option value="downsampling">downsample</option>
+                <option value="upsampling">upsample</option>
+        </param>
+<param name="function1" type="select" display="radio" label="Select appropriate function for algorithm"  >
+                <option value="rfFuncs" selected="true">random forest based function </option>
+                <option value="lmFuncs">linear model based function</option>
+                <option value="treebagFuncs">treebag(CART) based function</option>
+                <option value="nbFuncs">neive bayes based function</option>
+</param>
+
+<param name="corcutoff"  type="float" value= "0.8" min="0.0" max = "1.0" label="Select correlation cutoff" help="values bewteen 0-1. fileds above cufoff value removed from data " />
+<param name="resampling" type="select" label="Select appropriate resampling method"  >
+                <option value="repeatedcv" selected="true">repeatedcv </option>
+                <option value="boot">boot</option>
+                <option value="cv">cv</option>
+                <option value="boot632">boot632</option>
+</param>
+ <param name="repeat" type="select" label="Set Number of times to repeat" help="default is 3 ">
+               <option value="3" selected="true">3</option>
+                <option value="1">1</option>
+                <option value="5">5</option>
+                <option value="10">10</option>
+        </param>
+<param name="number" type="select" label="Set Number of times Resample" help="default is 10">
+                <option value="10" selected="true">10</option>
+                <option value="5">5</option>
+                <option value="15">15</option>
+                <option value="20">20</option>
+                <option value="25">25</option>
+        </param>
+
+</inputs>
+<outputs>
+<data type="data" format="data" name="profile" label="$function1-profile" />
+<data type= "data" format="data" name="finalset" label="Selected_feature.RData "/>
+</outputs>
+<help>
+.. class:: infomark
+
+**RFE based feature selection for classification and regression**
+
+Input file must be  RData file obtained by converting csv file in to RData.
+
+output  "Selected_feature.RData"  file used for model building purpose.While profile
+
+represents feature selection model.
+
+Correlation cutoff value is desired for choosing independent variables For example
+
+Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values.
+
+User may choose varous resampling methods in combination with repeats and times of resample.
+
+
+
+</help>
+
+
+<tests>
+<test>
+          <param name="input" value="testinput.RData"/>
+          <param name="function1"  value="rfFuncs" />
+          <param name="corcutoff"  value="0.6" />
+          <param name="resampling"  value="repeatedcv" />
+          <param name="repeat"  value="1" />
+          <param name="number"  value="5" />
+          <param name="SAMPLING"  value="garb" />
+
+          <output name="profile" file="rfprofile.RData" compare="sim_size" delta="2000000" />
+          <output name="finalset" file="selected_fet.RData" compare="sim_size" delta="2000000"/>
+    </test>
+</tests>
+
+
+</tool>