Mercurial > repos > deepakjadmin > feature_selection_test1
changeset 5:016c69bfb2a1 draft
Uploaded
author | deepakjadmin |
---|---|
date | Tue, 03 Jan 2017 02:26:17 -0500 |
parents | 5364cf43a8c1 |
children | b84589b7c014 |
files | feature_selection.R featureselect/feature_selection.R featureselect/tool_dependencies.xml featureselect/toolrfe.xml tool_dependencies.xml toolrfe.xml |
diffstat | 6 files changed, 252 insertions(+), 216 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selection.R Tue Jan 03 02:26:17 2017 -0500 @@ -0,0 +1,147 @@ +args <- commandArgs(T) + +arg1 <- args[1] +arg2 <- args[2] +arg3 <- args[3] +arg4 <- args[4] +arg5 <- args[5] +arg6 <- args[6] +arg7 <- args[7] +arg8 <- args[8] +arg9 <- args[9] +library(caret) +load(arg1) + +#RAWDATA <- dataX +#RAWDATA$outcome <- dataY + + +########################### +Smpling <- arg9 + +if(Smpling=="downsampling") +{ +dwnsmpl <- downSample(dataX,dataY) +RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1] +RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)] +dataX <- RAWDATA[,1:length(dwnsmpl)-1] +dataY <- RAWDATA[,"outcome"] +remove("dwnsmpl") +}else if(Smpling=="upsampling"){ +upsmpl <- upSample(dataX,dataY) +RAWDATA <- upsmpl[,1:length(upsmpl)-1] +RAWDATA$outcome <- upsmpl[,length(upsmpl)] +dataX <- RAWDATA[,1:length(upsmpl)-1] +dataY <- RAWDATA[,"outcome"] +remove("upsmpl") +}else { +RAWDATA <- dataX +RAWDATA$outcome <- dataY +} + + + + +########################## + + +rawData <- dataX +predictorNames <- names(rawData) + +isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) +if(any(!isNum)) stop("all predictors in rawData should be numeric") + +colRate <- apply(rawData[, predictorNames, drop = FALSE], + 2, function(x) mean(is.na(x))) +colExclude <- colRate > 0.1 + if(any(colExclude)){ + predictorNames <- predictorNames[-which(colExclude)] + rawData <- RAWDATA[, c(predictorNames,"outcome")] + } else { + rawData <- RAWDATA + } + rowRate <- apply(rawData[, predictorNames, drop = FALSE], + 1, function(x) mean(is.na(x))) + + +rowExclude <- rowRate > 0 + if(any(rowExclude)){ + rawData <- rawData[!rowExclude, ] + ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], + ##1, function(x) mean(is.na(x))) + +############################################################################ + + +############################################################################### + } else { + rawData <- rawData[complete.cases(rawData),] + + } + +set.seed(2) + +#print(dim(dataX)) +#print(dim(rawData)) +#print(length(dataY)) + +nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) + if(length(nzv) > 0) { + #nzvVars <- names(rawData)[nzv] + rawData <- rawData[,-nzv] + #rawData$outcome <- dataY + } + +predictorNames <- names(rawData)[names(rawData) != "outcome"] + +dx <- rawData[,1:length(rawData)-1] +dy <- rawData[,length(rawData)] +corrThresh <- as.numeric(arg8) +highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) +dx <- dx[, -highCorr] +subsets <- seq(1,length(dx),by=5) +normalization <- preProcess(dx) +dx <- predict(normalization, dx) +dx <- as.data.frame(dx) + +if (arg4 == "lmFuncs"){ +ctrl1 <- rfeControl(functions = lmFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} else if(arg4 == "rfFuncs"){ +ctrl1 <- rfeControl(functions = rfFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else if (arg4 == "treebagFuncs"){ +ctrl1 <- rfeControl(functions = treebagFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else { + +ctrl1 <- rfeControl(functions = nbFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} + + + + +Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) + +pred11 <- predictors(Profile) +save(Profile,file=arg2) +dataX <- rawData[,pred11] +dataY <- rawData$outcome + +save(dataX,dataY,file=arg3) +rm(dataX) +rm(dataY) +
--- a/featureselect/feature_selection.R Sun Oct 02 05:36:30 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -args <- commandArgs(T) - -arg1 <- args[1] -arg2 <- args[2] -arg3 <- args[3] -arg4 <- args[4] -arg5 <- args[5] -arg6 <- args[6] -arg7 <- args[7] -arg8 <- args[8] - -library(caret) -load(arg1) - -RAWDATA <- dataX -RAWDATA$outcome <- dataY -rawData <- dataX -predictorNames <- names(rawData) - -isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) -if(any(!isNum)) stop("all predictors in rawData should be numeric") - -colRate <- apply(rawData[, predictorNames, drop = FALSE], - 2, function(x) mean(is.na(x))) -colExclude <- colRate > 0.1 - if(any(colExclude)){ - predictorNames <- predictorNames[-which(colExclude)] - rawData <- RAWDATA[, c(predictorNames,"outcome")] - } else { - rawData <- RAWDATA - } - rowRate <- apply(rawData[, predictorNames, drop = FALSE], - 1, function(x) mean(is.na(x))) - - -rowExclude <- rowRate > 0 - if(any(rowExclude)){ - rawData <- rawData[!rowExclude, ] - ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], - ##1, function(x) mean(is.na(x))) - -############################################################################ - - -############################################################################### - } else { - rawData <- rawData[complete.cases(rawData),] - - } - -set.seed(2) - -#print(dim(dataX)) -#print(dim(rawData)) -#print(length(dataY)) - -nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) - if(length(nzv) > 0) { - #nzvVars <- names(rawData)[nzv] - rawData <- rawData[,-nzv] - #rawData$outcome <- dataY - } - -predictorNames <- names(rawData)[names(rawData) != "outcome"] - -dx <- rawData[,1:length(rawData)-1] -dy <- rawData[,length(rawData)] -corrThresh <- as.numeric(arg8) -highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) -dx <- dx[, -highCorr] -subsets <- seq(1,length(dx),by=5) -normalization <- preProcess(dx) -dx <- predict(normalization, dx) -dx <- as.data.frame(dx) - -if (arg4 == "lmFuncs"){ -ctrl1 <- rfeControl(functions = lmFuncs, - method = arg5 , - repeats = as.numeric(arg6), - number = as.numeric(arg7), - verbose = FALSE) -} else if(arg4 == "rfFuncs"){ -ctrl1 <- rfeControl(functions = rfFuncs, - method = arg5 , - repeats = as.numeric(arg6), - number = as.numeric(arg7), - verbose = FALSE) -}else if (arg4 == "treebagFuncs"){ -ctrl1 <- rfeControl(functions = treebagFuncs, - method = arg5 , - repeats = as.numeric(arg6), - number = as.numeric(arg7), - verbose = FALSE) -}else { - -ctrl1 <- rfeControl(functions = nbFuncs, - method = arg5 , - repeats = as.numeric(arg6), - number = as.numeric(arg7), - verbose = FALSE) -} - - - - -Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) - -pred11 <- predictors(Profile) -save(Profile,file=arg2) -dataX <- rawData[,pred11] -dataY <- rawData$outcome - -save(dataX,dataY,file=arg3) -rm(dataX) -rm(dataY) -
--- a/featureselect/tool_dependencies.xml Sun Oct 02 05:36:30 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - -<set_environment version="1.0"> - <environment_variable name="FEATURE_SELECTION_R" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> - </set_environment> - <package name="R" version="3.2.0"> - <repository changeset_revision="7833b0ebf8d6" name="package_r_3_2_0" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="caret-tools" version="1.0.0"> - <repository changeset_revision="e5faefaf1037" name="caret_tool_test1" owner="deepakjadmin" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>
--- a/featureselect/toolrfe.xml Sun Oct 02 05:36:30 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ -<tool id="featureSelectR" name="Feature Selection" > -<description> - This tool used for extract best feature subsets cantaining input data for model building. -</description> -<!--command interpreter="bash">step3run.sh $file1 $model $output1 2>/dev/null </command--> -<requirements> - <requirement type="set_environment">FEATURE_SELECTION_R</requirement> - <requirement type="set_environment">R_ROOT_DIR</requirement> - <requirement type="package" version="3.2.0">R</requirement> - <requirement type="package" version="1.0.0">caret-tools</requirement> -</requirements> -<command interpreter="Rscript">feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff > /dev/null 2>&1 </command> - -<inputs> -<param name="input" type="data" label="Select input data file" help="input .RData file" /> -<param name="function1" type="select" display="radio" label="Select appropriate function for algorithm" > - <option value="rfFuncs" selected="true">random forest based function </option> - <option value="lmFuncs">linear model based function</option> - <option value="treebagFuncs">treebag(CART) based function</option> - <option value="nbFuncs">neive bayes based function</option> -</param> - -<param name="corcutoff" type="float" value= "0.8" min="0.0" max = "1.0" label="Select correlation cutoff" help="values bewteen 0-1. fileds above cufoff value removed from data " /> -<param name="resampling" type="select" label="Select appropriate resampling method" > - <option value="repeatedcv" selected="true">repeatedcv </option> - <option value="boot">boot</option> - <option value="cv">cv</option> - <option value="boot632">boot632</option> -</param> - <param name="repeat" type="select" label="Set Number of times to repeat" help="default is 3 "> - <option value="3" selected="true">3</option> - <option value="1">1</option> - <option value="5">5</option> - <option value="10">10</option> - </param> -<param name="number" type="select" label="Set Number of times Resample" help="default is 10"> - <option value="10" selected="true">10</option> - <option value="5">5</option> - <option value="15">15</option> - <option value="20">20</option> - <option value="25">25</option> - </param> - -</inputs> -<outputs> -<data type="data" format="data" name="profile" label="$function1-profile" /> -<data type= "data" format="data" name="finalset" label="Selected_feature.RData "/> -</outputs> -<help> -.. class:: infomark - -**RFE based feature selection for classification and regression** - -Input file must be RData file obtained by converting csv file in to RData. - -output "Selected_feature.RData" file used for model building purpose.While profile - -represents feature selection model. - -Correlation cutoff value is desired for choosing independent variables For example - -Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values. - -User may choose varous resampling methods in combination with repeats and times of resample. - - - -</help> - - -<tests> -<test> - <param name="input" value="testinput.RData"/> - <param name="function1" value="rfFuncs" /> - <param name="corcutoff" value="0.6" /> - <param name="resampling" value="repeatedcv" /> - <param name="repeat" value="1" /> - <param name="number" value="5" /> - - - <output name="profile" file="rfprofile.RData" compare="sim_size" delta="2000000" /> - <output name="finalset" file="selected_fet.RData" compare="sim_size" delta="2000000"/> - </test> -</tests> - - -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Jan 03 02:26:17 2017 -0500 @@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<tool_dependency> + +<set_environment version="1.0"> + <environment_variable name="FEATURE_SELECTION_R" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> + <package name="R" version="3.2.0"> + <repository changeset_revision="7833b0ebf8d6" name="package_r_3_2_0" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="caret-tools" version="1.0.0"> + <repository changeset_revision="e5faefaf1037" name="caret_tool_test1" owner="deepakjadmin" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolrfe.xml Tue Jan 03 02:26:17 2017 -0500 @@ -0,0 +1,92 @@ +<tool id="featureSelectR" name="Feature Selection" > +<description> + This tool used for extract best feature subsets cantaining input data for model building. +</description> +<!--command interpreter="bash">step3run.sh $file1 $model $output1 2>/dev/null </command--> +<requirements> + <requirement type="set_environment">FEATURE_SELECTION_R</requirement> + <requirement type="set_environment">R_ROOT_DIR</requirement> + <requirement type="package" version="3.2.0">R</requirement> + <requirement type="package" version="1.0.0">caret-tools</requirement> +</requirements> +<command interpreter="Rscript">feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff $SAMPLING> /dev/null 2>&1 </command> + +<inputs> +<param name="input" type="data" label="Select input data file" help="input .RData file" /> +<param name="SAMPLING" type="select" label="3(i). Select Sampling Method for imbalanced data" help="Defualt is with No sampling. you may choose downsample or upsample" > + <option value="garBage" selected="true">No Sampling</option> + <option value="downsampling">downsample</option> + <option value="upsampling">upsample</option> + </param> +<param name="function1" type="select" display="radio" label="Select appropriate function for algorithm" > + <option value="rfFuncs" selected="true">random forest based function </option> + <option value="lmFuncs">linear model based function</option> + <option value="treebagFuncs">treebag(CART) based function</option> + <option value="nbFuncs">neive bayes based function</option> +</param> + +<param name="corcutoff" type="float" value= "0.8" min="0.0" max = "1.0" label="Select correlation cutoff" help="values bewteen 0-1. fileds above cufoff value removed from data " /> +<param name="resampling" type="select" label="Select appropriate resampling method" > + <option value="repeatedcv" selected="true">repeatedcv </option> + <option value="boot">boot</option> + <option value="cv">cv</option> + <option value="boot632">boot632</option> +</param> + <param name="repeat" type="select" label="Set Number of times to repeat" help="default is 3 "> + <option value="3" selected="true">3</option> + <option value="1">1</option> + <option value="5">5</option> + <option value="10">10</option> + </param> +<param name="number" type="select" label="Set Number of times Resample" help="default is 10"> + <option value="10" selected="true">10</option> + <option value="5">5</option> + <option value="15">15</option> + <option value="20">20</option> + <option value="25">25</option> + </param> + +</inputs> +<outputs> +<data type="data" format="data" name="profile" label="$function1-profile" /> +<data type= "data" format="data" name="finalset" label="Selected_feature.RData "/> +</outputs> +<help> +.. class:: infomark + +**RFE based feature selection for classification and regression** + +Input file must be RData file obtained by converting csv file in to RData. + +output "Selected_feature.RData" file used for model building purpose.While profile + +represents feature selection model. + +Correlation cutoff value is desired for choosing independent variables For example + +Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values. + +User may choose varous resampling methods in combination with repeats and times of resample. + + + +</help> + + +<tests> +<test> + <param name="input" value="testinput.RData"/> + <param name="function1" value="rfFuncs" /> + <param name="corcutoff" value="0.6" /> + <param name="resampling" value="repeatedcv" /> + <param name="repeat" value="1" /> + <param name="number" value="5" /> + <param name="SAMPLING" value="garb" /> + + <output name="profile" file="rfprofile.RData" compare="sim_size" delta="2000000" /> + <output name="finalset" file="selected_fet.RData" compare="sim_size" delta="2000000"/> + </test> +</tests> + + +</tool>