5
|
1 args <- commandArgs(T)
|
|
2
|
|
3 arg1 <- args[1]
|
|
4 arg2 <- args[2]
|
|
5 arg3 <- args[3]
|
|
6 arg4 <- args[4]
|
|
7 arg5 <- args[5]
|
|
8 arg6 <- args[6]
|
|
9 arg7 <- args[7]
|
|
10 arg8 <- args[8]
|
|
11 arg9 <- args[9]
|
7
|
12 arg10 <- args[10]
|
5
|
13 library(caret)
|
7
|
14 library(doMC)
|
5
|
15 load(arg1)
|
|
16
|
|
17 #RAWDATA <- dataX
|
|
18 #RAWDATA$outcome <- dataY
|
|
19
|
|
20
|
|
21 ###########################
|
|
22 Smpling <- arg9
|
|
23
|
|
24 if(Smpling=="downsampling")
|
|
25 {
|
|
26 dwnsmpl <- downSample(dataX,dataY)
|
|
27 RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
|
|
28 RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
|
|
29 dataX <- RAWDATA[,1:length(dwnsmpl)-1]
|
|
30 dataY <- RAWDATA[,"outcome"]
|
|
31 remove("dwnsmpl")
|
|
32 }else if(Smpling=="upsampling"){
|
|
33 upsmpl <- upSample(dataX,dataY)
|
|
34 RAWDATA <- upsmpl[,1:length(upsmpl)-1]
|
|
35 RAWDATA$outcome <- upsmpl[,length(upsmpl)]
|
|
36 dataX <- RAWDATA[,1:length(upsmpl)-1]
|
|
37 dataY <- RAWDATA[,"outcome"]
|
|
38 remove("upsmpl")
|
|
39 }else {
|
|
40 RAWDATA <- dataX
|
|
41 RAWDATA$outcome <- dataY
|
|
42 }
|
|
43
|
|
44
|
|
45
|
|
46
|
|
47 ##########################
|
|
48
|
|
49
|
|
50 rawData <- dataX
|
|
51 predictorNames <- names(rawData)
|
|
52
|
|
53 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
|
|
54 if(any(!isNum)) stop("all predictors in rawData should be numeric")
|
|
55
|
|
56 colRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
57 2, function(x) mean(is.na(x)))
|
|
58 colExclude <- colRate > 0.1
|
|
59 if(any(colExclude)){
|
|
60 predictorNames <- predictorNames[-which(colExclude)]
|
|
61 rawData <- RAWDATA[, c(predictorNames,"outcome")]
|
|
62 } else {
|
|
63 rawData <- RAWDATA
|
|
64 }
|
|
65 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
66 1, function(x) mean(is.na(x)))
|
|
67
|
|
68
|
|
69 rowExclude <- rowRate > 0
|
|
70 if(any(rowExclude)){
|
|
71 rawData <- rawData[!rowExclude, ]
|
|
72 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
73 ##1, function(x) mean(is.na(x)))
|
|
74
|
|
75 ############################################################################
|
|
76
|
|
77
|
|
78 ###############################################################################
|
|
79 } else {
|
|
80 rawData <- rawData[complete.cases(rawData),]
|
|
81
|
|
82 }
|
|
83
|
|
84 set.seed(2)
|
|
85
|
|
86 #print(dim(dataX))
|
|
87 #print(dim(rawData))
|
|
88 #print(length(dataY))
|
|
89
|
|
90 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
|
|
91 if(length(nzv) > 0) {
|
|
92 #nzvVars <- names(rawData)[nzv]
|
|
93 rawData <- rawData[,-nzv]
|
|
94 #rawData$outcome <- dataY
|
|
95 }
|
|
96
|
|
97 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
98
|
|
99 dx <- rawData[,1:length(rawData)-1]
|
|
100 dy <- rawData[,length(rawData)]
|
|
101 corrThresh <- as.numeric(arg8)
|
|
102 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
|
|
103 dx <- dx[, -highCorr]
|
|
104 subsets <- seq(1,length(dx),by=5)
|
|
105 normalization <- preProcess(dx)
|
|
106 dx <- predict(normalization, dx)
|
|
107 dx <- as.data.frame(dx)
|
|
108
|
|
109 if (arg4 == "lmFuncs"){
|
|
110 ctrl1 <- rfeControl(functions = lmFuncs,
|
|
111 method = arg5 ,
|
|
112 repeats = as.numeric(arg6),
|
|
113 number = as.numeric(arg7),
|
|
114 verbose = FALSE)
|
|
115 } else if(arg4 == "rfFuncs"){
|
|
116 ctrl1 <- rfeControl(functions = rfFuncs,
|
|
117 method = arg5 ,
|
|
118 repeats = as.numeric(arg6),
|
|
119 number = as.numeric(arg7),
|
|
120 verbose = FALSE)
|
|
121 }else if (arg4 == "treebagFuncs"){
|
|
122 ctrl1 <- rfeControl(functions = treebagFuncs,
|
|
123 method = arg5 ,
|
|
124 repeats = as.numeric(arg6),
|
|
125 number = as.numeric(arg7),
|
|
126 verbose = FALSE)
|
|
127 }else {
|
|
128
|
|
129 ctrl1 <- rfeControl(functions = nbFuncs,
|
|
130 method = arg5 ,
|
|
131 repeats = as.numeric(arg6),
|
|
132 number = as.numeric(arg7),
|
|
133 verbose = FALSE)
|
|
134 }
|
|
135
|
|
136
|
|
137
|
7
|
138 if (as.numeric(arg10) == 1){
|
5
|
139 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
|
|
140
|
|
141 pred11 <- predictors(Profile)
|
|
142 save(Profile,file=arg2)
|
|
143 dataX <- rawData[,pred11]
|
|
144 dataY <- rawData$outcome
|
|
145
|
|
146 save(dataX,dataY,file=arg3)
|
|
147 rm(dataX)
|
|
148 rm(dataY)
|
7
|
149 } else if (as.numeric(arg10) > 1){
|
|
150 registerDoMC(cores = as.numeric(arg10))
|
5
|
151
|
7
|
152 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
|
|
153
|
|
154 pred11 <- predictors(Profile)
|
|
155 save(Profile,file=arg2)
|
|
156 dataX <- rawData[,pred11]
|
|
157 dataY <- rawData$outcome
|
|
158
|
|
159 save(dataX,dataY,file=arg3)
|
|
160 rm(dataX)
|
|
161 rm(dataY)
|
|
162 } else { stop("something went wrong. please see the parameters")}
|
|
163
|
|
164
|