0
|
1 args <- commandArgs(T)
|
|
2
|
|
3 arg1 <- args[1]
|
|
4 arg2 <- args[2]
|
|
5 arg3 <- args[3]
|
|
6 arg4 <- args[4]
|
|
7 arg5 <- args[5]
|
|
8 arg6 <- args[6]
|
|
9 arg7 <- args[7]
|
|
10
|
|
11 library(caret)
|
|
12 load(arg1)
|
|
13 print("data loaded")
|
|
14 RAWDATA <- dataX
|
|
15 RAWDATA$outcome <- dataY
|
|
16 rawData <- dataX
|
|
17 predictorNames <- names(rawData)
|
|
18
|
|
19 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
|
|
20 if(any(!isNum)) stop("all predictors in rawData should be numeric")
|
|
21
|
|
22 colRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
23 2, function(x) mean(is.na(x)))
|
|
24 colExclude <- colRate > 0.001
|
|
25 if(any(colExclude)){
|
|
26 predictorNames <- predictorNames[!colExclude]
|
|
27 rawData <- rawData[, predictorNames]
|
|
28 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
29 1, function(x) mean(is.na(x)))
|
|
30 }
|
|
31
|
|
32 rowExclude <- rowRate > 0.00000001
|
|
33 if(any(rowExclude)){
|
|
34 rawData <- rawData[!rowExclude, ]
|
|
35 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
36 ##1, function(x) mean(is.na(x)))
|
|
37
|
|
38 ############################################################################
|
|
39 if(is.factor(dataY)){
|
|
40 dataY <- as.vector(dataY)
|
|
41 dataY <- t(dataY)
|
|
42 colName1 <- rownames(dataX)
|
|
43 colnames(dataY) <- colName1
|
|
44 names11 <- rownames(rawData)
|
|
45 dataY <- dataY[,names11]
|
|
46 #dataY <- t(dataY)
|
|
47
|
|
48 colnames(dataY) <- NULL
|
|
49 dataY <- as.factor(dataY)
|
|
50 } else {
|
|
51 dataY <- t(dataY)
|
|
52 colnames(dataY) <- rownames(dataX)
|
|
53 names11 <- rownames(rawData)
|
|
54 dataY <- dataY[,names11]
|
|
55
|
|
56 }
|
|
57
|
|
58
|
|
59 ###############################################################################
|
|
60 } else {
|
|
61 rawData <- RAWDATA[complete.cases(RAWDATA),]
|
|
62 dataX <- rawData[,1:lenght(rawData)-1]
|
|
63 dataY <- rawData[,length(rawData)]
|
|
64
|
|
65 print(dim(dataX))
|
|
66 print(dim(rawData))
|
|
67 rawData <- dataX
|
|
68
|
|
69 print(dim(rawData))
|
|
70 }
|
|
71
|
|
72 set.seed(2)
|
|
73
|
|
74 print(dim(dataX))
|
|
75 print(dim(rawData))
|
|
76 print(length(dataY))
|
|
77
|
|
78 save(rawData,dataY,file="check.RData")
|
|
79 nzv <- nearZeroVar(rawData)
|
|
80 if(length(nzv) > 0) {
|
|
81 nzvVars <- names(rawData)[nzv]
|
|
82 rawData <- rawData[,-nzv]
|
|
83 rawData$outcome <- dataY
|
|
84 } else {
|
|
85 rawData <- rawData
|
|
86 rawData$outcome <- dataY
|
|
87 }
|
|
88
|
|
89 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
90
|
|
91 dx <- rawData[,1:length(rawData)-1]
|
|
92 dy <- rawData[,length(rawData)]
|
|
93 corrThresh <- 0.90
|
|
94 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
|
|
95 dx <- dx[, -highCorr]
|
|
96 subsets <- seq(1,length(dx),by=5)
|
|
97 normalization <- preProcess(dx)
|
|
98 dx <- predict(normalization, dx)
|
|
99 dx <- as.data.frame(dx)
|
|
100
|
|
101 if (arg4 == "lmFuncs"){
|
|
102 ctrl1 <- rfeControl(functions = lmFuncs,
|
|
103 method = arg5 ,
|
|
104 repeats = as.numeric(arg6),
|
|
105 number = as.numeric(arg7),
|
|
106 verbose = FALSE)
|
|
107 } else if(arg4 == "rfFuncs"){
|
|
108 ctrl1 <- rfeControl(functions = rfFuncs,
|
|
109 method = arg5 ,
|
|
110 repeats = as.numeric(arg6),
|
|
111 number = as.numeric(arg7),
|
|
112 verbose = FALSE)
|
|
113 }else if (arg4 == "treebagFuncs"){
|
|
114 ctrl1 <- rfeControl(functions = treebagFuncs,
|
|
115 method = arg5 ,
|
|
116 repeats = as.numeric(arg6),
|
|
117 number = as.numeric(arg7),
|
|
118 verbose = FALSE)
|
|
119 }else {
|
|
120
|
|
121 ctrl1 <- rfeControl(functions = nbFuncs,
|
|
122 method = arg5 ,
|
|
123 repeats = as.numeric(arg6),
|
|
124 number = as.numeric(arg7),
|
|
125 verbose = FALSE)
|
|
126 }
|
|
127
|
|
128
|
|
129
|
|
130
|
|
131 Profile <- rfe(dx, dy,
|
|
132 sizes = subsets,
|
|
133 rfeControl = ctrl1)
|
|
134
|
|
135 pred11 <- predictors(Profile)
|
|
136 save(Profile,file=arg2)
|
|
137 rawData <- RAWDATA[,pred11]
|
|
138 rawData$outcome <- RAWDATA$outcome
|
|
139 dataX <- rawData[,1:length(rawData)-1]
|
|
140 dataY <- rawData[,length(rawData)]
|
|
141 save(dataX,dataY,file=arg3)
|
|
142 rm(dataX)
|
|
143 rm(dataY)
|
|
144
|