0
|
1 def __template4Rnw():
|
|
2
|
|
3 template4Rnw = r'''%% Regression Modeling Script
|
|
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
|
|
5 %% Version: 1.00
|
|
6 %% Created on: 2010/10/02
|
|
7 %%
|
|
8 %% Lynn Group
|
|
9 %% Version: 2.00
|
|
10 %% Created on: 2014/11/15
|
|
11
|
|
12 %% This is an Sweave template for building and describing
|
|
13 %% classification models. It mixes R and LaTeX code. The document can
|
|
14 %% be processing using R's Sweave function to produce a tex file.
|
|
15 %%
|
|
16 %% The inputs are:
|
|
17 %% - the initial data set in a data frame called 'rawData'
|
|
18 %% - a numeric column in the data set called 'outcome'. this should be the
|
|
19 %% outcome variable
|
|
20 %% - all other columns in rawData should be predictor variables
|
|
21 %% - the type of model should be in a variable called 'modName'.
|
|
22 %%
|
|
23 %% The script attempts to make some intelligent choices based on the
|
|
24 %% model being used. For example, if modName is "pls", the script will
|
|
25 %% automatically center and scale the predictor data. There are
|
|
26 %% situations where these choices can (and should be) changed.
|
|
27 %%
|
|
28 %% There are other options that may make sense to change. For example,
|
|
29 %% the user may want to adjust the type of resampling. To find these
|
|
30 %% parts of the script, search on the string 'OPTION'. These parts of
|
|
31 %% the code will document the options.
|
|
32
|
|
33 \documentclass[12pt]{report}
|
|
34 \usepackage{amsmath}
|
|
35 \usepackage[pdftex]{graphicx}
|
|
36 \usepackage{color}
|
|
37 \usepackage{ctable}
|
|
38 \usepackage{xspace}
|
|
39 \usepackage{fancyvrb}
|
|
40 \usepackage{fancyhdr}
|
|
41 \usepackage{lastpage}
|
|
42 \usepackage{longtable}
|
|
43 \usepackage{algorithm2e}
|
|
44 \usepackage[
|
|
45 colorlinks=true,
|
|
46 linkcolor=blue,
|
|
47 citecolor=blue,
|
|
48 urlcolor=blue]
|
|
49 {hyperref}
|
|
50 \usepackage{lscape}
|
|
51
|
|
52 \usepackage{Sweave}
|
|
53 \SweaveOpts{keep.source = TRUE}
|
|
54
|
|
55 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
56
|
|
57 % define new colors for use
|
|
58 \definecolor{darkgreen}{rgb}{0,0.6,0}
|
|
59 \definecolor{darkred}{rgb}{0.6,0.0,0}
|
|
60 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
|
|
61 \definecolor{brown}{rgb}{0.6,0.3,0.3}
|
|
62 \definecolor{darkblue}{rgb}{0,0,0.8}
|
|
63 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
|
|
64
|
|
65 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
66
|
|
67 \newcommand{\bld}[1]{\mbox{\boldmath $#1$}}
|
|
68 \newcommand{\shell}[1]{\mbox{$#1$}}
|
|
69 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
|
|
70
|
|
71 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
|
|
72 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
|
|
73
|
|
74 \newcommand{\halfs}{\frac{1}{2}}
|
|
75
|
|
76 \setlength{\oddsidemargin}{-.25 truein}
|
|
77 \setlength{\evensidemargin}{0truein}
|
|
78 \setlength{\topmargin}{-0.2truein}
|
|
79 \setlength{\textwidth}{7 truein}
|
|
80 \setlength{\textheight}{8.5 truein}
|
|
81 \setlength{\parindent}{0.20truein}
|
|
82 \setlength{\parskip}{0.10truein}
|
|
83
|
|
84 \setcounter{LTchunksize}{50}
|
|
85
|
|
86 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
87 \pagestyle{fancy}
|
|
88 \lhead{}
|
|
89 %% OPTION Report header name
|
|
90 \chead{Regression Model Script}
|
|
91 \rhead{}
|
|
92 \lfoot{}
|
|
93 \cfoot{}
|
|
94 \rfoot{\thepage\ of \pageref{LastPage}}
|
|
95 \renewcommand{\headrulewidth}{1pt}
|
|
96 \renewcommand{\footrulewidth}{1pt}
|
|
97 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
98
|
|
99 %% OPTION Report title and modeler name
|
|
100 \title{Regression Model Script using $METHOD }
|
|
101 \author{"M. Kuhn and Lynn Group, SCIS, JNU, New Delhi"}
|
|
102
|
|
103 \begin{document}
|
|
104
|
|
105 \maketitle
|
|
106
|
|
107 \thispagestyle{empty}
|
|
108
|
|
109 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
|
|
110 library(Hmisc)
|
|
111 library(caret)
|
|
112 versionTest <- compareVersion(packageDescription("caret")$Version,
|
|
113 "4.65")
|
|
114 if(versionTest < 0) stop("caret version 4.65 or later is required")
|
|
115
|
|
116 library(RColorBrewer)
|
|
117
|
|
118
|
|
119 listString <- function (x, period = FALSE, verbose = FALSE)
|
|
120 {
|
|
121 if (verbose) cat("\n entering listString\n")
|
|
122 flush.console()
|
|
123 if (!is.character(x))
|
|
124 x <- as.character(x)
|
|
125 numElements <- length(x)
|
|
126 out <- if (length(x) > 0) {
|
|
127 switch(min(numElements, 3), x, paste(x, collapse = " and "),
|
|
128 {
|
|
129 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
|
|
130 paste(x, collapse = " ")
|
|
131 })
|
|
132 }
|
|
133 else ""
|
|
134 if (period) out <- paste(out, ".", sep = "")
|
|
135 if (verbose) cat(" leaving listString\n\n")
|
|
136 flush.console()
|
|
137 out
|
|
138 }
|
|
139
|
|
140 resampleStats <- function(x, digits = 3)
|
|
141 {
|
|
142 bestPerf <- x$bestTune
|
|
143 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
|
|
144 out <- merge(x$results, bestPerf)
|
|
145 out <- out[, colnames(out) %in% x$perfNames]
|
|
146 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
|
|
147 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
|
|
148 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
|
|
149 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
|
|
150 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
|
|
151 names(out) <- gsub("RMSE", "root mean squared error", names(out), fixed = TRUE)
|
|
152 names(out) <- gsub("Rsquared", "$R^2$", names(out), fixed = TRUE)
|
|
153
|
|
154 out <- format(out, digits = digits)
|
|
155 listString(paste(names(out), "was", out))
|
|
156 }
|
|
157
|
|
158 latticeBubble <- function(x, y, z, offset = .5, splits = 10,
|
|
159 pal = colorRampPalette(brewer.pal(9,"YlOrRd")[-(1:2)]),
|
|
160 ...)
|
|
161 {
|
|
162 cexValues <- rank(z)/length(z) + offset
|
|
163 splits <- unique(quantile(z, probs = seq(0, 1, length = splits)))
|
|
164 splitup <- cut(z, breaks = splits, include.lowest = TRUE)
|
|
165 cols <- pal(length(levels(splitup)))
|
|
166 colValues <- cols[as.numeric(splitup)]
|
|
167 if(is.data.frame(x))
|
|
168 {
|
|
169 out <- splom(~x, col = colValues, cex = cexValues, ...)
|
|
170
|
|
171 } else out <- xyplot(y~x, col = colValues, cex = cexValues, ...)
|
|
172 out
|
|
173
|
|
174 }
|
|
175
|
|
176
|
|
177 ##OPTION: model name: see ?train for more values/models
|
|
178 modName <- "$METHOD"
|
|
179 load("$RDATA")
|
|
180 rawData <- dataX
|
|
181 rawData$$outcome <- dataY
|
|
182
|
|
183
|
|
184
|
|
185 @
|
|
186
|
|
187
|
|
188 \section*{Data Sets}\label{S:data}
|
|
189
|
|
190 %% OPTION: provide some background on the problem, the experimental
|
|
191 %% data, how the compounds were selected etc
|
|
192
|
|
193
|
|
194 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
|
|
195 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
|
|
196 if(!is.numeric(rawData$outcome)) stop("the outcome should be a numeric vector")
|
|
197
|
|
198 numSamples <- nrow(rawData)
|
|
199 numPredictors <- ncol(rawData) - 1
|
|
200 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
201
|
|
202 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
|
|
203 if(any(!isNum)) stop("all predictors in rawData should be numeric")
|
|
204
|
|
205 @
|
|
206
|
|
207
|
|
208 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
|
|
209
|
|
210 colRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
211 2, function(x) mean(is.na(x)))
|
|
212
|
|
213 ##OPTION thresholds can be changed
|
|
214 colExclude <- colRate > $MISSINGFILTERTHRESHC
|
|
215
|
|
216 missingText <- ""
|
|
217
|
|
218 if(any(colExclude))
|
|
219 {
|
|
220 missingText <- paste(missingText,
|
|
221 ifelse(sum(colExclude) > 1,
|
|
222 " There were ",
|
|
223 " There was "),
|
|
224 sum(colExclude),
|
|
225 ifelse(sum(colExclude) > 1,
|
|
226 " predictors ",
|
|
227 " predictor "),
|
|
228 "with an excessive number of ",
|
|
229 "missing data. ",
|
|
230 ifelse(sum(colExclude) > 1,
|
|
231 " These were excluded. ",
|
|
232 " This was excluded. "))
|
|
233 predictorNames <- predictorNames[!colExclude]
|
|
234 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
|
|
235 }
|
|
236
|
|
237
|
|
238 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
239 1, function(x) mean(is.na(x)))
|
|
240
|
|
241 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
|
|
242
|
|
243
|
|
244 if(any(rowExclude))
|
|
245 {
|
|
246 missingText <- paste(missingText,
|
|
247 ifelse(sum(rowExclude) > 1,
|
|
248 " There were ",
|
|
249 " There was "),
|
|
250 sum(colExclude),
|
|
251 ifelse(sum(rowExclude) > 1,
|
|
252 " samples ",
|
|
253 " sample "),
|
|
254 "with an excessive number of ",
|
|
255 "missing data. ",
|
|
256 ifelse(sum(rowExclude) > 1,
|
|
257 " These were excluded. ",
|
|
258 " This was excluded. "),
|
|
259 "After filtering, ",
|
|
260 sum(!rowExclude),
|
|
261 " samples remained.")
|
|
262 rawData <- rawData[!rowExclude, ]
|
|
263 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
264 1, function(x) mean(is.na(x)))
|
|
265 } else {
|
|
266 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
267 1, function(x) any(is.na(x)))
|
|
268 missingText <- paste(missingText,
|
|
269 ifelse(missingText == "",
|
|
270 "There ",
|
|
271 "Subsequently, there "),
|
|
272 ifelse(sum(hasMissing) == 1,
|
|
273 "was ",
|
|
274 "were "),
|
|
275 ifelse(sum(hasMissing) > 0,
|
|
276 sum(hasMissing),
|
|
277 "no"),
|
|
278 ifelse(sum(hasMissing) == 1,
|
|
279 "sample ",
|
|
280 "samples "),
|
|
281 "with missing values.")
|
|
282 rawData <- rawData[complete.cases(rawData),]
|
|
283 }
|
|
284
|
|
285 dataDist <- summary(rawData$outcome)
|
|
286 dataSD <- sd(rawData$outcome, na.rm = TRUE)
|
|
287 dataText <- paste("The average outcome value was ",
|
|
288 dataDist["Mean"],
|
|
289 " and a standard deviation of ",
|
|
290 dataSD, ". The minimum and maximum values were ",
|
|
291 dataDist["Min."], " and ", dataDist["Max."],
|
|
292 ", respectively. Figure \\\\ref{F:dens} shows a ",
|
|
293 " density plot (i.e. a smooth histogram) of the response.",
|
|
294 sep = "")
|
|
295
|
|
296 rawData1 <- rawData[,1:length(rawData)-1]
|
|
297 rawData2 <- rawData[,length(rawData)]
|
|
298
|
|
299 set.seed(222)
|
|
300 nzv1 <- nearZeroVar(rawData1)
|
|
301 if(length(nzv1) > 0)
|
|
302 {
|
|
303 nzvVars1 <- names(rawData1)[nzv1]
|
|
304 rawData <- rawData1[,-nzv1]
|
|
305 rawData$outcome <- rawData2
|
|
306 nzvText1 <- paste("There were ",
|
|
307 length(nzv1),
|
|
308 " predictors that were removed from original data due to",
|
|
309 " severely unbalanced distributions that",
|
|
310 " could negatively affect the model fit",
|
|
311 ifelse(length(nzv1) > 10,
|
|
312 ".",
|
|
313 paste(": ",
|
|
314 listString(nzvVars1),
|
|
315 ".",
|
|
316 sep = "")),
|
|
317 sep = "")
|
|
318
|
|
319 } else {
|
|
320 rawData <- rawData1
|
|
321 rawData$outcome <- rawData2
|
|
322 nzvText1 <- ""
|
|
323
|
|
324 }
|
|
325
|
|
326 remove("rawData1")
|
|
327 remove("rawData2")
|
|
328
|
|
329 @
|
|
330
|
|
331 The initial data set consisted of \Sexpr{numSamples} samples and
|
|
332 \Sexpr{numPredictors} predictor variables. \Sexpr{dataText} \Sexpr{missingText}
|
|
333 \Sexpr{nzvText1}
|
|
334
|
|
335 \setkeys{Gin}{width = 0.8\textwidth}
|
|
336 \begin{figure}[b]
|
|
337 \begin{center}
|
|
338
|
|
339 <<densityplot, echo = FALSE, results = hide, fig = TRUE, width = 8, height = 4.5>>=
|
|
340 trellis.par.set(caretTheme(), warn = TRUE)
|
|
341 print(densityplot(~rawData$outcome, pch = "|", adjust = 1.25, xlab = ""))
|
|
342 @
|
|
343 \caption[Data Density]{A density plot of the response. The marks
|
|
344 along the $x$--axis show the locations of the data points.}
|
|
345 \label{F:dens}
|
|
346 \end{center}
|
|
347 \end{figure}
|
|
348
|
|
349
|
|
350 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
|
|
351 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
352 numPredictors <- length(predictorNames)
|
|
353
|
|
354 predictors <- rawData[, predictorNames, drop = FALSE]
|
|
355 ## PCA will fail with predictors having less than 2 unique values
|
|
356 isZeroVar <- apply(predictors, 2,
|
|
357 function(x) length(unique(x)) < 2)
|
|
358 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
|
|
359 ## For whatever, only the formula interface to prcomp
|
|
360 ## handles missing values
|
|
361 pcaForm <- as.formula(
|
|
362 paste("~",
|
|
363 paste(names(predictors), collapse = "+")))
|
|
364 pca <- prcomp(pcaForm,
|
|
365 data = predictors,
|
|
366 center = TRUE,
|
|
367 scale. = TRUE,
|
|
368 na.action = na.omit)
|
|
369 ## OPTION: the number of components plotted/discussed can be set
|
|
370 numPCAcomp <- $PCACOMP
|
|
371 pctVar <- pca$sdev^2/sum(pca$sdev^2)*100
|
|
372 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
|
|
373 "\\\\%",
|
|
374 sep = "")
|
|
375 pcaText <- listString(pcaText)
|
|
376 @
|
|
377
|
|
378 To get an initial assessment of the separability of the classes,
|
|
379 principal component analysis (PCA) was used to distill the
|
|
380 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
|
|
381 surrogate variables (i.e. the principal components) in a manner that
|
|
382 attempts to maximize the amount of information preserved from the
|
|
383 original predictor set. Figure \ref{F:inititalPCA} contains plots of
|
|
384 the first \Sexpr{numPCAcomp} components, which accounted for
|
|
385 \Sexpr{pcaText} percent of the variability in the original predictors
|
|
386 (respectively).
|
|
387
|
|
388 %% OPTION: remark on how well (or poorly) the data separated
|
|
389
|
|
390 \setkeys{Gin}{width = 0.8\textwidth}
|
|
391 \begin{figure}[p]
|
|
392 \begin{center}
|
|
393
|
|
394 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
|
|
395 trellis.par.set(caretTheme(), warn = TRUE)
|
|
396 if(numPCAcomp == 2)
|
|
397 {
|
|
398 axisRange <- extendrange(pca$x[, 1:2])
|
|
399 print(
|
|
400 latticeBubble(x = as.data.frame(pca$x)$PC1,
|
|
401 y = as.data.frame(pca$x)$PC2,
|
|
402 z = rawData$outcome,
|
|
403 type = c("p", "g"),
|
|
404 xlab = "PC1", ylab = "PC2",
|
|
405 xlim = axisRange,
|
|
406 ylim = axisRange))
|
|
407
|
|
408 } else {
|
|
409 axisRange <- extendrange(pca$x[, 1:numPCAcomp])
|
|
410 print(
|
|
411 latticeBubble(x = as.data.frame(pca$x)[,1:numPCAcomp],
|
|
412
|
|
413 z = rawData$outcome,
|
|
414 type = c("p", "g"),
|
|
415 xlab = "PC1", ylab = "PC2",
|
|
416 xlim = axisRange,
|
|
417 ylim = axisRange))
|
|
418
|
|
419
|
|
420 }
|
|
421 @
|
|
422 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
|
|
423 principal components for the original data set. Smaller, lighter
|
|
424 points indicate smaller values of the response while darker,
|
|
425 larger points correspond to larger values of the outcome}
|
|
426 \label{F:inititalPCA}
|
|
427 \end{center}
|
|
428 \end{figure}
|
|
429
|
|
430
|
|
431 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
|
|
432
|
|
433 ## OPTION: in small samples sizes, you may not want to set aside a
|
|
434 ## training set and focus on the resampling results.
|
|
435 numSamples <- nrow(rawData)
|
|
436
|
|
437 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
438 numPredictors <- length(predictorNames)
|
|
439
|
|
440 # pctTrain <- .15
|
|
441 pctTrain <- $PERCENT
|
|
442
|
|
443 if(pctTrain < 1)
|
|
444 {
|
|
445 ## OPTION: seed number can be changed
|
|
446 set.seed(1)
|
|
447 inTrain <- createDataPartition(rawData$outcome,
|
|
448 p = pctTrain,
|
|
449 list = FALSE)
|
|
450 trainX <- rawData[ inTrain, predictorNames]
|
|
451 testX <- rawData[-inTrain, predictorNames]
|
|
452 trainY <- rawData[ inTrain, "outcome"]
|
|
453 testY <- rawData[-inTrain, "outcome"]
|
|
454 splitText <- paste("The original data were split into ",
|
|
455 "a training set ($n$=",
|
|
456 nrow(trainX),
|
|
457 ") and a test set ($n$=",
|
|
458 nrow(testX),
|
|
459 ") in a manner that preserved the ",
|
|
460 "distribution of the response.",
|
|
461 sep = "")
|
|
462 isZeroVar <- apply(trainX, 2,
|
|
463 function(x) length(unique(x)) < 2)
|
|
464 if(any(isZeroVar))
|
|
465 {
|
|
466 trainX <- trainX[, !isZeroVar, drop = FALSE]
|
|
467 testX <- testX[, !isZeroVar, drop = FALSE]
|
|
468 }
|
|
469
|
|
470 } else {
|
|
471 trainX <- rawData[, predictorNames]
|
|
472 testX <- NULL
|
|
473 trainY <- rawData[, "outcome"]
|
|
474 testY <- NULL
|
|
475 splitText <- "The entire data set was used as the training set."
|
|
476 }
|
|
477
|
|
478 remove("rawData")
|
|
479
|
|
480 @
|
|
481
|
|
482 \Sexpr{splitText}
|
|
483 The data set for model building consisted of \Sexpr{numSamples} samples and
|
|
484 \Sexpr{numPredictors} predictor variables.
|
|
485
|
|
486
|
|
487
|
|
488 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
|
|
489 ## OPTION: other pre-processing steps can be used
|
|
490 ppSteps <- caret:::suggestions(modName)
|
|
491
|
|
492 set.seed(2)
|
|
493 if(ppSteps["nzv"])
|
|
494 {
|
|
495 nzv <- nearZeroVar(trainX)
|
|
496 if(length(nzv) > 0)
|
|
497 {
|
|
498 nzvVars <- names(trainX)[nzv]
|
|
499 trainX <- trainX[, -nzv]
|
|
500 nzvText <- paste("There were ",
|
|
501 length(nzv),
|
|
502 " predictors that were removed due to",
|
|
503 " severely unbalanced distributions that",
|
|
504 " could negatively affect the model fit",
|
|
505 ifelse(length(nzv) > 10,
|
|
506 ".",
|
|
507 paste(": ",
|
|
508 listString(nzvVars),
|
|
509 ".",
|
|
510 sep = "")),
|
|
511 sep = "")
|
|
512 testX <- testX[, -nzv]
|
|
513 } else nzvText <- ""
|
|
514 } else nzvText <- ""
|
|
515 @
|
|
516
|
|
517 \Sexpr{nzvText}
|
|
518
|
|
519 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
|
|
520
|
|
521 if(ppSteps["corr"])
|
|
522 {
|
|
523 ## OPTION:
|
|
524 ##corrThresh <- .75
|
|
525 corrThresh <- $THRESHHOLDCOR
|
|
526 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
|
|
527 corrThresh)
|
|
528 if(length(highCorr) > 0)
|
|
529 {
|
|
530 corrVars <- names(trainX)[highCorr]
|
|
531 trainX <- trainX[, -highCorr]
|
|
532 corrText <- paste("There were ",
|
|
533 length(highCorr),
|
|
534 " predictors that were removed due to",
|
|
535 " large between--predictor correlations that",
|
|
536 " could negatively affect the model fit",
|
|
537 ifelse(length(highCorr) > 10,
|
|
538 ".",
|
|
539 paste(": ",
|
|
540 listString(highCorr),
|
|
541 ".",
|
|
542 sep = "")),
|
|
543 " Removing these predictors forced",
|
|
544 " all pair--wise correlations to be",
|
|
545 " less than ",
|
|
546 corrThresh,
|
|
547 ".",
|
|
548 sep = "")
|
|
549 testX <- testX[, -highCorr]
|
|
550 } else corrText <- ""
|
|
551 }else corrText <- ""
|
|
552 @
|
|
553
|
|
554 \Sexpr{corrText}
|
|
555
|
|
556 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
|
|
557
|
|
558 ppMethods <- NULL
|
|
559 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
|
|
560 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
|
|
561 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
|
|
562 ##OPTION other methods, such as spatial sign, can be added to this list
|
|
563
|
|
564 if(length(ppMethods) > 0)
|
|
565 {
|
|
566 ppInfo <- preProcess(trainX, method = ppMethods)
|
|
567 trainX <- predict(ppInfo, trainX)
|
|
568 if(pctTrain < 1) testX <- predict(ppInfo, testX)
|
|
569 ppText <- paste("The following pre--processing methods were",
|
|
570 " applied to the training",
|
|
571 ifelse(pctTrain < 1, " and test", ""),
|
|
572 " data: ",
|
|
573 listString(ppMethods),
|
|
574 ".",
|
|
575 sep = "")
|
|
576 ppText <- gsub("center", "mean centering", ppText)
|
|
577 ppText <- gsub("scale", "scaling to unit variance", ppText)
|
|
578 ppText <- gsub("knnImpute",
|
|
579 paste(ppInfo$k, "--nearest neighbor imputation", sep = ""),
|
|
580 ppText)
|
|
581 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
|
|
582 ppText <- gsub("pca", "principal component feature extraction", ppText)
|
|
583 ppText <- gsub("ica", "independent component feature extraction", ppText)
|
|
584 } else {
|
|
585 ppInfo <- NULL
|
|
586 ppText <- ""
|
|
587 }
|
|
588
|
|
589 predictorNames <- names(trainX)
|
|
590 if(nzvText != "" | corrText != "" | ppText != "")
|
|
591 {
|
|
592 varText <- paste("After pre--processing, ",
|
|
593 ncol(trainX),
|
|
594 "predictors remained for modeling.")
|
|
595 } else varText <- ""
|
|
596
|
|
597 @
|
|
598
|
|
599 \Sexpr{ppText} \Sexpr{varText}
|
|
600
|
|
601 \clearpage
|
|
602 \section*{Model Building}
|
|
603
|
|
604 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
|
|
605
|
|
606 numWorkers <- $NUMWORKERS
|
|
607 ##OPTION: turn up numWorkers to use MPI
|
|
608 if(numWorkers > 1)
|
|
609 {
|
|
610 mpiCalcs <- function(X, FUN, ...)
|
|
611 {
|
|
612 theDots <- list(...)
|
|
613 parLapply(theDots$cl, X, FUN)
|
|
614 }
|
|
615
|
|
616 library(snow)
|
|
617 cl <- makeCluster(numWorkers, "MPI")
|
|
618 }
|
|
619 @
|
|
620
|
|
621 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
|
|
622 ##<<setupResampling, echo = FALSE, results = hide>>=
|
|
623 ##OPTION: the resampling options can be changed. See
|
|
624 ## ?trainControl for details
|
|
625 resampName <- "repeatedcv"
|
|
626 resampNumber <- $RESAMPLENUMBER
|
|
627 numRepeat <- 3
|
|
628 resampP <- $RESAMPLENUMBERPERCENT
|
|
629
|
|
630 modelInfo <- modelLookup(modName)
|
|
631
|
|
632 set.seed(3)
|
|
633 ctlObj <- trainControl(method = resampName,
|
|
634 number = resampNumber,
|
|
635 repeats = numRepeat,
|
|
636 p = resampP)
|
|
637
|
|
638
|
|
639 ##OPTION select other performance metrics as needed
|
|
640 optMetric <- "RMSE"
|
|
641
|
|
642 if(numWorkers > 1)
|
|
643 {
|
|
644 ctlObj$workers <- numWorkers
|
|
645 ctlObj$computeFunction <- mpiCalcs
|
|
646 ctlObj$computeArgs <- list(cl = cl)
|
|
647 }
|
|
648 @
|
|
649
|
|
650 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
|
|
651
|
|
652 ##OPTION expand or contract these grids as needed (or
|
|
653 ## add more models
|
|
654
|
|
655 gridSize <- $SETUPGRIDSIZE
|
|
656
|
|
657 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "ctree2", "ctree")) gridSize <- 5
|
|
658 if(modName %in% c("earth")) gridSize <- 7
|
|
659 if(modName %in% c("knn", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
|
|
660
|
|
661 if(modName %in% c("rpart")) gridSize <- 15
|
|
662 if(modName %in% c("pls", "lars2", "lars")) gridSize <- min(20, ncol(trainX))
|
|
663
|
|
664 if(modName == "gbm")
|
|
665 {
|
|
666 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
|
|
667 .n.trees = (1:10)*20,
|
|
668 .shrinkage = .1)
|
|
669 }
|
|
670
|
|
671 if(modName == "nnet")
|
|
672 {
|
|
673 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
|
|
674 .decay = c(0, .001, .01, .1))
|
|
675 }
|
|
676
|
|
677 @
|
|
678
|
|
679
|
|
680 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
|
|
681
|
|
682 ##OPTION alter as needed
|
|
683
|
|
684 set.seed(4)
|
|
685 modelFit <- switch(modName,
|
|
686 gbm =
|
|
687 {
|
|
688 mix <- sample(seq(along = trainY))
|
|
689 train(
|
|
690 trainX[mix,], trainY[mix], modName,
|
|
691 verbose = FALSE,
|
|
692 bag.fraction = .9,
|
|
693 metric = optMetric,
|
|
694 trControl = ctlObj,
|
|
695 tuneGrid = tGrid)
|
|
696 },
|
|
697
|
|
698 nnet =
|
|
699 {
|
|
700 train(
|
|
701 trainX, trainY, modName,
|
|
702 metric = optMetric,
|
|
703 linout = TRUE,
|
|
704 trace = FALSE,
|
|
705 maxiter = 1000,
|
|
706 MaxNWts = 5000,
|
|
707 trControl = ctlObj,
|
|
708 tuneGrid = tGrid)
|
|
709
|
|
710 },
|
|
711
|
|
712 svmRadial =, svmPoly =, svmLinear =
|
|
713 {
|
|
714 train(
|
|
715 trainX, trainY, modName,
|
|
716 metric = optMetric,
|
|
717 scaled = TRUE,
|
|
718 trControl = ctlObj,
|
|
719 tuneLength = gridSize)
|
|
720 },
|
|
721 {
|
|
722 train(trainX, trainY, modName,
|
|
723 trControl = ctlObj,
|
|
724 metric = optMetric,
|
|
725 tuneLength = gridSize)
|
|
726 })
|
|
727
|
|
728 @
|
|
729
|
|
730 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
|
|
731
|
|
732
|
|
733 summaryText <- ""
|
|
734
|
|
735 resampleName <- switch(tolower(modelFit$control$method),
|
|
736 boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""),
|
|
737 boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""),
|
|
738 cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""),
|
|
739 repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ",
|
|
740 modelFit$control$repeats, " times)", sep = ""),
|
|
741 lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ",
|
|
742 round(modelFit$control$p, 2), "$\\%$)", sep = ""))
|
|
743
|
|
744 tuneVars <- latexTranslate(tolower(modelInfo$label))
|
|
745 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
|
|
746 if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter")
|
|
747 {
|
|
748 summaryText <- paste(summaryText,
|
|
749 "\n\n",
|
|
750 "There are no tuning parameters associated with this model.",
|
|
751 "To characterize the model performance on the training set,",
|
|
752 resampleName,
|
|
753 "was used.",
|
|
754 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
|
|
755 "show summaries of the resampling results. ")
|
|
756
|
|
757 } else {
|
|
758 summaryText <- paste("There",
|
|
759 ifelse(nrow(modelInfo) > 1, "are", "is"),
|
|
760 nrow(modelInfo),
|
|
761 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
|
|
762 "associated with this model:",
|
|
763 listString(tuneVars, period = TRUE))
|
|
764
|
|
765
|
|
766
|
4
|
767 paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE)
|
|
768 ##for(i in seq(along = paramNames))
|
|
769 ## {
|
|
770 ## check <- modelInfo$parameter %in% paramNames[i]
|
|
771 ## if(any(check))
|
|
772 ## {
|
|
773 ## paramNames[i] <- modelInfo$label[which(check)]
|
|
774 ## }
|
|
775 ## }
|
0
|
776
|
|
777 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
|
|
778 ## Check to see if there was only one combination fit
|
|
779 summaryText <- paste(summaryText,
|
|
780 "To choose",
|
|
781 ifelse(nrow(modelInfo) > 1,
|
|
782 "appropriate values of the tuning parameters,",
|
|
783 "an appropriate value of the tuning parameter,"),
|
|
784 resampleName,
|
|
785 "was used to generated a profile of performance across the",
|
|
786 nrow(modelFit$results),
|
|
787 ifelse(nrow(modelInfo) > 1,
|
|
788 "combinations of the tuning parameters.",
|
|
789 "candidate values."),
|
|
790
|
|
791 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
|
|
792 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
|
|
793 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE))
|
|
794
|
|
795 }
|
|
796 @
|
|
797
|
|
798 \Sexpr{summaryText}
|
|
799
|
|
800 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
|
|
801
|
|
802 tableData <- modelFit$results
|
|
803
|
|
804 if(all(modelInfo$parameter == "parameter"))
|
|
805 {
|
|
806 tableData <- tableData[,-1, drop = FALSE]
|
|
807 colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames))
|
|
808 colLabels <- c("Mean", "Standard Deviation")
|
|
809 constString <- ""
|
|
810 isConst <- NULL
|
|
811 } else {
|
|
812
|
|
813 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
|
|
814 2,
|
|
815 function(x) length(unique(x)) == 1)
|
|
816
|
|
817 numParamInTable <- sum(!isConst)
|
|
818
|
|
819 if(any(isConst))
|
|
820 {
|
|
821 constParam <- modelInfo$parameter[isConst]
|
|
822 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
|
|
823 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
|
|
824 constString <- paste("The tuning",
|
|
825 ifelse(sum(isConst) > 1,
|
|
826 "parmeters",
|
|
827 "parameter"),
|
|
828 listString(paste("``", names(constValues), "''", sep = "")),
|
|
829 ifelse(sum(isConst) > 1,
|
|
830 "were",
|
|
831 "was"),
|
|
832 "held constant at",
|
|
833 ifelse(sum(isConst) > 1,
|
|
834 "a value of",
|
|
835 "values of"),
|
|
836 listString(constValues[1,]))
|
|
837
|
|
838 } else constString <- ""
|
|
839
|
|
840 cn <- colnames(tableData)
|
|
841 for(i in seq(along = cn))
|
|
842 {
|
|
843 check <- modelInfo$parameter %in% cn[i]
|
|
844 if(any(check))
|
|
845 {
|
|
846 cn[i] <- modelInfo$label[which(check)]
|
|
847 }
|
|
848 }
|
|
849 colnames(tableData) <- cn
|
|
850
|
|
851 colNums <- c(numParamInTable,
|
|
852 length(modelFit$perfNames),
|
|
853 length(modelFit$perfNames))
|
|
854 colLabels <- c("", "Mean", "Standard Deviation")
|
|
855 }
|
|
856
|
|
857 colnames(tableData) <- gsub("SD$", "", colnames(tableData))
|
|
858 colnames(tableData) <- latexTranslate(colnames(tableData))
|
|
859 rownames(tableData) <- latexTranslate(rownames(tableData))
|
|
860
|
|
861 latex(tableData,
|
|
862 rowname = NULL,
|
|
863 file = "",
|
|
864 cgroup = colLabels,
|
|
865 n.cgroup = colNums,
|
|
866 where = "h!",
|
|
867 digits = 4,
|
|
868 longtable = nrow(tableData) > 30,
|
|
869 caption = paste(resampleName, "results from the model fit.", constString),
|
|
870 label = "T:resamps")
|
|
871 @
|
|
872
|
|
873 \setkeys{Gin}{ width = 0.9\textwidth}
|
|
874 \begin{figure}[b]
|
|
875 \begin{center}
|
|
876
|
|
877 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
|
|
878
|
|
879 trellis.par.set(caretTheme(), warn = TRUE)
|
|
880 if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb")
|
|
881 {
|
|
882 resultsPlot <- resampleHist(modelFit)
|
|
883 plotCaption <- paste("Distributions of model performance from the ",
|
|
884 "training set estimated using ",
|
|
885 resampleName)
|
|
886 } else {
|
|
887 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
|
|
888 {
|
|
889 resultsPlot <- plot(modelFit,
|
|
890 metric = optMetric,
|
|
891 xTrans = function(x) log10(x))
|
|
892 resultsPlot <- update(resultsPlot,
|
|
893 type = c("g", "p", "l"),
|
|
894 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
|
|
895
|
|
896 } else {
|
|
897 resultsPlot <- plot(modelFit,
|
|
898 metric = optMetric)
|
|
899 resultsPlot <- update(resultsPlot,
|
|
900 type = c("g", "p", "l"),
|
|
901 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
|
|
902 }
|
|
903 plotCaption <- paste("A plot of the estimates of the",
|
|
904 optMetric,
|
|
905 "values calculated using",
|
|
906 resampleName)
|
|
907 }
|
|
908 print(resultsPlot)
|
|
909 @
|
|
910 \caption[Performance Plot]{\Sexpr{plotCaption}.}
|
|
911 \label{F:profile}
|
|
912 \end{center}
|
|
913 \end{figure}
|
|
914
|
|
915 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
|
|
916 ##<<stopWorkers, echo = FALSE, results = hide>>=
|
|
917 if(numWorkers > 1) stopCluster(cl)
|
|
918 @
|
|
919
|
|
920 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
|
|
921
|
|
922
|
|
923 if(pctTrain < 1)
|
|
924 {
|
|
925 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
|
|
926
|
|
927 testPreds <- extractPrediction(list(fit = modelFit),
|
|
928 testX = testX, testY = testY)
|
2
|
929 testPreds_all <- testPreds
|
0
|
930 testPreds <- subset(testPreds, dataType == "Test")
|
2
|
931 testPreds_test <- testPreds
|
0
|
932 values <- modelFit$control$summaryFunction(testPreds)
|
|
933 names(values) <- gsub("RMSE", "root mean squared error", names(values), fixed = TRUE)
|
|
934 names(values) <- gsub("Rsquared", "$R^2$", names(values), fixed = TRUE)
|
|
935 values <- format(values, digits = 3)
|
|
936
|
|
937 testString <- paste("Based on the test set of",
|
|
938 nrow(testX),
|
|
939 "samples,",
|
|
940 listString(paste(names(values), "was", values), period = TRUE),
|
|
941 " A plot of the observed and predicted outcomes for the test set ",
|
|
942 "is given in Figure \\\\ref{F:obsPred}.")
|
|
943 testString <- paste(testString,
|
|
944 " Using ", resampleName,
|
|
945 ", the training set estimates were ",
|
|
946 resampleStats(modelFit),
|
|
947 ".",
|
|
948 sep = "")
|
|
949
|
|
950 axisRange <- extendrange(testPreds[, c("obs", "pred")])
|
|
951 obsPred <- xyplot(obs ~ pred,
|
|
952 data = testPreds,
|
|
953 xlim = axisRange,
|
|
954 ylim = axisRange,
|
|
955 panel = function(x, y)
|
|
956 {
|
|
957 panel.abline(0, 1, col = "darkgrey", lty = 2)
|
|
958 panel.xyplot(x, y, type = c("p", "g"))
|
|
959 panel.loess(x, y, col = "darkred", lwd = 2)
|
|
960
|
|
961
|
|
962 },
|
|
963 ylab = "Observed Response",
|
|
964 xlab = "Predicted Response")
|
|
965
|
|
966 pdf("obsPred.pdf", height = 8, width = 8)
|
|
967 trellis.par.set(caretTheme())
|
|
968 print(obsPred)
|
|
969 dev.off()
|
|
970
|
|
971 } else testString <- ""
|
|
972 @
|
|
973 \Sexpr{testString}
|
|
974
|
|
975
|
|
976 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
|
|
977
|
|
978
|
|
979 if(pctTrain < 1)
|
|
980 {
|
|
981 cat(
|
|
982 paste("\\begin{figure}[p]\n",
|
|
983 "\\begin{center}\n",
|
|
984 "\\includegraphics{obsPred}",
|
|
985 "\\caption[Observed V Fitted Values]{",
|
|
986 "The observed and predicted responses. ",
|
|
987 "The grey line is the line of identity while the",
|
|
988 "solid red line is a smoothed trend line.}\n",
|
|
989 "\\label{F:obsPred}\n",
|
|
990 "\\end{center}\n",
|
|
991 "\\end{figure}"))
|
|
992 }
|
|
993
|
|
994 @
|
|
995
|
|
996 \section*{Versions}
|
|
997
|
|
998 <<versions, echo = FALSE, results = tex>>=
|
|
999 toLatex(sessionInfo())
|
|
1000
|
|
1001 @
|
|
1002
|
|
1003 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
|
|
1004 ## change this to the name of modName....
|
|
1005 Fit<-modelFit
|
3
|
1006 save(Fit,testPreds_all,testPreds_test,file="$METHOD-Fit.RData")
|
0
|
1007 @
|
|
1008 The model was built using $METHOD and is saved as $METHOD-Fit.RData for reuse. This contains the variable Fit.
|
|
1009
|
|
1010
|
|
1011 \end{document}'''
|
|
1012 return template4Rnw
|