6
|
1 def __template4Rnw():
|
|
2
|
|
3 template4Rnw = r'''%% Classification Modeling Script
|
|
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
|
|
5 %% Version: 1.00
|
|
6 %% Created on: 2010/10/02
|
|
7 %%
|
|
8 %% This is an Sweave template for building and describing
|
|
9 %% classification models. It mixes R and LaTeX code. The document can
|
|
10 %% be processing using R's Sweave function to produce a tex file.
|
|
11 %%
|
|
12 %% The inputs are:
|
|
13 %% - the initial data set in a data frame called 'rawData'
|
|
14 %% - a factor column in the data set called 'class'. this should be the
|
|
15 %% outcome variable
|
|
16 %% - all other columns in rawData should be predictor variables
|
|
17 %% - the type of model should be in a variable called 'modName'.
|
|
18 %%
|
|
19 %% The script attempts to make some intelligent choices based on the
|
|
20 %% model being used. For example, if modName is "pls", the script will
|
|
21 %% automatically center and scale the predictor data. There are
|
|
22 %% situations where these choices can (and should be) changed.
|
|
23 %%
|
|
24 %% There are other options that may make sense to change. For example,
|
|
25 %% the user may want to adjust the type of resampling. To find these
|
|
26 %% parts of the script, search on the string 'OPTION'. These parts of
|
|
27 %% the code will document the options.
|
|
28
|
|
29 \documentclass[14pt]{report}
|
|
30 \usepackage{amsmath}
|
|
31 \usepackage[pdftex]{graphicx}
|
|
32 \usepackage{color}
|
|
33 \usepackage{ctable}
|
|
34 \usepackage{xspace}
|
|
35 \usepackage{fancyvrb}
|
|
36 \usepackage{fancyhdr}
|
|
37 \usepackage{lastpage}
|
|
38 \usepackage{longtable}
|
|
39 \usepackage{algorithm2e}
|
|
40 \usepackage[
|
|
41 colorlinks=true,
|
|
42 linkcolor=blue,
|
|
43 citecolor=blue,
|
|
44 urlcolor=blue]
|
|
45 {hyperref}
|
|
46 \usepackage{lscape}
|
|
47 \usepackage{Sweave}
|
|
48 \SweaveOpts{keep.source = TRUE}
|
|
49
|
|
50 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
51
|
|
52 % define new colors for use
|
|
53 \definecolor{darkgreen}{rgb}{0,0.6,0}
|
|
54 \definecolor{darkred}{rgb}{0.6,0.0,0}
|
|
55 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
|
|
56 \definecolor{brown}{rgb}{0.6,0.3,0.3}
|
|
57 \definecolor{darkblue}{rgb}{0,0,0.8}
|
|
58 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
|
|
59
|
|
60 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
61
|
|
62 \newcommand{\bld}[1]{\mbox{\boldmath $$#1$$}}
|
|
63 \newcommand{\shell}[1]{\mbox{$$#1$$}}
|
|
64 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
|
|
65
|
|
66 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
|
|
67 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
|
|
68
|
|
69 \newcommand{\halfs}{\frac{1}{2}}
|
|
70
|
|
71 \setlength{\oddsidemargin}{-.25 truein}
|
|
72 \setlength{\evensidemargin}{0truein}
|
|
73 \setlength{\topmargin}{-0.2truein}
|
|
74 \setlength{\textwidth}{7 truein}
|
|
75 \setlength{\textheight}{8.5 truein}
|
|
76 \setlength{\parindent}{0.20truein}
|
|
77 \setlength{\parskip}{0.10truein}
|
|
78
|
|
79 \setcounter{LTchunksize}{50}
|
|
80
|
|
81 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
82 \pagestyle{fancy}
|
|
83 \lhead{}
|
|
84 %% OPTION Report header name
|
|
85 \chead{Classification Model Script}
|
|
86 \rhead{}
|
|
87 \lfoot{}
|
|
88 \cfoot{}
|
|
89 \rfoot{\thepage\ of \pageref{LastPage}}
|
|
90 \renewcommand{\headrulewidth}{1pt}
|
|
91 \renewcommand{\footrulewidth}{1pt}
|
|
92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
93
|
|
94 %% OPTION Report title and modeler name
|
|
95 \title{Classification Model Script using $METHOD}
|
|
96 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
|
|
97
|
|
98 \begin{document}
|
|
99
|
|
100 \maketitle
|
|
101
|
|
102 \thispagestyle{empty}
|
|
103 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
|
|
104 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
|
|
105 numSamples=''
|
|
106 classDistString=''
|
|
107 missingText=''
|
|
108 numPredictors=''
|
|
109 numPCAcomp=''
|
|
110 pcaText=''
|
|
111 nzvText=''
|
|
112 corrText=''
|
|
113 ppText=''
|
|
114 varText=''
|
|
115 splitText="Dummy Text"
|
|
116 nirText="Dummy Text"
|
|
117 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
|
|
118 pctTrain=0.8
|
|
119 Smpling=''
|
|
120 nzvText1=''
|
|
121 classDistString1=''
|
|
122 dwnsmpl=''
|
|
123 upsmpl=''
|
|
124
|
|
125 @
|
|
126 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
|
|
127 library(Hmisc)
|
|
128 library(caret)
|
|
129 library(pROC)
|
|
130 versionTest <- compareVersion(packageDescription("caret")$$Version,
|
|
131 "4.65")
|
|
132 if(versionTest < 0) stop("caret version 4.65 or later is required")
|
|
133
|
|
134 library(RColorBrewer)
|
|
135
|
|
136
|
|
137 listString <- function (x, period = FALSE, verbose = FALSE)
|
|
138 {
|
|
139 if (verbose) cat("\n entering listString\n")
|
|
140 flush.console()
|
|
141 if (!is.character(x))
|
|
142 x <- as.character(x)
|
|
143 numElements <- length(x)
|
|
144 out <- if (length(x) > 0) {
|
|
145 switch(min(numElements, 3), x, paste(x, collapse = " and "),
|
|
146 {
|
|
147 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
|
|
148 paste(x, collapse = " ")
|
|
149 })
|
|
150 }
|
|
151 else ""
|
|
152 if (period) out <- paste(out, ".", sep = "")
|
|
153 if (verbose) cat(" leaving listString\n\n")
|
|
154 flush.console()
|
|
155 out
|
|
156 }
|
|
157
|
|
158 resampleStats <- function(x, digits = 3)
|
|
159 {
|
|
160 bestPerf <- x$$bestTune
|
|
161 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
|
|
162 out <- merge(x$$results, bestPerf)
|
|
163 out <- out[, colnames(out) %in% x$$perfNames]
|
|
164 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
|
|
165 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
|
|
166 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
|
|
167 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
|
|
168 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
|
|
169
|
|
170 out <- format(out, digits = digits)
|
|
171 listString(paste(names(out), "was", out))
|
|
172 }
|
|
173
|
|
174 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
|
|
175 {
|
|
176 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
|
|
177 specificity(data[, "pred"], data[, "obs"], lev[2]),
|
|
178 confusionMatrix(data[, "pred"], data[, "obs"])$$overall["Kappa"])
|
|
179
|
|
180 names(out) <- c("Sens", "Spec", "Kappa")
|
|
181 out
|
|
182 }
|
|
183
|
|
184
|
|
185
|
|
186 ##OPTION: model name: see ?train for more values/models
|
|
187 modName <- "$METHOD"
|
|
188
|
|
189
|
|
190 load("$RDATA")
|
|
191 rawData <- dataX
|
|
192 rawData$$outcome <- dataY
|
|
193
|
|
194 @
|
|
195
|
|
196
|
|
197 \section*{Data Sets}\label{S:data}
|
|
198
|
|
199 %% OPTION: provide some background on the problem, the experimental
|
|
200 %% data, how the compounds were selected etc
|
|
201
|
|
202 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
|
|
203 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
|
|
204 if(!is.factor(rawData$$outcome)) stop("the outcome should be a factor vector")
|
|
205
|
|
206 ## OPTION: when there are only two classes, the first level of the
|
|
207 ## factor is used as the "positive" or "event" for calculating
|
|
208 ## sensitivity and specificity. Adjust the outcome factor accordingly.
|
|
209 numClasses <- length(levels(rawData$$outcome))
|
|
210 numSamples <- nrow(rawData)
|
|
211 numPredictors <- ncol(rawData) - 1
|
|
212 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
213
|
|
214 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
|
|
215 if(any(!isNum)) stop("all predictors in rawData should be numeric")
|
|
216
|
|
217 classTextCheck <- all.equal(levels(rawData$$outcome), make.names(levels(rawData$$outcome)))
|
|
218 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
|
|
219
|
|
220 ## Get the class distribution
|
|
221 classDist <- table(rawData$$outcome)
|
|
222 classDistString <- paste("``",
|
|
223 names(classDist),
|
|
224 "'' ($$n$$=",
|
|
225 classDist,
|
|
226 ")",
|
|
227 sep = "")
|
|
228 classDistString <- listString(classDistString)
|
|
229 @
|
|
230
|
|
231 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
|
|
232 colRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
233 2, function(x) mean(is.na(x)))
|
|
234
|
|
235 ##OPTION thresholds can be changed
|
|
236 colExclude <- colRate > $MISSINGFILTERTHRESHC
|
|
237
|
|
238 missingText <- ""
|
|
239
|
|
240 if(any(colExclude))
|
|
241 {
|
|
242 missingText <- paste(missingText,
|
|
243 ifelse(sum(colExclude) > 1,
|
|
244 " There were ",
|
|
245 " There was "),
|
|
246 sum(colExclude),
|
|
247 ifelse(sum(colExclude) > 1,
|
|
248 " predictors ",
|
|
249 " predictor "),
|
|
250 "with an excessive number of ",
|
|
251 "missing data. ",
|
|
252 ifelse(sum(colExclude) > 1,
|
|
253 " These were excluded. ",
|
|
254 " This was excluded. "))
|
|
255 predictorNames <- predictorNames[!colExclude]
|
|
256 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
|
|
257 }
|
|
258
|
|
259
|
|
260 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
|
|
261 1, function(x) mean(is.na(x)))
|
|
262
|
|
263 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
|
|
264
|
|
265
|
|
266 if(any(rowExclude)) {
|
|
267 missingText <- paste(missingText,
|
|
268 ifelse(sum(rowExclude) > 1,
|
|
269 " There were ",
|
|
270 " There was "),
|
|
271 sum(colExclude),
|
|
272 ifelse(sum(rowExclude) > 1,
|
|
273 " samples ",
|
|
274 " sample "),
|
|
275 "with an excessive number of ",
|
|
276 "missing data. ",
|
|
277 ifelse(sum(rowExclude) > 1,
|
|
278 " These were excluded. ",
|
|
279 " This was excluded. "),
|
|
280 "After filtering, ",
|
|
281 sum(!rowExclude),
|
|
282 " samples remained.")
|
|
283 rawData <- rawData[!rowExclude, ]
|
|
284 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
285 1, function(x) mean(is.na(x)))
|
|
286 } else {
|
|
287 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
|
|
288 1, function(x) any(is.na(x)))
|
|
289 missingText <- paste(missingText,
|
|
290 ifelse(missingText == "",
|
|
291 "There ",
|
|
292 "Subsequently, there "),
|
|
293 ifelse(sum(hasMissing) == 1,
|
|
294 "was ",
|
|
295 "were "),
|
|
296 ifelse(sum(hasMissing) > 0,
|
|
297 sum(hasMissing),
|
|
298 "no"),
|
|
299 ifelse(sum(hasMissing) == 1,
|
|
300 "sample ",
|
|
301 "samples "),
|
|
302 "with missing values.")
|
|
303
|
|
304 rawData <- rawData[complete.cases(rawData),]
|
|
305
|
|
306 }
|
|
307
|
|
308 rawData1 <- rawData[,1:length(rawData)-1]
|
|
309 rawData2 <- rawData[,length(rawData)]
|
|
310
|
|
311 set.seed(222)
|
|
312 nzv1 <- nearZeroVar(rawData1)
|
|
313 if(length(nzv1) > 0)
|
|
314 {
|
|
315 nzvVars1 <- names(rawData1)[nzv1]
|
|
316 rawData <- rawData1[, -nzv1]
|
|
317 rawData$outcome <- rawData2
|
|
318 nzvText1 <- paste("There were ",
|
|
319 length(nzv1),
|
|
320 " predictors that were removed from original data due to",
|
|
321 " severely unbalanced distributions that",
|
|
322 " could negatively affect the model fit",
|
|
323 ifelse(length(nzv1) > 10,
|
|
324 ".",
|
|
325 paste(": ",
|
|
326 listString(nzvVars1),
|
|
327 ".",
|
|
328 sep = "")),
|
|
329 sep = "")
|
|
330
|
|
331 } else {
|
|
332 rawData <- rawData1
|
|
333 rawData$outcome <- rawData2
|
|
334 nzvText1 <- ""
|
|
335
|
|
336 }
|
|
337
|
|
338 remove("rawData1")
|
|
339 remove("rawData2")
|
|
340
|
|
341 @
|
|
342
|
|
343 The initial data set consisted of \Sexpr{numSamples} samples and
|
|
344 \Sexpr{numPredictors} predictor variables. The breakdown of the
|
|
345 outcome data classes were: \Sexpr{classDistString}.
|
|
346
|
|
347 \Sexpr{missingText}
|
|
348
|
|
349 \Sexpr{nzvText1}
|
|
350
|
|
351 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
|
|
352
|
|
353 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
354 numPredictors <- length(predictorNames)
|
|
355 predictors <- rawData[, predictorNames, drop = FALSE]
|
|
356 ## PCA will fail with predictors having less than 2 unique values
|
|
357 isZeroVar <- apply(predictors, 2,
|
|
358 function(x) length(unique(x)) < 2)
|
|
359 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
|
|
360 ## For whatever, only the formula interface to prcomp
|
|
361 ## handles missing values
|
|
362 pcaForm <- as.formula(
|
|
363 paste("~",
|
|
364 paste(names(predictors), collapse = "+")))
|
|
365 pca <- prcomp(pcaForm,
|
|
366 data = predictors,
|
|
367 center = TRUE,
|
|
368 scale. = TRUE,
|
|
369 na.action = na.omit)
|
|
370 ## OPTION: the number of components plotted/discussed can be set
|
|
371 numPCAcomp <- $PCACOMP
|
|
372 pctVar <- pca$$sdev^2/sum(pca$$sdev^2)*100
|
|
373 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
|
|
374 "\\\\%",
|
|
375 sep = "")
|
|
376 pcaText <- listString(pcaText)
|
|
377 @
|
|
378
|
|
379 To get an initial assessment of the separability of the classes,
|
|
380 principal component analysis (PCA) was used to distill the
|
|
381 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
|
|
382 surrogate variables (i.e. the principal components) in a manner that
|
|
383 attempts to maximize the amount of information preserved from the
|
|
384 original predictor set. Figure \ref{F:inititalPCA} contains plots of
|
|
385 the first \Sexpr{numPCAcomp} components, which accounted for
|
|
386 \Sexpr{pcaText} percent of the variability in the original predictors
|
|
387 (respectively).
|
|
388
|
|
389
|
|
390 %% OPTION: remark on how well (or poorly) the data separated
|
|
391
|
|
392 \setkeys{Gin}{width = 0.8\textwidth}
|
|
393 \begin{figure}[p]
|
|
394 \begin{center}
|
|
395
|
|
396 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
|
|
397 trellis.par.set(caretTheme(), warn = TRUE)
|
|
398 if(numPCAcomp == 2)
|
|
399 {
|
|
400 axisRange <- extendrange(pca$$x[, 1:2])
|
|
401 print(
|
|
402 xyplot(PC1 ~ PC2,
|
|
403 data = as.data.frame(pca$$x),
|
|
404 type = c("p", "g"),
|
|
405 groups = rawData$$outcome,
|
|
406 auto.key = list(columns = 2),
|
|
407 xlim = axisRange,
|
|
408 ylim = axisRange))
|
|
409 } else {
|
|
410 axisRange <- extendrange(pca$$x[, 1:numPCAcomp])
|
|
411 print(
|
|
412 splom(~as.data.frame(pca$$x)[, 1:numPCAcomp],
|
|
413 type = c("p", "g"),
|
|
414 groups = rawData$$outcome,
|
|
415 auto.key = list(columns = 2),
|
|
416 as.table = TRUE,
|
|
417 prepanel.limits = function(x) axisRange
|
|
418 ))
|
|
419
|
|
420 }
|
|
421
|
|
422 @
|
|
423
|
|
424 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
|
|
425 principal components for the original data set.}
|
|
426 \label{F:inititalPCA}
|
|
427 \end{center}
|
|
428 \end{figure}
|
|
429
|
|
430
|
|
431
|
|
432 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
|
|
433
|
|
434 ## OPTION: in small samples sizes, you may not want to set aside a
|
|
435 ## training set and focus on the resampling results.
|
|
436
|
|
437 set.seed(1234)
|
|
438 dataX <- rawData[,1:length(rawData)-1]
|
|
439 dataY <- rawData[,length(rawData)]
|
|
440
|
|
441 Smpling <- "$SAAMPLING"
|
|
442
|
|
443 if(Smpling=="downsampling")
|
|
444 {
|
|
445 dwnsmpl <- downSample(dataX,dataY)
|
|
446 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
|
|
447 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
|
|
448 remove("dwnsmpl")
|
|
449 remove("dataX")
|
|
450 remove("dataY")
|
|
451 }else if(Smpling=="upsampling"){
|
|
452 upsmpl <- upSample(dataX,dataY)
|
|
453 rawData <- upsmpl[,1:length(upsmpl)-1]
|
|
454 rawData$outcome <- upsmpl[,length(upsmpl)]
|
|
455 remove("upsmpl")
|
|
456 remove("dataX")
|
|
457 remove("dataY")
|
|
458 }else{remove("dataX")
|
|
459 remove("dataY")
|
|
460 }
|
|
461
|
|
462
|
|
463
|
|
464 numSamples <- nrow(rawData)
|
|
465
|
|
466 predictorNames <- names(rawData)[names(rawData) != "outcome"]
|
|
467 numPredictors <- length(predictorNames)
|
|
468
|
|
469
|
|
470 classDist1 <- table(rawData$outcome)
|
|
471 classDistString1 <- paste("``",
|
|
472 names(classDist1),
|
|
473 "'' ($n$=",
|
|
474 classDist1,
|
|
475 ")",
|
|
476 sep = "")
|
|
477 classDistString1 <- listString(classDistString1)
|
|
478
|
|
479 pctTrain <- $PERCENT
|
|
480
|
|
481 if(pctTrain < 1)
|
|
482 {
|
|
483 ## OPTION: seed number can be changed
|
|
484 set.seed(1)
|
|
485 inTrain <- createDataPartition(rawData$$outcome,
|
|
486 p = pctTrain,
|
|
487 list = FALSE)
|
|
488 trainX <- rawData[ inTrain, predictorNames]
|
|
489 testX <- rawData[-inTrain, predictorNames]
|
|
490 trainY <- rawData[ inTrain, "outcome"]
|
|
491 testY <- rawData[-inTrain, "outcome"]
|
|
492 splitText <- paste("The original data were split into ",
|
|
493 "a training set ($$n$$=",
|
|
494 nrow(trainX),
|
|
495 ") and a test set ($$n$$=",
|
|
496 nrow(testX),
|
|
497 ") in a manner that preserved the ",
|
|
498 "distribution of the classes.",
|
|
499 sep = "")
|
|
500 isZeroVar <- apply(trainX, 2,
|
|
501 function(x) length(unique(x)) < 2)
|
|
502 if(any(isZeroVar))
|
|
503 {
|
|
504 trainX <- trainX[, !isZeroVar, drop = FALSE]
|
|
505 testX <- testX[, !isZeroVar, drop = FALSE]
|
|
506 }
|
|
507
|
|
508 } else {
|
|
509 trainX <- rawData[, predictorNames]
|
|
510 testX <- NULL
|
|
511 trainY <- rawData[, "outcome"]
|
|
512 testY <- NULL
|
|
513 splitText <- "The entire data set was used as the training set."
|
|
514 }
|
|
515 trainDist <- table(trainY)
|
|
516 nir <- max(trainDist)/length(trainY)*100
|
|
517 niClass <- names(trainDist)[which.max(trainDist)]
|
|
518 nirText <- paste("The non--information rate is the accuracy that can be ",
|
|
519 "achieved by predicting all samples using the most ",
|
|
520 "dominant class. For these data, the rate is ",
|
|
521 round(nir, 2), "\\\\% using the ``",
|
|
522 niClass,
|
|
523 "'' class.",
|
|
524 sep = "")
|
|
525
|
|
526 remove("rawData")
|
|
527
|
|
528 if((!is.null(testX)) && (!is.null(testY))){
|
7
|
529 #save(trainX,trainY,testX,testY,file="datasets.RData")
|
6
|
530 } else {
|
|
531 save(trainX,trainY,file="datasets.RData")
|
|
532 }
|
|
533
|
|
534 @
|
|
535
|
|
536 \Sexpr{splitText}
|
|
537
|
|
538 \Sexpr{nirText}
|
|
539
|
|
540 The data set for model building consisted of \Sexpr{numSamples} samples and
|
|
541 \Sexpr{numPredictors} predictor variables. The breakdown of the
|
|
542 outcome data classes were: \Sexpr{classDistString1}.
|
|
543
|
|
544 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
|
|
545 ## OPTION: other pre-processing steps can be used
|
|
546 ppSteps <- caret:::suggestions(modName)
|
|
547
|
|
548 set.seed(2)
|
|
549 if(ppSteps["nzv"])
|
|
550 {
|
|
551 nzv <- nearZeroVar(trainX)
|
|
552 if(length(nzv) > 0)
|
|
553 {
|
|
554 nzvVars <- names(trainX)[nzv]
|
|
555 trainX <- trainX[, -nzv]
|
|
556 nzvText <- paste("There were ",
|
|
557 length(nzv),
|
|
558 " predictors that were removed from train set due to",
|
|
559 " severely unbalanced distributions that",
|
|
560 " could negatively affect the model",
|
|
561 ifelse(length(nzv) > 10,
|
|
562 ".",
|
|
563 paste(": ",
|
|
564 listString(nzvVars),
|
|
565 ".",
|
|
566 sep = "")),
|
|
567 sep = "")
|
|
568 testX <- testX[, -nzv]
|
|
569 } else nzvText <- ""
|
|
570 } else nzvText <- ""
|
|
571 @
|
|
572
|
|
573 \Sexpr{nzvText}
|
|
574
|
|
575
|
|
576 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
|
|
577 if(ppSteps["corr"])
|
|
578 {
|
|
579 ## OPTION:
|
|
580 corrThresh <- $THRESHHOLDCOR
|
|
581 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
|
|
582 corrThresh)
|
|
583 if(length(highCorr) > 0)
|
|
584 {
|
|
585 corrVars <- names(trainX)[highCorr]
|
|
586 trainX <- trainX[, -highCorr]
|
|
587 corrText <- paste("There were ",
|
|
588 length(highCorr),
|
|
589 " predictors that were removed due to",
|
|
590 " large between--predictor correlations that",
|
|
591 " could negatively affect the model fit",
|
|
592 ifelse(length(highCorr) > 10,
|
|
593 ".",
|
|
594 paste(": ",
|
|
595 listString(highCorr),
|
|
596 ".",
|
|
597 sep = "")),
|
|
598 " Removing these predictors forced",
|
|
599 " all pair--wise correlations to be",
|
|
600 " less than ",
|
|
601 corrThresh,
|
|
602 ".",
|
|
603 sep = "")
|
|
604 testX <- testX[, -highCorr]
|
|
605 } else corrText <- "No correlation among data on given threshold"
|
|
606 }else corrText <- ""
|
|
607 @
|
|
608
|
|
609 \Sexpr{corrText}
|
|
610
|
|
611 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
|
|
612 ppMethods <- NULL
|
|
613 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
|
|
614 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
|
|
615 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
|
|
616 ##OPTION other methods, such as spatial sign, can be added to this list
|
|
617
|
|
618 if(length(ppMethods) > 0)
|
|
619 {
|
|
620 ppInfo <- preProcess(trainX, method = ppMethods)
|
|
621 trainX <- predict(ppInfo, trainX)
|
|
622 if(pctTrain < 1) testX <- predict(ppInfo, testX)
|
|
623 ppText <- paste("The following pre--processing methods were",
|
|
624 " applied to the training",
|
|
625 ifelse(pctTrain < 1, " and test", ""),
|
|
626 " data: ",
|
|
627 listString(ppMethods),
|
|
628 ".",
|
|
629 sep = "")
|
|
630 ppText <- gsub("center", "mean centering", ppText)
|
|
631 ppText <- gsub("scale", "scaling to unit variance", ppText)
|
|
632 ppText <- gsub("knnImpute",
|
|
633 paste(ppInfo$$k, "--nearest neighbor imputation", sep = ""),
|
|
634 ppText)
|
|
635 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
|
|
636 ppText <- gsub("pca", "principal component feature extraction", ppText)
|
|
637 ppText <- gsub("ica", "independent component feature extraction", ppText)
|
|
638 } else {
|
|
639 ppInfo <- NULL
|
|
640 ppText <- ""
|
|
641 }
|
|
642
|
|
643 predictorNames <- names(trainX)
|
|
644 if(nzvText != "" | corrText != "" | ppText != "")
|
|
645 {
|
|
646 varText <- paste("After pre--processing, ",
|
|
647 ncol(trainX),
|
|
648 "predictors remained for modeling.")
|
|
649 } else varText <- ""
|
|
650
|
|
651 @
|
|
652
|
|
653 \Sexpr{ppText}
|
|
654 \Sexpr{varText}
|
|
655
|
|
656 \clearpage
|
|
657
|
|
658 \section*{Model Building}
|
|
659
|
|
660 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
|
|
661 numWorkers <- $NUMWORKERS
|
|
662 ##OPTION: turn up numWorkers to use MPI
|
|
663 if(numWorkers > 1)
|
|
664 {
|
|
665 mpiCalcs <- function(X, FUN, ...)
|
|
666 {
|
|
667 theDots <- list(...)
|
|
668 parLapply(theDots$$cl, X, FUN)
|
|
669 }
|
|
670
|
|
671 library(snow)
|
|
672 cl <- makeCluster(numWorkers, "MPI")
|
|
673 }
|
|
674 @
|
|
675
|
|
676 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
|
|
677 ##OPTION: the resampling options can be changed. See
|
|
678 ## ?trainControl for details
|
|
679
|
|
680 resampName <- "$RESAMPNAME"
|
|
681 resampNumber <- $RESAMPLENUMBER
|
|
682 numRepeat <- $NUMREPEAT
|
|
683 resampP <- $RESAMPLENUMBERPERCENT
|
|
684
|
|
685 modelInfo <- modelLookup(modName)
|
|
686
|
|
687 if(numClasses == 2)
|
|
688 {
|
|
689 foo <- if(any(modelInfo$$probModel)) twoClassSummary else twoClassNoProbs
|
|
690 } else foo <- defaultSummary
|
|
691
|
|
692 set.seed(3)
|
|
693 ctlObj <- trainControl(method = resampName,
|
|
694 number = resampNumber,
|
|
695 repeats = numRepeat,
|
|
696 p = resampP,
|
|
697 classProbs = any(modelInfo$$probModel),
|
|
698 summaryFunction = foo)
|
|
699
|
|
700
|
|
701 ##OPTION select other performance metrics as needed
|
|
702 optMetric <- if(numClasses == 2 & any(modelInfo$$probModel)) "ROC" else "Kappa"
|
|
703
|
|
704 if(numWorkers > 1)
|
|
705 {
|
|
706 ctlObj$$workers <- numWorkers
|
|
707 ctlObj$$computeFunction <- mpiCalcs
|
|
708 ctlObj$$computeArgs <- list(cl = cl)
|
|
709 }
|
|
710 @
|
|
711
|
|
712 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
|
|
713 ##OPTION expand or contract these grids as needed (or
|
|
714 ## add more models
|
|
715
|
|
716 gridSize <- $SETUPGRIDSIZE
|
|
717
|
|
718 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
|
|
719 if(modName %in% c("earth", "fda")) gridSize <- 7
|
|
720 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
|
|
721
|
|
722 if(modName %in% c("nb")) gridSize <- 2
|
|
723 if(modName %in% c("pam", "rpart")) gridSize <- 15
|
|
724 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
|
|
725
|
|
726 if(modName == "gbm")
|
|
727 {
|
|
728 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
|
|
729 .n.trees = (1:10)*20,
|
|
730 .shrinkage = .1)
|
|
731 }
|
|
732
|
|
733 if(modName == "nnet")
|
|
734 {
|
|
735 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
|
|
736 .decay = c(0, .001, .01, .1))
|
|
737 }
|
|
738
|
|
739 if(modName == "ada")
|
|
740 {
|
|
741 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
|
|
742
|
|
743 }
|
|
744
|
|
745
|
|
746 @
|
|
747
|
|
748 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
|
|
749 ##OPTION alter as needed
|
|
750
|
|
751 set.seed(4)
|
|
752 modelFit <- switch(modName,
|
|
753 gbm =
|
|
754 {
|
|
755 mix <- sample(seq(along = trainY))
|
|
756 train(
|
|
757 trainX[mix,], trainY[mix], modName,
|
|
758 verbose = FALSE,
|
|
759 bag.fraction = .9,
|
|
760 metric = optMetric,
|
|
761 trControl = ctlObj,
|
|
762 tuneGrid = tGrid)
|
|
763 },
|
|
764
|
|
765 multinom =
|
|
766 {
|
|
767 train(
|
|
768 trainX, trainY, modName,
|
|
769 trace = FALSE,
|
|
770 metric = optMetric,
|
|
771 maxiter = 1000,
|
|
772 MaxNWts = 5000,
|
|
773 trControl = ctlObj,
|
|
774 tuneLength = gridSize)
|
|
775 },
|
|
776
|
|
777 nnet =
|
|
778 {
|
|
779 train(
|
|
780 trainX, trainY, modName,
|
|
781 metric = optMetric,
|
|
782 linout = FALSE,
|
|
783 trace = FALSE,
|
|
784 maxiter = 1000,
|
|
785 MaxNWts = 5000,
|
|
786 trControl = ctlObj,
|
|
787 tuneGrid = tGrid)
|
|
788
|
|
789 },
|
|
790
|
|
791 svmRadial =, svmPoly =, svmLinear =
|
|
792 {
|
|
793 train(
|
|
794 trainX, trainY, modName,
|
|
795 metric = optMetric,
|
|
796 scaled = TRUE,
|
|
797 trControl = ctlObj,
|
|
798 tuneLength = gridSize)
|
|
799 },
|
|
800 {
|
|
801 train(trainX, trainY, modName,
|
|
802 trControl = ctlObj,
|
|
803 metric = optMetric,
|
|
804 tuneLength = gridSize)
|
|
805 })
|
|
806
|
|
807 @
|
|
808
|
|
809 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
|
|
810 summaryText <- ""
|
|
811
|
|
812 resampleName <- switch(tolower(modelFit$$control$$method),
|
|
813 boot = paste("the bootstrap (", length(modelFit$$control$$index), " reps)", sep = ""),
|
|
814 boot632 = paste("the bootstrap 632 rule (", length(modelFit$$control$$index), " reps)", sep = ""),
|
|
815 cv = paste("cross-validation (", modelFit$$control$$number, " fold)", sep = ""),
|
|
816 repeatedcv = paste("cross-validation (", modelFit$$control$$number, " fold, repeated ",
|
|
817 modelFit$$control$$repeats, " times)", sep = ""),
|
|
818 lgocv = paste("repeated train/test splits (", length(modelFit$$control$$index), " reps, ",
|
|
819 round(modelFit$$control$$p, 2), "$$\\%$$)", sep = ""))
|
|
820
|
|
821 tuneVars <- latexTranslate(tolower(modelInfo$$label))
|
|
822 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
|
|
823 if(ncol(modelFit$$bestTune) == 1 && colnames(modelFit$$bestTune) == ".parameter")
|
|
824 {
|
|
825 summaryText <- paste(summaryText,
|
|
826 "\n\n",
|
|
827 "There are no tuning parameters associated with this model.",
|
|
828 "To characterize the model performance on the training set,",
|
|
829 resampleName,
|
|
830 "was used.",
|
|
831 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
|
|
832 "show summaries of the resampling results. ")
|
|
833
|
|
834 } else {
|
|
835 summaryText <- paste("There",
|
|
836 ifelse(nrow(modelInfo) > 1, "are", "is"),
|
|
837 nrow(modelInfo),
|
|
838 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
|
|
839 "associated with this model:",
|
|
840 listString(tuneVars, period = TRUE))
|
|
841
|
|
842
|
|
843
|
|
844 paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE)
|
|
845 ## (i in seq(along = paramNames))
|
|
846 ## {
|
|
847 ## check <- modelInfo$$parameter %in% paramNames[i]
|
|
848 ## if(any(check))
|
|
849 ## {
|
|
850 ## paramNames[i] <- modelInfo$$label[which(check)]
|
|
851 ## }
|
|
852 ## }
|
|
853
|
|
854 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
|
|
855 ## Check to see if there was only one combination fit
|
|
856 summaryText <- paste(summaryText,
|
|
857 "To choose",
|
|
858 ifelse(nrow(modelInfo) > 1,
|
|
859 "appropriate values of the tuning parameters,",
|
|
860 "an appropriate value of the tuning parameter,"),
|
|
861 resampleName,
|
|
862 "was used to generated a profile of performance across the",
|
|
863 nrow(modelFit$$results),
|
|
864 ifelse(nrow(modelInfo) > 1,
|
|
865 "combinations of the tuning parameters.",
|
|
866 "candidate values."),
|
|
867
|
|
868 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
|
|
869 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
|
|
870 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$$bestTune[1,]), period = TRUE))
|
|
871
|
|
872 }
|
|
873 @
|
|
874
|
|
875 \Sexpr{summaryText}
|
|
876
|
|
877 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
|
|
878 tableData <- modelFit$$results
|
|
879
|
|
880 if(all(modelInfo$$parameter == "parameter") && resampName == "boot632")
|
|
881 {
|
|
882 tableData <- tableData[,-1, drop = FALSE]
|
|
883 colNums <- c( length(modelFit$$perfNames), length(modelFit$$perfNames), length(modelFit$$perfNames))
|
|
884 colLabels <- c("Mean", "Standard Deviation","Apparant")
|
|
885 constString <- ""
|
|
886 isConst <- NULL
|
|
887 } else if (all(modelInfo$$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName == "repeatedcv" )){
|
|
888 tableData <- tableData[,-1, drop = FALSE]
|
|
889 colNums <- c(length(modelFit$$perfNames), length(modelFit$$perfNames))
|
|
890 colLabels <- c("Mean", "Standard Deviation")
|
|
891 constString <- ""
|
|
892 isConst <- NULL
|
|
893 } else if (all(modelInfo$$parameter == "parameter") && resampName == "LOOCV" ){
|
|
894 tableData <- tableData[,-1, drop = FALSE]
|
|
895 colNums <- length(modelFit$$perfNames)
|
|
896 colLabels <- c("Measures")
|
|
897 constString <- ""
|
|
898 isConst <- NULL
|
|
899 } else {
|
|
900 if (all(modelInfo$$parameter != "parameter") && resampName == "boot632" ){
|
|
901 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
|
|
902 2,
|
|
903 function(x) length(unique(x)) == 1)
|
|
904
|
|
905 numParamInTable <- sum(!isConst)
|
|
906
|
|
907 if(any(isConst))
|
|
908 {
|
|
909 constParam <- modelInfo$$parameter[isConst]
|
|
910 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
|
|
911 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
|
|
912 constString <- paste("The tuning",
|
|
913 ifelse(sum(isConst) > 1,
|
|
914 "parmeters",
|
|
915 "parameter"),
|
|
916 listString(paste("``", names(constValues), "''", sep = "")),
|
|
917 ifelse(sum(isConst) > 1,
|
|
918 "were",
|
|
919 "was"),
|
|
920 "held constant at",
|
|
921 ifelse(sum(isConst) > 1,
|
|
922 "a value of",
|
|
923 "values of"),
|
|
924 listString(constValues[1,]))
|
|
925
|
|
926 } else constString <- ""
|
|
927
|
|
928 cn <- colnames(tableData)
|
|
929 ## for(i in seq(along = cn))
|
|
930 ## {
|
|
931 ## check <- modelInfo$$parameter %in% cn[i]
|
|
932 ## if(any(check))
|
|
933 ## {
|
|
934 ## cn[i] <- modelInfo$$label[which(check)]
|
|
935 ## }
|
|
936 ## }
|
|
937 ## colnames(tableData) <- cn
|
|
938
|
|
939 colNums <- c(numParamInTable,
|
|
940 length(modelFit$$perfNames),
|
|
941 length(modelFit$$perfNames),
|
|
942 length(modelFit$$perfNames))
|
|
943 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
|
|
944
|
|
945 }else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot" | resampName == "repeatedcv" | resampName == "cv") ){
|
|
946 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
|
|
947 2,
|
|
948 function(x) length(unique(x)) == 1)
|
|
949
|
|
950 numParamInTable <- sum(!isConst)
|
|
951
|
|
952 if(any(isConst))
|
|
953 {
|
|
954 constParam <- modelInfo$$parameter[isConst]
|
|
955 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
|
|
956 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
|
|
957 constString <- paste("The tuning",
|
|
958 ifelse(sum(isConst) > 1,
|
|
959 "parmeters",
|
|
960 "parameter"),
|
|
961 listString(paste("``", names(constValues), "''", sep = "")),
|
|
962 ifelse(sum(isConst) > 1,
|
|
963 "were",
|
|
964 "was"),
|
|
965 "held constant at",
|
|
966 ifelse(sum(isConst) > 1,
|
|
967 "a value of",
|
|
968 "values of"),
|
|
969 listString(constValues[1,]))
|
|
970
|
|
971 } else constString <- ""
|
|
972
|
|
973 cn <- colnames(tableData)
|
|
974 ## for(i in seq(along = cn))
|
|
975 ## {
|
|
976 ## check <- modelInfo$$parameter %in% cn[i]
|
|
977 ## if(any(check))
|
|
978 ## {
|
|
979 ## cn[i] <- modelInfo$$label[which(check)]
|
|
980 ## }
|
|
981 ## }
|
|
982 ## colnames(tableData) <- cn
|
|
983
|
|
984 colNums <- c(numParamInTable,
|
|
985 length(modelFit$$perfNames),
|
|
986 length(modelFit$$perfNames))
|
|
987 colLabels <- c("", "Mean", "Standard Deviation")
|
|
988
|
|
989 }
|
|
990 else if (all(modelInfo$$parameter != "parameter") && resampName == "LOOCV"){
|
|
991 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
|
|
992 2,
|
|
993 function(x) length(unique(x)) == 1)
|
|
994
|
|
995 numParamInTable <- sum(!isConst)
|
|
996
|
|
997 if(any(isConst))
|
|
998 {
|
|
999 constParam <- modelInfo$$parameter[isConst]
|
|
1000 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
|
|
1001 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
|
|
1002 constString <- paste("The tuning",
|
|
1003 ifelse(sum(isConst) > 1,
|
|
1004 "parmeters",
|
|
1005 "parameter"),
|
|
1006 listString(paste("``", names(constValues), "''", sep = "")),
|
|
1007 ifelse(sum(isConst) > 1,
|
|
1008 "were",
|
|
1009 "was"),
|
|
1010 "held constant at",
|
|
1011 ifelse(sum(isConst) > 1,
|
|
1012 "a value of",
|
|
1013 "values of"),
|
|
1014 listString(constValues[1,]))
|
|
1015
|
|
1016 } else constString <- ""
|
|
1017
|
|
1018 cn <- colnames(tableData)
|
|
1019 ## for(i in seq(along = cn))
|
|
1020 ## {
|
|
1021 ## check <- modelInfo$$parameter %in% cn[i]
|
|
1022 ## if(any(check))
|
|
1023 ## {
|
|
1024 ## cn[i] <- modelInfo$$label[which(check)]
|
|
1025 ## }
|
|
1026 ## }
|
|
1027 ## colnames(tableData) <- cn
|
|
1028
|
|
1029 colNums <- c(numParamInTable,
|
|
1030 length(modelFit$$perfNames))
|
|
1031 colLabels <- c("", "Measures")
|
|
1032
|
|
1033 }
|
|
1034
|
|
1035 }
|
|
1036
|
|
1037
|
|
1038
|
|
1039 colnames(tableData) <- gsub("SD$$", "", colnames(tableData))
|
|
1040 colnames(tableData) <- gsub("Apparent$$", "", colnames(tableData))
|
|
1041 colnames(tableData) <- latexTranslate(colnames(tableData))
|
|
1042 rownames(tableData) <- latexTranslate(rownames(tableData))
|
|
1043
|
|
1044 latex(tableData,
|
|
1045 rowname = NULL,
|
|
1046 file = "",
|
|
1047 cgroup = colLabels,
|
|
1048 n.cgroup = colNums,
|
|
1049 where = "h!",
|
|
1050 digits = 4,
|
|
1051 longtable = nrow(tableData) > 30,
|
|
1052 caption = paste(resampleName, "results from the model fit.", constString),
|
|
1053 label = "T:resamps")
|
|
1054 @
|
|
1055
|
|
1056 \setkeys{Gin}{ width = 0.9\textwidth}
|
|
1057 \begin{figure}[b]
|
|
1058 \begin{center}
|
|
1059
|
|
1060 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
|
|
1061 trellis.par.set(caretTheme(), warn = TRUE)
|
|
1062 if(all(modelInfo$$parameter == "parameter") | all(isConst) | modName == "nb")
|
|
1063 {
|
|
1064 resultsPlot <- resampleHist(modelFit)
|
|
1065 plotCaption <- paste("Distributions of model performance from the ",
|
|
1066 "training set estimated using ",
|
|
1067 resampleName)
|
|
1068 } else {
|
|
1069 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
|
|
1070 {
|
|
1071 resultsPlot <- plot(modelFit,
|
|
1072 metric = optMetric,
|
|
1073 xTrans = function(x) log10(x))
|
|
1074 resultsPlot <- update(resultsPlot,
|
|
1075 type = c("g", "p", "l"),
|
|
1076 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
|
|
1077
|
|
1078 } else {
|
|
1079 resultsPlot <- plot(modelFit,
|
|
1080 metric = optMetric)
|
|
1081 resultsPlot <- update(resultsPlot,
|
|
1082 type = c("g", "p", "l"),
|
|
1083 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
|
|
1084 }
|
|
1085 plotCaption <- paste("A plot of the estimates of the",
|
|
1086 optMetric,
|
|
1087 "values calculated using",
|
|
1088 resampleName)
|
|
1089 }
|
|
1090 print(resultsPlot)
|
|
1091 @
|
|
1092 \caption[Performance Plot]{\Sexpr{plotCaption}.}
|
|
1093 \label{F:profile}
|
|
1094 \end{center}
|
|
1095 \end{figure}
|
|
1096
|
|
1097
|
|
1098 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
|
|
1099 if(numWorkers > 1) stopCluster(cl)
|
|
1100 @
|
|
1101
|
|
1102 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
|
8
|
1103 if((!is.null(testX)) && (!is.null(testY))){
|
|
1104 save(trainX,trainY,testX,testY,file="datasets.RData")
|
|
1105 } else {
|
|
1106 save(trainX,trainY,file="datasets.RData")
|
|
1107 }
|
|
1108
|
6
|
1109 if(pctTrain < 1)
|
|
1110 {
|
|
1111 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
|
|
1112 classPred <- predict(modelFit, testX)
|
|
1113 cm <- confusionMatrix(classPred, testY)
|
|
1114 values <- cm$$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
|
|
1115
|
|
1116 values <- values[!is.na(values) & !is.nan(values)]
|
|
1117 values <- c(format(values[1:2], digits = 3),
|
|
1118 format.pval(values[-(1:2)], digits = 5))
|
|
1119 nms <- c("the overall accuracy", "the Kappa statistic",
|
|
1120 "the $$p$$--value that accuracy is greater than the no--information rate",
|
|
1121 "the $$p$$--value of concordance from McNemar's test")
|
|
1122 nms <- nms[seq(along = values)]
|
|
1123 names(values) <- nms
|
|
1124
|
|
1125 if(any(modelInfo$$probModel))
|
|
1126 {
|
|
1127 classProbs <- extractProb(list(fit = modelFit),
|
|
1128 testX = testX,
|
|
1129 testY = testY)
|
|
1130 classProbs <- subset(classProbs, dataType == "Test")
|
|
1131 if(numClasses == 2)
|
|
1132 {
|
|
1133 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$$obs))
|
|
1134 tmp <- c(format(tmp, digits = 3))
|
|
1135 names(tmp) <- c("the area under the ROC curve", "the sensitivity", "the specificity")
|
|
1136
|
|
1137 values <- c(values, tmp)
|
|
1138
|
|
1139 }
|
|
1140 probPlot <- plotClassProbs(classProbs)
|
|
1141 }
|
|
1142 testString <- paste("Based on the test set of",
|
|
1143 nrow(testX),
|
|
1144 "samples,",
|
|
1145 listString(paste(names(values), "was", values), period = TRUE),
|
|
1146 "The confusion matrix for the test set is shown in Table",
|
|
1147 "\\\\ref{T:cm}.")
|
|
1148 testString <- paste(testString,
|
|
1149 " Using ", resampleName,
|
|
1150 ", the training set estimates were ",
|
|
1151 resampleStats(modelFit),
|
|
1152 ".",
|
|
1153 sep = "")
|
|
1154
|
|
1155 if(any(modelInfo$$probModel)) testString <- paste(testString,
|
|
1156 "Histograms of the class probabilities",
|
|
1157 "for the test set samples are shown in",
|
|
1158 "Figure \\\\ref{F:probs}",
|
|
1159 ifelse(numClasses == 2,
|
|
1160 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
|
|
1161 "."))
|
|
1162
|
|
1163
|
|
1164
|
|
1165 latex(cm$$table,
|
|
1166 title = "",
|
|
1167 file = "",
|
|
1168 where = "h",
|
|
1169 cgroup = "Observed Values",
|
|
1170 n.cgroup = numClasses,
|
|
1171 caption = "The confusion matrix for the test set",
|
|
1172 label = "T:cm")
|
|
1173
|
|
1174 } else testString <- ""
|
|
1175 @
|
|
1176 \Sexpr{testString}
|
|
1177
|
|
1178
|
|
1179 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
|
|
1180 if(any(modelInfo$probModel) && pctTrain < 1 ) {
|
|
1181 cat(
|
|
1182 paste("\\begin{figure}[p]\n",
|
|
1183 "\\begin{center}\n",
|
|
1184 "\\includegraphics{classProbs}",
|
|
1185 "\\caption[PCA Plot]{Class probabilities",
|
|
1186 "for the test set. Each panel contains ",
|
|
1187 "separate classes}\n",
|
|
1188 "\\label{F:probs}\n",
|
|
1189 "\\end{center}\n",
|
|
1190 "\\end{figure}"))
|
|
1191 }
|
|
1192 if(any(modelInfo$$probModel) & numClasses == 2 & pctTrain < 1 )
|
|
1193 {
|
|
1194 cat(
|
|
1195 paste("\\begin{figure}[p]\n",
|
|
1196 "\\begin{center}\n",
|
|
1197 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
|
|
1198 "\\caption[ROC Plot]{ROC Curve",
|
|
1199 "for the test set.}\n",
|
|
1200 "\\label{F:roc}\n",
|
|
1201 "\\end{center}\n",
|
|
1202 "\\end{figure}"))
|
|
1203 } else {
|
|
1204 cat (paste(""))
|
|
1205 }
|
|
1206
|
|
1207 @
|
|
1208 <<classProbsTex, results = $CLASSPROBSTEXRESULT1, echo = $CLASSPROBSTEXECHO1 >>=
|
|
1209 if(any(modelInfo$probModel) && pctTrain < 1) {
|
|
1210 pdf("classProbs.pdf", height = 7, width = 7)
|
|
1211 trellis.par.set(caretTheme(), warn = FALSE)
|
|
1212 print(probPlot)
|
|
1213 dev.off()
|
|
1214 }
|
|
1215 if(any(modelInfo$probModel) & numClasses == 2 & pctTrain < 1) {
|
|
1216 resPonse<-testY
|
|
1217 preDictor<-classProbs[, levels(trainY)[1]]
|
|
1218 pdf("roc.pdf", height = 8, width = 8)
|
|
1219 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
|
|
1220 plot.roc(resPonse, preDictor, # data
|
|
1221 percent=TRUE, # show all values in percent
|
|
1222 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
|
|
1223 print.auc=TRUE, #display pAUC value on the plot with following options:
|
|
1224 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
|
|
1225 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
|
|
1226 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
|
|
1227 main="Partial AUC (pAUC)")
|
|
1228 plot.roc(resPonse, preDictor,
|
|
1229 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
|
|
1230 partial.auc=c(100, 90), partial.auc.correct=TRUE,
|
|
1231 partial.auc.focus="se", # focus pAUC on the sensitivity
|
|
1232 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
|
|
1233 print.auc.y=40, # do not print auc over the previous one
|
|
1234 auc.polygon=TRUE, auc.polygon.col="#008600",
|
|
1235 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
|
|
1236 dev.off()
|
|
1237 } else {
|
|
1238 cat("")
|
|
1239 }
|
|
1240
|
|
1241 @
|
|
1242
|
|
1243 \section*{Versions}
|
|
1244
|
|
1245 <<versions, echo = FALSE, results = tex>>=
|
|
1246 toLatex(sessionInfo())
|
|
1247
|
|
1248 @
|
|
1249
|
|
1250 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
|
|
1251 ## change this to the name of modName....
|
|
1252 Fit<-modelFit
|
8
|
1253
|
|
1254 if(exists('ppInfo') && !is.null(ppInfo)){
|
|
1255 save(Fit,ppInfo,file="$METHOD-Fit.RData")
|
|
1256 } else {save(Fit,file="$METHOD-Fit.RData")}
|
|
1257
|
|
1258
|
|
1259
|
|
1260
|
6
|
1261 @
|
|
1262 The model was built using $METHOD and is saved as $METHOD Model for reuse. This contains the variable Fit.
|
|
1263
|
|
1264 \end{document}'''
|
|
1265
|
|
1266 return template4Rnw
|