comparison caret_future/tool2/TEST.Rnw @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68300206e90d
1 %% Classification Modeling Script
2 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
3 %% Version: 1.00
4 %% Created on: 2010/10/02
5 %%
6 %% This is an Sweave template for building and describing
7 %% classification models. It mixes R and LaTeX code. The document can
8 %% be processing using R's Sweave function to produce a tex file.
9 %%
10 %% The inputs are:
11 %% - the initial data set in a data frame called 'rawData'
12 %% - a factor column in the data set called 'class'. this should be the
13 %% outcome variable
14 %% - all other columns in rawData should be predictor variables
15 %% - the type of model should be in a variable called 'modName'.
16 %%
17 %% The script attempts to make some intelligent choices based on the
18 %% model being used. For example, if modName is "pls", the script will
19 %% automatically center and scale the predictor data. There are
20 %% situations where these choices can (and should be) changed.
21 %%
22 %% There are other options that may make sense to change. For example,
23 %% the user may want to adjust the type of resampling. To find these
24 %% parts of the script, search on the string 'OPTION'. These parts of
25 %% the code will document the options.
26
27 \documentclass[14pt]{report}
28 \usepackage{amsmath}
29 \usepackage[pdftex]{graphicx}
30 \usepackage{color}
31 \usepackage{ctable}
32 \usepackage{xspace}
33 \usepackage{fancyvrb}
34 \usepackage{fancyhdr}
35 \usepackage{lastpage}
36 \usepackage{longtable}
37 \usepackage{algorithm2e}
38 \usepackage[
39 colorlinks=true,
40 linkcolor=blue,
41 citecolor=blue,
42 urlcolor=blue]
43 {hyperref}
44 \usepackage{lscape}
45 \usepackage{Sweave}
46 \SweaveOpts{keep.source = TRUE}
47
48 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
49
50 % define new colors for use
51 \definecolor{darkgreen}{rgb}{0,0.6,0}
52 \definecolor{darkred}{rgb}{0.6,0.0,0}
53 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
54 \definecolor{brown}{rgb}{0.6,0.3,0.3}
55 \definecolor{darkblue}{rgb}{0,0,0.8}
56 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
57
58 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59
60 \newcommand{\bld}[1]{\mbox{\boldmath $#1$}}
61 \newcommand{\shell}[1]{\mbox{$#1$}}
62 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
63
64 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
65 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
66
67 \newcommand{\halfs}{\frac{1}{2}}
68
69 \setlength{\oddsidemargin}{-.25 truein}
70 \setlength{\evensidemargin}{0truein}
71 \setlength{\topmargin}{-0.2truein}
72 \setlength{\textwidth}{7 truein}
73 \setlength{\textheight}{8.5 truein}
74 \setlength{\parindent}{0.20truein}
75 \setlength{\parskip}{0.10truein}
76
77 \setcounter{LTchunksize}{50}
78
79 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
80 \pagestyle{fancy}
81 \lhead{}
82 %% OPTION Report header name
83 \chead{Classification Model Script}
84 \rhead{}
85 \lfoot{}
86 \cfoot{}
87 \rfoot{\thepage\ of \pageref{LastPage}}
88 \renewcommand{\headrulewidth}{1pt}
89 \renewcommand{\footrulewidth}{1pt}
90 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
91
92 %% OPTION Report title and modeler name
93 \title{Classification Model Script using rf}
94 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
95
96 \begin{document}
97
98 \maketitle
99
100 \thispagestyle{empty}
101 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
102 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
103 numSamples=''
104 classDistString=''
105 missingText=''
106 numPredictors=''
107 numPCAcomp=''
108 pcaText=''
109 nzvText=''
110 corrText=''
111 ppText=''
112 varText=''
113 splitText="Dummy Text"
114 nirText="Dummy Text"
115 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
116 pctTrain=0.8
117 Smpling=''
118 nzvText1=''
119 classDistString1=''
120 dwnsmpl=''
121 upsmpl=''
122
123 @
124 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
125 library(Hmisc)
126 library(caret)
127 library(pROC)
128 versionTest <- compareVersion(packageDescription("caret")$Version,
129 "4.65")
130 if(versionTest < 0) stop("caret version 4.65 or later is required")
131
132 library(RColorBrewer)
133
134
135 listString <- function (x, period = FALSE, verbose = FALSE)
136 {
137 if (verbose) cat("\n entering listString\n")
138 flush.console()
139 if (!is.character(x))
140 x <- as.character(x)
141 numElements <- length(x)
142 out <- if (length(x) > 0) {
143 switch(min(numElements, 3), x, paste(x, collapse = " and "),
144 {
145 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
146 paste(x, collapse = " ")
147 })
148 }
149 else ""
150 if (period) out <- paste(out, ".", sep = "")
151 if (verbose) cat(" leaving listString\n\n")
152 flush.console()
153 out
154 }
155
156 resampleStats <- function(x, digits = 3)
157 {
158 bestPerf <- x$bestTune
159 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
160 out <- merge(x$results, bestPerf)
161 out <- out[, colnames(out) %in% x$perfNames]
162 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
163 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
164 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
165 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
166 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
167
168 out <- format(out, digits = digits)
169 listString(paste(names(out), "was", out))
170 }
171
172 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
173 {
174 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
175 specificity(data[, "pred"], data[, "obs"], lev[2]),
176 confusionMatrix(data[, "pred"], data[, "obs"])$overall["Kappa"])
177
178 names(out) <- c("Sens", "Spec", "Kappa")
179 out
180 }
181
182
183
184 ##OPTION: model name: see ?train for more values/models
185 modName <- "svmRadial"
186
187
188 load("/home/galaxy/galaxy-dist/database/files/000/dataset_521.dat")
189 rawData <- dataX
190 rawData$outcome <- dataY
191
192 @
193
194
195 \section*{Data Sets}\label{S:data}
196
197 %% OPTION: provide some background on the problem, the experimental
198 %% data, how the compounds were selected etc
199
200 <<getDataInfo, eval = TRUE, echo = FALSE, results = hide>>=
201 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
202 if(!is.factor(rawData$outcome)) stop("the outcome should be a factor vector")
203
204 ## OPTION: when there are only two classes, the first level of the
205 ## factor is used as the "positive" or "event" for calculating
206 ## sensitivity and specificity. Adjust the outcome factor accordingly.
207 numClasses <- length(levels(rawData$outcome))
208 numSamples <- nrow(rawData)
209 numPredictors <- ncol(rawData) - 1
210 predictorNames <- names(rawData)[names(rawData) != "outcome"]
211
212 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
213 if(any(!isNum)) stop("all predictors in rawData should be numeric")
214
215 classTextCheck <- all.equal(levels(rawData$outcome), make.names(levels(rawData$outcome)))
216 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
217
218 ## Get the class distribution
219 classDist <- table(rawData$outcome)
220 classDistString <- paste("``",
221 names(classDist),
222 "'' ($n$=",
223 classDist,
224 ")",
225 sep = "")
226 classDistString <- listString(classDistString)
227 @
228
229 <<missingFilter, eval = TRUE, echo = FALSE, results = tex>>=
230 colRate <- apply(rawData[, predictorNames, drop = FALSE],
231 2, function(x) mean(is.na(x)))
232
233 ##OPTION thresholds can be changed
234 colExclude <- colRate > 0.2
235
236 missingText <- ""
237
238 if(any(colExclude))
239 {
240 missingText <- paste(missingText,
241 ifelse(sum(colExclude) > 1,
242 " There were ",
243 " There was "),
244 sum(colExclude),
245 ifelse(sum(colExclude) > 1,
246 " predictors ",
247 " predictor "),
248 "with an excessive number of ",
249 "missing data. ",
250 ifelse(sum(colExclude) > 1,
251 " These were excluded. ",
252 " This was excluded. "))
253 predictorNames <- predictorNames[!colExclude]
254 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
255 }
256
257
258 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
259 1, function(x) mean(is.na(x)))
260
261 rowExclude <- rowRate > 0.2
262
263
264 if(any(rowExclude)) {
265 missingText <- paste(missingText,
266 ifelse(sum(rowExclude) > 1,
267 " There were ",
268 " There was "),
269 sum(colExclude),
270 ifelse(sum(rowExclude) > 1,
271 " samples ",
272 " sample "),
273 "with an excessive number of ",
274 "missing data. ",
275 ifelse(sum(rowExclude) > 1,
276 " These were excluded. ",
277 " This was excluded. "),
278 "After filtering, ",
279 sum(!rowExclude),
280 " samples remained.")
281 rawData <- rawData[!rowExclude, ]
282 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
283 1, function(x) mean(is.na(x)))
284 } else {
285 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
286 1, function(x) any(is.na(x)))
287 missingText <- paste(missingText,
288 ifelse(missingText == "",
289 "There ",
290 "Subsequently, there "),
291 ifelse(sum(hasMissing) == 1,
292 "was ",
293 "were "),
294 ifelse(sum(hasMissing) > 0,
295 sum(hasMissing),
296 "no"),
297 ifelse(sum(hasMissing) == 1,
298 "sample ",
299 "samples "),
300 "with missing values.")
301
302 rawData <- rawData[complete.cases(rawData),]
303
304 }
305
306 rawData1 <- rawData[,1:length(rawData)-1]
307 rawData2 <- rawData[,length(rawData)]
308
309 set.seed(222)
310 nzv1 <- nearZeroVar(rawData1)
311 if(length(nzv1) > 0)
312 {
313 nzvVars1 <- names(rawData1)[nzv1]
314 rawData <- rawData1[, -nzv1]
315 rawData$outcome <- rawData2
316 nzvText1 <- paste("There were ",
317 length(nzv1),
318 " predictors that were removed from original data due to",
319 " severely unbalanced distributions that",
320 " could negatively affect the model fit",
321 ifelse(length(nzv1) > 10,
322 ".",
323 paste(": ",
324 listString(nzvVars1),
325 ".",
326 sep = "")),
327 sep = "")
328
329 } else {
330 rawData <- rawData1
331 rawData$outcome <- rawData2
332 nzvText1 <- ""
333
334 }
335
336 remove("rawData1")
337 remove("rawData2")
338
339 @
340
341 The initial data set consisted of \Sexpr{numSamples} samples and
342 \Sexpr{numPredictors} predictor variables. The breakdown of the
343 outcome data classes were: \Sexpr{classDistString}.
344
345 \Sexpr{missingText}
346
347 \Sexpr{nzvText1}
348
349 <<pca, eval= TRUE, echo = FALSE, results = hide>>=
350
351 predictorNames <- names(rawData)[names(rawData) != "outcome"]
352 numPredictors <- length(predictorNames)
353 predictors <- rawData[, predictorNames, drop = FALSE]
354 ## PCA will fail with predictors having less than 2 unique values
355 isZeroVar <- apply(predictors, 2,
356 function(x) length(unique(x)) < 2)
357 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
358 ## For whatever, only the formula interface to prcomp
359 ## handles missing values
360 pcaForm <- as.formula(
361 paste("~",
362 paste(names(predictors), collapse = "+")))
363 pca <- prcomp(pcaForm,
364 data = predictors,
365 center = TRUE,
366 scale. = TRUE,
367 na.action = na.omit)
368 ## OPTION: the number of components plotted/discussed can be set
369 numPCAcomp <- 3
370 pctVar <- pca$sdev^2/sum(pca$sdev^2)*100
371 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
372 "\\\\%",
373 sep = "")
374 pcaText <- listString(pcaText)
375 @
376
377 To get an initial assessment of the separability of the classes,
378 principal component analysis (PCA) was used to distill the
379 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
380 surrogate variables (i.e. the principal components) in a manner that
381 attempts to maximize the amount of information preserved from the
382 original predictor set. Figure \ref{F:inititalPCA} contains plots of
383 the first \Sexpr{numPCAcomp} components, which accounted for
384 \Sexpr{pcaText} percent of the variability in the original predictors
385 (respectively).
386
387
388 %% OPTION: remark on how well (or poorly) the data separated
389
390 \setkeys{Gin}{width = 0.8\textwidth}
391 \begin{figure}[p]
392 \begin{center}
393
394 <<pcaPlot, eval = TRUE, echo = FALSE, results = tex, fig = TRUE, width = 8, height = 8>>=
395 trellis.par.set(caretTheme(), warn = TRUE)
396 if(numPCAcomp == 2)
397 {
398 axisRange <- extendrange(pca$x[, 1:2])
399 print(
400 xyplot(PC1 ~ PC2,
401 data = as.data.frame(pca$x),
402 type = c("p", "g"),
403 groups = rawData$outcome,
404 auto.key = list(columns = 2),
405 xlim = axisRange,
406 ylim = axisRange))
407 } else {
408 axisRange <- extendrange(pca$x[, 1:numPCAcomp])
409 print(
410 splom(~as.data.frame(pca$x)[, 1:numPCAcomp],
411 type = c("p", "g"),
412 groups = rawData$outcome,
413 auto.key = list(columns = 2),
414 as.table = TRUE,
415 prepanel.limits = function(x) axisRange
416 ))
417
418 }
419
420 @
421
422 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
423 principal components for the original data set.}
424 \label{F:inititalPCA}
425 \end{center}
426 \end{figure}
427
428
429
430 <<initialDataSplit, eval = TRUE, echo = FALSE, results = tex>>=
431
432 ## OPTION: in small samples sizes, you may not want to set aside a
433 ## training set and focus on the resampling results.
434
435 set.seed(1234)
436 dataX <- rawData[,1:length(rawData)-1]
437 dataY <- rawData[,length(rawData)]
438
439 Smpling <- "garBage"
440
441 if(Smpling=="downsampling")
442 {
443 dwnsmpl <- downSample(dataX,dataY)
444 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
445 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
446 remove("dwnsmpl")
447 remove("dataX")
448 remove("dataY")
449 }else if(Smpling=="upsampling"){
450 upsmpl <- upSample(dataX,dataY)
451 rawData <- upsmpl[,1:length(upsmpl)-1]
452 rawData$outcome <- upsmpl[,length(upsmpl)]
453 remove("upsmpl")
454 remove("dataX")
455 remove("dataY")
456 }else{remove("dataX")
457 remove("dataY")
458 }
459
460
461
462 numSamples <- nrow(rawData)
463
464 predictorNames <- names(rawData)[names(rawData) != "outcome"]
465 numPredictors <- length(predictorNames)
466
467
468 classDist1 <- table(rawData$outcome)
469 classDistString1 <- paste("``",
470 names(classDist1),
471 "'' ($n$=",
472 classDist1,
473 ")",
474 sep = "")
475 classDistString1 <- listString(classDistString1)
476
477 pctTrain <- 0.8
478
479 if(pctTrain < 1)
480 {
481 ## OPTION: seed number can be changed
482 set.seed(1)
483 inTrain <- createDataPartition(rawData$outcome,
484 p = pctTrain,
485 list = FALSE)
486 trainX <- rawData[ inTrain, predictorNames]
487 testX <- rawData[-inTrain, predictorNames]
488 trainY <- rawData[ inTrain, "outcome"]
489 testY <- rawData[-inTrain, "outcome"]
490 splitText <- paste("The original data were split into ",
491 "a training set ($n$=",
492 nrow(trainX),
493 ") and a test set ($n$=",
494 nrow(testX),
495 ") in a manner that preserved the ",
496 "distribution of the classes.",
497 sep = "")
498 isZeroVar <- apply(trainX, 2,
499 function(x) length(unique(x)) < 2)
500 if(any(isZeroVar))
501 {
502 trainX <- trainX[, !isZeroVar, drop = FALSE]
503 testX <- testX[, !isZeroVar, drop = FALSE]
504 }
505
506 } else {
507 trainX <- rawData[, predictorNames]
508 testX <- NULL
509 trainY <- rawData[, "outcome"]
510 testY <- NULL
511 splitText <- "The entire data set was used as the training set."
512 }
513 trainDist <- table(trainY)
514 nir <- max(trainDist)/length(trainY)*100
515 niClass <- names(trainDist)[which.max(trainDist)]
516 nirText <- paste("The non--information rate is the accuracy that can be ",
517 "achieved by predicting all samples using the most ",
518 "dominant class. For these data, the rate is ",
519 round(nir, 2), "\\\\% using the ``",
520 niClass,
521 "'' class.",
522 sep = "")
523
524 remove("rawData")
525
526 @
527
528 \Sexpr{splitText}
529
530 \Sexpr{nirText}
531
532 The data set for model building consisted of \Sexpr{numSamples} samples and
533 \Sexpr{numPredictors} predictor variables. The breakdown of the
534 outcome data classes were: \Sexpr{classDistString1}.
535
536 <<nzv, eval= TRUE, results = tex, echo = FALSE>>=
537 ## OPTION: other pre-processing steps can be used
538 ppSteps <- caret:::suggestions(modName)
539
540 set.seed(2)
541 if(ppSteps["nzv"])
542 {
543 nzv <- nearZeroVar(trainX)
544 if(length(nzv) > 0)
545 {
546 nzvVars <- names(trainX)[nzv]
547 trainX <- trainX[, -nzv]
548 nzvText <- paste("There were ",
549 length(nzv),
550 " predictors that were removed from train set due to",
551 " severely unbalanced distributions that",
552 " could negatively affect the model",
553 ifelse(length(nzv) > 10,
554 ".",
555 paste(": ",
556 listString(nzvVars),
557 ".",
558 sep = "")),
559 sep = "")
560 testX <- testX[, -nzv]
561 } else nzvText <- ""
562 } else nzvText <- ""
563 @
564
565 \Sexpr{nzvText}
566
567
568 <<corrFilter, eval = TRUE, results = tex, echo = FALSE>>=
569 if(ppSteps["corr"])
570 {
571 ## OPTION:
572 corrThresh <- 0.75
573 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
574 corrThresh)
575 if(length(highCorr) > 0)
576 {
577 corrVars <- names(trainX)[highCorr]
578 trainX <- trainX[, -highCorr]
579 corrText <- paste("There were ",
580 length(highCorr),
581 " predictors that were removed due to",
582 " large between--predictor correlations that",
583 " could negatively affect the model fit",
584 ifelse(length(highCorr) > 10,
585 ".",
586 paste(": ",
587 listString(highCorr),
588 ".",
589 sep = "")),
590 " Removing these predictors forced",
591 " all pair--wise correlations to be",
592 " less than ",
593 corrThresh,
594 ".",
595 sep = "")
596 testX <- testX[, -highCorr]
597 } else corrText <- "No correlation among data on given threshold"
598 }else corrText <- ""
599 @
600
601 \Sexpr{corrText}
602
603 <<preProc, eval = TRUE, echo = FALSE, results = tex>>=
604 ppMethods <- NULL
605 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
606 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
607 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
608 ##OPTION other methods, such as spatial sign, can be added to this list
609
610 if(length(ppMethods) > 0)
611 {
612 ppInfo <- preProcess(trainX, method = ppMethods)
613 trainX <- predict(ppInfo, trainX)
614 if(pctTrain < 1) testX <- predict(ppInfo, testX)
615 ppText <- paste("The following pre--processing methods were",
616 " applied to the training",
617 ifelse(pctTrain < 1, " and test", ""),
618 " data: ",
619 listString(ppMethods),
620 ".",
621 sep = "")
622 ppText <- gsub("center", "mean centering", ppText)
623 ppText <- gsub("scale", "scaling to unit variance", ppText)
624 ppText <- gsub("knnImpute",
625 paste(ppInfo$k, "--nearest neighbor imputation", sep = ""),
626 ppText)
627 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
628 ppText <- gsub("pca", "principal component feature extraction", ppText)
629 ppText <- gsub("ica", "independent component feature extraction", ppText)
630 } else {
631 ppInfo <- NULL
632 ppText <- ""
633 }
634
635 predictorNames <- names(trainX)
636 if(nzvText != "" | corrText != "" | ppText != "")
637 {
638 varText <- paste("After pre--processing, ",
639 ncol(trainX),
640 "predictors remained for modeling.")
641 } else varText <- ""
642
643 @
644
645 \Sexpr{ppText}
646 \Sexpr{varText}
647
648 \clearpage
649
650 \section*{Model Building}
651
652 <<setupWorkers, eval = TRUE, echo = FALSE, results = tex>>=
653 numWorkers <- 1
654 ##OPTION: turn up numWorkers to use MPI
655 if(numWorkers > 1)
656 {
657 mpiCalcs <- function(X, FUN, ...)
658 {
659 theDots <- list(...)
660 parLapply(theDots$cl, X, FUN)
661 }
662
663 library(snow)
664 cl <- makeCluster(numWorkers, "MPI")
665 }
666 @
667
668 <<setupResampling, echo = FALSE, results = hide>>=
669 ##OPTION: the resampling options can be changed. See
670 ## ?trainControl for details
671
672 resampName <- "boot"
673 resampNumber <- 3
674 numRepeat <- 1
675 resampP <- 0.75
676
677 modelInfo <- modelLookup(modName)
678
679 if(numClasses == 2)
680 {
681 foo <- if(any(modelInfo$probModel)) twoClassSummary else twoClassNoProbs
682 } else foo <- defaultSummary
683
684 set.seed(3)
685 ctlObj <- trainControl(method = resampName,
686 number = resampNumber,
687 repeats = numRepeat,
688 p = resampP,
689 classProbs = any(modelInfo$probModel),
690 summaryFunction = foo)
691
692
693 ##OPTION select other performance metrics as needed
694 optMetric <- if(numClasses == 2 & any(modelInfo$probModel)) "ROC" else "Kappa"
695
696 if(numWorkers > 1)
697 {
698 ctlObj$workers <- numWorkers
699 ctlObj$computeFunction <- mpiCalcs
700 ctlObj$computeArgs <- list(cl = cl)
701 }
702 @
703
704 <<setupGrid, results = hide, echo = FALSE>>=
705 ##OPTION expand or contract these grids as needed (or
706 ## add more models
707
708 gridSize <- 3
709
710 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
711 if(modName %in% c("earth", "fda")) gridSize <- 7
712 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
713
714 if(modName %in% c("nb")) gridSize <- 2
715 if(modName %in% c("pam", "rpart")) gridSize <- 15
716 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
717
718 if(modName == "gbm")
719 {
720 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
721 .n.trees = (1:10)*20,
722 .shrinkage = .1)
723 }
724
725 if(modName == "nnet")
726 {
727 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
728 .decay = c(0, .001, .01, .1))
729 }
730
731 if(modName == "ada")
732 {
733 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
734
735 }
736
737
738 @
739
740 <<fitModel, results = tex, echo = FALSE, eval = TRUE>>=
741 ##OPTION alter as needed
742
743 set.seed(4)
744 modelFit <- switch(modName,
745 gbm =
746 {
747 mix <- sample(seq(along = trainY))
748 train(
749 trainX[mix,], trainY[mix], modName,
750 verbose = FALSE,
751 bag.fraction = .9,
752 metric = optMetric,
753 trControl = ctlObj,
754 tuneGrid = tGrid)
755 },
756
757 multinom =
758 {
759 train(
760 trainX, trainY, modName,
761 trace = FALSE,
762 metric = optMetric,
763 maxiter = 1000,
764 MaxNWts = 5000,
765 trControl = ctlObj,
766 tuneLength = gridSize)
767 },
768
769 nnet =
770 {
771 train(
772 trainX, trainY, modName,
773 metric = optMetric,
774 linout = FALSE,
775 trace = FALSE,
776 maxiter = 1000,
777 MaxNWts = 5000,
778 trControl = ctlObj,
779 tuneGrid = tGrid)
780
781 },
782
783 svmRadial =, svmPoly =, svmLinear =
784 {
785 train(
786 trainX, trainY, modName,
787 metric = optMetric,
788 scaled = TRUE,
789 trControl = ctlObj,
790 tuneLength = gridSize)
791 },
792 {
793 train(trainX, trainY, modName,
794 trControl = ctlObj,
795 metric = optMetric,
796 tuneLength = gridSize)
797 })
798
799 @
800
801 <<modelDescr, echo = FALSE, results = tex>>=
802 summaryText <- ""
803
804 resampleName <- switch(tolower(modelFit$control$method),
805 boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""),
806 boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""),
807 cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""),
808 repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ",
809 modelFit$control$repeats, " times)", sep = ""),
810 lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ",
811 round(modelFit$control$p, 2), "$\\%$)", sep = ""))
812
813 tuneVars <- latexTranslate(tolower(modelInfo$label))
814 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
815 if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter")
816 {
817 summaryText <- paste(summaryText,
818 "\n\n",
819 "There are no tuning parameters associated with this model.",
820 "To characterize the model performance on the training set,",
821 resampleName,
822 "was used.",
823 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
824 "show summaries of the resampling results. ")
825
826 } else {
827 summaryText <- paste("There",
828 ifelse(nrow(modelInfo) > 1, "are", "is"),
829 nrow(modelInfo),
830 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
831 "associated with this model:",
832 listString(tuneVars, period = TRUE))
833
834
835
836 paramNames <- gsub(".", "", names(modelFit$bestTune), fixed = TRUE)
837 for(i in seq(along = paramNames))
838 {
839 check <- modelInfo$parameter %in% paramNames[i]
840 if(any(check))
841 {
842 paramNames[i] <- modelInfo$label[which(check)]
843 }
844 }
845
846 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
847 ## Check to see if there was only one combination fit
848 summaryText <- paste(summaryText,
849 "To choose",
850 ifelse(nrow(modelInfo) > 1,
851 "appropriate values of the tuning parameters,",
852 "an appropriate value of the tuning parameter,"),
853 resampleName,
854 "was used to generated a profile of performance across the",
855 nrow(modelFit$results),
856 ifelse(nrow(modelInfo) > 1,
857 "combinations of the tuning parameters.",
858 "candidate values."),
859
860 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
861 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
862 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE))
863
864 }
865 @
866
867 \Sexpr{summaryText}
868
869 <<resampTable, echo = FALSE, results = tex>>=
870 tableData <- modelFit$results
871
872 if(all(modelInfo$parameter == "parameter") && resampName == "boot632")
873 {
874 tableData <- tableData[,-1, drop = FALSE]
875 colNums <- c( length(modelFit$perfNames), length(modelFit$perfNames), length(modelFit$perfNames))
876 colLabels <- c("Mean", "Standard Deviation","Apparant")
877 constString <- ""
878 isConst <- NULL
879 } else if (all(modelInfo$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName == "repeatedcv" )){
880 tableData <- tableData[,-1, drop = FALSE]
881 colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames))
882 colLabels <- c("Mean", "Standard Deviation")
883 constString <- ""
884 isConst <- NULL
885 } else if (all(modelInfo$parameter == "parameter") && resampName == "LOOCV" ){
886 tableData <- tableData[,-1, drop = FALSE]
887 colNums <- length(modelFit$perfNames)
888 colLabels <- c("Measures")
889 constString <- ""
890 isConst <- NULL
891 } else {
892 if (all(modelInfo$parameter != "parameter") && resampName == "boot632" ){
893 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
894 2,
895 function(x) length(unique(x)) == 1)
896
897 numParamInTable <- sum(!isConst)
898
899 if(any(isConst))
900 {
901 constParam <- modelInfo$parameter[isConst]
902 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
903 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
904 constString <- paste("The tuning",
905 ifelse(sum(isConst) > 1,
906 "parmeters",
907 "parameter"),
908 listString(paste("``", names(constValues), "''", sep = "")),
909 ifelse(sum(isConst) > 1,
910 "were",
911 "was"),
912 "held constant at",
913 ifelse(sum(isConst) > 1,
914 "a value of",
915 "values of"),
916 listString(constValues[1,]))
917
918 } else constString <- ""
919
920 cn <- colnames(tableData)
921 for(i in seq(along = cn))
922 {
923 check <- modelInfo$parameter %in% cn[i]
924 if(any(check))
925 {
926 cn[i] <- modelInfo$label[which(check)]
927 }
928 }
929 colnames(tableData) <- cn
930
931 colNums <- c(numParamInTable,
932 length(modelFit$perfNames),
933 length(modelFit$perfNames),
934 length(modelFit$perfNames))
935 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
936
937 }else if (all(modelInfo$parameter != "parameter") && (resampName == "boot" | resampName == "repeatedcv" | resampName == "cv") ){
938 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
939 2,
940 function(x) length(unique(x)) == 1)
941
942 numParamInTable <- sum(!isConst)
943
944 if(any(isConst))
945 {
946 constParam <- modelInfo$parameter[isConst]
947 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
948 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
949 constString <- paste("The tuning",
950 ifelse(sum(isConst) > 1,
951 "parmeters",
952 "parameter"),
953 listString(paste("``", names(constValues), "''", sep = "")),
954 ifelse(sum(isConst) > 1,
955 "were",
956 "was"),
957 "held constant at",
958 ifelse(sum(isConst) > 1,
959 "a value of",
960 "values of"),
961 listString(constValues[1,]))
962
963 } else constString <- ""
964
965 cn <- colnames(tableData)
966 for(i in seq(along = cn))
967 {
968 check <- modelInfo$parameter %in% cn[i]
969 if(any(check))
970 {
971 cn[i] <- modelInfo$label[which(check)]
972 }
973 }
974 colnames(tableData) <- cn
975
976 colNums <- c(numParamInTable,
977 length(modelFit$perfNames),
978 length(modelFit$perfNames))
979 colLabels <- c("", "Mean", "Standard Deviation")
980
981 }
982 else if (all(modelInfo$parameter != "parameter") && resampName == "LOOCV"){
983 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
984 2,
985 function(x) length(unique(x)) == 1)
986
987 numParamInTable <- sum(!isConst)
988
989 if(any(isConst))
990 {
991 constParam <- modelInfo$parameter[isConst]
992 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
993 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
994 constString <- paste("The tuning",
995 ifelse(sum(isConst) > 1,
996 "parmeters",
997 "parameter"),
998 listString(paste("``", names(constValues), "''", sep = "")),
999 ifelse(sum(isConst) > 1,
1000 "were",
1001 "was"),
1002 "held constant at",
1003 ifelse(sum(isConst) > 1,
1004 "a value of",
1005 "values of"),
1006 listString(constValues[1,]))
1007
1008 } else constString <- ""
1009
1010 cn <- colnames(tableData)
1011 for(i in seq(along = cn))
1012 {
1013 check <- modelInfo$parameter %in% cn[i]
1014 if(any(check))
1015 {
1016 cn[i] <- modelInfo$label[which(check)]
1017 }
1018 }
1019 colnames(tableData) <- cn
1020
1021 colNums <- c(numParamInTable,
1022 length(modelFit$perfNames))
1023 colLabels <- c("", "Measures")
1024
1025 }
1026
1027 }
1028
1029 colnames(tableData) <- gsub("SD$", "", colnames(tableData))
1030 colnames(tableData) <- gsub("Apparent$", "", colnames(tableData))
1031 colnames(tableData) <- latexTranslate(colnames(tableData))
1032 rownames(tableData) <- latexTranslate(rownames(tableData))
1033
1034 latex(tableData,
1035 rowname = NULL,
1036 file = "",
1037 cgroup = colLabels,
1038 n.cgroup = colNums,
1039 where = "h!",
1040 digits = 4,
1041 longtable = nrow(tableData) > 30,
1042 caption = paste(resampleName, "results from the model fit.", constString),
1043 label = "T:resamps")
1044 @
1045
1046 \setkeys{Gin}{ width = 0.9\textwidth}
1047 \begin{figure}[b]
1048 \begin{center}
1049
1050 <<profilePlot, echo = FALSE, fig = TRUE, width = 8, height = 6>>=
1051 trellis.par.set(caretTheme(), warn = TRUE)
1052 if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb")
1053 {
1054 resultsPlot <- resampleHist(modelFit)
1055 plotCaption <- paste("Distributions of model performance from the ",
1056 "training set estimated using ",
1057 resampleName)
1058 } else {
1059 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
1060 {
1061 resultsPlot <- plot(modelFit,
1062 metric = optMetric,
1063 xTrans = function(x) log10(x))
1064 resultsPlot <- update(resultsPlot,
1065 type = c("g", "p", "l"),
1066 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1067
1068 } else {
1069 resultsPlot <- plot(modelFit,
1070 metric = optMetric)
1071 resultsPlot <- update(resultsPlot,
1072 type = c("g", "p", "l"),
1073 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1074 }
1075 plotCaption <- paste("A plot of the estimates of the",
1076 optMetric,
1077 "values calculated using",
1078 resampleName)
1079 }
1080 print(resultsPlot)
1081 @
1082 \caption[Performance Plot]{\Sexpr{plotCaption}.}
1083 \label{F:profile}
1084 \end{center}
1085 \end{figure}
1086
1087
1088 <<stopWorkers, echo = FALSE, results = hide>>=
1089 if(numWorkers > 1) stopCluster(cl)
1090 @
1091
1092 <<testPred, results = tex, echo = FALSE>>=
1093 if(pctTrain < 1)
1094 {
1095 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
1096 classPred <- predict(modelFit, testX)
1097 cm <- confusionMatrix(classPred, testY)
1098 values <- cm$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
1099
1100 values <- values[!is.na(values) & !is.nan(values)]
1101 values <- c(format(values[1:2], digits = 3),
1102 format.pval(values[-(1:2)], digits = 5))
1103 nms <- c("the overall accuracy", "the Kappa statistic",
1104 "the $p$--value that accuracy is greater than the no--information rate",
1105 "the $p$--value of concordance from McNemar's test")
1106 nms <- nms[seq(along = values)]
1107 names(values) <- nms
1108
1109 if(any(modelInfo$probModel))
1110 {
1111 classProbs <- extractProb(list(fit = modelFit),
1112 testX = testX,
1113 testY = testY)
1114 classProbs <- subset(classProbs, dataType == "Test")
1115 if(numClasses == 2)
1116 {
1117 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$obs))
1118 tmp <- c(format(tmp, digits = 3))
1119 names(tmp) <- c("the sensitivity", "the specificity",
1120 "the area under the ROC curve")
1121 values <- c(values, tmp)
1122
1123 }
1124 probPlot <- plotClassProbs(classProbs)
1125 }
1126 testString <- paste("Based on the test set of",
1127 nrow(testX),
1128 "samples,",
1129 listString(paste(names(values), "was", values), period = TRUE),
1130 "The confusion matrix for the test set is shown in Table",
1131 "\\\\ref{T:cm}.")
1132 testString <- paste(testString,
1133 " Using ", resampleName,
1134 ", the training set estimates were ",
1135 resampleStats(modelFit),
1136 ".",
1137 sep = "")
1138
1139 if(any(modelInfo$probModel)) testString <- paste(testString,
1140 "Histograms of the class probabilities",
1141 "for the test set samples are shown in",
1142 "Figure \\\\ref{F:probs}",
1143 ifelse(numClasses == 2,
1144 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
1145 "."))
1146
1147
1148
1149 latex(cm$table,
1150 title = "",
1151 file = "",
1152 where = "h",
1153 cgroup = "Observed Values",
1154 n.cgroup = numClasses,
1155 caption = "The confusion matrix for the test set",
1156 label = "T:cm")
1157
1158 } else testString <- ""
1159 @
1160 \Sexpr{testString}
1161
1162
1163 <<classProbsTex, results = tex, echo = FALSE>>=
1164 if(any(modelInfo$probModel))
1165 {
1166 cat(
1167 paste("\\begin{figure}[p]\n",
1168 "\\begin{center}\n",
1169 "\\includegraphics{classProbs}",
1170 "\\caption[PCA Plot]{Class probabilities",
1171 "for the test set. Each panel contains ",
1172 "separate classes}\n",
1173 "\\label{F:probs}\n",
1174 "\\end{center}\n",
1175 "\\end{figure}"))
1176 }
1177 if(any(modelInfo$probModel) & numClasses == 2)
1178 {
1179 cat(
1180 paste("\\begin{figure}[p]\n",
1181 "\\begin{center}\n",
1182 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
1183 "\\caption[ROC Plot]{ROC Curve",
1184 "for the test set.}\n",
1185 "\\label{F:roc}\n",
1186 "\\end{center}\n",
1187 "\\end{figure}"))
1188 }
1189 @
1190 <<classProbsTex, results = hide, echo = FALSE >>=
1191 if(any(modelInfo$probModel))
1192 {
1193 pdf("classProbs.pdf", height = 7, width = 7)
1194 trellis.par.set(caretTheme(), warn = FALSE)
1195 print(probPlot)
1196 dev.off()
1197 }
1198
1199 if(any(modelInfo$probModel) & numClasses == 2)
1200 { resPonse<-testY
1201 preDictor<-classProbs[, levels(trainY)[1]]
1202 pdf("roc.pdf", height = 8, width = 8)
1203 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
1204 plot.roc(resPonse, preDictor, # data
1205 percent=TRUE, # show all values in percent
1206 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
1207 print.auc=TRUE, #display pAUC value on the plot with following options:
1208 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
1209 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
1210 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
1211 main="Partial AUC (pAUC)")
1212 plot.roc(resPonse, preDictor,
1213 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
1214 partial.auc=c(100, 90), partial.auc.correct=TRUE,
1215 partial.auc.focus="se", # focus pAUC on the sensitivity
1216 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
1217 print.auc.y=40, # do not print auc over the previous one
1218 auc.polygon=TRUE, auc.polygon.col="#008600",
1219 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
1220 dev.off()
1221 }
1222
1223
1224 @
1225
1226 \section*{Versions}
1227
1228 <<versions, echo = FALSE, results = tex>>=
1229 toLatex(sessionInfo())
1230
1231 @
1232
1233 <<save-data, echo = FALSE, results = tex>>=
1234 ## change this to the name of modName....
1235 Fit<-modelFit
1236 save(Fit,file="pls-Fit.RData")
1237 @
1238 The model was built using pls and is saved as pls-Fit.RData for reuse. This contains the variable Fit.
1239
1240 \end{document}