comparison caret_future/tool3/test/tmp1/earth/result-doc.Rnw @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68300206e90d
1 %% Classification Modeling Script
2 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
3 %% Version: 1.00
4 %% Created on: 2010/10/02
5 %%
6 %% This is an Sweave template for building and describing
7 %% classification models. It mixes R and LaTeX code. The document can
8 %% be processing using R's Sweave function to produce a tex file.
9 %%
10 %% The inputs are:
11 %% - the initial data set in a data frame called 'rawData'
12 %% - a factor column in the data set called 'class'. this should be the
13 %% outcome variable
14 %% - all other columns in rawData should be predictor variables
15 %% - the type of model should be in a variable called 'modName'.
16 %%
17 %% The script attempts to make some intelligent choices based on the
18 %% model being used. For example, if modName is "pls", the script will
19 %% automatically center and scale the predictor data. There are
20 %% situations where these choices can (and should be) changed.
21 %%
22 %% There are other options that may make sense to change. For example,
23 %% the user may want to adjust the type of resampling. To find these
24 %% parts of the script, search on the string 'OPTION'. These parts of
25 %% the code will document the options.
26
27 \documentclass[14pt]{report}
28 \usepackage{amsmath}
29 \usepackage[pdftex]{graphicx}
30 \usepackage{color}
31 \usepackage{ctable}
32 \usepackage{xspace}
33 \usepackage{fancyvrb}
34 \usepackage{fancyhdr}
35 \usepackage{lastpage}
36 \usepackage{longtable}
37 \usepackage{algorithm2e}
38 \usepackage[
39 colorlinks=true,
40 linkcolor=blue,
41 citecolor=blue,
42 urlcolor=blue]
43 {hyperref}
44 \usepackage{lscape}
45 \usepackage{Sweave}
46 \SweaveOpts{keep.source = TRUE}
47
48 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
49
50 % define new colors for use
51 \definecolor{darkgreen}{rgb}{0,0.6,0}
52 \definecolor{darkred}{rgb}{0.6,0.0,0}
53 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
54 \definecolor{brown}{rgb}{0.6,0.3,0.3}
55 \definecolor{darkblue}{rgb}{0,0,0.8}
56 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
57
58 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59
60 \newcommand{\bld}[1]{\mbox{\boldmath $#1$}}
61 \newcommand{\shell}[1]{\mbox{$#1$}}
62 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
63
64 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
65 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
66
67 \newcommand{\halfs}{\frac{1}{2}}
68
69 \setlength{\oddsidemargin}{-.25 truein}
70 \setlength{\evensidemargin}{0truein}
71 \setlength{\topmargin}{-0.2truein}
72 \setlength{\textwidth}{7 truein}
73 \setlength{\textheight}{8.5 truein}
74 \setlength{\parindent}{0.20truein}
75 \setlength{\parskip}{0.10truein}
76
77 \setcounter{LTchunksize}{50}
78
79 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
80 \pagestyle{fancy}
81 \lhead{}
82 %% OPTION Report header name
83 \chead{Classification Model Script}
84 \rhead{}
85 \lfoot{}
86 \cfoot{}
87 \rfoot{\thepage\ of \pageref{LastPage}}
88 \renewcommand{\headrulewidth}{1pt}
89 \renewcommand{\footrulewidth}{1pt}
90 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
91
92 %% OPTION Report title and modeler name
93 \title{Classification Model Script using earth}
94 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
95
96 \begin{document}
97
98 \maketitle
99
100 \thispagestyle{empty}
101 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
102 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
103 numSamples=''
104 classDistString=''
105 missingText=''
106 numPredictors=''
107 numPCAcomp=''
108 pcaText=''
109 nzvText=''
110 corrText=''
111 ppText=''
112 varText=''
113 splitText="Dummy Text"
114 nirText="Dummy Text"
115 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
116 pctTrain=0.8
117 Smpling=''
118 nzvText1=''
119 classDistString1=''
120 dwnsmpl=''
121 upsmpl=''
122
123 @
124 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
125 library(Hmisc)
126 library(caret)
127 library(pROC)
128 versionTest <- compareVersion(packageDescription("caret")$Version,
129 "4.65")
130 if(versionTest < 0) stop("caret version 4.65 or later is required")
131
132 library(RColorBrewer)
133
134
135 listString <- function (x, period = FALSE, verbose = FALSE)
136 {
137 if (verbose) cat("\n entering listString\n")
138 flush.console()
139 if (!is.character(x))
140 x <- as.character(x)
141 numElements <- length(x)
142 out <- if (length(x) > 0) {
143 switch(min(numElements, 3), x, paste(x, collapse = " and "),
144 {
145 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
146 paste(x, collapse = " ")
147 })
148 }
149 else ""
150 if (period) out <- paste(out, ".", sep = "")
151 if (verbose) cat(" leaving listString\n\n")
152 flush.console()
153 out
154 }
155
156 resampleStats <- function(x, digits = 3)
157 {
158 bestPerf <- x$bestTune
159 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
160 out <- merge(x$results, bestPerf)
161 out <- out[, colnames(out) %in% x$perfNames]
162 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
163 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
164 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
165 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
166 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
167
168 out <- format(out, digits = digits)
169 listString(paste(names(out), "was", out))
170 }
171
172 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
173 {
174 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
175 specificity(data[, "pred"], data[, "obs"], lev[2]),
176 confusionMatrix(data[, "pred"], data[, "obs"])$overall["Kappa"])
177
178 names(out) <- c("Sens", "Spec", "Kappa")
179 out
180 }
181
182
183
184 ##OPTION: model name: see ?train for more values/models
185 modName <- "earth"
186
187
188 load("/home/galaxy/galaxy-dist/database/files/000/dataset_397.dat")
189 rawData <- dataX
190 rawData$outcome <- dataY
191
192 @
193
194
195 \section*{Data Sets}\label{S:data}
196
197 %% OPTION: provide some background on the problem, the experimental
198 %% data, how the compounds were selected etc
199
200 <<getDataInfo, eval = TRUE, echo = FALSE, results = hide>>=
201 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
202 if(!is.factor(rawData$outcome)) stop("the outcome should be a factor vector")
203
204 ## OPTION: when there are only two classes, the first level of the
205 ## factor is used as the "positive" or "event" for calculating
206 ## sensitivity and specificity. Adjust the outcome factor accordingly.
207 numClasses <- length(levels(rawData$outcome))
208 numSamples <- nrow(rawData)
209 numPredictors <- ncol(rawData) - 1
210 predictorNames <- names(rawData)[names(rawData) != "outcome"]
211
212 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
213 if(any(!isNum)) stop("all predictors in rawData should be numeric")
214
215 classTextCheck <- all.equal(levels(rawData$outcome), make.names(levels(rawData$outcome)))
216 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
217
218 ## Get the class distribution
219 classDist <- table(rawData$outcome)
220 classDistString <- paste("``",
221 names(classDist),
222 "'' ($n$=",
223 classDist,
224 ")",
225 sep = "")
226 classDistString <- listString(classDistString)
227 @
228
229 <<missingFilter, eval = TRUE, echo = FALSE, results = tex>>=
230 colRate <- apply(rawData[, predictorNames, drop = FALSE],
231 2, function(x) mean(is.na(x)))
232
233 ##OPTION thresholds can be changed
234 colExclude <- colRate > 0.2
235
236 missingText <- ""
237
238 if(any(colExclude))
239 {
240 missingText <- paste(missingText,
241 ifelse(sum(colExclude) > 1,
242 " There were ",
243 " There was "),
244 sum(colExclude),
245 ifelse(sum(colExclude) > 1,
246 " predictors ",
247 " predictor "),
248 "with an excessive number of ",
249 "missing data. ",
250 ifelse(sum(colExclude) > 1,
251 " These were excluded. ",
252 " This was excluded. "))
253 predictorNames <- predictorNames[!colExclude]
254 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
255 }
256
257
258 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
259 1, function(x) mean(is.na(x)))
260
261 rowExclude <- rowRate > 0.2
262
263
264 if(any(rowExclude)) {
265 missingText <- paste(missingText,
266 ifelse(sum(rowExclude) > 1,
267 " There were ",
268 " There was "),
269 sum(colExclude),
270 ifelse(sum(rowExclude) > 1,
271 " samples ",
272 " sample "),
273 "with an excessive number of ",
274 "missing data. ",
275 ifelse(sum(rowExclude) > 1,
276 " These were excluded. ",
277 " This was excluded. "),
278 "After filtering, ",
279 sum(!rowExclude),
280 " samples remained.")
281 rawData <- rawData[!rowExclude, ]
282 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
283 1, function(x) mean(is.na(x)))
284 } else {
285 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
286 1, function(x) any(is.na(x)))
287 missingText <- paste(missingText,
288 ifelse(missingText == "",
289 "There ",
290 "Subsequently, there "),
291 ifelse(sum(hasMissing) == 1,
292 "was ",
293 "were "),
294 ifelse(sum(hasMissing) > 0,
295 sum(hasMissing),
296 "no"),
297 ifelse(sum(hasMissing) == 1,
298 "sample ",
299 "samples "),
300 "with missing values.")
301
302 rawData <- rawData[complete.cases(rawData),]
303
304 }
305
306 rawData1 <- rawData[,1:length(rawData)-1]
307 rawData2 <- rawData[,length(rawData)]
308
309 set.seed(222)
310 nzv1 <- nearZeroVar(rawData1)
311 if(length(nzv1) > 0)
312 {
313 nzvVars1 <- names(rawData1)[nzv1]
314 rawData <- rawData1[, -nzv1]
315 rawData$outcome <- rawData2
316 nzvText1 <- paste("There were ",
317 length(nzv1),
318 " predictors that were removed from original data due to",
319 " severely unbalanced distributions that",
320 " could negatively affect the model fit",
321 ifelse(length(nzv1) > 10,
322 ".",
323 paste(": ",
324 listString(nzvVars1),
325 ".",
326 sep = "")),
327 sep = "")
328
329 } else {
330 rawData <- rawData1
331 rawData$outcome <- rawData2
332 nzvText1 <- ""
333
334 }
335
336 remove("rawData1")
337 remove("rawData2")
338
339 @
340
341 The initial data set consisted of \Sexpr{numSamples} samples and
342 \Sexpr{numPredictors} predictor variables. The breakdown of the
343 outcome data classes were: \Sexpr{classDistString}.
344
345 \Sexpr{missingText}
346
347 \Sexpr{nzvText1}
348
349 <<pca, eval= TRUE, echo = FALSE, results = tex>>=
350
351 predictorNames <- names(rawData)[names(rawData) != "outcome"]
352 numPredictors <- length(predictorNames)
353 predictors <- rawData[, predictorNames, drop = FALSE]
354 ## PCA will fail with predictors having less than 2 unique values
355 isZeroVar <- apply(predictors, 2,
356 function(x) length(unique(x)) < 2)
357 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
358 ## For whatever, only the formula interface to prcomp
359 ## handles missing values
360 pcaForm <- as.formula(
361 paste("~",
362 paste(names(predictors), collapse = "+")))
363 pca <- prcomp(pcaForm,
364 data = predictors,
365 center = TRUE,
366 scale. = TRUE,
367 na.action = na.omit)
368 ## OPTION: the number of components plotted/discussed can be set
369 numPCAcomp <- 3
370 pctVar <- pca$sdev^2/sum(pca$sdev^2)*100
371 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
372 "\\\\%",
373 sep = "")
374 pcaText <- listString(pcaText)
375 @
376
377 To get an initial assessment of the separability of the classes,
378 principal component analysis (PCA) was used to distill the
379 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
380 surrogate variables (i.e. the principal components) in a manner that
381 attempts to maximize the amount of information preserved from the
382 original predictor set. Figure \ref{F:inititalPCA} contains plots of
383 the first \Sexpr{numPCAcomp} components, which accounted for
384 \Sexpr{pcaText} percent of the variability in the original predictors
385 (respectively).
386
387
388 %% OPTION: remark on how well (or poorly) the data separated
389
390 \setkeys{Gin}{width = 0.8\textwidth}
391 \begin{figure}[p]
392 \begin{center}
393
394 <<pcaPlot, eval = TRUE, echo = FALSE, results = tex, fig = TRUE, width = 8, height = 8>>=
395 trellis.par.set(caretTheme(), warn = TRUE)
396 if(numPCAcomp == 2)
397 {
398 axisRange <- extendrange(pca$x[, 1:2])
399 print(
400 xyplot(PC1 ~ PC2,
401 data = as.data.frame(pca$x),
402 type = c("p", "g"),
403 groups = rawData$outcome,
404 auto.key = list(columns = 2),
405 xlim = axisRange,
406 ylim = axisRange))
407 } else {
408 axisRange <- extendrange(pca$x[, 1:numPCAcomp])
409 print(
410 splom(~as.data.frame(pca$x)[, 1:numPCAcomp],
411 type = c("p", "g"),
412 groups = rawData$outcome,
413 auto.key = list(columns = 2),
414 as.table = TRUE,
415 prepanel.limits = function(x) axisRange
416 ))
417
418 }
419
420 @
421
422 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
423 principal components for the original data set.}
424 \label{F:inititalPCA}
425 \end{center}
426 \end{figure}
427
428
429
430 <<initialDataSplit, eval = TRUE, echo = FALSE, results = tex>>=
431
432 ## OPTION: in small samples sizes, you may not want to set aside a
433 ## training set and focus on the resampling results.
434
435 set.seed(1234)
436 dataX <- rawData[,1:length(rawData)-1]
437 dataY <- rawData[,length(rawData)]
438
439 Smpling <- "garBage"
440
441 if(Smpling=="downsampling")
442 {
443 dwnsmpl <- downSample(dataX,dataY)
444 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
445 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
446 remove("dwnsmpl")
447 remove("dataX")
448 remove("dataY")
449 }else if(Smpling=="upsampling"){
450 upsmpl <- upSample(dataX,dataY)
451 rawData <- upsmpl[,1:length(upsmpl)-1]
452 rawData$outcome <- upsmpl[,length(upsmpl)]
453 remove("upsmpl")
454 remove("dataX")
455 remove("dataY")
456 }else{remove("dataX")
457 remove("dataY")
458 }
459
460
461
462 numSamples <- nrow(rawData)
463
464 predictorNames <- names(rawData)[names(rawData) != "outcome"]
465 numPredictors <- length(predictorNames)
466
467
468 classDist1 <- table(rawData$outcome)
469 classDistString1 <- paste("``",
470 names(classDist1),
471 "'' ($n$=",
472 classDist1,
473 ")",
474 sep = "")
475 classDistString1 <- listString(classDistString1)
476
477 pctTrain <- 0.8
478
479 if(pctTrain < 1)
480 {
481 ## OPTION: seed number can be changed
482 set.seed(1)
483 inTrain <- createDataPartition(rawData$outcome,
484 p = pctTrain,
485 list = FALSE)
486 trainX <- rawData[ inTrain, predictorNames]
487 testX <- rawData[-inTrain, predictorNames]
488 trainY <- rawData[ inTrain, "outcome"]
489 testY <- rawData[-inTrain, "outcome"]
490 splitText <- paste("The original data were split into ",
491 "a training set ($n$=",
492 nrow(trainX),
493 ") and a test set ($n$=",
494 nrow(testX),
495 ") in a manner that preserved the ",
496 "distribution of the classes.",
497 sep = "")
498 isZeroVar <- apply(trainX, 2,
499 function(x) length(unique(x)) < 2)
500 if(any(isZeroVar))
501 {
502 trainX <- trainX[, !isZeroVar, drop = FALSE]
503 testX <- testX[, !isZeroVar, drop = FALSE]
504 }
505
506 } else {
507 trainX <- rawData[, predictorNames]
508 testX <- NULL
509 trainY <- rawData[, "outcome"]
510 testY <- NULL
511 splitText <- "The entire data set was used as the training set."
512 }
513 trainDist <- table(trainY)
514 nir <- max(trainDist)/length(trainY)*100
515 niClass <- names(trainDist)[which.max(trainDist)]
516 nirText <- paste("The non--information rate is the accuracy that can be ",
517 "achieved by predicting all samples using the most ",
518 "dominant class. For these data, the rate is ",
519 round(nir, 2), "\\\\% using the ``",
520 niClass,
521 "'' class.",
522 sep = "")
523
524 remove("rawData")
525
526 @
527
528 \Sexpr{splitText}
529
530 \Sexpr{nirText}
531
532 The data set for model building consisted of \Sexpr{numSamples} samples and
533 \Sexpr{numPredictors} predictor variables. The breakdown of the
534 outcome data classes were: \Sexpr{classDistString1}.
535
536 <<nzv, eval= TRUE, results = tex, echo = FALSE>>=
537 ## OPTION: other pre-processing steps can be used
538 ppSteps <- caret:::suggestions(modName)
539
540 set.seed(2)
541 if(ppSteps["nzv"])
542 {
543 nzv <- nearZeroVar(trainX)
544 if(length(nzv) > 0)
545 {
546 nzvVars <- names(trainX)[nzv]
547 trainX <- trainX[, -nzv]
548 nzvText <- paste("There were ",
549 length(nzv),
550 " predictors that were removed from train set due to",
551 " severely unbalanced distributions that",
552 " could negatively affect the model",
553 ifelse(length(nzv) > 10,
554 ".",
555 paste(": ",
556 listString(nzvVars),
557 ".",
558 sep = "")),
559 sep = "")
560 testX <- testX[, -nzv]
561 } else nzvText <- ""
562 } else nzvText <- ""
563 @
564
565 \Sexpr{nzvText}
566
567
568 <<corrFilter, eval = TRUE, results = tex, echo = FALSE>>=
569 if(ppSteps["corr"])
570 {
571 ## OPTION:
572 corrThresh <- 0.75
573 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
574 corrThresh)
575 if(length(highCorr) > 0)
576 {
577 corrVars <- names(trainX)[highCorr]
578 trainX <- trainX[, -highCorr]
579 corrText <- paste("There were ",
580 length(highCorr),
581 " predictors that were removed due to",
582 " large between--predictor correlations that",
583 " could negatively affect the model fit",
584 ifelse(length(highCorr) > 10,
585 ".",
586 paste(": ",
587 listString(highCorr),
588 ".",
589 sep = "")),
590 " Removing these predictors forced",
591 " all pair--wise correlations to be",
592 " less than ",
593 corrThresh,
594 ".",
595 sep = "")
596 testX <- testX[, -highCorr]
597 } else corrText <- "No correlation among data on given threshold"
598 }else corrText <- ""
599 @
600
601 \Sexpr{corrText}
602
603 <<preProc, eval = TRUE, echo = FALSE, results = tex>>=
604 ppMethods <- NULL
605 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
606 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
607 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
608 ##OPTION other methods, such as spatial sign, can be added to this list
609
610 if(length(ppMethods) > 0)
611 {
612 ppInfo <- preProcess(trainX, method = ppMethods)
613 trainX <- predict(ppInfo, trainX)
614 if(pctTrain < 1) testX <- predict(ppInfo, testX)
615 ppText <- paste("The following pre--processing methods were",
616 " applied to the training",
617 ifelse(pctTrain < 1, " and test", ""),
618 " data: ",
619 listString(ppMethods),
620 ".",
621 sep = "")
622 ppText <- gsub("center", "mean centering", ppText)
623 ppText <- gsub("scale", "scaling to unit variance", ppText)
624 ppText <- gsub("knnImpute",
625 paste(ppInfo$k, "--nearest neighbor imputation", sep = ""),
626 ppText)
627 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
628 ppText <- gsub("pca", "principal component feature extraction", ppText)
629 ppText <- gsub("ica", "independent component feature extraction", ppText)
630 } else {
631 ppInfo <- NULL
632 ppText <- ""
633 }
634
635 predictorNames <- names(trainX)
636 if(nzvText != "" | corrText != "" | ppText != "")
637 {
638 varText <- paste("After pre--processing, ",
639 ncol(trainX),
640 "predictors remained for modeling.")
641 } else varText <- ""
642
643 @
644
645 \Sexpr{ppText}
646 \Sexpr{varText}
647
648 \clearpage
649
650 \section*{Model Building}
651
652 <<setupWorkers, eval = TRUE, echo = FALSE, results = tex>>=
653 numWorkers <- 1
654 ##OPTION: turn up numWorkers to use MPI
655 if(numWorkers > 1)
656 {
657 mpiCalcs <- function(X, FUN, ...)
658 {
659 theDots <- list(...)
660 parLapply(theDots$cl, X, FUN)
661 }
662
663 library(snow)
664 cl <- makeCluster(numWorkers, "MPI")
665 }
666 @
667
668 <<setupResampling, echo = FALSE, results = hide>>=
669 ##OPTION: the resampling options can be changed. See
670 ## ?trainControl for details
671 resampName <- "boot632"
672 resampNumber <- 10
673 numRepeat <- 1
674 resampP <- 0.75
675
676 modelInfo <- modelLookup(modName)
677
678 if(numClasses == 2)
679 {
680 foo <- if(any(modelInfo$probModel)) twoClassSummary else twoClassNoProbs
681 } else foo <- defaultSummary
682
683 set.seed(3)
684 ctlObj <- trainControl(method = resampName,
685 number = resampNumber,
686 repeats = numRepeat,
687 p = resampP,
688 classProbs = any(modelInfo$probModel),
689 summaryFunction = foo)
690
691
692 ##OPTION select other performance metrics as needed
693 optMetric <- if(numClasses == 2 & any(modelInfo$probModel)) "ROC" else "Kappa"
694
695 if(numWorkers > 1)
696 {
697 ctlObj$workers <- numWorkers
698 ctlObj$computeFunction <- mpiCalcs
699 ctlObj$computeArgs <- list(cl = cl)
700 }
701 @
702
703 <<setupGrid, results = hide, echo = FALSE>>=
704 ##OPTION expand or contract these grids as needed (or
705 ## add more models
706
707 gridSize <- 3
708
709 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
710 if(modName %in% c("earth", "fda")) gridSize <- 7
711 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
712
713 if(modName %in% c("nb")) gridSize <- 2
714 if(modName %in% c("pam", "rpart")) gridSize <- 15
715 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
716
717 if(modName == "gbm")
718 {
719 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
720 .n.trees = (1:10)*20,
721 .shrinkage = .1)
722 }
723
724 if(modName == "nnet")
725 {
726 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
727 .decay = c(0, .001, .01, .1))
728 }
729
730 if(modName == "ada")
731 {
732 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
733
734 }
735
736
737 @
738
739 <<fitModel, results = tex, echo = FALSE, eval = TRUE>>=
740 ##OPTION alter as needed
741
742 set.seed(4)
743 modelFit <- switch(modName,
744 gbm =
745 {
746 mix <- sample(seq(along = trainY))
747 train(
748 trainX[mix,], trainY[mix], modName,
749 verbose = FALSE,
750 bag.fraction = .9,
751 metric = optMetric,
752 trControl = ctlObj,
753 tuneGrid = tGrid)
754 },
755
756 multinom =
757 {
758 train(
759 trainX, trainY, modName,
760 trace = FALSE,
761 metric = optMetric,
762 maxiter = 1000,
763 MaxNWts = 5000,
764 trControl = ctlObj,
765 tuneLength = gridSize)
766 },
767
768 nnet =
769 {
770 train(
771 trainX, trainY, modName,
772 metric = optMetric,
773 linout = FALSE,
774 trace = FALSE,
775 maxiter = 1000,
776 MaxNWts = 5000,
777 trControl = ctlObj,
778 tuneGrid = tGrid)
779
780 },
781
782 svmRadial =, svmPoly =, svmLinear =
783 {
784 train(
785 trainX, trainY, modName,
786 metric = optMetric,
787 scaled = TRUE,
788 trControl = ctlObj,
789 tuneLength = gridSize)
790 },
791 {
792 train(trainX, trainY, modName,
793 trControl = ctlObj,
794 metric = optMetric,
795 tuneLength = gridSize)
796 })
797
798 @
799
800 <<modelDescr, echo = FALSE, results = tex>>=
801 summaryText <- ""
802
803 resampleName <- switch(tolower(modelFit$control$method),
804 boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""),
805 boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""),
806 cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""),
807 repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ",
808 modelFit$control$repeats, " times)", sep = ""),
809 lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ",
810 round(modelFit$control$p, 2), "$\\%$)", sep = ""))
811
812 tuneVars <- latexTranslate(tolower(modelInfo$label))
813 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
814 if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter")
815 {
816 summaryText <- paste(summaryText,
817 "\n\n",
818 "There are no tuning parameters associated with this model.",
819 "To characterize the model performance on the training set,",
820 resampleName,
821 "was used.",
822 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
823 "show summaries of the resampling results. ")
824
825 } else {
826 summaryText <- paste("There",
827 ifelse(nrow(modelInfo) > 1, "are", "is"),
828 nrow(modelInfo),
829 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
830 "associated with this model:",
831 listString(tuneVars, period = TRUE))
832
833
834
835 paramNames <- gsub(".", "", names(modelFit$bestTune), fixed = TRUE)
836 for(i in seq(along = paramNames))
837 {
838 check <- modelInfo$parameter %in% paramNames[i]
839 if(any(check))
840 {
841 paramNames[i] <- modelInfo$label[which(check)]
842 }
843 }
844
845 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
846 ## Check to see if there was only one combination fit
847 summaryText <- paste(summaryText,
848 "To choose",
849 ifelse(nrow(modelInfo) > 1,
850 "appropriate values of the tuning parameters,",
851 "an appropriate value of the tuning parameter,"),
852 resampleName,
853 "was used to generated a profile of performance across the",
854 nrow(modelFit$results),
855 ifelse(nrow(modelInfo) > 1,
856 "combinations of the tuning parameters.",
857 "candidate values."),
858
859 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
860 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
861 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE))
862
863 }
864 @
865
866 \Sexpr{summaryText}
867
868 <<resampTable, echo = FALSE, results = tex>>=
869 tableData <- modelFit$results
870
871 if(all(modelInfo$parameter == "parameter"))
872 {
873 tableData <- tableData[,-1, drop = FALSE]
874 colNums <- c( length(modelFit$perfNames), length(modelFit$perfNames), length(modelFit$perfNames))
875 colLabels <- c("Mean", "Standard Deviation","Apparant")
876 constString <- ""
877 isConst <- NULL
878
879 } else {
880
881 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
882 2,
883 function(x) length(unique(x)) == 1)
884
885 numParamInTable <- sum(!isConst)
886
887 if(any(isConst))
888 {
889 constParam <- modelInfo$parameter[isConst]
890 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
891 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
892 constString <- paste("The tuning",
893 ifelse(sum(isConst) > 1,
894 "parmeters",
895 "parameter"),
896 listString(paste("``", names(constValues), "''", sep = "")),
897 ifelse(sum(isConst) > 1,
898 "were",
899 "was"),
900 "held constant at",
901 ifelse(sum(isConst) > 1,
902 "a value of",
903 "values of"),
904 listString(constValues[1,]))
905
906 } else constString <- ""
907
908 cn <- colnames(tableData)
909 for(i in seq(along = cn))
910 {
911 check <- modelInfo$parameter %in% cn[i]
912 if(any(check))
913 {
914 cn[i] <- modelInfo$label[which(check)]
915 }
916 }
917 colnames(tableData) <- cn
918
919 colNums <- c(numParamInTable,
920 length(modelFit$perfNames),
921 length(modelFit$perfNames),
922 length(modelFit$perfNames))
923 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
924 }
925
926 colnames(tableData) <- gsub("SD$", "", colnames(tableData))
927 colnames(tableData) <- gsub("Apparent$", "", colnames(tableData))
928 colnames(tableData) <- latexTranslate(colnames(tableData))
929 rownames(tableData) <- latexTranslate(rownames(tableData))
930
931 latex(tableData,
932 rowname = NULL,
933 file = "",
934 cgroup = colLabels,
935 n.cgroup = colNums,
936 where = "h!",
937 digits = 4,
938 longtable = nrow(tableData) > 30,
939 caption = paste(resampleName, "results from the model fit.", constString),
940 label = "T:resamps")
941 @
942
943 \setkeys{Gin}{ width = 0.9\textwidth}
944 \begin{figure}[b]
945 \begin{center}
946
947 <<profilePlot, echo = FALSE, fig = TRUE, width = 8, height = 6>>=
948 trellis.par.set(caretTheme(), warn = TRUE)
949 if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb")
950 {
951 resultsPlot <- resampleHist(modelFit)
952 plotCaption <- paste("Distributions of model performance from the ",
953 "training set estimated using ",
954 resampleName)
955 } else {
956 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
957 {
958 resultsPlot <- plot(modelFit,
959 metric = optMetric,
960 xTrans = function(x) log10(x))
961 resultsPlot <- update(resultsPlot,
962 type = c("g", "p", "l"),
963 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
964
965 } else {
966 resultsPlot <- plot(modelFit,
967 metric = optMetric)
968 resultsPlot <- update(resultsPlot,
969 type = c("g", "p", "l"),
970 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
971 }
972 plotCaption <- paste("A plot of the estimates of the",
973 optMetric,
974 "values calculated using",
975 resampleName)
976 }
977 print(resultsPlot)
978 @
979 \caption[Performance Plot]{\Sexpr{plotCaption}.}
980 \label{F:profile}
981 \end{center}
982 \end{figure}
983
984
985 <<stopWorkers, echo = FALSE, results = hide>>=
986 if(numWorkers > 1) stopCluster(cl)
987 @
988
989 <<testPred, results = tex, echo = FALSE>>=
990 if(pctTrain < 1)
991 {
992 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
993 classPred <- predict(modelFit, testX)
994 cm <- confusionMatrix(classPred, testY)
995 values <- cm$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
996
997 values <- values[!is.na(values) & !is.nan(values)]
998 values <- c(format(values[1:2], digits = 3),
999 format.pval(values[-(1:2)], digits = 5))
1000 nms <- c("the overall accuracy", "the Kappa statistic",
1001 "the $p$--value that accuracy is greater than the no--information rate",
1002 "the $p$--value of concordance from McNemar's test")
1003 nms <- nms[seq(along = values)]
1004 names(values) <- nms
1005
1006 if(any(modelInfo$probModel))
1007 {
1008 classProbs <- extractProb(list(fit = modelFit),
1009 testX = testX,
1010 testY = testY)
1011 classProbs <- subset(classProbs, dataType == "Test")
1012 if(numClasses == 2)
1013 {
1014 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$obs))
1015 tmp <- c(format(tmp, digits = 3))
1016 names(tmp) <- c("the sensitivity", "the specificity",
1017 "the area under the ROC curve")
1018 values <- c(values, tmp)
1019
1020 }
1021 probPlot <- plotClassProbs(classProbs)
1022 }
1023 testString <- paste("Based on the test set of",
1024 nrow(testX),
1025 "samples,",
1026 listString(paste(names(values), "was", values), period = TRUE),
1027 "The confusion matrix for the test set is shown in Table",
1028 "\\\\ref{T:cm}.")
1029 testString <- paste(testString,
1030 " Using ", resampleName,
1031 ", the training set estimates were ",
1032 resampleStats(modelFit),
1033 ".",
1034 sep = "")
1035
1036 if(any(modelInfo$probModel)) testString <- paste(testString,
1037 "Histograms of the class probabilities",
1038 "for the test set samples are shown in",
1039 "Figure \\\\ref{F:probs}",
1040 ifelse(numClasses == 2,
1041 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
1042 "."))
1043
1044
1045
1046 latex(cm$table,
1047 title = "",
1048 file = "",
1049 where = "h",
1050 cgroup = "Observed Values",
1051 n.cgroup = numClasses,
1052 caption = "The confusion matrix for the test set",
1053 label = "T:cm")
1054
1055 } else testString <- ""
1056 @
1057 \Sexpr{testString}
1058
1059
1060 <<classProbsTex, results = tex, echo = FALSE>>=
1061 if(any(modelInfo$probModel))
1062 {
1063 cat(
1064 paste("\\begin{figure}[p]\n",
1065 "\\begin{center}\n",
1066 "\\includegraphics{classProbs}",
1067 "\\caption[PCA Plot]{Class probabilities",
1068 "for the test set. Each panel contains ",
1069 "separate classes}\n",
1070 "\\label{F:probs}\n",
1071 "\\end{center}\n",
1072 "\\end{figure}"))
1073 }
1074 if(any(modelInfo$probModel) & numClasses == 2)
1075 {
1076 cat(
1077 paste("\\begin{figure}[p]\n",
1078 "\\begin{center}\n",
1079 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
1080 "\\caption[ROC Plot]{ROC Curve",
1081 "for the test set.}\n",
1082 "\\label{F:roc}\n",
1083 "\\end{center}\n",
1084 "\\end{figure}"))
1085 }
1086 @
1087 <<classProbsTex, results = hide, echo = FALSE >>=
1088 if(any(modelInfo$probModel))
1089 {
1090 pdf("classProbs.pdf", height = 7, width = 7)
1091 trellis.par.set(caretTheme(), warn = FALSE)
1092 print(probPlot)
1093 dev.off()
1094 }
1095
1096 if(any(modelInfo$probModel) & numClasses == 2)
1097 { resPonse<-testY
1098 preDictor<-classProbs[, levels(trainY)[1]]
1099 pdf("roc.pdf", height = 8, width = 8)
1100 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
1101 plot.roc(resPonse, preDictor, # data
1102 percent=TRUE, # show all values in percent
1103 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
1104 print.auc=TRUE, #display pAUC value on the plot with following options:
1105 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
1106 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
1107 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
1108 main="Partial AUC (pAUC)")
1109 plot.roc(resPonse, preDictor,
1110 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
1111 partial.auc=c(100, 90), partial.auc.correct=TRUE,
1112 partial.auc.focus="se", # focus pAUC on the sensitivity
1113 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
1114 print.auc.y=40, # do not print auc over the previous one
1115 auc.polygon=TRUE, auc.polygon.col="#008600",
1116 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
1117 dev.off()
1118 }
1119
1120
1121 @
1122
1123 \section*{Versions}
1124
1125 <<versions, echo = FALSE, results = tex>>=
1126 toLatex(sessionInfo())
1127
1128 @
1129
1130 <<save-data, echo = FALSE, results = tex>>=
1131 ## change this to the name of modName....
1132 Fit<-modelFit
1133 save(Fit,file="earth-Fit.RData")
1134 @
1135 The model was built using earth and is saved as earth-Fit.RData for reuse. This contains the variable Fit.
1136
1137 \end{document}