r_caret_test: caret_regression/tool2/templateLibrary.py comparison

comparison caret_regression/tool2/templateLibrary.py @ 0:68300206e90d draft default tip

Uploaded

author	deepakjadmin
date	Thu, 05 Nov 2015 02:41:30 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:68300206e90d
+def __template4Rnw():
+	template4Rnw = r'''%% Regression Modeling Script
+%% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
+%% Version: 1.00
+%% Created on: 2010/10/02
+%%
+%% Lynn Group
+%% Version: 2.00
+%% Created on: 2014/11/15
+%% This is an Sweave template for building and describing
+%% classification models. It mixes R and LaTeX code. The document can
+%% be processing using R's Sweave function to produce a tex file.
+%%
+%% The inputs are:
+%% - the initial data set in a data frame called 'rawData'
+%% - a numeric column in the data set called 'outcome'. this should be the
+%%    outcome variable
+%% - all other columns in rawData should be predictor variables
+%% - the type of model should be in a variable called 'modName'.
+%%
+%% The script attempts to make some intelligent choices based on the
+%% model being used. For example, if modName is "pls", the script will
+%% automatically center and scale the predictor data. There are
+%% situations where these choices can (and should be) changed.
+%%
+%% There are other options that may make sense to change. For example,
+%% the user may want to adjust the type of resampling. To find these
+%% parts of the script, search on the string 'OPTION'. These parts of
+%% the code will document the options.
+\documentclass[12pt]{report}
+\usepackage{amsmath}
+\usepackage[pdftex]{graphicx}
+\usepackage{color}
+\usepackage{ctable}
+\usepackage{xspace}
+\usepackage{fancyvrb}
+\usepackage{fancyhdr}
+\usepackage{lastpage}
+\usepackage{longtable}
+\usepackage{algorithm2e}
+\usepackage[
+colorlinks=true,
+linkcolor=blue,
+citecolor=blue,
+urlcolor=blue]
+{hyperref}
+\usepackage{lscape}
+\usepackage{Sweave}
+\SweaveOpts{keep.source = TRUE}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% define new colors for use
+\definecolor{darkgreen}{rgb}{0,0.6,0}
+\definecolor{darkred}{rgb}{0.6,0.0,0}
+\definecolor{lightbrown}{rgb}{1,0.9,0.8}
+\definecolor{brown}{rgb}{0.6,0.3,0.3}
+\definecolor{darkblue}{rgb}{0,0,0.8}
+\definecolor{darkmagenta}{rgb}{0.5,0,0.5}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newcommand{\bld}[1]{\mbox{\boldmath $#1$}}
+\newcommand{\shell}[1]{\mbox{$#1$}}
+\renewcommand{\vec}[1]{\mbox{\bf {#1}}}
+\newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
+\newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
+\newcommand{\halfs}{\frac{1}{2}}
+\setlength{\oddsidemargin}{-.25 truein}
+\setlength{\evensidemargin}{0truein}
+\setlength{\topmargin}{-0.2truein}
+\setlength{\textwidth}{7 truein}
+\setlength{\textheight}{8.5 truein}
+\setlength{\parindent}{0.20truein}
+\setlength{\parskip}{0.10truein}
+\setcounter{LTchunksize}{50}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\pagestyle{fancy}
+\lhead{}
+%% OPTION Report header name
+\chead{Regression Model Script}
+\rhead{}
+\lfoot{}
+\cfoot{}
+\rfoot{\thepage\ of \pageref{LastPage}}
+\renewcommand{\headrulewidth}{1pt}
+\renewcommand{\footrulewidth}{1pt}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% OPTION Report title and modeler name
+\title{Regression Model Script using $METHOD }
+\author{"M. Kuhn and Lynn Group, SCIS, JNU, New Delhi"}
+\begin{document}
+\maketitle
+\thispagestyle{empty}
+<<startup, eval= TRUE, results = hide, echo = FALSE>>=
+library(Hmisc)
+library(caret)
+versionTest <- compareVersion(packageDescription("caret")$Version,
+"4.65")
+if(versionTest < 0) stop("caret version 4.65 or later is required")
+library(RColorBrewer)
+listString <- function (x, period = FALSE, verbose = FALSE)
+{
+if (verbose)   cat("\n      entering listString\n")
+flush.console()
+if (!is.character(x))
+x <- as.character(x)
+numElements <- length(x)
+out <- if (length(x) > 0) {
+switch(min(numElements, 3), x, paste(x, collapse = " and "),
+{
+x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
+paste(x, collapse = " ")
+})
+}
+else ""
+if (period)  out <- paste(out, ".", sep = "")
+if (verbose)  cat("      leaving  listString\n\n")
+flush.console()
+out
+}
+resampleStats <- function(x, digits = 3)
+{
+bestPerf <- x$bestTune
+colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
+out <- merge(x$results, bestPerf)
+out <- out[, colnames(out) %in% x$perfNames]
+names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
+names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
+names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
+names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
+names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
+names(out) <- gsub("RMSE", "root mean squared error", names(out), fixed = TRUE)
+names(out) <- gsub("Rsquared", "$R^2$", names(out), fixed = TRUE)
+out <- format(out, digits = digits)
+listString(paste(names(out), "was", out))
+}
+latticeBubble <- function(x, y, z, offset = .5, splits = 10,
+pal = colorRampPalette(brewer.pal(9,"YlOrRd")[-(1:2)]),
+...)
+{
+cexValues <- rank(z)/length(z) + offset
+splits <- unique(quantile(z, probs = seq(0, 1, length = splits)))
+splitup <- cut(z, breaks = splits, include.lowest = TRUE)
+cols <- pal(length(levels(splitup)))
+colValues <- cols[as.numeric(splitup)]
+if(is.data.frame(x))
+{
+out <- splom(~x, col = colValues, cex = cexValues, ...)
+} else out <- xyplot(y~x, col = colValues, cex = cexValues, ...)
+out
+}
+##OPTION: model name: see ?train for more values/models
+modName <- "$METHOD"
+load("$RDATA")
+rawData <- dataX
+rawData$$outcome <- dataY
+@
+\section*{Data Sets}\label{S:data}
+%% OPTION: provide some background on the problem, the experimental
+%% data, how the compounds were selected etc
+<<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
+if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
+if(!is.numeric(rawData$outcome)) stop("the outcome should be a numeric vector")
+numSamples <- nrow(rawData)
+numPredictors <- ncol(rawData) - 1
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
+if(any(!isNum)) stop("all predictors in rawData should be numeric")
+@
+<<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
+colRate <- apply(rawData[, predictorNames, drop = FALSE],
+2, function(x) mean(is.na(x)))
+##OPTION thresholds can be changed
+colExclude <- colRate > $MISSINGFILTERTHRESHC
+missingText <- ""
+if(any(colExclude))
+{
+missingText <- paste(missingText,
+ifelse(sum(colExclude) > 1,
+" There were ",
+" There was "),
+sum(colExclude),
+ifelse(sum(colExclude) > 1,
+" predictors ",
+" predictor "),
+"with an excessive number of ",
+"missing data. ",
+ifelse(sum(colExclude) > 1,
+" These were excluded. ",
+" This was excluded. "))
+predictorNames <- predictorNames[!colExclude]
+rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
+}
+rowRate <- apply(rawData[, predictorNames, drop = FALSE],
+1, function(x) mean(is.na(x)))
+rowExclude <- rowRate > $MISSINGFILTERTHRESHR
+if(any(rowExclude))
+{
+missingText <- paste(missingText,
+ifelse(sum(rowExclude) > 1,
+" There were ",
+" There was "),
+sum(colExclude),
+ifelse(sum(rowExclude) > 1,
+" samples ",
+" sample "),
+"with an excessive number of ",
+"missing data. ",
+ifelse(sum(rowExclude) > 1,
+" These were excluded. ",
+" This was excluded. "),
+"After filtering, ",
+sum(!rowExclude),
+" samples remained.")
+rawData <- rawData[!rowExclude, ]
+hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+1, function(x) mean(is.na(x)))
+} else {
+hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+1, function(x) any(is.na(x)))
+missingText <- paste(missingText,
+ifelse(missingText == "",
+"There ",
+"Subsequently, there "),
+ifelse(sum(hasMissing) == 1,
+"was ",
+"were "),
+ifelse(sum(hasMissing) > 0,
+sum(hasMissing),
+"no"),
+ifelse(sum(hasMissing) == 1,
+"sample ",
+"samples "),
+"with missing values.")
+rawData <- rawData[complete.cases(rawData),]
+}
+dataDist <- summary(rawData$outcome)
+dataSD <- sd(rawData$outcome, na.rm = TRUE)
+dataText <- paste("The average outcome value was ",
+dataDist["Mean"],
+" and a standard deviation of ",
+dataSD, ". The minimum and maximum values were ",
+dataDist["Min."], " and ", dataDist["Max."],
+", respectively. Figure \\\\ref{F:dens} shows a ",
+" density plot (i.e. a smooth histogram) of the response.",
+sep = "")
+rawData1 <- rawData[,1:length(rawData)-1]
+rawData2 <- rawData[,length(rawData)]
+set.seed(222)
+nzv1 <- nearZeroVar(rawData1)
+if(length(nzv1) > 0)
+{
+nzvVars1 <- names(rawData1)[nzv1]
+rawData <- rawData1[,-nzv1]
+rawData$outcome <- rawData2
+nzvText1 <- paste("There were ",
+length(nzv1),
+" predictors that were removed from original data due to",
+" severely unbalanced distributions that",
+" could negatively affect the model fit",
+ifelse(length(nzv1) > 10,
+".",
+paste(": ",
+listString(nzvVars1),
+".",
+sep = "")),
+sep = "")
+} else {
+rawData <- rawData1
+rawData$outcome <- rawData2
+nzvText1 <- ""
+}
+remove("rawData1")
+remove("rawData2")
+@
+The initial data set consisted of \Sexpr{numSamples} samples and
+\Sexpr{numPredictors} predictor variables. \Sexpr{dataText} \Sexpr{missingText}
+\Sexpr{nzvText1}
+\setkeys{Gin}{width = 0.8\textwidth}
+\begin{figure}[b]
+\begin{center}
+<<densityplot, echo = FALSE, results = hide, fig = TRUE, width = 8, height = 4.5>>=
+trellis.par.set(caretTheme(), warn = TRUE)
+print(densityplot(~rawData$outcome, pch = "|", adjust = 1.25, xlab = ""))
+@
+\caption[Data Density]{A density plot of the response. The marks
+along the $x$--axis show the locations of the data points.}
+\label{F:dens}
+\end{center}
+\end{figure}
+<<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+numPredictors <- length(predictorNames)
+predictors <- rawData[, predictorNames, drop = FALSE]
+## PCA will fail with predictors having less than 2 unique values
+isZeroVar <- apply(predictors, 2,
+function(x) length(unique(x)) < 2)
+if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
+## For whatever, only the formula interface to prcomp
+## handles missing values
+pcaForm <- as.formula(
+paste("~",
+paste(names(predictors), collapse = "+")))
+pca <- prcomp(pcaForm,
+data = predictors,
+center = TRUE,
+scale. = TRUE,
+na.action = na.omit)
+## OPTION: the number of components plotted/discussed can be set
+numPCAcomp <- $PCACOMP
+pctVar <- pca$sdev^2/sum(pca$sdev^2)*100
+pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
+"\\\\%",
+sep = "")
+pcaText <- listString(pcaText)
+@
+To get an initial assessment of the separability of the classes,
+principal component analysis (PCA) was used to distill the
+\Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
+surrogate variables (i.e. the principal components) in a manner that
+attempts to maximize the amount of information preserved from the
+original predictor set. Figure \ref{F:inititalPCA} contains plots of
+the first \Sexpr{numPCAcomp} components, which accounted for
+\Sexpr{pcaText} percent of the variability in the original predictors
+(respectively).
+%% OPTION: remark on how well (or poorly) the data separated
+\setkeys{Gin}{width = 0.8\textwidth}
+\begin{figure}[p]
+\begin{center}
+<<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
+trellis.par.set(caretTheme(), warn = TRUE)
+if(numPCAcomp == 2)
+{
+axisRange <- extendrange(pca$x[, 1:2])
+print(
+latticeBubble(x = as.data.frame(pca$x)$PC1,
+y = as.data.frame(pca$x)$PC2,
+z = rawData$outcome,
+type = c("p", "g"),
+xlab = "PC1", ylab = "PC2",
+xlim = axisRange,
+ylim = axisRange))
+} else {
+axisRange <- extendrange(pca$x[, 1:numPCAcomp])
+print(
+latticeBubble(x = as.data.frame(pca$x)[,1:numPCAcomp],
+z = rawData$outcome,
+type = c("p", "g"),
+xlab = "PC1", ylab = "PC2",
+xlim = axisRange,
+ylim = axisRange))
+}
+@
+\caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
+principal components for the original data set. Smaller, lighter
+points indicate smaller values of the response while darker,
+larger points correspond to larger values of the outcome}
+\label{F:inititalPCA}
+\end{center}
+\end{figure}
+<<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
+## OPTION: in small samples sizes, you may not want to set aside a
+## training set and focus on the resampling results.
+numSamples <- nrow(rawData)
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+numPredictors <- length(predictorNames)
+# pctTrain <- .15
+pctTrain <- $PERCENT
+if(pctTrain < 1)
+{
+## OPTION: seed number can be changed
+set.seed(1)
+inTrain <- createDataPartition(rawData$outcome,
+p = pctTrain,
+list = FALSE)
+trainX <- rawData[ inTrain, predictorNames]
+testX  <- rawData[-inTrain, predictorNames]
+trainY <- rawData[ inTrain, "outcome"]
+testY  <- rawData[-inTrain, "outcome"]
+splitText <- paste("The original data were split into ",
+"a training set ($n$=",
+nrow(trainX),
+") and a test set ($n$=",
+nrow(testX),
+") in a manner that preserved the ",
+"distribution of the response.",
+sep = "")
+isZeroVar <- apply(trainX, 2,
+function(x) length(unique(x)) < 2)
+if(any(isZeroVar))
+{
+trainX <- trainX[, !isZeroVar, drop = FALSE]
+testX <- testX[, !isZeroVar, drop = FALSE]
+}
+} else {
+trainX <- rawData[, predictorNames]
+testX  <- NULL
+trainY <- rawData[, "outcome"]
+testY  <- NULL
+splitText <- "The entire data set was used as the training set."
+}
+remove("rawData")
+@
+\Sexpr{splitText}
+The data set for model building consisted of \Sexpr{numSamples} samples and
+\Sexpr{numPredictors} predictor variables.
+<<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
+## OPTION: other pre-processing steps can be used
+ppSteps <- caret:::suggestions(modName)
+set.seed(2)
+if(ppSteps["nzv"])
+{
+nzv <- nearZeroVar(trainX)
+if(length(nzv) > 0)
+{
+nzvVars <- names(trainX)[nzv]
+trainX <- trainX[, -nzv]
+nzvText <- paste("There were ",
+length(nzv),
+" predictors that were removed due to",
+" severely unbalanced distributions that",
+" could negatively affect the model fit",
+ifelse(length(nzv) > 10,
+".",
+paste(": ",
+listString(nzvVars),
+".",
+sep = "")),
+sep = "")
+testX <- testX[, -nzv]
+} else nzvText <- ""
+} else nzvText <- ""
+@
+\Sexpr{nzvText}
+<<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
+if(ppSteps["corr"])
+{
+## OPTION:
+##corrThresh <- .75
+corrThresh <- $THRESHHOLDCOR
+highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
+corrThresh)
+if(length(highCorr) > 0)
+{
+corrVars <- names(trainX)[highCorr]
+trainX <- trainX[, -highCorr]
+corrText <- paste("There were ",
+length(highCorr),
+" predictors that were removed due to",
+" large between--predictor correlations that",
+" could negatively affect the model fit",
+ifelse(length(highCorr) > 10,
+".",
+paste(": ",
+listString(highCorr),
+".",
+sep = "")),
+" Removing these predictors forced",
+" all pair--wise correlations to be",
+" less than ",
+corrThresh,
+".",
+sep = "")
+testX <- testX[, -highCorr]
+} else corrText <- ""
+}else corrText <- ""
+@
+\Sexpr{corrText}
+<<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
+ppMethods <- NULL
+if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
+if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
+if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
+##OPTION other methods, such as spatial sign, can be added to this list
+if(length(ppMethods) > 0)
+{
+ppInfo <- preProcess(trainX, method = ppMethods)
+trainX <- predict(ppInfo, trainX)
+if(pctTrain < 1) testX <- predict(ppInfo, testX)
+ppText <- paste("The following pre--processing methods were",
+" applied to the training",
+ifelse(pctTrain < 1, " and test", ""),
+" data: ",
+listString(ppMethods),
+".",
+sep = "")
+ppText <- gsub("center", "mean centering", ppText)
+ppText <- gsub("scale", "scaling to unit variance", ppText)
+ppText <- gsub("knnImpute",
+paste(ppInfo$k, "--nearest neighbor imputation", sep = ""),
+ppText)
+ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
+ppText <- gsub("pca", "principal component feature extraction", ppText)
+ppText <- gsub("ica", "independent component feature extraction", ppText)
+} else {
+ppInfo <- NULL
+ppText <- ""
+}
+predictorNames <- names(trainX)
+if(nzvText != "" | corrText != "" | ppText != "")
+{
+varText <- paste("After pre--processing, ",
+ncol(trainX),
+"predictors remained for modeling.")
+} else varText <- ""
+@
+\Sexpr{ppText} \Sexpr{varText}
+\clearpage
+\section*{Model Building}
+<<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
+numWorkers <- $NUMWORKERS
+##OPTION: turn up numWorkers to use MPI
+if(numWorkers > 1)
+{
+mpiCalcs <- function(X, FUN, ...)
+{
+theDots <- list(...)
+parLapply(theDots$cl, X, FUN)
+}
+library(snow)
+cl <- makeCluster(numWorkers, "MPI")
+}
+@
+<<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
+##<<setupResampling, echo = FALSE, results = hide>>=
+##OPTION: the resampling options can be changed. See
+##        ?trainControl for details
+resampName <- "repeatedcv"
+resampNumber <- $RESAMPLENUMBER
+numRepeat <- 3
+resampP <- $RESAMPLENUMBERPERCENT
+modelInfo <- modelLookup(modName)
+set.seed(3)
+ctlObj <- trainControl(method = resampName,
+number = resampNumber,
+repeats = numRepeat,
+p = resampP)
+##OPTION select other performance metrics as needed
+optMetric <- "RMSE"
+if(numWorkers > 1)
+{
+ctlObj$workers <- numWorkers
+ctlObj$computeFunction <- mpiCalcs
+ctlObj$computeArgs <- list(cl = cl)
+}
+@
+<<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
+##OPTION expand or contract these grids as needed (or
+##       add more models
+gridSize <- $SETUPGRIDSIZE
+if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "ctree2", "ctree")) gridSize <- 5
+if(modName %in% c("earth")) gridSize <- 7
+if(modName %in% c("knn", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
+if(modName %in% c("rpart")) gridSize <- 15
+if(modName %in% c("pls", "lars2", "lars")) gridSize <- min(20, ncol(trainX))
+if(modName == "gbm")
+{
+tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
+.n.trees = (1:10)*20,
+.shrinkage = .1)
+}
+if(modName == "nnet")
+{
+tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
+.decay = c(0, .001, .01, .1))
+}
+@
+<<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
+##OPTION alter as needed
+set.seed(4)
+modelFit <- switch(modName,
+gbm =
+{
+mix <- sample(seq(along = trainY))
+train(
+trainX[mix,], trainY[mix], modName,
+verbose = FALSE,
+bag.fraction = .9,
+metric = optMetric,
+trControl = ctlObj,
+tuneGrid = tGrid)
+},
+nnet =
+{
+train(
+trainX, trainY, modName,
+metric = optMetric,
+linout = TRUE,
+trace = FALSE,
+maxiter = 1000,
+MaxNWts = 5000,
+trControl = ctlObj,
+tuneGrid = tGrid)
+},
+svmRadial =, svmPoly =, svmLinear =
+{
+train(
+trainX, trainY, modName,
+metric = optMetric,
+scaled = TRUE,
+trControl = ctlObj,
+tuneLength = gridSize)
+},
+{
+train(trainX, trainY, modName,
+trControl = ctlObj,
+metric = optMetric,
+tuneLength = gridSize)
+})
+@
+<<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
+summaryText <- ""
+resampleName <- switch(tolower(modelFit$control$method),
+boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""),
+boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""),
+cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""),
+repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ",
+modelFit$control$repeats, " times)", sep = ""),
+lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ",
+round(modelFit$control$p, 2), "$\\%$)", sep = ""))
+tuneVars <- latexTranslate(tolower(modelInfo$label))
+tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
+if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter")
+{
+summaryText <- paste(summaryText,
+"\n\n",
+"There are no tuning parameters associated with this model.",
+"To characterize the model performance on the training set,",
+resampleName,
+"was used.",
+"Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
+"show summaries of the resampling results. ")
+} else {
+summaryText <- paste("There",
+ifelse(nrow(modelInfo) > 1, "are", "is"),
+nrow(modelInfo),
+ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
+"associated with this model:",
+listString(tuneVars, period = TRUE))
+paramNames <- gsub(".", "", names(modelFit$bestTune), fixed = TRUE)
+for(i in seq(along = paramNames))
+{
+check <- modelInfo$parameter %in% paramNames[i]
+if(any(check))
+{
+paramNames[i] <- modelInfo$label[which(check)]
+}
+}
+paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
+## Check to see if there was only one combination fit
+summaryText <- paste(summaryText,
+"To choose",
+ifelse(nrow(modelInfo) > 1,
+"appropriate values of the tuning parameters,",
+"an appropriate value of the tuning parameter,"),
+resampleName,
+"was used to generated a profile of performance across the",
+nrow(modelFit$results),
+ifelse(nrow(modelInfo) > 1,
+"combinations of the tuning parameters.",
+"candidate values."),
+"Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
+"summaries of the resampling profile. ",                                                                                         "The final model fitted to the entire training set was:",
+listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE))
+}
+@
+\Sexpr{summaryText}
+<<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
+tableData <- modelFit$results
+if(all(modelInfo$parameter == "parameter"))
+{
+tableData <- tableData[,-1, drop = FALSE]
+colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames))
+colLabels <- c("Mean", "Standard Deviation")
+constString <- ""
+isConst <- NULL
+} else {
+isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
+2,
+function(x) length(unique(x)) == 1)
+numParamInTable <- sum(!isConst)
+if(any(isConst))
+{
+constParam <- modelInfo$parameter[isConst]
+constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
+tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
+constString <- paste("The tuning",
+ifelse(sum(isConst) > 1,
+"parmeters",
+"parameter"),
+listString(paste("``", names(constValues), "''", sep = "")),
+ifelse(sum(isConst) > 1,
+"were",
+"was"),
+"held constant at",
+ifelse(sum(isConst) > 1,
+"a value of",
+"values of"),
+listString(constValues[1,]))
+} else constString <- ""
+cn <- colnames(tableData)
+for(i in seq(along = cn))
+{
+check <- modelInfo$parameter %in% cn[i]
+if(any(check))
+{
+cn[i] <- modelInfo$label[which(check)]
+}
+}
+colnames(tableData) <- cn
+colNums <- c(numParamInTable,
+length(modelFit$perfNames),
+length(modelFit$perfNames))
+colLabels <- c("", "Mean", "Standard Deviation")
+}
+colnames(tableData) <- gsub("SD$", "", colnames(tableData))
+colnames(tableData) <- latexTranslate(colnames(tableData))
+rownames(tableData) <- latexTranslate(rownames(tableData))
+latex(tableData,
+rowname = NULL,
+file = "",
+cgroup = colLabels,
+n.cgroup = colNums,
+where = "h!",
+digits = 4,
+longtable = nrow(tableData) > 30,
+caption = paste(resampleName, "results from the model fit.", constString),
+label = "T:resamps")
+@
+\setkeys{Gin}{ width = 0.9\textwidth}
+\begin{figure}[b]
+\begin{center}
+<<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
+trellis.par.set(caretTheme(), warn = TRUE)
+if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb")
+{
+resultsPlot <- resampleHist(modelFit)
+plotCaption <- paste("Distributions of model performance from the ",
+"training set estimated using ",
+resampleName)
+} else {
+if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
+{
+resultsPlot <- plot(modelFit,
+metric = optMetric,
+xTrans = function(x) log10(x))
+resultsPlot <- update(resultsPlot,
+type = c("g", "p", "l"),
+ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
+} else {
+resultsPlot <- plot(modelFit,
+metric = optMetric)
+resultsPlot <- update(resultsPlot,
+type = c("g", "p", "l"),
+ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
+}
+plotCaption <- paste("A plot of the estimates of the",
+optMetric,
+"values calculated using",
+resampleName)
+}
+print(resultsPlot)
+@
+\caption[Performance Plot]{\Sexpr{plotCaption}.}
+\label{F:profile}
+\end{center}
+\end{figure}
+<<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
+##<<stopWorkers, echo = FALSE, results = hide>>=
+if(numWorkers > 1) stopCluster(cl)
+@
+<<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
+if(pctTrain < 1)
+{
+cat("\\clearpage\n\\section*{Test Set Results}\n\n")
+testPreds <- extractPrediction(list(fit = modelFit),
+testX = testX, testY = testY)
+testPreds <- subset(testPreds, dataType == "Test")
+values <- modelFit$control$summaryFunction(testPreds)
+names(values) <- gsub("RMSE", "root mean squared error", names(values), fixed = TRUE)
+names(values) <- gsub("Rsquared", "$R^2$", names(values), fixed = TRUE)
+values <- format(values, digits = 3)
+testString <- paste("Based on the test set of",
+nrow(testX),
+"samples,",
+listString(paste(names(values), "was", values), period = TRUE),
+" A plot of the observed and predicted outcomes for the test set ",
+"is given in Figure \\\\ref{F:obsPred}.")
+testString <- paste(testString,
+" Using ", resampleName,
+", the training set estimates were ",
+resampleStats(modelFit),
+".",
+sep = "")
+axisRange <- extendrange(testPreds[, c("obs", "pred")])
+obsPred <- xyplot(obs ~ pred,
+data = testPreds,
+xlim = axisRange,
+ylim = axisRange,
+panel = function(x, y)
+{
+panel.abline(0, 1, col = "darkgrey", lty = 2)
+panel.xyplot(x, y, type = c("p", "g"))
+panel.loess(x, y, col = "darkred", lwd = 2)
+},
+ylab = "Observed Response",
+xlab = "Predicted Response")
+pdf("obsPred.pdf", height = 8, width = 8)
+trellis.par.set(caretTheme())
+print(obsPred)
+dev.off()
+} else testString <- ""
+@
+\Sexpr{testString}
+<<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
+if(pctTrain < 1)
+{
+cat(
+paste("\\begin{figure}[p]\n",
+"\\begin{center}\n",
+"\\includegraphics{obsPred}",
+"\\caption[Observed V Fitted Values]{",
+"The observed and predicted responses. ",
+"The grey line is the line of identity while the",
+"solid red line is a smoothed trend line.}\n",
+"\\label{F:obsPred}\n",
+"\\end{center}\n",
+"\\end{figure}"))
+}
+@
+\section*{Versions}
+<<versions, echo = FALSE, results = tex>>=
+toLatex(sessionInfo())
+@
+<<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
+## change this to the name of modName....
+Fit<-modelFit
+save(Fit,file="$METHOD-Fit.RData")
+@
+The model was built using $METHOD and is saved as $METHOD-Fit.RData for reuse. This contains the variable Fit.
+\end{document}'''
+	return template4Rnw

Mercurial > repos > deepakjadmin > r_caret_test

comparison caret_regression/tool2/templateLibrary.py @ 0:68300206e90d draft default tip