# Ch. 6 – Evaluation Measures

In this example, we will evaluate the different measures that can be used for profit-driven model evaluation, following Chapter 6 in the book.

The first step is to load the packages that we will use throughout the example. Two profit-driven measures have their own package in R’s package manager, CRAN: the H-Measure (package hmeasure), and our EMP measure (package EMP).

# Libraries
library(biglm)
library(foreach)
library(ggplot2)
library(verification)
library(hmeasure)
library(EMP)
library(stargazer)

The MicroData dataset, available from the download page of the book, contains five predictive variables that we can use for our models. We will compare five alternative models, built by leaving one of the predictive variables out.

The following code trains the model (we use the package foreach for simplicity), and then plots the ROC curve of all models using ggplot2.

# Read Data
micro.data$X <- NULL # Formula and weights formula.orig <- formula('Target~Amount+Term+Age+Region+Econ_Sector') weights <- micro.data$Target
weights[weights==0] <- 3087/6913

#Generate models removing one variable with ROC curve per model
rocs.logreg <- foreach(vars=1:5, .packages='biglm') %do% {
formula <-  formula(paste0('Target~',paste(colnames(micro.data)[c(-vars, -9:-6)], collapse='+')))
logreg.profit <- bigglm(formula, micro.data[!micro.data$If_Test,], family = binomial(link='logit'), maxit = 1000) test.logreg <- predict(logreg.profit, micro.data[micro.data$If_Test,], type = "response")
roc.logreg <- roc.plot(micro.data$Target[micro.data$If_Test], test.logreg, thresholds=100, plot = NULL)
roc.logreg <- as.data.frame(roc.logreg$plot.data) roc.logreg$var <- rep(paste0("Model ",vars), nrow(roc.logreg))
roc.logreg
}

# Graph all ROC curves
temp <- data.frame(fpr = rocs.logreg[]$V3, var = rocs.logreg[]$var, value = rocs.logreg[]$V2) for(i in 2:5){ temp <- rbind(temp, data.frame(fpr = rocs.logreg[[i]]$V3, var = rocs.logreg[[i]]$var, value = rocs.logreg[[i]]$V2))
}

# Plot using ggplot2

p <- ggplot(temp, aes(fpr, value, group = var, color = var, linetype = var))
p <-  p + geom_line(size = 1) + theme_bw() + scale_color_grey() +
scale_x_continuous("False Positive Rate (1-Specificity)") +
scale_y_continuous("True Positive Rate (Sensitivity)") +
coord_fixed() +
theme(legend.justification=c(1,0), legend.position=c(1,0),
legend.title = element_blank(),
legend.text = element_text(size = 14),
legend.key.width = unit(3, "cm"),
legend.key = element_blank(),
legend.background = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()

)

p # Uncomment to save figure.
# ggsave('ROCComparison.png')

We can see that all curves intercept. This means that the AUC itself is not a reliable measure. If we want to keep the AUC, without adding any profit or cost-based measures, then the convex hull of the AUC curve gives better results. This measure is available in the hmeasure package.

On the other hand, a profit-based measure, such as the EMP, can give a more accurate quantification of the predictive ability of the model given each particular business circumstances. In order to use these measures, the following code calculates the key profit quantities.

# Calculate the average cost per borrower
c1 <- mean(micro.data$LGD[micro.data$Target == 1 & !micro.data$If_Test] * micro.data$EAD[micro.data$Target == 1 & !micro.data$If_Test])
c0 <- mean(micro.data$Amount[micro.data$Target == 0 & !micro.data$If_Test] * 0.2644) # Calculate the distribution of the LGD variable hist.lgd <- ggplot(data = micro.data[!micro.data$If_Test & (micro.data$Target == 1),], aes(LGD)) + geom_histogram(binwidth=0.1) hist.lgd <- hist.lgd + theme_bw() + scale_color_grey() + scale_y_continuous("Nr. of Cases") + theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) hist.lgd # Uncomment to save figure. # ggsave('HistLGD.tiff') hist.lgd.table <- hist(micro.data[!micro.data$If_Test & (micro.data$Target == 1),]$LGD, plot=FALSE)
stargazer(data.frame(breaks=hist.lgd.table$breaks[-1], counts=hist.lgd.table$counts), summary = FALSE,
type = 'html')

breaks counts
0.100 1,562
0.200 36
0.300 30
0.400 49
0.500 42
0.600 51
0.700 61
0.800 55
0.900 53
1 222

The LGD has two point masses, as expected in a loan portfolio. The EMP measure requires the proportion of cases in each peak. The first peak in zero, $$p_0$$ equals $$0.72$$, and the second peak $$p_1$$ in one equals $$0.1$$. The severity ratio, necessary for the h-measure, depends on the proportion of costs for each class. In this case $$c=0.2811$$.

With these quantities we can now compare the different models.

# Model comparison

We run the models, and then compare the accuracy, the AUC, the EMP, the cutoff suggested by the EMP (EMPfrac), the h-measure, and the area unbder the convex hull of the ROC curve.

# Compare all models against each other.
results.logreg <- foreach(vars=1:5, .packages='biglm', .combine = rbind) %do% {
formula <-  formula(paste0('Target~',paste(colnames(micro.data)[c(-vars, -9:-6)], collapse='+')))
logreg.profit <- bigglm(formula, micro.data[!micro.data$If_Test,], family = binomial(link='logit'), maxit = 1000) test.logreg <- predict(logreg.profit, micro.data[micro.data$If_Test,], type = "response")
acc.logreg <- (round(test.logreg) == micro.data$Target[micro.data$If_Test])
acc.logreg <- sum(as.integer(acc.logreg))/sum(micro.data$If_Test) auc.logreg <- roc.area(micro.data$Target[micro.data$If_Test], test.logreg) auc.logreg <- auc.logreg$A
emp.logreg <- empCreditScoring(test.logreg, micro.data$Target[micro.data$If_Test], p0 = 0.72, p1 = 0.1)
H.logreg <- HMeasure(micro.data$Target[micro.data$If_Test], test.logreg, severity.ratio = 0.2811)
cbind(paste0("Model ", vars), acc.logreg,auc.logreg, emp.logreg$EMPC, emp.logreg$EMPCfrac, H.logreg$metrics['H'], H.logreg$metrics['AUCH'])
}

names(results.logreg) <- c("Model","accuracy","AUC","EMP","EMPfrac", "H-Measure", "AUC (convex hull)")
stargazer(results.logreg, summary=FALSE, rownames = FALSE, type = "html")
Model accuracy AUC EMP EMPfrac H-Measure AUC (convex hull)
Model 1 0.703 0.614 0.017 0.141 0.064 0.622
Model 2 0.701 0.591 0.017 0.158 0.043 0.599
Model 3 0.706 0.618 0.017 0.146 0.071 0.627
Model 4 0.706 0.608 0.017 0.155 0.060 0.615
Model 5 0.708 0.620 0.017 0.141 0.068 0.625

As we can see, Model 5 was apparently the best model using only AUC, but the convex hull and all other profit-based measures actually suggest that model 3 is the best. Let’s calculate the true profits for both models.

# Formula and weights
formula.final <- formula('Target~Amount+Term+Region+Econ_Sector')
weights <- micro.data$Target weights[weights==0] <- 3087/6913 # Apply model logreg.microdata.final <- bigglm(formula.final, micro.data[!micro.data$If_Test,], family = binomial(link='logit'), maxit = 1000)
test.microdata.final <- predict(logreg.microdata.final, micro.data[micro.data$If_Test,], type = "response") #Order data data.out <- micro.data[micro.data$If_Test,]
data.out$pred <- as.vector(test.microdata.final) data.out <- data.out[with(data.out, order(-data.out[,10])),] # Extract the upper (100%-14.6%=85.4% of cases) accepted <- round(0.854 * nrow(data.out)) data.out$Accepted <- 0
data.out$Accepted[1:accepted] <- 1 # Calculate profits and losses data.out$Profits <- data.out$Target * data.out$Accepted * (-1*data.out$LGD * data.out$EAD) +
data.out$Target * (1-data.out$Accepted) * (data.out$LGD * data.out$EAD) +
(1-data.out$Target) * data.out$Accepted * (0.2644 * data.out$Amount) + (1-data.out$Target) * (1-data.out$Accepted) * (-1*0.2644 * data.out$Amount)

# Output
total.profits <- data.frame(Model = "Model 3", Benefits = sum(data.out$Profits[data.out$Profits>0]),
Costs = sum(data.out$Profits[data.out$Profits<0]),
Profits = sum(data.out$Profits[data.out$Profits>0]) + sum(data.out$Profits[data.out$Profits<0]))

# Formula and weights
formula.final <- formula('Target~Amount+Age+Term+Region')

# Apply model
logreg.microdata.final <- bigglm(formula.final, micro.data[!micro.data$If_Test,], family = binomial(link='logit'), maxit = 1000) test.microdata.final <- predict(logreg.microdata.final, micro.data[micro.data$If_Test,], type = "response")

#Order data
data.out <- micro.data[micro.data$If_Test,] data.out$pred <- as.vector(test.microdata.final)
data.out <- data.out[with(data.out, order(-data.out[,10])),]

# Extract the upper (100%-14.6%=85.4% of cases)
accepted <- round(0.854 * nrow(data.out))
data.out$Accepted <- 0 data.out$Accepted[1:accepted] <- 1

# Calculate profits and losses
data.out$Profits <- data.out$Target * data.out$Accepted * (-1*data.out$LGD * data.out$EAD) + data.out$Target * (1-data.out$Accepted) * (data.out$LGD * data.out$EAD) + (1-data.out$Target) * data.out$Accepted * (0.2644 * data.out$Amount) +
(1-data.out$Target) * (1-data.out$Accepted) * (-1*0.2644 * data.out$Amount) # Output total.profits <- rbind(total.profits, data.frame(Model = "Model 5", Benefits = sum(data.out$Profits[data.out$Profits>0]), Costs = sum(data.out$Profits[data.out$Profits<0]), Profits = sum(data.out$Profits[data.out$Profits>0]) + sum(data.out$Profits[data.out\$Profits<0])))

stargazer(total.profits, summary = FALSE, type = 'html')
Model Benefits Costs Profits
Model 3 571,061.500 -111,560.200 459,501.300
Model 5 555,190.600 -127,431.100 427,759.600

As expected, it is indeed model 3 the one that has the higher profits, and also the one with lower cost. Both the h-measure (cost based) and the EMP (profit-based) can choose the correct model.