The following packages are required for the random forest

if(!require(tidyverse)){install.packages("tidyverse");library(tidyverse)}
if(!require(janitor)){install.packages("janitor");library(janitor)} # for rename

if(!require(randomForest)){install.packages("randomForest");library(randomForest)}
if(!require(caret)){install.packages("caret");library(caret)} # for `confustionMatrix`

A Random forest is made of Random Trees

Data <- read_csv(file = here::here("content/post/2022-06-26-random-forest", "german_credit.csv"))

Exploring the dataset

Data <- clean_names(Data)
Data$creditability <- as.factor(Data$creditability)
glimpse(Data)
## Rows: 1,000
## Columns: 21
## $ creditability                     <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ account_balance                   <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 4, 2, 1, 1, …
## $ duration_of_credit_month          <dbl> 18, 9, 12, 12, 12, 10, 8, 6, 18, 24,…
## $ payment_status_of_previous_credit <dbl> 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 4, 4, …
## $ purpose                           <dbl> 2, 0, 9, 0, 0, 0, 0, 0, 3, 3, 0, 1, …
## $ credit_amount                     <dbl> 1049, 2799, 841, 2122, 2171, 2241, 3…
## $ value_savings_stocks              <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 2, …
## $ length_of_current_employment      <dbl> 2, 3, 4, 3, 3, 2, 4, 2, 1, 1, 3, 4, …
## $ instalment_per_cent               <dbl> 4, 2, 2, 3, 4, 1, 1, 2, 4, 1, 2, 1, …
## $ sex_marital_status                <dbl> 2, 3, 2, 3, 3, 3, 3, 3, 2, 2, 3, 4, …
## $ guarantors                        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ duration_in_current_address       <dbl> 4, 2, 4, 2, 4, 3, 4, 4, 4, 4, 2, 4, …
## $ most_valuable_available_asset     <dbl> 2, 1, 1, 1, 2, 1, 1, 1, 3, 4, 1, 3, …
## $ age_years                         <dbl> 21, 36, 23, 39, 38, 48, 39, 40, 65, …
## $ concurrent_credits                <dbl> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, …
## $ type_of_apartment                 <dbl> 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, …
## $ no_of_credits_at_this_bank        <dbl> 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, …
## $ occupation                        <dbl> 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 3, 3, …
## $ no_of_dependents                  <dbl> 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, …
## $ telephone                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ foreign_worker                    <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, …

Assess the creditabiliy with the help of other variables

# code -------------------------------------------------------------------------

ggplot(data = Data, aes(x = age_years, color = creditability, fill = creditability)) +
    geom_histogram(binwidth = 5, position = "identity", alpha = 0.4) +
    scale_x_continuous(breaks = scales::pretty_breaks(n = 6)) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 6)) + theme_minimal()



# Create a training and testing data
set.seed(7791)
partitioning <- sample(2, nrow(Data), replace = TRUE, prob = c(0.8, 0.2))
table(partitioning)
## partitioning
##   1   2 
## 797 203

train <- Data[partitioning == 1, ]
table(train$creditability)
## 
##   0   1 
## 244 553
test <- Data[partitioning == 2, ]
table(test$creditability)
## 
##   0   1 
##  56 147

Train the Random forest model


# Generate random forest with train data
rf_model <- randomForest(formula = creditability ~ ., data = train)
predict_train <- predict(rf_model, train)
confusionMatrix(predict_train, train$creditability)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 244   0
##          1   0 553
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9954, 1)
##     No Information Rate : 0.6939     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3061     
##          Detection Rate : 0.3061     
##    Detection Prevalence : 0.3061     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

Testing the model `rf_model` on test data

predict_test <- predict(rf_model, test)
confusionMatrix(predict_test, test$creditability)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  28  12
##          1  28 135
##                                           
##                Accuracy : 0.803           
##                  95% CI : (0.7415, 0.8553)
##     No Information Rate : 0.7241          
##     P-Value [Acc > NIR] : 0.006157        
##                                           
##                   Kappa : 0.459           
##                                           
##  Mcnemar's Test P-Value : 0.017706        
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.9184          
##          Pos Pred Value : 0.7000          
##          Neg Pred Value : 0.8282          
##              Prevalence : 0.2759          
##          Detection Rate : 0.1379          
##    Detection Prevalence : 0.1970          
##       Balanced Accuracy : 0.7092          
##                                           
##        'Positive' Class : 0               
## 


#            Reference
# Prediction   0   1
#         0   29  14
#         1   27 133

varImpPlot(rf_model)

Optimize the performance of randomforest.

plot(rf_model) # black line is out of bag error.


oob_error <- double(20)

for (mtry in 1:20) {
    rf <- randomForest(formula = creditability ~ ., data = train, mtry = mtry, ntree = 166)
    oob_error[mtry] <- rf$err.rate[166]
}

plot(1:20, oob_error, type  = "b", xlab = "Number of variable considered", ylab = "Out of bag erro [-]", xaxt = "n")
axis(1, at = 1:20, labels = 1:20, cex = 0.8)


opti_num_var = which.min(oob_error)

Re running the random forest with optimum number of variables

rf_optim <- randomForest(formula = creditability ~ ., data = train, mtry = opti_num_var, ntree = 166)

# Testing the model `rf_model` on test data
confusionMatrix(predict(rf_optim, test), test$creditability)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  33  20
##          1  23 127
##                                           
##                Accuracy : 0.7882          
##                  95% CI : (0.7255, 0.8423)
##     No Information Rate : 0.7241          
##     P-Value [Acc > NIR] : 0.02266         
##                                           
##                   Kappa : 0.4609          
##                                           
##  Mcnemar's Test P-Value : 0.76037         
##                                           
##             Sensitivity : 0.5893          
##             Specificity : 0.8639          
##          Pos Pred Value : 0.6226          
##          Neg Pred Value : 0.8467          
##              Prevalence : 0.2759          
##          Detection Rate : 0.1626          
##    Detection Prevalence : 0.2611          
##       Balanced Accuracy : 0.7266          
##                                           
##        'Positive' Class : 0               
##

Exploring useful variables

train <- as.data.frame(train)
varImpPlot(rf_model)

importance(rf_model)
##                                   MeanDecreaseGini
## account_balance                          36.543019
## duration_of_credit_month                 34.072986
## payment_status_of_previous_credit        18.303982
## purpose                                  20.837156
## credit_amount                            46.208519
## value_savings_stocks                     18.564843
## length_of_current_employment             18.296281
## instalment_per_cent                      13.122638
## sex_marital_status                       12.781397
## guarantors                                7.055415
## duration_in_current_address              13.666627
## most_valuable_available_asset            14.340330
## age_years                                33.451374
## concurrent_credits                        8.064289
## type_of_apartment                         8.821601
## no_of_credits_at_this_bank                7.306201
## occupation                               10.567355
## no_of_dependents                          4.325819
## telephone                                 6.359517
## foreign_worker                            1.666973

# How variable affect the chance of getting loan.
partialPlot(rf_model, train, account_balance, "1")

partialPlot(rf_model, train, age_years, "1")

This analysis for discrete variable of creditability.

A Random forest is made of Random Trees

Exploring the dataset

Assess the creditabiliy with the help of other variables

Train the Random forest model

Testing the model rf_model on test data

Optimize the performance of randomforest.

Re running the random forest with optimum number of variables

Exploring useful variables

Testing the model `rf_model` on test data