Code: Random Forest Model with All Features

Author

Susmi Sharma

Pre-processing data

Installing necessary packages

# install.packages(setdiff(c("randomForest", "ggplot2", "caret","cowplot", "rfUtilities", 
#                            "writexl"), rownames(installed.packages())))
#install.packages("writexl")
library(randomForest)

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.

library(caret)

Loading required package: ggplot2


Attaching package: 'ggplot2'

The following object is masked from 'package:randomForest':

    margin

Loading required package: lattice

library(cowplot)
library(rpart)
library("dplyr")


Attaching package: 'dplyr'

The following object is masked from 'package:randomForest':

    combine

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library("writexl")

Loading the dataset

Heart_data <- read.csv("CVD_cleaned.csv")
head(Heart_data)

  General_Health                 Checkup Exercise Heart_Disease Skin_Cancer
1           Poor Within the past 2 years       No            No          No
2      Very Good    Within the past year       No           Yes          No
3      Very Good    Within the past year      Yes            No          No
4           Poor    Within the past year      Yes           Yes          No
5           Good    Within the past year       No            No          No
6           Good    Within the past year       No            No          No
  Other_Cancer Depression Diabetes Arthritis    Sex Age_Category Height_.cm.
1           No         No       No       Yes Female        70-74         150
2           No         No      Yes        No Female        70-74         165
3           No         No      Yes        No Female        60-64         163
4           No         No      Yes        No   Male        75-79         180
5           No         No       No        No   Male          80+         191
6           No        Yes       No       Yes   Male        60-64         183
  Weight_.kg.   BMI Smoking_History Alcohol_Consumption Fruit_Consumption
1       32.66 14.54             Yes                   0                30
2       77.11 28.29              No                   0                30
3       88.45 33.47              No                   4                12
4       93.44 28.73              No                   0                30
5       88.45 24.37             Yes                   0                 8
6      154.22 46.11              No                   0                12
  Green_Vegetables_Consumption FriedPotato_Consumption
1                           16                      12
2                            0                       4
3                            3                      16
4                           30                       8
5                            4                       0
6                           12                      12

Selecting a subset of participants from CVD: 20000 participants

Number_ofparticipants <- 10000
## This is a very large dataset, let's only select 10000 samples from both groups
set.seed(150)
Yes_case <- sample(which(Heart_data$Heart_Disease == "Yes"), Number_ofparticipants)
set.seed(150)
No_case <- sample(which(Heart_data$Heart_Disease == "No"), Number_ofparticipants)
Data_yes <- Heart_data[Yes_case, ]
Data_no <-  Heart_data[No_case, ]
Data_total <- rbind(Data_yes, Data_no)
head(Data_total)

       General_Health              Checkup Exercise Heart_Disease Skin_Cancer
43289            Poor Within the past year      Yes           Yes         Yes
281436           Fair Within the past year      Yes           Yes          No
196610           Fair Within the past year      Yes           Yes          No
306903           Fair Within the past year       No           Yes          No
304887           Poor Within the past year       No           Yes          No
278988           Fair Within the past year      Yes           Yes          No
       Other_Cancer Depression Diabetes Arthritis    Sex Age_Category
43289            No         No      Yes       Yes   Male        65-69
281436           No         No      Yes       Yes   Male        70-74
196610           No         No      Yes       Yes Female        40-44
306903          Yes         No      Yes       Yes Female        55-59
304887           No        Yes       No       Yes   Male          80+
278988           No         No       No        No   Male        50-54
       Height_.cm. Weight_.kg.   BMI Smoking_History Alcohol_Consumption
43289          175      102.97 33.52             Yes                   0
281436         170      104.33 36.02             Yes                   0
196610         180      136.08 41.84             Yes                   5
306903         160       65.77 25.69              No                   0
304887         155       62.60 26.07             Yes                   0
278988         180      106.59 32.78              No                  25
       Fruit_Consumption Green_Vegetables_Consumption FriedPotato_Consumption
43289                 12                           16                       3
281436                60                           30                       4
196610                60                            2                      12
306903                12                           12                       3
304887                28                           28                       4
278988                 3                            4                       8

Converting Age from range to numerical value

## We will convert the Age variable that is in range form in the dataset to a numerical form.
My_Age_Heart_Data <- data.frame(Data_total$Age_Category, Data_total$Heart_Disease)
names(My_Age_Heart_Data) <- c("Age_Category", "Heart_Disease") 

#  This code gets the data in the right format
My_Age_Heart_Data$Age_min <- 
  as.numeric(substr(My_Age_Heart_Data$Age_Category, start = 1, stop = 2))
My_Age_Heart_Data$Age_max <- 
  as.numeric(substr(My_Age_Heart_Data$Age_Category, start = 4, stop = 5))
for (i in 1:Number_ofparticipants){
  # if (is.na(My_Age_Heart_Data$Age_min[i])){
  #   My_Age_Heart_Data$Age_min[i] = My_Age_Heart_Data$Age_max[i]}
  if (is.na(My_Age_Heart_Data$Age_max[i])){
    My_Age_Heart_Data$Age_max[i] = My_Age_Heart_Data$Age_min[i]}
}
## We check if there were other missing age value included in the data.
more_missing_agevalue <- Data_total$Age_Category[(is.na(My_Age_Heart_Data$Age_max))]

## Since all the missing values were participants 80 and older, we replace those missing values with 80.
My_Age_Heart_Data$Age_max[(is.na(My_Age_Heart_Data$Age_max))] = as.numeric(80)
# This code gives me the final age value to be used in the analysis.
My_Age_Heart_Data$FinalAge <- 
  (My_Age_Heart_Data$Age_min + My_Age_Heart_Data$Age_max)/2
head(My_Age_Heart_Data)

  Age_Category Heart_Disease Age_min Age_max FinalAge
1        65-69           Yes      65      69       67
2        70-74           Yes      70      74       72
3        40-44           Yes      40      44       42
4        55-59           Yes      55      59       57
5          80+           Yes      80      80       80
6        50-54           Yes      50      54       52

## Adding cleaned Age data in the data
Data_total$FinalAge <- My_Age_Heart_Data$FinalAge
sum(is.na(Data_total$FinalAge))

[1] 0

Converting the variables that are in character form to factors

Data_total$Diabetes <- factor(Data_total$Diabetes)
Data_total$Checkup <- factor(Data_total$Checkup)
Data_total$General_Health <- factor(Data_total$General_Health)
Data_total$Arthritis <- factor(Data_total$Arthritis)
Data_total$Sex <- factor(Data_total$Sex)
Data_total$Smoking_History <- factor(Data_total$Smoking_History)
Data_total$Height_.cm. <- as.numeric(Data_total$Height_.cm.)
Data_total$Skin_Cancer <- factor(Data_total$Skin_Cancer)
Data_total$Other_Cancer <- factor(Data_total$Other_Cancer)
Data_total$Exercise <- factor(Data_total$Exercise)
Data_total$Depression <- factor(Data_total$Depression)

Final Pre-processed data

## Here we will remove the Diabetes column that do not make sense to us. 
Extra_DiabetesandnullAge_Column <- 
  which((Data_total$Diabetes == "Yes, but female told only during pregnancy") | 
          (Data_total$Diabetes == "No, pre-diabetes or borderline diabetes"))

## This is our final data with clean variables
Data_total <- Data_total[-Extra_DiabetesandnullAge_Column, ]
Data_total$Diabetes <- factor(Data_total$Diabetes)
Data_total$Age_Category <- NULL # removing age category
str(Data_total)

'data.frame':   19323 obs. of  19 variables:
 $ General_Health              : Factor w/ 5 levels "Excellent","Fair",..: 4 2 2 2 4 2 5 3 5 2 ...
 $ Checkup                     : Factor w/ 5 levels "5 or more years ago",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ Exercise                    : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 2 2 2 ...
 $ Heart_Disease               : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ Skin_Cancer                 : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
 $ Other_Cancer                : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
 $ Depression                  : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 1 1 1 ...
 $ Diabetes                    : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 1 1 2 1 1 ...
 $ Arthritis                   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 2 1 ...
 $ Sex                         : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 2 2 1 1 ...
 $ Height_.cm.                 : num  175 170 180 160 155 180 185 163 163 173 ...
 $ Weight_.kg.                 : num  103 104.3 136.1 65.8 62.6 ...
 $ BMI                         : num  33.5 36 41.8 25.7 26.1 ...
 $ Smoking_History             : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 1 1 2 ...
 $ Alcohol_Consumption         : num  0 0 5 0 0 25 1 2 1 2 ...
 $ Fruit_Consumption           : num  12 60 60 12 28 3 4 30 12 16 ...
 $ Green_Vegetables_Consumption: num  16 30 2 12 28 4 10 8 12 2 ...
 $ FriedPotato_Consumption     : num  3 4 12 3 4 8 12 3 4 12 ...
 $ FinalAge                    : num  67 72 42 57 80 52 67 72 77 62 ...

Dependent variable vs independent variable of the model

## independent Variables
InputtoHeartDisease_Model2 <- 
  data.frame(Data_total$Fruit_Consumption, Data_total$Alcohol_Consumption,
  Data_total$Green_Vegetables_Consumption, Data_total$FriedPotato_Consumption,
  Data_total$FinalAge, Data_total$BMI, Data_total$Diabetes,
  Data_total$Checkup, Data_total$General_Health, Data_total$Arthritis,
  Data_total$Sex, Data_total$Smoking_History, Data_total$Height_.cm.,
  Data_total$Weight_.kg.,Data_total$Skin_Cancer, Data_total$Other_Cancer, 
  Data_total$Exercise, Data_total$Depression)
names(InputtoHeartDisease_Model2) <- 
  c("Fruit", "Alcohol", "GreenVeggie", "FriedPotato",
    "FinalAge", "BMI", "Diabetes", "Checkup", "GeneralHealth", 
    "Arthritis", "Sex", "SmokingHistory", "Height_.cm", 
    "Weight_.kg.","SkinCancer", "OtherCancer", "Exercise", "Depression")

## Dependent Variable
Heart_label <- as.factor(Data_total$Heart_Disease)

Partitioning of the data into training/testing for random forest model

set.seed(222)
sample_data <- sample(nrow(InputtoHeartDisease_Model2), .7* nrow(InputtoHeartDisease_Model2))
train_data <- InputtoHeartDisease_Model2[sample_data, ]
test_data <- InputtoHeartDisease_Model2[-sample_data, ]
HeartLabel_Train <- Heart_label[sample_data]
HeartLabel_Test <- Heart_label[-sample_data]

Running the model

This is our random forest model with all features

our_model_500 <- randomForest(HeartLabel_Train ~ ., data = train_data, ntree = 500, 
                              proximity = T, mtry = 3)
our_model_500


Call:
 randomForest(formula = HeartLabel_Train ~ ., data = train_data,      ntree = 500, proximity = T, mtry = 3) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 24.44%
Confusion matrix:
      No  Yes class.error
No  4775 1973   0.2923829
Yes 1333 5445   0.1966657

Parameter Tuning

## Parameter Tuning A: here we find the optimal number of decision trees
plot(our_model_500, log = "y", main = "All-features: Error rate as a function 
     of  ntrees (= 500)", lwd = 3, bty = "n", ylim = c(0.2,0.8))
legend("top", colnames(our_model_500$err.rate), col = 1:3, cex = 0.8, fill = 1:3)

## This is our random forest model with all features
our_model_1000 <- randomForest(HeartLabel_Train ~ ., data = train_data, ntree = 1000, 
                              proximity = T, mtry = 3)
our_model_1000


Call:
 randomForest(formula = HeartLabel_Train ~ ., data = train_data,      ntree = 1000, proximity = T, mtry = 3) 
               Type of random forest: classification
                     Number of trees: 1000
No. of variables tried at each split: 3

        OOB estimate of  error rate: 24.35%
Confusion matrix:
      No  Yes class.error
No  4786 1962   0.2907528
Yes 1332 5446   0.1965181

plot(our_model_1000, log = "y", main = "All-features: Error rate as a function 
     of  ntrees (= 1000)", lwd = 3, bty = "n", ylim = c(0.2,0.8))
legend("top", colnames(our_model_1000$err.rate), col = 1:3, cex = 0.8, fill = 1:3)

## Parameter Tuning B: now we will find the optimal number of split of the tree
oob_values <- vector(length = 10)
for (i in 1:10) {
  Num_model <- randomForest(HeartLabel_Train ~ ., data = train_data, mtry = i, ntree = 500)
  oob_values[i] <- Num_model$err.rate[nrow(Num_model$err.rate), 1]
}
plot(oob_values, main = "Error rate as a function of Num of Nodes", lwd = 3, bty = "n",
     ylim = c(0.2,0.8), col = 2, xlab = "Nodes", ylab = "OOB Rate")

We will record the out of bag error estimate of the optimal model

our_model_500 <- randomForest(HeartLabel_Train ~ ., data = train_data, ntree = 500, 
                              proximity = T, mtry = 2)
our_model_500


Call:
 randomForest(formula = HeartLabel_Train ~ ., data = train_data,      ntree = 500, proximity = T, mtry = 2) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 24.43%
Confusion matrix:
      No  Yes class.error
No  4726 2022   0.2996443
Yes 1282 5496   0.1891413

Model Features and Performance

Model Performance on the testing and training set

## Testing the model accuracy on the training data
train_rf <- predict(our_model_500, train_data)
Train_Summary <- confusionMatrix(train_rf, HeartLabel_Train)
Train_Summary

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  6091  384
       Yes  657 6394
                                          
               Accuracy : 0.923           
                 95% CI : (0.9184, 0.9275)
    No Information Rate : 0.5011          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8461          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9026          
            Specificity : 0.9433          
         Pos Pred Value : 0.9407          
         Neg Pred Value : 0.9068          
             Prevalence : 0.4989          
         Detection Rate : 0.4503          
   Detection Prevalence : 0.4787          
      Balanced Accuracy : 0.9230          
                                          
       'Positive' Class : No

## Testing the model accuracy on the testing data
test_rf <- predict(our_model_500, test_data)
Test_Summary <- confusionMatrix(test_rf, HeartLabel_Test)
Test_Summary ## All features

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2022  541
       Yes  904 2330
                                          
               Accuracy : 0.7507          
                 95% CI : (0.7394, 0.7618)
    No Information Rate : 0.5047          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.502           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.6910          
            Specificity : 0.8116          
         Pos Pred Value : 0.7889          
         Neg Pred Value : 0.7205          
             Prevalence : 0.5047          
         Detection Rate : 0.3488          
   Detection Prevalence : 0.4421          
      Balanced Accuracy : 0.7513          
                                          
       'Positive' Class : No

Which variables are most important according to our model

#install.packages("ggalt")
suppressPackageStartupMessages({
  library(ggalt)
  library(randomForest)
  library(data.table)
})

## Here we get a nice looking plot for the feature important index
imp <- data.table(importance(our_model_500), keep.rownames = TRUE)
imp = arrange(imp, MeanDecreaseGini)
imp[, rn := factor(rn, unique(rn))]
ggplot(melt(imp, id.vars="rn")[grep("Mean", variable)], 
       aes(x=rn, y=value, label = round(value, 1))) + 
  geom_lollipop(point.size = 4, point.colour = "orange3", pch = 19, bg = 2) +
  geom_text(aes(nudge_y = 2), hjust = -.3) +
  coord_flip() +
  #facet_wrap(~variable) +
  theme_minimal() +
  labs(y="MeanDecreaseGini", x=NULL, 
  title = "Feature Importance Plot from Cardio Vascular Dataset", cex = 0.9) +
  expand_limits(y = 1300) + theme(plot.title = element_text(face = "bold", hjust = 0.5))

Warning in geom_text(aes(nudge_y = 2), hjust = -0.3): Ignoring unknown
aesthetics: nudge_y

Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
ℹ Please use the `linewidth` aesthetic instead.

Partial Depedence Plot

### Partial Dependence Plot
## Age
Age <- partialPlot(our_model_500, pred.data = train_data, FinalAge, "Yes", lwd = 3, 
      ylab = "Partial depedence", col = "pink3", main = substitute(bold("Age (in years)")),
      ylim = c(-1,1.5), bty="n", cex.main = 2)
axis(c(2), font = 4)
axis(c(1), font = 4)

## Weight
Weight <- partialPlot(our_model_500,  pred.data = train_data, x.var = Weight_.kg., "Yes", lwd = 3, 
        ylab = "Partial depedence", main = substitute(bold("Weight (in kg)")),
        bty = "n", col = "darkgrey", ylim = c(-0.1,.1), cex.main = 2) 
axis(c(2), font = 4)
axis(c(1), font = 4)

## BMI
BMI <- partialPlot(our_model_500, pred.data = train_data, BMI, "Yes", lwd = 4, 
        ylab = "Partial depedence", main = substitute(bold("BMI")),
        col = "tan", bty = "n",ylim = c(-0.2, .2), cex.main = 2) 
axis(c(2), font = 4)
axis(c(1), font = 4)

## General Health
#par(mfrow = c(1, 2))
GenHealth <- partialPlot(our_model_500, pred.data = train_data, GeneralHealth, 
             "Yes", lwd = 4, 
             ylab = "Partial depedence", main = "General Health",
             bty = "n", ylim = c(-1,1), cex.main = 2)

## Height
Height <- partialPlot(our_model_500,  pred.data = train_data, x.var = Height_.cm, 
           "Yes", lwd = 4, ylab = "Partial depedence", 
           main = substitute(bold("Height (in cm)")), col = "brown2",
          bty = "n", ylim = c(-0.1,.1), cex.main = 2)
axis(c(2), font = 3)
axis(c(1), font = 3)

## Food Habits——
#par(mfrow = c(1,3))
#par(font.axis = 2)
## Fruits
Fruits <- partialPlot(our_model_500,  pred.data = train_data, x.var = Fruit, 
          "Yes", lwd = 4, ylab = "Partial depedence", 
          main = substitute(bold("Fruit Eating Habits")),bty = "n", 
          col = "orange2", ylim = c(-0.1,.1), cex.main = 2) 
axis(c(2), font = 3)
axis(c(1), font = 3)

## Vegetables
Veggie <- partialPlot(our_model_500,  pred.data = train_data, x.var = GreenVeggie, 
          "Yes", lwd = 4, ylab = "Partial depedence", 
          main = substitute(bold("Veggie Eating Habits")),
            bty = "n", col = "green4", ylim = c(-.1,.1), cex.main = 2) 
axis(c(2), font = 3)
axis(c(1), font = 3)

## Friedpotatoes
FriedPotatoes <- partialPlot(our_model_500,  pred.data = train_data, 
                 x.var = FriedPotato, "Yes", lwd = 4, ylab = "Partial depedence", 
                 main = substitute(bold("Fried-Potatoes Eating Habits")),
            bty = "n", col = "yellow2", ylim = c(-0.1,.1), cex.main = 2) 
axis(c(2), font = 3)
axis(c(1), font = 3)