# MKT 445 Project
# Data = Bank customer churn
# Source = https://www.kaggle.com/code/a3amat02/customer-churn-eda-smote-classifier/input

install.packages("openxlsx")
library(openxlsx)
BankCustomer <- read.xlsx("BankCustomer Churn.xlsx")

#Data Cleaning
#Dropping columns - Because teh first 3 columns doesnt add any value to the analysis, we dropped the columns
Bank <- BankCustomer[-c(1:3)]
View(Bank)
str(Bank)

# Checking no.of rows and columns
nrow(Bank)
ncol(Bank)

# Checking for missing values
sum(is.na(Bank))
# There are no missing values in the data

#Variable transformation
# Our target variable is "Exited". Because it is in numerical form,we transformed it from numerical to factor
Bank$Exited <- as.factor((Bank$Exited))
Bank$HasCrCard <- as.factor(Bank$HasCrCard)
Bank$IsActiveMember <- as.factor(Bank$IsActiveMember)
str(Bank)

# Checking for outliers in numeric variables
# Outliers @ credit score
range(Bank$CreditScore)
boxplot(Bank$CreditScore, main="Boxplot", ylab="Values")
New_Bank <- boxplot(Bank$CreditScore, plot=FALSE)$out
New_Bank
# 15 outliers are found by box plot code 
# All the values <= 376 are considered as Outliers.
# Because these values are not unrealistic, we left untreated.

# Outliers @ age
range(Bank$Age)
age_outliers<-boxplot(Bank$Age, main="Boxplot", ylab="Values")
age<- boxplot(Bank$Age,plot=FALSE)$out
age
# 359 outliers are identified using boxplot code
# Ages >= 63 are ourliers
# Because these values are not unrealistic, we left untreated.

# No outliers in other numeric variables


# Data Visualization

library(ggplot2) 
library(readxl) 
BankCustomer <- read.xlsx("BankCustomer Churn.xlsx")

#Histogram of Customer Age Group 

ggplot(data, aes(x = Age, fill = factor(Exited))) + 
  
  geom_histogram(position = "identity", alpha = 0.5, bins = 20) + 
  
  geom_density(alpha = 0.2) + 
  
  labs(title = "Histogram of Customer Age Group",  
       
       x = "Age",  
       
       y = "Frequency",  
       
       fill = "Churn Status") 

average_age <- mean(data$Age, na.rm = TRUE)

# Summary of Age Distribution
#age_distribution_summary <- "Age Distribution:
#Majority of customers are aged 25-45, peaking in their early 30s.

#Engagement Strategies:
#Focus on tailored financial products, digital enhancements, targeted marketing, and personalized advisory services for customers in their early 30s.

#Churn Patterns:
#Younger age groups, particularly those in their late 20s to mid-30s, exhibit higher churn rates.
#Older customers, aged 50 and above, show relatively lower churn rates, indicating higher satisfaction or lower propensity to switch banks."

#The average age is around 38 years old.

#Boxplot of Credit Scores by Churn Status 

ggplot(data, aes(x = factor(Exited), y = CreditScore, fill = factor(Exited))) + 
  
  geom_boxplot() + 
  
  labs(title = "Boxplot of Credit Scores by Churn Status",  
       
       x = "Churn Status (0 = No, 1 = Yes)",  
       
       y = "Credit Score",  
       
       fill = "Churn Status") 

# Summary of Median Credit Score and Interquartile Range (IQR)
#credit_score_summary <- "Median Credit Score:
#Customers who stayed (red box) have a slightly lower median credit score compared to those who exited (teal box).
#This suggests that loyalty, ease of service access, and limited options for 
#similar products elsewhere may influence customers with lower credit scores to stay with the bank.

#Interquartile Range (IQR):
#Both retained and exited customers exhibit similar variability in credit scores, 
#as indicated by the similar heights of their respective boxes."


#Heatmap of Product Usage and Churn 

library(reshape2) 

product_usage <- table(data$NumOfProducts, data$Exited) 

product_usage_melted <- melt(product_usage) 


library(ggplot2) 

ggplot(product_usage_melted, aes(x=Var1, y=Var2, fill=value)) + 
  
  geom_tile() + 
  
  labs(title = "Heatmap of Product Usage and Churn", 
       
       x = "Number of Products", 
       
       y = "Churn Status (0 = No, 1 = Yes)", 
       
       fill = "Count") 

# Summary of Product Usage Patterns
product_usage_summary <- "Objective: To identify patterns in product usage across customer segments.

#Key Observations:
#Single Product Usage:
# - Darker blue suggests high retention with one product.
# - Lighter teal indicates some churn but still notable.

#Increasing Product Usage:
# - Moderate retention with two products.
# - Significant churn risk with only two products.

#High Product Usage:
# - Very high retention with three to four products.
# - Minimal churn among customers utilizing multiple offerings."

# Chi-square test
# Code by Juliana

#chi-square
# We want to test if relationship exits between credit score and customer getting exited.

#H0: Credit score and customers churn are not related
#Ha: Credit score and customers churn are related

credit_churn_rate <- table(Bank$CreditScore, Bank$Exited)
chisq.test(credit_churn_rate)

####





# Aggregations 
# Ran aggregation code to analyse individual variables which appeared insignificant in chi-square and prediction models
# The need for this analysis is to identify the patterns existed between these variables and Y Variable
# Each variable averages is taken into consideration

balance_aggregate <- aggregate(Bank$Balance, by= list(Bank$Exited), FUN=mean)
#

salary_aggregate <- aggregate(Bank$EstimatedSalary, by= list(Bank$Exited), FUN=mean)
# The average salary of exited customers is greater than the average salary of the non-existing customers
#   Group.1         x
# 1       0  99738.39
# 2       1 101465.68

creditscore_aggregate <- aggregate(Bank$CreditScore, by=list(Bank$Exited), FUN=mean)
#

age_aggregate <- aggregate(Bank$Age, by= list(Bank$Exited), FUN=mean)
# The average age of the customer Exited is greater than the non-exited customers
# Group.1        x
# 1       0 37.40839
# 2       1 44.83800

# Table function
# To understand the frequencies of the variables
Bank_Gender_frequencies <- table(Bank$Gender, Bank$Exited)
1139/2037
# From the frequency table, it is clear that among the 2037 Exited customers approximately 60% are female.

Bank_geography_frequencies <- table(Bank$Geography, Bank$Exited)
#           0    1
# France  4204  810
# Germany 1695  814 = 2509
# Spain   2064  413
#From the table Customers from Germany and France are exited are double than the customers from Spain.

Geo_gender_frequencies <- table(Bank$Geography,Bank$Exited, Bank$Gender)
# The table below gives the gender breakup of the "Exited" customers from each Nation
#           Female Male
# France     460  350
# Germany    448  366
# Spain      231  182

product_frequencies <- table(Bank$NumOfProducts,Bank$Exited)
# 

active_frequencies <- table(Bank$IsActiveMember,Bank$Exited)
# More number of not-active customers exited than the active customers

credit_card_frequencies <- table(Bank$HasCrCard, Bank$Exited)
#

Balance_frequencies <-table(Bank$Balance, Bank$Exited)
#

tenure_frequencies <-table(Bank$Tenure==0, Bank$Exited==0)
#

# Predictive models
# Step 1: # Splitting data
# We did 70-40 data split. 60% Train data and 40% test data
set.seed(50)
training_proposition <- 0.70
training <- sample(nrow(Bank),nrow(Bank)*training_proposition)
train <- Bank[training,]
test <- Bank[-training, ]

# Step 2: First Model - Run classification model
# Logistic regression

# Because our response variable is logical, we will use Logistic regression for prediction
# Setting the Y-variable as factor adn defing the sucess is "Exited" coded as 1
Bank$Exited <- factor(Bank$Exited, levels=c("0", "1"))
table(Bank$Exited)

str(Bank)

# In the first attempt we used all the 10 variables in the model
Bank_model1 <- glm(Exited~.,data= train, family=binomial)
summary(Bank_model1)

# 4 out of 10 seem to be non-significant. They are "Tenure","HasCrCard", "EstimatedSalary" "NumOfProducts".
# AIC of Bank_model1 = 5948.5

# The second glm is created eliminating the non-significant variables.
Bank_model2 <- glm(Exited~.-Tenure-HasCrCard-EstimatedSalary-NumOfProducts,data= train, family=binomial)
summary(Bank_model2)

# In the Bank_model2 all the variables are significant
# AIC of Bank_model2 = 5946.8

# Step 3: Model coeffecinets analysis
Bank_model2$coefficients

exp(8.616947e-01)
1-2.36
exp(4.622e-02)
exp(-0.3888) 
exp(0.3888)
exp(7.198820e-02)
1.07-1
exp(-5.109919e-01)
1- 0.5999002
exp(2.410253e-06)
exp(-6.954e-04)
1-0.9993048
exp(-0.3888)
exp(0.3888)
exp(2.723e-06)
1-1.000003

exp(-1.123e+00)
1-0.3253024

# Coeffecient Inferences
# The log odds of a customer hailing from Germany geting "Exited" are 2.37 times higher than the customer hailing from France
# For every one year increase in age the log odds of a customer getting "Exited" increases  by 1.07 times
# The log odds of a customer getting "Exited" are 59.9% lower if the customer is male than customer being Female.

#Step 4: Model Accuracy
# Train Data model accuracy
train_prediction <- predict(Bank_model2,train, type = "response")
pred<- mean(train$Exited == train_prediction)

# convert the probability into binary
binary_predictions <- ifelse(train_prediction > 0.3, 1, 0)
Bank_model2_accuracy <- mean(binary_predictions==train$Exited)

#Bank_model2 accuracy on train data = 78.84
# test Data model accuracy
test_prediction <- predict(Bank_model2,test, type = "response")
test_binary_predictions <- ifelse(test_prediction > 0.3, 1, 0)
test$train_probability <- exp(test_prediction)/(1+exp(test_prediction))
Bank_model2_test_accuracy <- mean(test$Exited==test_binary_predictions )

#Bank_model2 accuracy on test data = 77.03%
conf_matrix <- table(test_binary_predictions, test$Exited)
conf_matrix

# Logistic model sensitivity
# log_sensitivity = tp/(tp+fn)
log_sensitivity = 298/666
298+368

# 2 prediction model - Random forest model

# Step-1
set.seed(50)
library(randomForest)

# Step 2
# Train data model
rf_bank_train <- randomForest(Exited~., data=train, ntree = 8000)
rf_bank_train
# OOB estimate of  error rate of train data: 14.07%
# Accuracy of the train data random forest model = 85.93

# Step 3
# Predict Test data
predictions <- predict(rf_bank_train, newdata = test)
rfmodel_accuracy <- mean(test$Exited == predictions)
# Accuracy of the test data random forest model = 86.0%

# Step 4
# Evaluate model performance
conf_matrix <- table(predictions, test$Exited)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
f1_score <- 2 * precision * recall / (precision + recall)

#sensitivity <- tp/tp+fn
sensitivity = 277/(277+78)
conf_matrix

# Print model evaluation metrics
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("Recall:", recall))
print(paste("F1-score:", f1_score))




