# Installing the libraries with the specified version.
# uncomment and run the following lines if Jupyter Notebook is being used
!pip install scikit-learn==1.2.2 seaborn==0.13.1 matplotlib==3.7.1 numpy==1.25.2 pandas==1.5.3 imbalanced-learn==0.12.0 xgboost==2.0.3 -q --user
!pip install --upgrade -q threadpoolctl


# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# To suppress scientific notations
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To tune model, get different metric scores, and split data
from sklearn import metrics
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from sklearn.impute import SimpleImputer

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To do hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To help with model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To supress warnings
import warnings
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/content/drive') # Load Drive

Mounted at /content/drive


churn = pd.read_csv("BankChurners.csv") # Read Original Dataset and copy on churn dataframe (Protect it as Original)


data = churn.copy() # Copy churn dataframe into data


data.head(5) # View top 5 rows of the data


data.tail(5) # View last 5 rows of the data


# Checking the number of rows and columns in the training data
churn.shape # Number of rows and columns
print("For the training dataset (# rows, # columns or features):",churn.shape)
print("Number of rows:", churn.shape[0])
print("Number of columns:", churn.shape[1])

For the training dataset (# rows, # columns or features): (10127, 21)
Number of rows: 10127
Number of columns: 21


data.info() # Data types of the columns/features in the training dataset.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           8608 non-null   object 
 6   Marital_Status            9378 non-null   object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_Revolving_Bal       10127 non-null  int64  
 15  Avg_Open_To_Buy           10127 non-null  float64
 16  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 17  Total_Trans_Amt           10127 non-null  int64  
 18  Total_Trans_Ct            10127 non-null  int64  
 19  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 20  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(10), object(6)
memory usage: 1.6+ MB


num_object_columns = len(data.select_dtypes(include=['float64', 'int64']).columns) # Count the number of float64, and int64 data types.

print("There are only\033[1;31m", num_object_columns, "\033[0mfloat64 and int64 data types:\n") # Print the resulting sum of float64 + int64 data types.

data.describe().T # Statitical summary of the training data.

There are only 15 float64 and int64 data types:


num_object_columns = len(data.select_dtypes(include='object').columns) # Count the number of object data types.

print("There are only\033[1;31m", num_object_columns, "\033[0mobject data types:\n") # Print the number of object data types in bold red.

data.describe(include=["object"]).T # List only the object data type for examinations.

There are only 6 object data types:


for i in data.describe(include=["object"]).columns: # Loop through object data types
    print("The breakdown of unique values in\033[1;33m", i, "\033[0mare as follows:") # Bold Yellow Object Datatype.
    print("\033[1;31m", data[i].value_counts().index.tolist(), "\033[0m") # Bold Red Categories names.
    print("\033[1;32m",data[i].value_counts(),"\033[0m") # Bold Green value count of each of the categories.
    print("-" * 70) # Print a line separator.
    print("\n") # Escape to a new line.

The breakdown of unique values in Attrition_Flag are as follows:
 ['Existing Customer', 'Attrited Customer'] 
 Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64 
----------------------------------------------------------------------


The breakdown of unique values in Gender are as follows:
 ['F', 'M'] 
 Gender
F    5358
M    4769
Name: count, dtype: int64 
----------------------------------------------------------------------


The breakdown of unique values in Education_Level are as follows:
 ['Graduate', 'High School', 'Uneducated', 'College', 'Post-Graduate', 'Doctorate'] 
 Education_Level
Graduate         3128
High School      2013
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: count, dtype: int64 
----------------------------------------------------------------------


The breakdown of unique values in Marital_Status are as follows:
 ['Married', 'Single', 'Divorced'] 
 Marital_Status
Married     4687
Single      3943
Divorced     748
Name: count, dtype: int64 
----------------------------------------------------------------------


The breakdown of unique values in Income_Category are as follows:
 ['Less than $40K', '$40K - $60K', '$80K - $120K', '$60K - $80K', 'abc', '$120K +'] 
 Income_Category
Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
abc               1112
$120K +            727
Name: count, dtype: int64 
----------------------------------------------------------------------


The breakdown of unique values in Card_Category are as follows:
 ['Blue', 'Silver', 'Gold', 'Platinum'] 
 Card_Category
Blue        9436
Silver       555
Gold         116
Platinum      20
Name: count, dtype: int64 
----------------------------------------------------------------------


# Additional way to display categories in a nicer tabular format.
for i in data.describe(include=["object"]).columns:
    print("The breakdown of unique values in\033[1;33m", i, "\033[0mare as follows:")
    value_counts = data[i].value_counts()
    value_counts_df = pd.DataFrame({'Categories': value_counts.index.tolist(), 'Counts': value_counts.values})
    value_counts_df.style.set_properties(**{'color': 'red', 'font-weight': 'bold'})
    print(value_counts_df.to_markdown(index=False, numalign='left', stralign='left'))
    print("-" * 70) # Print a line separator.
    print("\n")

The breakdown of unique values in Attrition_Flag are as follows:
| Categories        | Counts   |
|:------------------|:---------|
| Existing Customer | 8500     |
| Attrited Customer | 1627     |
----------------------------------------------------------------------


The breakdown of unique values in Gender are as follows:
| Categories   | Counts   |
|:-------------|:---------|
| F            | 5358     |
| M            | 4769     |
----------------------------------------------------------------------


The breakdown of unique values in Education_Level are as follows:
| Categories    | Counts   |
|:--------------|:---------|
| Graduate      | 3128     |
| High School   | 2013     |
| Uneducated    | 1487     |
| College       | 1013     |
| Post-Graduate | 516      |
| Doctorate     | 451      |
----------------------------------------------------------------------


The breakdown of unique values in Marital_Status are as follows:
| Categories   | Counts   |
|:-------------|:---------|
| Married      | 4687     |
| Single       | 3943     |
| Divorced     | 748      |
----------------------------------------------------------------------


The breakdown of unique values in Income_Category are as follows:
| Categories     | Counts   |
|:---------------|:---------|
| Less than $40K | 3561     |
| $40K - $60K    | 1790     |
| $80K - $120K   | 1535     |
| $60K - $80K    | 1402     |
| abc            | 1112     |
| $120K +        | 727      |
----------------------------------------------------------------------


The breakdown of unique values in Card_Category are as follows:
| Categories   | Counts   |
|:-------------|:---------|
| Blue         | 9436     |
| Silver       | 555      |
| Gold         | 116      |
| Platinum     | 20       |
----------------------------------------------------------------------


# Check for duplicate values in the data
data.duplicated().sum() # Check duplicate entries in the data

0


# Check for missing values in the data
data.isnull().sum() # Check missing entries in the train data


# CLIENTNUM column contains uniques ID for clients not adding value to the training dataset for the model.
data.drop(["CLIENTNUM"], axis=1, inplace=True) # It can be removed.


## Encoding Existing and Attrited customers to 0 and 1 respectively, for analysis.
data["Attrition_Flag"].replace("Existing Customer", 0, inplace=True)
data["Attrition_Flag"].replace("Attrited Customer", 1, inplace=True)


# Function to plot a Boxplot and a Histogram along the same scale.

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram


# Function to create labeled barplots

def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot


# function to plot Stacked Bar Chart

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()


# Function to plot Distributions

def distribution_plot_wrt_target(data, predictor, target):

    fig, axs = plt.subplots(2, 2, figsize=(12, 10))

    target_uniq = data[target].unique()

    axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
    sns.histplot(
        data=data[data[target] == target_uniq[0]],
        x=predictor,
        kde=True,
        ax=axs[0, 0],
        color="teal",
    )

    axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
    sns.histplot(
        data=data[data[target] == target_uniq[1]],
        x=predictor,
        kde=True,
        ax=axs[0, 1],
        color="orange",
    )

    axs[1, 0].set_title("Boxplot w.r.t target")
    sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")

    axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axs[1, 1],
        showfliers=False,
        palette="gist_rainbow",
    )

    plt.tight_layout()
    plt.show()


# Call the histogram_boxplot function
histogram_boxplot(data, "Customer_Age", kde=True)


histogram_boxplot(data,'Months_on_book',kde=True)  # Call histogram_boxplot for 'Months_on_book'.


histogram_boxplot(data,'Credit_Limit',kde=True)  # Call histogram_boxplot for 'Credit_Limit'.


histogram_boxplot(data,'Total_Revolving_Bal',kde=True)  # Call histogram_boxplot for 'Total_Revolving_Bal'.


histogram_boxplot(data,'Avg_Open_To_Buy',kde=True)  # Call histogram_boxplot for 'Avg_Open_To_Buy'.


histogram_boxplot(data,'Total_Trans_Ct',kde=True)  # Call histogram_boxplot for 'Total_Trans_Ct'.


histogram_boxplot(data,'Total_Amt_Chng_Q4_Q1',kde=True)  # Call histogram_boxplot for 'Total_Amt_Chng_Q4_Q1'.


histogram_boxplot(data,'Total_Trans_Amt',kde=True)  # Call histogram_boxplot for 'Total_Trans_Amt'.


histogram_boxplot(data,'Total_Ct_Chng_Q4_Q1',kde=True)  # Call histogram_boxplot for 'Total_Ct_Chng_Q4_Q1'.


histogram_boxplot(data,'Avg_Utilization_Ratio',kde=True)  # Call histogram_boxplot for 'Avg_Utilization_Ratio'.


labeled_barplot(data, "Dependent_count") # Call labeled_barplot for Dependent_count.


labeled_barplot(data,"Total_Relationship_Count") # Call labeled_barplot for Total_Relationship_Count.


labeled_barplot(data,"Months_Inactive_12_mon") # Call labeled_barplot for Months_Inactive_12_mon.


labeled_barplot(data,"Contacts_Count_12_mon") # Call labeled_barplot for Contacts_Count_12_mon.


labeled_barplot(data,"Gender") # Call labeled_barplot for Gender.


labeled_barplot(data,"Education_Level") # Call labeled_barplot for Education_Level.


labeled_barplot(data,"Marital_Status") # Call labeled_barplot for Marital_Status.


labeled_barplot(data,"Income_Category") # Call labeled_barplot for Income_Category.


labeled_barplot(data,"Card_Category") # Call labeled_barplot for Card_Category.


labeled_barplot(data,"Attrition_Flag") # Call labeled_barplot for Attrition_Flag.


# Displaying Histograms:
data.hist(figsize=(14, 14))
plt.show()


plt.figure(figsize=(15, 7))
corr_matrix = data.select_dtypes(exclude='object').corr() # Exclude Categories (object datatype) Features/Columns - This Time for quick analysis.
sns.heatmap(corr_matrix, annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()


from prettytable import PrettyTable
print("Attributes that have the strongest correlations with each other:\n")
# Initialize a PrettyTable object
table = PrettyTable()

# Define column headers
table.field_names = ["\033[1;4mFeature 1", "Feature 2", "Correlation\033[0m"]

# Add rows
table.add_row(["Credit_Limit #14", "Avg_Open_To_Buy #16", "\033[1;33m1\033[0m"])
table.add_row(["Customer_Age #2", "Months_On_Book #10", "\033[1;33m0.79\033[0m"])
table.add_row(["Total_Revolving_Bal #15", "Avg_Utilization_Ratio #21","\033[1;33m0.62\033[0m" ])
table.add_row(["Total_Ct_Chng_04_Q1 #20", "Total_Amt_Chng_04_Q1 #17", "\033[1;33m0.38\033[0m"])

# Print the table
print(table)

Attributes that have the strongest correlations with each other:

+-------------------------+---------------------------+-------------+
|        Feature 1        |         Feature 2         | Correlation |
+-------------------------+---------------------------+-------------+
|     Credit_Limit #14    |    Avg_Open_To_Buy #16    |      1      |
|     Customer_Age #2     |     Months_On_Book #10    |     0.79    |
| Total_Revolving_Bal #15 | Avg_Utilization_Ratio #21 |     0.62    |
| Total_Ct_Chng_04_Q1 #20 |  Total_Amt_Chng_04_Q1 #17 |     0.38    |
+-------------------------+---------------------------+-------------+


import matplotlib.pyplot as plt #  plots library

print("\n\033[1;92mAttrition_Flag vs Features with Categories to analyze Category distributions:\033[0m\n")
print(" 0 = Existing Customer"," \033[1;31m1 = Customer Attrition\033[0m\n")
# Create subplots for each feature with Categories to analyze distributions
stacked_barplot(data, "Attrition_Flag","Gender")
stacked_barplot(data, "Attrition_Flag","Dependent_count")
stacked_barplot(data, "Attrition_Flag","Education_Level")
stacked_barplot(data, "Attrition_Flag","Marital_Status")
stacked_barplot(data, "Attrition_Flag","Income_Category")
stacked_barplot(data, "Attrition_Flag","Card_Category")
stacked_barplot(data, "Attrition_Flag","Total_Relationship_Count")
stacked_barplot(data, "Attrition_Flag","Months_Inactive_12_mon")
stacked_barplot(data, "Attrition_Flag","Contacts_Count_12_mon")
# Adjust layout to prevent overlap
plt.tight_layout()

# Display the plots
plt.show()
print("\n\033[1;92m------- END OF SECTION: Attrition_Flag vs Feature with Categories to analyze distributions:\033[0m\n")

Attrition_Flag vs Features with Categories to analyze Category distributions:

 0 = Existing Customer  1 = Customer Attrition

Gender             F     M    All
Attrition_Flag                   
All             5358  4769  10127
0               4428  4072   8500
1                930   697   1627
------------------------------------------------------------------------------------------------------------------------

Dependent_count    0     1     2     3     4    5    All
Attrition_Flag                                          
All              904  1838  2655  2732  1574  424  10127
0                769  1569  2238  2250  1314  360   8500
1                135   269   417   482   260   64   1627
------------------------------------------------------------------------------------------------------------------------

Education_Level  College  Doctorate  Graduate  High School  Post-Graduate  \
Attrition_Flag                                                              
All                 1013        451      3128         2013            516   
0                    859        356      2641         1707            424   
1                    154         95       487          306             92   

Education_Level  Uneducated   All  
Attrition_Flag                     
All                    1487  8608  
0                      1250  7237  
1                       237  1371  
------------------------------------------------------------------------------------------------------------------------

Marital_Status  Divorced  Married  Single   All
Attrition_Flag                                 
All                  748     4687    3943  9378
0                    627     3978    3275  7880
1                    121      709     668  1498
------------------------------------------------------------------------------------------------------------------------

Income_Category  $120K +  $40K - $60K  $60K - $80K  $80K - $120K  \
Attrition_Flag                                                     
All                  727         1790         1402          1535   
0                    601         1519         1213          1293   
1                    126          271          189           242   

Income_Category  Less than $40K   abc    All  
Attrition_Flag                                
All                        3561  1112  10127  
0                          2949   925   8500  
1                           612   187   1627  
------------------------------------------------------------------------------------------------------------------------

Card_Category   Blue  Gold  Platinum  Silver    All
Attrition_Flag                                     
All             9436   116        20     555  10127
0               7917    95        15     473   8500
1               1519    21         5      82   1627
------------------------------------------------------------------------------------------------------------------------

Total_Relationship_Count    1     2     3     4     5     6    All
Attrition_Flag                                                    
All                       910  1243  2305  1912  1891  1866  10127
0                         677   897  1905  1687  1664  1670   8500
1                         233   346   400   225   227   196   1627
------------------------------------------------------------------------------------------------------------------------

Months_Inactive_12_mon   0     1     2     3    4    5    6    All
Attrition_Flag                                                    
All                     29  2233  3282  3846  435  178  124  10127
1                       15   100   505   826  130   32   19   1627
0                       14  2133  2777  3020  305  146  105   8500
------------------------------------------------------------------------------------------------------------------------

Contacts_Count_12_mon    0     1     2     3     4    5   6    All
Attrition_Flag                                                    
1                        7   108   403   681   315   59  54   1627
All                    399  1499  3227  3380  1392  176  54  10127
0                      392  1391  2824  2699  1077  117   0   8500
------------------------------------------------------------------------------------------------------------------------

<Figure size 640x480 with 0 Axes>


import matplotlib.pyplot as plt #  plots library

print("\n\033[1;92mFeatures with Categories to analyze Attrition distributions:\033[0m\n")
print(" 0 = Existing Customer"," \033[1;31m1 = Customer Attrition\033[0m\n")
# Create subplots for each feature with Categories to analyze distributions
stacked_barplot(data, "Gender", "Attrition_Flag")
stacked_barplot(data, "Dependent_count", "Attrition_Flag")
stacked_barplot(data, "Education_Level", "Attrition_Flag")
stacked_barplot(data, "Marital_Status", "Attrition_Flag")
stacked_barplot(data, "Income_Category", "Attrition_Flag")
stacked_barplot(data, "Card_Category", "Attrition_Flag")
stacked_barplot(data, "Total_Relationship_Count", "Attrition_Flag")
stacked_barplot(data, "Months_Inactive_12_mon", "Attrition_Flag")
stacked_barplot(data, "Contacts_Count_12_mon", "Attrition_Flag")
# Adjust layout to prevent overlap
plt.tight_layout()

# Display the plots
plt.show()
print("\n\033[1;92m------- END OF SECTION: Attrition_Flag vs Feature with Categories to analyze distributions:\033[0m\n")

Features with Categories to analyze Attrition distributions:

 0 = Existing Customer  1 = Customer Attrition

Attrition_Flag     0     1    All
Gender                           
All             8500  1627  10127
F               4428   930   5358
M               4072   697   4769
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag      0     1    All
Dependent_count                   
All              8500  1627  10127
3                2250   482   2732
2                2238   417   2655
1                1569   269   1838
4                1314   260   1574
0                 769   135    904
5                 360    64    424
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag      0     1   All
Education_Level                  
All              7237  1371  8608
Graduate         2641   487  3128
High School      1707   306  2013
Uneducated       1250   237  1487
College           859   154  1013
Doctorate         356    95   451
Post-Graduate     424    92   516
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag     0     1   All
Marital_Status                  
All             7880  1498  9378
Married         3978   709  4687
Single          3275   668  3943
Divorced         627   121   748
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag      0     1    All
Income_Category                   
All              8500  1627  10127
Less than $40K   2949   612   3561
$40K - $60K      1519   271   1790
$80K - $120K     1293   242   1535
$60K - $80K      1213   189   1402
abc               925   187   1112
$120K +           601   126    727
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag     0     1    All
Card_Category                    
All             8500  1627  10127
Blue            7917  1519   9436
Silver           473    82    555
Gold              95    21    116
Platinum          15     5     20
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag               0     1    All
Total_Relationship_Count                   
All                       8500  1627  10127
3                         1905   400   2305
2                          897   346   1243
1                          677   233    910
5                         1664   227   1891
4                         1687   225   1912
6                         1670   196   1866
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag             0     1    All
Months_Inactive_12_mon                   
All                     8500  1627  10127
3                       3020   826   3846
2                       2777   505   3282
4                        305   130    435
1                       2133   100   2233
5                        146    32    178
6                        105    19    124
0                         14    15     29
------------------------------------------------------------------------------------------------------------------------

Attrition_Flag            0     1    All
Contacts_Count_12_mon                   
All                    8500  1627  10127
3                      2699   681   3380
2                      2824   403   3227
4                      1077   315   1392
1                      1391   108   1499
5                       117    59    176
6                         0    54     54
0                       392     7    399
------------------------------------------------------------------------------------------------------------------------

<Figure size 640x480 with 0 Axes>


stacked_barplot(data, "Gender", "Attrition_Flag") # Call stacked_bar to analyze Distribution of Attrition by Gender.

Attrition_Flag     0     1    All
Gender                           
All             8500  1627  10127
F               4428   930   5358
M               4072   697   4769
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Marital_Status") # Call distribution_plot for Attrition_Flag vs Marital_Status

Marital_Status  Divorced  Married  Single   All
Attrition_Flag                                 
All                  748     4687    3943  9378
0                    627     3978    3275  7880
1                    121      709     668  1498
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Education_Level") # Call distribution_plot for Attrition_Flag vs Education_Level

Education_Level  College  Doctorate  Graduate  High School  Post-Graduate  \
Attrition_Flag                                                              
All                 1013        451      3128         2013            516   
0                    859        356      2641         1707            424   
1                    154         95       487          306             92   

Education_Level  Uneducated   All  
Attrition_Flag                     
All                    1487  8608  
0                      1250  7237  
1                       237  1371  
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Income_Category") # Call distribution_plot for Attrition_Flag vs Income_Category

Income_Category  $120K +  $40K - $60K  $60K - $80K  $80K - $120K  \
Attrition_Flag                                                     
All                  727         1790         1402          1535   
0                    601         1519         1213          1293   
1                    126          271          189           242   

Income_Category  Less than $40K   abc    All  
Attrition_Flag                                
All                        3561  1112  10127  
0                          2949   925   8500  
1                           612   187   1627  
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Contacts_Count_12_mon") # Call distribution_plot for Attrition_Flag vs Income_Category

Contacts_Count_12_mon    0     1     2     3     4    5   6    All
Attrition_Flag                                                    
1                        7   108   403   681   315   59  54   1627
All                    399  1499  3227  3380  1392  176  54  10127
0                      392  1391  2824  2699  1077  117   0   8500
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Months_Inactive_12_mon") # Call distribution_plot for Attrition_Flag vs Months_Inactive_12_mon

Months_Inactive_12_mon   0     1     2     3    4    5    6    All
Attrition_Flag                                                    
All                     29  2233  3282  3846  435  178  124  10127
1                       15   100   505   826  130   32   19   1627
0                       14  2133  2777  3020  305  146  105   8500
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Months_Inactive_12_mon") # Call stacked_barplot for Attrition_Flag vs Total_Relationship_Count.

Months_Inactive_12_mon   0     1     2     3    4    5    6    All
Attrition_Flag                                                    
All                     29  2233  3282  3846  435  178  124  10127
1                       15   100   505   826  130   32   19   1627
0                       14  2133  2777  3020  305  146  105   8500
------------------------------------------------------------------------------------------------------------------------


stacked_barplot(data,"Attrition_Flag", "Months_Inactive_12_mon") # Call stacked_barplot for Attrition_Flag vs Dependent_count.

Months_Inactive_12_mon   0     1     2     3    4    5    6    All
Attrition_Flag                                                    
All                     29  2233  3282  3846  435  178  124  10127
1                       15   100   505   826  130   32   19   1627
0                       14  2133  2777  3020  305  146  105   8500
------------------------------------------------------------------------------------------------------------------------


distribution_plot_wrt_target(data, "Total_Revolving_Bal", "Attrition_Flag") # Call istribution_plot_wrt_target for Total_Revolving_Bal vs Attrition_Flag.


distribution_plot_wrt_target(data, "Attrition_Flag", "Credit_Limit") # Call distribution_plot_wrt_target for Attrition_Flag vs Customer_Age.


distribution_plot_wrt_target(data, "Attrition_Flag", "Customer_Age") # Call distribution_plot_wrt_target for Attrition_Flag vs Customer_Age.


distribution_plot_wrt_target(data, "Total_Trans_Ct", "Attrition_Flag") # Call distribution_plot_wrt_target for Total_Trans_Ct vs Attrition_Flag.


distribution_plot_wrt_target(data, "Total_Ct_Chng_Q4_Q1", "Attrition_Flag") # Call distribution_plot_wrt_target for Total_Ct_Chng_Q4_Q1 vs Attrition_Flag


distribution_plot_wrt_target(data, "Avg_Utilization_Ratio", "Attrition_Flag") # Call distribution_plot_wrt_target for Avg_Utilization_Ratio vs Attrition_Flag


distribution_plot_wrt_target(data, "Attrition_Flag", "Months_on_book") # Call distribution_plot_wrt_target for Attrition_Flag vs Months_on_book


distribution_plot_wrt_target(data, "Attrition_Flag", "Total_Revolving_Bal") # Call distribution_plot_wrt_target for Attrition_Flag vs Total_Revolving_Bal


distribution_plot_wrt_target(data, "Attrition_Flag", "Avg_Open_To_Buy") # Call distribution_plot_wrt_target for Attrition_Flag vs Avg_Open_To_Buy


# You are selecting a specific column if data is a DataFrame - Your choice
# Replace 'column_name' with the actual column you are working with
data_column = data['Attrition_Flag']

# Convert the column to numeric values
data_column = pd.to_numeric(data_column, errors='coerce')  # Convert to numeric, NaN for non-numeric
data_column = data_column.dropna()  # Remove rows with NaN values

# Calculate the quartiles
Q1 = data_column.quantile(0.25)  # 25th percentile
Q3 = data_column.quantile(0.75)  # 75th percentile

# Interquartile Range (IQR)
IQR = Q3 - Q1

# Finding the lower and upper bounds for outliers
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR


# checking the % outliers
((data.select_dtypes(include=["float64", "int64"]) < lower) | (data.select_dtypes(include=["float64", "int64"]) > upper)).sum() / len(data) * 100


# creating the copy of the dataframe
data1 = data.copy()


# Replace "Unknown" (or any other anomalous value) with NaN in the "Income_Category" column
data1["Income_Category"].replace("Unknown", np.nan, inplace=True)


data1.isna().sum()


# Creating an instace of the imputer to be used
imputer = SimpleImputer(strategy="most_frequent")


# Separation of features X and the target y
# It is a common preprocessing step before training machine learning models.

X = data1.drop(["Attrition_Flag"], axis=1) # Remove the whole column
y = data1["Attrition_Flag"] # extracts the column "Attrition_Flag" from data1 and assigns it to the variable y

# The column "Attrition_Flag" contains the target variable (label), which is what the model will try to predict.
# In this case, it likely indicates whether a customer has left the credit card services (churned) or not.

print("The second data set is the target column.\nThe first one are the features.\n" ) # The column "Attrition_Flag" contains the target variable (label)
print(X.shape, y.shape)
print("\nOur models will try to predict the target variable y (Attrition_Flag).\nIn other words, whether a customer left the credit card services (churned) or not.")

The second data set is the target column.
The first one are the features.

(10127, 19) (10127,)

Our models will try to predict the target variable y (Attrition_Flag).
In other words, whether a customer left the credit card services (churned) or not.


# Import
from sklearn.model_selection import train_test_split

print("\n")
print("|" * 100)# Print a line separator.
# X is your feature data, and y is your target data
print(f"Original set shape: {X.shape}\n")


# Step 1: Split data into 80% training and 20% temporary (test + validation) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Subsplit the temporary set into 75% test and 25% validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Print shapes of the resulting splits
print("1. First train_test_split: Splits the data into 80% training and 20% \033[1;4mtemporary\033[0m: (X_temp, y_temp).\n")
print("|" * 80,"\033[1;33m|\033[0m" * 20)# Print a line separator.
print(f"Training set shape: {X_train.shape}")
print(f"Temporary set shape: {X_temp.shape}\n")
print("2. Second train_test_split: Subsplits the \033[1;4mtemporary set\033[0m: into 75% test and 25% validation (X_test, X_val).\n")
print("|" * 80,"\033[1;33mt\033[0m" * 15,"\033[1;33mv\033[0m" * 5)# Print a line separator.
print(f"Test set shape: {X_test.shape}")
print(f"Validation set shape: {X_val.shape}")


||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Original set shape: (10127, 19)

1. First train_test_split: Splits the data into 80% training and 20% temporary: (X_temp, y_temp).

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||
Training set shape: (8101, 19)
Temporary set shape: (2026, 19)

2. Second train_test_split: Subsplits the temporary set: into 75% test and 25% validation (X_test, X_val).

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ttttttttttttttt vvvvv
Test set shape: (1519, 19)
Validation set shape: (507, 19)


reqd_col_for_impute = ["Education_Level", "Marital_Status", "Income_Category"] # Category columns to impute #6, #7, and #8


# Category columns to impute
reqd_col_for_impute = ["Education_Level", "Marital_Status", "Income_Category"] # Category columns to impute #6, #7, and #8

# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])# Impute missing values in X_train.

# Transform the validation data
X_val[reqd_col_for_impute]  =  imputer.fit_transform(X_val[reqd_col_for_impute]) # Impute missing values in X_val.

# Transform the test data
X_test[reqd_col_for_impute] = imputer.fit_transform(X_test[reqd_col_for_impute]) #Impute missing values in X_test.


data1.isna().sum()


# Checking that no column has missing values in train or test sets
print("\nChecking that \033[1;4mno column has missing values\033[0m in train dataset (\033[1;92my_train\033[0m):")
print(y_train.isna().sum())
print("-" * 30)
print("\nChecking that \033[1;4mno column has missing values\033[0m in validation dataset (\033[1;92my_val\033[0m):")
print(y_val.isna().sum())
print("-" * 30)
print("\Checking that \033[1;4mno column has missing values\033[0m in test dataset (\033[1;92my_test\033[0m):")
print(y_test.isna().sum())
print("-" * 30)

Checking that no column has missing values in train dataset (y_train):
0
------------------------------

Checking that no column has missing values in validation dataset (y_val):
0
------------------------------
\Checking that no column has missing values in test dataset (y_test):
0
------------------------------


# Checking that no column has missing values in train or test sets
print("\nChecking that \033[1;4mno column has missing values\033[0m in train dataset (\033[1;92mX_train\033[0m): \n")
print(X_train.isna().sum())
print("-" * 30)
print("\nChecking that \033[1;4mno column has missing values\033[0m in validation dataset (\033[1;92mX_val\033[0m): \n")
print(X_val.isna().sum())
print("-" * 30)
print("\Checking that \033[1;4mno column has missing values\033[0m in test dataset (\033[1;92mX_test\033[0m): \n")
print(X_test.isna().sum())
print("-" * 60)
print("If not, check imputer function was executed.")
print("-" * 60)

Checking that no column has missing values in train dataset (X_train): 

Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64
------------------------------

Checking that no column has missing values in validation dataset (X_val): 

Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64
------------------------------
\Checking that no column has missing values in test dataset (X_test): 

Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64
------------------------------------------------------------
If not, check imputer function was executed.
------------------------------------------------------------


cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_train[i].value_counts())
    print("-" * 30)

Gender
F    4279
M    3822
Name: count, dtype: int64
------------------------------
Education_Level
Graduate         3733
High School      1619
Uneducated       1171
College           816
Post-Graduate     407
Doctorate         355
Name: count, dtype: int64
------------------------------
Marital_Status
Married     4346
Single      3144
Divorced     611
Name: count, dtype: int64
------------------------------
Income_Category
Less than $40K    2812
$40K - $60K       1453
$80K - $120K      1237
$60K - $80K       1122
abc                889
$120K +            588
Name: count, dtype: int64
------------------------------
Card_Category
Blue        7557
Silver       436
Gold          93
Platinum      15
Name: count, dtype: int64
------------------------------


# Display Features:
print("\033[1;33mTRAINING SET:\033[0m")
cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_train[i].value_counts())
    print("\033[1;33mFeature in X_train\033[0m")
    print("-" * 30)# Print a line separator.
    print("\n")

TRAINING SET:
Gender
F    4279
M    3822
Name: count, dtype: int64
Feature in X_train
------------------------------


Education_Level
Graduate         3733
High School      1619
Uneducated       1171
College           816
Post-Graduate     407
Doctorate         355
Name: count, dtype: int64
Feature in X_train
------------------------------


Marital_Status
Married     4346
Single      3144
Divorced     611
Name: count, dtype: int64
Feature in X_train
------------------------------


Income_Category
Less than $40K    2812
$40K - $60K       1453
$80K - $120K      1237
$60K - $80K       1122
abc                889
$120K +            588
Name: count, dtype: int64
Feature in X_train
------------------------------


Card_Category
Blue        7557
Silver       436
Gold          93
Platinum      15
Name: count, dtype: int64
Feature in X_train
------------------------------


cols = X_val.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_val[i].value_counts())
    print("\033[1;33mFeature X_train\033[0m")
    print("-" * 30)# Print a line separator.
    print("\n")


# Display Features:
print("\033[1;33mVALIDATION SET:\033[0m")
cols = X_val.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_val[i].value_counts())
    print("\033[1;33mFeature in X_val\033[0m")
    print("-" * 30)# Print a line separator.
    print("\n")

VALIDATION SET:
Gender
F    266
M    241
Name: count, dtype: int64
Feature in X_val
------------------------------


Education_Level
Graduate         237
High School       94
Uneducated        84
College           49
Doctorate         24
Post-Graduate     19
Name: count, dtype: int64
Feature in X_val
------------------------------


Marital_Status
Married     272
Single      193
Divorced     42
Name: count, dtype: int64
Feature in X_val
------------------------------


Income_Category
Less than $40K    174
$40K - $60K        88
$60K - $80K        74
$80K - $120K       71
abc                62
$120K +            38
Name: count, dtype: int64
Feature in X_val
------------------------------


Card_Category
Blue        465
Silver       37
Gold          3
Platinum      2
Name: count, dtype: int64
Feature in X_val
------------------------------


cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_train[i].value_counts())
    print("*" * 30)


# Display Features:
print("\033[1;33mTEST SET:\033[0m")
cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_test[i].value_counts())
    print("\033[1;33mFeature in X_test\033[0m")
    print("-" * 30)# Print a line separator.
    print("\n")

TEST SET:
Gender
F    813
M    706
Name: count, dtype: int64
Feature in X_test
------------------------------


Education_Level
Graduate         677
High School      300
Uneducated       232
College          148
Post-Graduate     90
Doctorate         72
Name: count, dtype: int64
Feature in X_test
------------------------------


Marital_Status
Married     818
Single      606
Divorced     95
Name: count, dtype: int64
Feature in X_test
------------------------------


Income_Category
Less than $40K    575
$40K - $60K       249
$80K - $120K      227
$60K - $80K       206
abc               161
$120K +           101
Name: count, dtype: int64
Feature in X_test
------------------------------


Card_Category
Blue        1414
Silver        82
Gold          20
Platinum       3
Name: count, dtype: int64
Feature in X_test
------------------------------


# Imputing:
X_train = pd.get_dummies(X_train, drop_first=True) # Impute missing values in X_train
X_val = pd.get_dummies(X_val, drop_first=True) # Impute missing values in X_val
X_test = pd.get_dummies(X_test, drop_first=True) # Impute missing values in X_test
print(X_train.shape, X_val.shape, X_test.shape)

(8101, 30) (507, 30) (1519, 30)


X_train.head(10) # Check the top 10 rows from the X_train dataset.


X_val.head(5) # Check the top 5 rows from the val dataset.


X_test.head(5) # Check the top 5 rows from the X_test dataset.


# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
        index=[0],
    )

    return df_perf


def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")


#  ------------------- Import Models Chosen -----------------
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from xgboost import XGBClassifier  # Import XGBoost classifier

models = []  # Empty list to store all the chosen models

# Adding models into the existing list
models.append(("Logistic Regression", LogisticRegression(random_state=1)))  # Adding Logistic Regression (1)
models.append(("Decision Tree", DecisionTreeClassifier(random_state=1)))  # Adding Decision Tree (2)
models.append(("Random forest", RandomForestClassifier(random_state=1))) # Random Forest (3)
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))  # Adding Gradient Boosting (4)
models.append(("Bagging", BaggingClassifier(random_state=1))) # Bagging (5)
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))  # Adding AdaBoost (6)
models.append(("XGBoost", XGBClassifier(random_state=1)))  # Adding XGBoost (Optional)

# Synthetic Minority Over Sampling Technique (original code sample)
# sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
# _train_over, y_train_over = sm.fit_resample(X_train, y_train)
# We have to ensure we have X_train, y_train, X_val, and y_val already defined in our environment
print("\n\033[1;4mTraining Performance:\033[0m\n")

# ANSI escape codes for bold and yellow text
print("\033[1;33mOriginal data without Handling Imbalanced Datasets:\033[0m")
print("\033[1;92mRecall metric:\033[0m")

# This line trains the current model (model) using the training data (X_train for features and y_train for labels).
# The .fit() function is how the model learns patterns in the training data."
for name, model in models:
    model.fit(X_train, y_train) # Training happens here. X_train Features & y_train labels (Target Prediction)
    scores = recall_score(y_train, model.predict(X_train))# Training happens here.
    print("{}: {}".format(name, scores)) # Print calculated values

print("\n\033[1;4mValidation Performance:\033[0m\n")

# ANSI escape codes for bold and yellow text
print("\033[1;33mOriginal data without Handling Imbalanced Datasets:\033[0m")
print("\033[1;92mRecall metric on Validation set:\033[0m")
for name, model in models:
    scores_val = recall_score(y_val, model.predict(X_val))
    print("{}: {}".format(name, scores_val))

print("\033[1;92m\nNotice Recall metric on Validation set vs Training set:\033[0m")

Training Performance:

Original data without Handling Imbalanced Datasets:
Recall metric:
Logistic Regression: 0.4115384615384615
Decision Tree: 1.0
Random forest: 1.0
Gradient Boosting: 0.8938461538461538
Bagging: 0.9784615384615385
AdaBoost: 0.8707692307692307
XGBoost: 1.0

Validation Performance:

Original data without Handling Imbalanced Datasets:
Recall metric on Validation set:
Logistic Regression: 0.36486486486486486
Decision Tree: 0.8243243243243243
Random forest: 0.7432432432432432
Gradient Boosting: 0.8513513513513513
Bagging: 0.8513513513513513
AdaBoost: 0.8513513513513513
XGBoost: 0.9324324324324325

Notice Recall metric on Validation set vs Training set:


#  ------------------- Import Models Chosen ----------------- BASELINE
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from xgboost import XGBClassifier  # Import XGBoost classifier

models = []  # Empty list to store all the chosen models

# Adding models into the existing list
models.append(("Logistic Regression", LogisticRegression(random_state=1)))  # Adding Logistic Regression (1)
models.append(("Decision Tree", DecisionTreeClassifier(random_state=1)))  # Adding Decision Tree (2)
models.append(("Random forest", RandomForestClassifier(random_state=1))) # Random Forest (3)
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))  # Adding Gradient Boosting (4)
models.append(("Bagging", BaggingClassifier(random_state=1))) # Bagging (5)
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))  # Adding AdaBoost (6)
models.append(("XGBoost", XGBClassifier(random_state=1)))  # Adding XGBoost (Optional)


print("\n\033[1;4mTraining Performance:\033[0m\n")
# ANSI escape codes for bold and yellow text
print("\033[1;33mOriginal data without Handling Imbalanced Datasets\033[0m")
print("\033[1;92mRecall metric:\033[0m")
for name, model in models:
    model.fit(X_train, y_train) # We have to ensure we have X_train, y_train, X_val, and y_val already defined in our environment
    scores = recall_score(y_train, model.predict(X_train))
    print("{}: {}".format(name, scores))

print("\n\033[1;4mValidation Performance:\033[0m\n")
# ANSI escape codes for bold and yellow text
print("\033[1;33mOriginal data without Handling Imbalanced Datasets\033[0m")
print("\033[1;92mRecall metric:\033[0m")
for name, model in models:
    scores_val = recall_score(y_val, model.predict(X_val))
    print("{}: {}".format(name, scores_val))

print("\n\033[1;4mTest Performance:\033[0m\n")
# ANSI escape codes for bold and yellow text
print("\033[1;33mOriginal data without Handling Imbalanced Datasets\033[0m")
print("\033[1;92mRecall metric:\033[0m")
for name, model in models:
    scores_test = recall_score(y_test, model.predict(X_test))
    print("{}: {}".format(name, scores_test))

print("\033[1;92m\nExamine the highest Recall score to identify the best model.\033[0m")

Training Performance:

Original data without Handling Imbalanced Datasets
Recall metric:
Logistic Regression: 0.4115384615384615
Decision Tree: 1.0
Random forest: 1.0
Gradient Boosting: 0.8938461538461538
Bagging: 0.9784615384615385
AdaBoost: 0.8707692307692307
XGBoost: 1.0

Validation Performance:

Original data without Handling Imbalanced Datasets
Recall metric:
Logistic Regression: 0.36486486486486486
Decision Tree: 0.8243243243243243
Random forest: 0.7432432432432432
Gradient Boosting: 0.8513513513513513
Bagging: 0.8513513513513513
AdaBoost: 0.8513513513513513
XGBoost: 0.9324324324324325

Test Performance:

Original data without Handling Imbalanced Datasets
Recall metric:
Logistic Regression: 0.3438735177865613
Decision Tree: 0.766798418972332
Random forest: 0.766798418972332
Gradient Boosting: 0.8379446640316206
Bagging: 0.8063241106719368
AdaBoost: 0.8063241106719368
XGBoost: 0.8656126482213439

Examine the highest Recall score to identify the best model.


# Model Performance using previously defined function - For Code Reference

print("\033[1;92mModel Performance Evaluation:\033[0m")
print(f"Model: {model.__class__.__name__}")  # Prints the class name of the model
model_performance_classification_sklearn(model, X_train, y_train) # Calls pre-defined function that displays performance metrics

Model Performance Evaluation:
Model: XGBClassifier


# Model Confusion Matrix using previously defined function - For Code Reference

print("\033[1;92mModel Performance Evaluation of Last Model used:\033[0m")
print(f"Model: {model.__class__.__name__}")  # Prints the class name of the model
confusion_matrix_sklearn(model, X_train, y_train) # Calls pre-defined function that displays confusion matrix

Model Performance Evaluation of Last Model used:
Model: XGBClassifier


print(model) # Prints the model object last used - For Code Reference

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=1, ...)


# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

# Create the model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],       # Number of trees in the forest
    'max_depth': [10, 20, 30],             # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Define a scorer
scorer = make_scorer(accuracy_score)

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring=scorer, cv=2, n_jobs=-1, verbose=2)

# Fit the model on training data
grid_search.fit(X_train, y_train)

# Best parameters found from grid search
print("Best Hyperparameters:", grid_search.best_params_)

# Use the best estimator (model) from grid search
best_model = grid_search.best_estimator_

# Predict on test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print("Recall on Test Set:", recall_score(y_test, y_pred))

Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best Hyperparameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Recall on Test Set: 0.7984189723320159


# model without hyperparameter tuning
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)


from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f"\n\033[1;31mPerformance Metrics without data handling (Oversampling/Undersampling):\033[0m") # Main code for multiple models training and evaluations on split datasets (training, validations, test)

# Define your models
models = []
models.append(("Logistic Regression", LogisticRegression(random_state=1)))
models.append(("Decision Tree", DecisionTreeClassifier(random_state=1)))
models.append(("Random Forest", RandomForestClassifier(random_state=1)))
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))
models.append(("XGBoost", XGBClassifier(random_state=1)))  # XGBoost (optional)

# Define a function to print performance metrics
def print_model_performance(model, X, y, dataset_name):
    y_pred = model.predict(X)  # Predict labels
    print(f"\n\033[1;4mPerformance on {dataset_name} data:\033[0m")
    print(f"Accuracy:  {accuracy_score(y, y_pred):.4f}")
    print(f"Recall:    {recall_score(y, y_pred):.4f}")
    print(f"Precision: {precision_score(y, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y, y_pred):.4f}")

# Evaluate each model
for name, model in models:
    model.fit(X_train, y_train)  # Train the model

    # Print the model being evaluated
    print(f"\n\033[1;92mModel Performance Evaluation:\033[0m")
    print(f"---  Model: {name}  ---")

    # Print performance metrics for training data
    print_model_performance(model, X_train, y_train, "Training")

    # Print performance metrics for validation data
    print_model_performance(model, X_val, y_val, "Validation")

    # Print performance metrics for test data
    print_model_performance(model, X_test, y_test, "Test")

print("\033[1;92m\nExamine the highest scores to identify the best model.\033[0m")

Performance Metrics without data handling (Oversampling/Undersampling):

Model Performance Evaluation:
---  Model: Logistic Regression  ---

Performance on Training data:
Accuracy:  0.8750
Recall:    0.4115
Precision: 0.6833
F1 Score:  0.5137

Performance on Validation data:
Accuracy:  0.8698
Recall:    0.3649
Precision: 0.5870
F1 Score:  0.4500

Performance on Test data:
Accuracy:  0.8585
Recall:    0.3439
Precision: 0.6397
F1 Score:  0.4473

Model Performance Evaluation:
---  Model: Decision Tree  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9310
Recall:    0.8243
Precision: 0.7349
F1 Score:  0.7771

Performance on Test data:
Accuracy:  0.9302
Recall:    0.7668
Precision: 0.8050
F1 Score:  0.7854

Model Performance Evaluation:
---  Model: Random Forest  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9527
Recall:    0.7432
Precision: 0.9167
F1 Score:  0.8209

Performance on Test data:
Accuracy:  0.9526
Recall:    0.7668
Precision: 0.9372
F1 Score:  0.8435

Model Performance Evaluation:
---  Model: Gradient Boosting  ---

Performance on Training data:
Accuracy:  0.9770
Recall:    0.8938
Precision: 0.9603
F1 Score:  0.9259

Performance on Validation data:
Accuracy:  0.9645
Recall:    0.8514
Precision: 0.9000
F1 Score:  0.8750

Performance on Test data:
Accuracy:  0.9664
Recall:    0.8379
Precision: 0.9550
F1 Score:  0.8926

Model Performance Evaluation:
---  Model: Bagging  ---

Performance on Training data:
Accuracy:  0.9963
Recall:    0.9785
Precision: 0.9984
F1 Score:  0.9883

Performance on Validation data:
Accuracy:  0.9546
Recall:    0.8514
Precision: 0.8400
F1 Score:  0.8456

Performance on Test data:
Accuracy:  0.9539
Recall:    0.8063
Precision: 0.9067
F1 Score:  0.8536

Model Performance Evaluation:
---  Model: AdaBoost  ---

Performance on Training data:
Accuracy:  0.9654
Recall:    0.8708
Precision: 0.9100
F1 Score:  0.8899

Performance on Validation data:
Accuracy:  0.9546
Recall:    0.8514
Precision: 0.8400
F1 Score:  0.8456

Performance on Test data:
Accuracy:  0.9526
Recall:    0.8063
Precision: 0.8987
F1 Score:  0.8500

Model Performance Evaluation:
---  Model: XGBoost  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9684
Recall:    0.9324
Precision: 0.8625
F1 Score:  0.8961

Performance on Test data:
Accuracy:  0.9651
Recall:    0.8656
Precision: 0.9202
F1 Score:  0.8921

Examine the highest scores to identify the best model.


# PART I OF III: OVERSAMPLING

# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score

print("\nPART I OF III: OVERSAMPLING..................................................................\n")
print("\033[1;33mMinority datapoints (Attrition) increase by Oversampling to match majority.\033[0m\n")


# ANSI escape codes for bold and yellow, original data showing minority data.
print("Before Oversampling, counts of label 'Yes' (Attrition): \033[1;33m{}\033[0m ".format(sum(y_train == 1)))  # Minority after split
print("Before Oversampling, counts of label 'No' (Non-Attrition): {} \n".format(sum(y_train == 0)))  # Majority

# Synthetic Minority Over Sampling Technique
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)

# Balancing the training dataset by oversampling the minority class so that the model will not be biased toward the majority class during training.
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

# ANSI escape codes for bold and yellow, oversampled data on Training set.
print("After Oversampling, counts of label 'Yes' (Attrition): \033[1;33m{}\033[0m ".format(sum(y_train_over == 1)))  # Minority increased to match majority.
print("After Oversampling, counts of label 'No' (Non-Attrition): {} \n".format(sum(y_train_over == 0)))  # Majority

# ANSI escape codes for bold and yellow, oversampled data shape on Training set.
print("After Oversampling, the shape of X_train: \033[1;33m{}\033[0m ".format(X_train_over.shape))  # Oversampled dataset
print("After Oversampling, the shape of y_train: {}\n".format(y_train_over.shape))  # Target labels

# Main goal
print("Minority datapoints (Attrition) increased from: \033[1;33m{}\033[0m ".format(sum(y_train == 1)), "up to: \033[1;33m{}\033[0m ".format(sum(y_train_over == 1)))

PART I OF III: OVERSAMPLING..................................................................

Minority datapoints (Attrition) increase by Oversampling to match majority.

Before Oversampling, counts of label 'Yes' (Attrition): 1300 
Before Oversampling, counts of label 'No' (Non-Attrition): 6801 

After Oversampling, counts of label 'Yes' (Attrition): 6801 
After Oversampling, counts of label 'No' (Non-Attrition): 6801 

After Oversampling, the shape of X_train: (13602, 30) 
After Oversampling, the shape of y_train: (13602,)

Minority datapoints (Attrition) increased from: 1300  up to: 6801


# PART II OF III: MODEL PERFORMANCE ON OVERSAMPLED TRAINING DATA
print("\n\nPART II OF III: MODEL PERFORMANCE ON OVERSAMPLED TRAINING DATA..............................\n")

# Model training and evaluation after oversampling
print("\033[1;33mOversampled data used to train models to handle imbalanced datasets:\033[0m")
print("\n\033[1;4mModel Performance (Recall on Oversampled Data):\033[0m\n")

# ANSI escape codes for text formatting
for name, model in models:
    model.fit(X_train_over, y_train_over)  # Train the model on oversampled data
    scores_train = recall_score(y_train_over, model.predict(X_train_over))  # Evaluate recall on the oversampled training set
    print("{}: \033[1;92m{}\033[0m".format(name, scores_train))  # Display Recall metric for each model


PART II OF III: MODEL PERFORMANCE ON OVERSAMPLED TRAINING DATA..............................

Oversampled data used to train models to handle imbalanced datasets:

Model Performance (Recall on Oversampled Data):

Logistic Regression: 0.8278194383178944
Decision Tree: 1.0
Random forest: 1.0
Gradient Boosting: 0.9780914571386561
Bagging: 0.9979414791942361
AdaBoost: 0.9669166299073666
XGBoost: 1.0


# SECTION III OF III: EVALUATION ON VALIDATION SET

print("\n\nPART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)..........\n")

# Now, evaluate the models on the validation set to assess generalization and prevent overfitting
print("\033[1;33mEvaluating models on Validation Data (to assess generalization):\033[0m")
print("\n\033[1;4mRecall metric on Validation set:\033[0m\n")

for name, model in models:
    # Predict on the validation set (X_val)
    y_val_pred = model.predict(X_val)

    # Calculate recall on the validation set
    recall_val = recall_score(y_val, y_val_pred)

    # Print the recall score for each model
    print("{}: \033[1;92m{}\033[0m".format(name, recall_val))

# Final comments to highlight the importance of validation set performance
print("\n(1) Models are trained on the oversampled training data (X_train_over, y_train_over).\n")
print("(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.\n")
print("(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.\n")
print("(4) \033[1;4mImportant:\033[0m \033[1;31mIf the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.\033[0m\n\n")


PART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)..........

Evaluating models on Validation Data (to assess generalization):

Recall metric on Validation set:

Logistic Regression: 0.7162162162162162
Decision Tree: 0.9054054054054054
Random forest: 0.8378378378378378
Gradient Boosting: 0.9054054054054054
Bagging: 0.8918918918918919
AdaBoost: 0.8108108108108109
XGBoost: 0.9054054054054054

(1) Models are trained on the oversampled training data (X_train_over, y_train_over).

(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.

(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.

(4) Important: If the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.


# COMBINED CODE

# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score

print("\nPART I OF III: OVERSAMPLING..................................................................\n")
print("\033[1;33mMinority datapoints (Attrition) increase by Oversampling to match majority.\033[0m\n")


# ANSI escape codes for bold and yellow, original data showing minority data.
print("Before Oversampling, counts of label 'Yes' (Attrition): \033[1;33m{}\033[0m ".format(sum(y_train == 1)))  # Minority after split
print("Before Oversampling, counts of label 'No' (Non-Attrition): {} \n".format(sum(y_train == 0)))  # Majority

# Synthetic Minority Over Sampling Technique
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)

# Balancing the training dataset by oversampling the minority class so that the model will not be biased toward the majority class during training.
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

# ANSI escape codes for bold and yellow, oversampled data on Training set.
print("After Oversampling, counts of label 'Yes' (Attrition): \033[1;33m{}\033[0m ".format(sum(y_train_over == 1)))  # Minority increased to match majority.
print("After Oversampling, counts of label 'No' (Non-Attrition): {} \n".format(sum(y_train_over == 0)))  # Majority

# ANSI escape codes for bold and yellow, oversampled data shape on Training set.
print("After Oversampling, the shape of X_train: \033[1;33m{}\033[0m ".format(X_train_over.shape))  # Oversampled dataset
print("After Oversampling, the shape of y_train: {}\n".format(y_train_over.shape))  # Target labels

# Main goal
print("Minority datapoints (Attrition) increased from: \033[1;33m{}\033[0m ".format(sum(y_train == 1)), "up to: \033[1;33m{}\033[0m ".format(sum(y_train_over == 1)))



print("\n\nPART II OF III: MODEL PERFORMANCE ON OVERSAMPLED TRAINING DATA..............................\n")

# Model training and evaluation after oversampling
print("\033[1;33mOversampled data used to train models to handle imbalanced datasets:\033[0m")
print("\n\033[1;4mModel Performance (Recall on Oversampled Data):\033[0m\n")

# ANSI escape codes for text formatting
for name, model in models:
    model.fit(X_train_over, y_train_over)  # Train the model on oversampled data
    scores_train = recall_score(y_train_over, model.predict(X_train_over))  # Evaluate recall on the oversampled training set
    print("{}: \033[1;92m{}\033[0m".format(name, scores_train))  # Display Recall metric for each model

# SECTION III OF III: EVALUATION ON VALIDATION SET

print("\n\nPART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)..........\n")

# Now, evaluate the models on the validation set to assess generalization and prevent overfitting
print("\033[1;33mEvaluating models on Validation Data (to assess generalization):\033[0m")
print("\n\033[1;4mRecall metric on Validation set:\033[0m\n")

for name, model in models:
    # Predict on the validation set (X_val)
    y_val_pred = model.predict(X_val)

    # Calculate recall on the validation set
    recall_val = recall_score(y_val, y_val_pred)

    # Print the recall score for each model
    print("{}: \033[1;92m{}\033[0m".format(name, recall_val))

# Final comments to highlight the importance of validation set performance
print("\n(1) Models are trained on the oversampled training data (X_train_over, y_train_over).\n")
print("(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.\n")
print("(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.\n")
print("(4) \033[1;4mImportant:\033[0m \033[1;31mIf the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.\033[0m\n\n")

PART I OF III: OVERSAMPLING..................................................................

Minority datapoints (Attrition) increase by Oversampling to match majority.

Before Oversampling, counts of label 'Yes' (Attrition): 1300 
Before Oversampling, counts of label 'No' (Non-Attrition): 6801 

After Oversampling, counts of label 'Yes' (Attrition): 6801 
After Oversampling, counts of label 'No' (Non-Attrition): 6801 

After Oversampling, the shape of X_train: (13602, 30) 
After Oversampling, the shape of y_train: (13602,)

Minority datapoints (Attrition) increased from: 1300  up to: 6801 


PART II OF III: MODEL PERFORMANCE ON OVERSAMPLED TRAINING DATA..............................

Oversampled data used to train models to handle imbalanced datasets:

Model Performance (Recall on Oversampled Data):

Logistic Regression: 0.8278194383178944
Decision Tree: 1.0
Random forest: 1.0
Gradient Boosting: 0.9780914571386561
Bagging: 0.9979414791942361
AdaBoost: 0.9669166299073666
XGBoost: 1.0


PART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)..........

Evaluating models on Validation Data (to assess generalization):

Recall metric on Validation set:

Logistic Regression: 0.7162162162162162
Decision Tree: 0.9054054054054054
Random forest: 0.8378378378378378
Gradient Boosting: 0.9054054054054054
Bagging: 0.8918918918918919
AdaBoost: 0.8108108108108109
XGBoost: 0.9054054054054054

(1) Models are trained on the oversampled training data (X_train_over, y_train_over).

(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.

(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.

(4) Important: If the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.


# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score

print("\nPART I OF II: UNDERSAMPLING....................................................................")
print("\033[1;33mMajority datapoints (Non-Attrition) reduced to match minority.\033[0m")

# ANSI escape codes for bold and yellow, original data showing majority and minority class counts before undersampling
print("Before Undersampling, counts of label 'Yes' (Attrition): {} ".format(sum(y_train == 1)))  # Minority class (Attrition)
print("Before Undersampling, counts of label 'No' (Non-Attrition): \033[1;33m{}\033[0m \n".format(sum(y_train == 0)))  # Majority class (Non-Attrition)

# Random Under Sampling Technique
rus = RandomUnderSampler(random_state=1)

# Balancing the training dataset by undersampling the majority class
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)

# ANSI escape codes for bold and yellow, undersampled data on Training set.
print("After Undersampling, counts of label 'Yes' (Attrition): {}".format(sum(y_train_un == 1)))  # Minority class
print("After Undersampling, counts of label 'No' (Non-Attrition): \033[1;33m{}\033[0m \n".format(sum(y_train_un == 0)))  # Majority class reduced

# ANSI escape codes for bold and yellow, undersampled data shape on Training set.
print("After Undersampling, the shape of X_train: \033[1;33m{}\033[0m ".format(X_train_un.shape))  # New training data shape
print("After Undersampling, the shape of y_train: {}\n".format(y_train_un.shape))  # New target shape

# Main goal
print("Majority influential datapoints are now reduced from: \033[1;33m{}\033[0m".format(sum(y_train == 0)), "down to: \033[1;33m{}\033[0m ".format(sum(y_train_un == 0)))

print("\n\nPART II OF II: MODEL PERFORMANCE ON UNDERSAMPLED TRAINING DATA..............................")

# Model training and evaluation after undersampling
print("\033[1;33mUndersampled data used to train these models to handle imbalanced datasets:\033[0m")
print("\n\033[1;4mModel Performance (Recall on Undersampled Data):\033[0m\n")

# ANSI escape codes for text formatting
for name, model in models:
    model.fit(X_train_un, y_train_un)  # Train the model on undersampled data
    scores_train = recall_score(y_train_un, model.predict(X_train_un))  # Evaluate recall on the undersampled training set
    print("{}: \033[1;92m{}\033[0m".format(name, scores_train))  # Display Recall metric for each model

# SECTION III OF III: EVALUATION ON VALIDATION SET

print("\n\nPART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)............")

# Now, evaluate the models on the validation set to assess generalization and prevent overfitting
print("\033[1;33mEvaluating models on Validation Data (to assess generalization):\033[0m")
print("\n\033[1;4mRecall metric on Validation set:\033[0m\n")

for name, model in models:
    # Predict on the validation set (X_val)
    y_val_pred = model.predict(X_val)

    # Calculate recall on the validation set
    recall_val = recall_score(y_val, y_val_pred)

    # Print the recall score for each model
    print("{}: \033[1;92m{}\033[0m".format(name, recall_val))

# Final comments to highlight the importance of validation set performance
print("\n(1) Models are trained on the undersampled training data (X_train_un, y_train_un).\n")
print("(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.\n")
print("(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.\n")
print("(4) If the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.\n")

PART I OF II: UNDERSAMPLING....................................................................
Majority datapoints (Non-Attrition) reduced to match minority.
Before Undersampling, counts of label 'Yes' (Attrition): 1300 
Before Undersampling, counts of label 'No' (Non-Attrition): 6801 

After Undersampling, counts of label 'Yes' (Attrition): 1300
After Undersampling, counts of label 'No' (Non-Attrition): 1300 

After Undersampling, the shape of X_train: (2600, 30) 
After Undersampling, the shape of y_train: (2600,)

Majority influential datapoints are now reduced from: 6801 down to: 1300 


PART II OF II: MODEL PERFORMANCE ON UNDERSAMPLED TRAINING DATA..............................
Undersampled data used to train these models to handle imbalanced datasets:

Model Performance (Recall on Undersampled Data):

Logistic Regression: 0.8215384615384616
Decision Tree: 1.0
Random forest: 1.0
Gradient Boosting: 0.9784615384615385
Bagging: 0.9930769230769231
AdaBoost: 0.9538461538461539
XGBoost: 1.0


PART III OF III: MODEL PERFORMANCE ON VALIDATION DATA (GENERALIZATION ASSESSMENT)............
Evaluating models on Validation Data (to assess generalization):

Recall metric on Validation set:

Logistic Regression: 0.7837837837837838
Decision Tree: 0.918918918918919
Random forest: 0.918918918918919
Gradient Boosting: 0.9594594594594594
Bagging: 0.9054054054054054
AdaBoost: 0.9324324324324325
XGBoost: 0.9594594594594594

(1) Models are trained on the undersampled training data (X_train_un, y_train_un).

(2) Models are evaluated on the validation set (X_val, y_val) to assess how well they generalize to unseen data.

(3) The recall metric on the validation set is crucial for ensuring the model is not overfitting on the training data.

(4) If the model performs well on both training and validation sets, it suggests good generalization; otherwise, overfitting may be occurring.


# Import:
from sklearn.metrics import make_scorer, f1_score

# Define a scorer using f1 score (binary classification example)
scorer = make_scorer(f1_score, average='binary')

# Calling RandomizedSearchCV with custom scorer
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs=-1, scoring=scorer, cv=5, random_state=1)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print("Best parameters are {} with CV score = \033[1;92m{}\033[0m:" .format(randomized_cv.best_params_, randomized_cv.best_score_),"Decision Tree model with original data")

Best parameters are {'min_samples_leaf': 7, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 15, 'max_depth': 5} with CV score = 0.7605589073511382: Decision Tree model with original data


# defining model
Model = DecisionTreeClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {'max_depth': np.arange(2,6),
              'min_samples_leaf': [1, 4, 7],
              'max_leaf_nodes' : [10,15],
              'min_impurity_decrease': [0.0001,0.001] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)

print("Best parameters are {} with CV score = \033[1;92m{}\033[0m:" .format(randomized_cv.best_params_, randomized_cv.best_score_),"Decision Tree model with original data")

Best parameters are {'min_samples_leaf': 7, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 15, 'max_depth': 5} with CV score = 0.7605589073511382: Decision Tree model with original data


# defining model
Model = DecisionTreeClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {'max_depth': np.arange(2,6),
              'min_samples_leaf': [1, 4, 7],
              'max_leaf_nodes' : [10,15],
              'min_impurity_decrease': [0.0001,0.001] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)

print("Best parameters are {} with CV score = \033[1;92m{}\033[0m:" .format(randomized_cv.best_params_, randomized_cv.best_score_),"Decision Tree model with Oversampled data")

Best parameters are {'min_samples_leaf': 7, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 15, 'max_depth': 5} with CV score = 0.9112516719863721: Decision Tree model with Oversampled data


# defining model
Model = DecisionTreeClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {'max_depth': np.arange(2,6),
              'min_samples_leaf': [1, 4, 7],
              'max_leaf_nodes' : [10,15],
              'min_impurity_decrease': [0.0001,0.001] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un,y_train_un) # <----------------------------------

print("Best parameters are {} with CV score = \033[1;92m{}\033[0m:" .format(randomized_cv.best_params_, randomized_cv.best_score_),"Decision Tree model with Undersampled data")

Best parameters are {'min_samples_leaf': 7, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 10, 'max_depth': 5} with CV score = 0.8797792993213255: Decision Tree model with Undersampled data


print(X_clean.dtypes) #If any numeric columns are stored as strings, convert them to the correct type:

Customer_Age                  int64
Gender                      float64
Dependent_count               int64
Education_Level             float64
Marital_Status              float64
Income_Category             float64
Card_Category               float64
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
dtype: object


# Fit the model with the cleaned and encoded dataset
randomized_cv.fit(X_clean_encoded, y_clean)

RandomizedSearchCV(cv=5, error_score='raise',
                   estimator=AdaBoostClassifier(random_state=1), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'estimator': [DecisionTreeClassifier(max_depth=2,
                                                                             random_state=1),
                                                      DecisionTreeClassifier(max_depth=3,
                                                                             random_state=1)],
                                        'learning_rate': [0.01, 0.1, 0.05],
                                        'n_estimators': array([ 50,  75, 100])},
                   random_state=1,
                   scoring=make_scorer(recall_score, average=macro))

RandomizedSearchCV(cv=5, error_score='raise',
                   estimator=AdaBoostClassifier(random_state=1), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'estimator': [DecisionTreeClassifier(max_depth=2,
                                                                             random_state=1),
                                                      DecisionTreeClassifier(max_depth=3,
                                                                             random_state=1)],
                                        'learning_rate': [0.01, 0.1, 0.05],
                                        'n_estimators': array([ 50,  75, 100])},
                   random_state=1,
                   scoring=make_scorer(recall_score, average=macro))

AdaBoostClassifier(random_state=1)

AdaBoostClassifier(random_state=1)


print("Best parameters are {} with CV score = \033[1;92m{}\033[0m:" .format(randomized_cv.best_params_, randomized_cv.best_score_),"AdaBoost model using original data")

Best parameters are {'min_samples_leaf': 7, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 10, 'max_depth': 5} with CV score = 0.8797792993213255: AdaBoost model using original data


# LAST LINE INSIDE HERE WAS DONE AS A SINGLE EXECUTION ABOVE ^^
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# Defining the model
Model = AdaBoostClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": np.arange(50, 110, 25),
    "learning_rate": [0.01, 0.1, 0.05],
    "estimator": [  # Change from base_estimator to estimator (or check your version of sklearn)
        DecisionTreeClassifier(max_depth=2, random_state=1),
        DecisionTreeClassifier(max_depth=3, random_state=1),
    ],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score, average='macro')  # Use average='macro' or 'micro' for multi-class

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=Model,
    param_distributions=param_grid,
    n_jobs=-1,
    n_iter=50,
    scoring=scorer,
    cv=5,
    random_state=1,
    error_score='raise'  # Set error_score to 'raise' for debugging
)

# Fitting parameters in RandomizedSearchCV <--------------- FIX THIS
#randomized_cv.fit(X, y)
#randomized_cv.fit(X_clean, y_clean)

#print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_, randomized_cv.best_score_))


# ORIGINAL - CHECK THAT IT REMAINS AS IN THE ORIGINAL.

# Import necessary libraries
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Assuming X and y are already defined somewhere earlier in the code

# Step 1: Remove rows with NaN in X or y
X_clean = X.dropna()  # Drop rows with missing values in X
y_clean = y[X_clean.index]  # Ensure y matches the index after dropping NaNs in X

# Step 2: Check and drop missing values in y if necessary
y_clean = y_clean.dropna()
X_clean = X_clean.loc[y_clean.index]  # Ensure X matches the index after dropping NaNs in y

# Step 3: Encode categorical features (if any)
# Identify categorical columns to be encoded (adjust this depending on your dataset)
categorical_columns = X_clean.select_dtypes(include=['object']).columns

# Create a ColumnTransformer to apply OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ], remainder='passthrough'  # Keep remaining columns as-is
)

# Step 4: Define the GradientBoostingClassifier model and RandomizedSearchCV
tuned_gbm = GradientBoostingClassifier(random_state=1)

# Define parameter grid for RandomizedSearchCV
param_grid = {
    "n_estimators": [50, 100, 150],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7]
}

# Use RandomizedSearchCV to find the best hyperparameters
randomized_cv = RandomizedSearchCV(
    estimator=tuned_gbm,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings sampled
    scoring='accuracy',  # Adjust this to the scoring metric you need
    cv=5,  # Number of cross-validation folds
    random_state=1
)

# Step 5: Create a pipeline that first encodes the data and then applies the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('randomized_cv', randomized_cv)
])

# Step 6: Fit the model with cleaned and encoded data
pipeline.fit(X_clean, y_clean)  # X_clean is encoded within the pipeline

# Once the model is fitted, you can access the best parameters using:
best_params = pipeline.named_steps['randomized_cv'].best_params_
print(f"Best parameters found: {best_params}")

Best parameters found: {'n_estimators': 150, 'max_depth': 3, 'learning_rate': 0.2}


# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X, y)


# Extracting the best max_depth for the DecisionTreeClassifier from the grid search
best_max_depth = randomized_cv.best_params_['estimator'].max_depth

# Creating a new pipeline with the best parameters
tuned_adb = AdaBoostClassifier(
    random_state=1,  # Using the same random_state
    n_estimators=randomized_cv.best_params_['n_estimators'],  # Best n_estimators
    learning_rate=randomized_cv.best_params_['learning_rate'],  # Best learning_rate
    base_estimator=DecisionTreeClassifier(max_depth=best_max_depth, random_state=1)  # Using the best max_depth
)

# Fitting the model on the original data
#tuned_adb.fit(X, y) # <-----------IT CONTAINS NaN
tuned_adb.fit(X_clean_encoded, y_clean)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

DecisionTreeClassifier(max_depth=3, random_state=1)

DecisionTreeClassifier(max_depth=3, random_state=1)


# Model erformance Metrics
adb_train = model_performance_classification_sklearn(tuned_adb, X_clean_encoded, y_clean)
adb_train


# Import:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Extract the best parameters from RandomizedSearchCV
best_params = randomized_cv.best_params_

# Check the extracted best parameters for correctness
print(best_params)

# Create the best base estimator using the best parameters found
# Assuming 'base_estimator' was used, replace 'base_estimator' with 'estimator' if needed
if 'base_estimator__max_depth' in best_params:
    best_max_depth = best_params['base_estimator__max_depth']
else:
    # Handle the case where 'base_estimator' might not be in the parameters
    best_max_depth = None

# Create the base estimator with the best max_depth
best_base_estimator = DecisionTreeClassifier(max_depth=best_max_depth, random_state=1)

# Create the AdaBoostClassifier with the best parameters and the base estimator
tuned_ada2 = AdaBoostClassifier(
    random_state=1,
    n_estimators=best_params.get('n_estimators', 50),  # Default to 50 if not found
    learning_rate=best_params.get('learning_rate', 1.0),  # Default to 1.0 if not found
    base_estimator=best_base_estimator  # Use the base estimator
)

# Fit the model on the undersampled data
# tuned_ada2.fit(X_undersample, y_undersample)# <---------------------- FIX WITH UNDERSAMPLE
tuned_ada2.fit(X_clean_encoded, y_clean)

{'n_estimators': 100, 'learning_rate': 0.1, 'estimator': DecisionTreeClassifier(max_depth=3, random_state=1)}

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)


# ANWSER 3:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Define the undersampler
undersampler = RandomUnderSampler(random_state=1)

# Fit and transform the data to create undersampled datasets
X_undersample, y_undersample = undersampler.fit_resample(X_clean_encoded, y_clean)

# Extract the best parameters from RandomizedSearchCV
best_params = randomized_cv.best_params_

# Check the extracted best parameters for correctness
print(best_params)

# Create the best base estimator using the best parameters found
# Assuming 'base_estimator' was used, replace 'base_estimator' with 'estimator' if needed
if 'base_estimator__max_depth' in best_params:
    best_max_depth = best_params['base_estimator__max_depth']
else:
    # Handle the case where 'base_estimator' might not be in the parameters
    best_max_depth = None

# Create the base estimator with the best max_depth
best_base_estimator = DecisionTreeClassifier(max_depth=best_max_depth, random_state=1)

# Create the AdaBoostClassifier with the best parameters and the base estimator
tuned_ada2 = AdaBoostClassifier(
    random_state=1,
    n_estimators=best_params.get('n_estimators', 50),  # Default to 50 if not found
    learning_rate=best_params.get('learning_rate', 1.0),  # Default to 1.0 if not found
    base_estimator=best_base_estimator  # Use the base estimator
)

# Fit the model on the undersampled data
tuned_ada2.fit(X_undersample, y_undersample)

{'n_estimators': 100, 'learning_rate': 0.1, 'estimator': DecisionTreeClassifier(max_depth=3, random_state=1)}

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                   learning_rate=0.1, n_estimators=100, random_state=1)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)


# When choosing Accuracy.
from sklearn.metrics import accuracy_score
'''
The following code calculates the accuracy of the tuned_ada2 model on
the undersampled training data.
It predicts the class labels for the undersampled data using the trained model
and then compares those predictions with the true labels.
The accuracy_score function computes the percentage of correct predictions,
which is stored in adb2_train.
'''
adb2_train = accuracy_score(y_undersample, tuned_ada2.predict(X_undersample))

'''
1. accuracy_score(y_undersample, tuned_ada2.predict(X_undersample)):
This line calculates the accuracy of the tuned_ada2 model on the undersampled training data.

2. tuned_ada2.predict(X_undersample):

This part predicts the class labels for the X_undersample data using the trained tuned_ada2 model.
The model applies its learned decision rules to the input data and returns the predicted class labels.
3. accuracy_score(y_undersample, ...):

This part calculates the accuracy of the model's predictions.
It compares the predicted labels from tuned_ada2.predict(X_undersample) with the true labels y_undersample.
The accuracy_score function calculates the percentage of correct predictions.
4. adb2_train = ...:

'''
adb2_train
'''
The calculated accuracy score is assigned to the variable adb2_train above.
This variable now holds the accuracy of the tuned_ada2 model on the undersampled training data.
'''

'\nThe calculated accuracy score is assigned to the variable adb2_train above.\nThis variable now holds the accuracy of the tuned_ada2 model on the undersampled training data.\n'


# Evaluation on Unseen Data : This is really important.
from sklearn.metrics import accuracy_score

adb2_val = accuracy_score(y_val, tuned_ada2.predict(X_val))

'''
This code calculates the accuracy score for the tuned_ada2 model on the validation set
(X_val, y_val). It first predicts labels for the validation data using tuned_ada2.predict(X_val),
and then compares those predictions with the true labels (y_val) using accuracy_score.

3. Interpretation:

The adb2_val variable will now hold the accuracy score of the model on the validation set.
This score provides a more reliable estimate of the model's performance on unseen data,
as it's evaluated on data the model hasn't seen during training.
'''
adb2_val

0.8915187376725838


# Randomized search process
randomized_cv.fit(X_undersample, y_undersample)


# MORE COMPLETE CODE for Reference
%%time

#Creating pipeline
Model = GradientBoostingClassifier(random_state=1)

#Parameter grid to pass in RandomSearchCV
param_grid = {
    "init": [AdaBoostClassifier(random_state=1),DecisionTreeClassifier(random_state=1)],
    "n_estimators": np.arange(50,110,25),
    "learning_rate": [0.01,0.1,0.05],
    "subsample":[0.7,0.9],
    "max_features":[0.5,0.7,1],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_undersample, y_undersample)  # <-- This line fits the model


print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.9, 'n_estimators': 75, 'max_features': 1, 'learning_rate': 0.1, 'init': AdaBoostClassifier(random_state=1)} with CV score=0.765876177828369:
CPU times: user 1.84 s, sys: 312 ms, total: 2.15 s
Wall time: 1min 6s


# Creating new pipeline with best parameters
tuned_gbm1 = GradientBoostingClassifier(
    max_features=randomized_cv.best_params_['max_features'],  # Access best max_features
    init=AdaBoostClassifier(random_state=1),
    random_state=1,
    learning_rate=randomized_cv.best_params_['learning_rate'],  # Access best learning_rate
    n_estimators=randomized_cv.best_params_['n_estimators'],  # Access best n_estimators
    subsample=randomized_cv.best_params_['subsample'],  # Access best subsample
)

tuned_gbm1.fit(X_train_un, y_train_un)  # Fit the model on the un-undersampled training data

GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                           max_features=0.5, random_state=1, subsample=0.9)

GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                           max_features=0.5, random_state=1, subsample=0.9)

AdaBoostClassifier(random_state=1)

AdaBoostClassifier(random_state=1)


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

# Define the resampling techniques
oversample = SMOTE(random_state=1)
undersample = RandomUnderSampler(random_state=1)

# Define the classifier
model = GradientBoostingClassifier(random_state=1)

# Create an imbalanced pipeline with oversampling and undersampling
pipeline = Pipeline([
    ('o', oversample),
    ('u', undersample),
    ('m', model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Use the fitted model for predictions or evaluation
predictions = pipeline.predict(X_test)


from imblearn.over_sampling import SMOTE

# Define the oversampler
oversample = SMOTE(random_state=1)

# Fit and transform the data
X_train_oversample, y_train_oversample = oversample.fit_resample(X_train, y_train)

# Fit the model on the oversampled data
tuned_gbm1.fit(X_train_oversample, y_train_oversample)


from imblearn.under_sampling import RandomUnderSampler

# Define the undersampler
undersample = RandomUnderSampler(random_state=1)

# Fit and transform the data
X_train_undersample, y_train_undersample = undersample.fit_resample(X_train, y_train)

# Fit the model on the undersampled data
tuned_gbm1.fit(X_train_undersample, y_train_undersample)


# Import necessary libraries
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score

# Assuming you already have X_train and y_train from the original dataset

# Step 1: Apply undersampling to the training data
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)  # This creates X_train_un and y_train_un

# Step 2: Define the model (here we're using GradientBoostingClassifier)
tuned_gbm1 = GradientBoostingClassifier(random_state=1)

# Step 3: Train the model on the undersampled training data
tuned_gbm1.fit(X_train_un, y_train_un)  # Ensure the model is trained on the undersampled data

# Step 4: Calculate the recall on the undersampled training set
gbm1_train_recall = recall_score(y_train_un, tuned_gbm1.predict(X_train_un))  # Check performance on undersampled train set

# Step 5: Output the recall score
print(f"Recall on undersampled training set: {gbm1_train_recall:.4f}")

'''
This line below calculates the recall score for the tuned_gbm1 model
on the X_train_un data and compares it to the true labels y_train_un.
The recall_score function returns the recall, which is the ability of the model
to capture the positive class correctly.

In summary, it compares the predicted labels with the true labels and returns
the recall value.
'''
gbm1_train_recall = recall_score(y_train_un, tuned_gbm1.predict(X_train_un))  # Check performance on undersampled train set

'''
Predict on Training Data:

tuned_gbm1.predict(X_train_un)<-- This method above predicts the class labels for
the X_train_un data using the trained tuned_gbm1 model. The predict method applies
the model's learned decision rules to the input data and returns the predicted class labels.

Calculate Recall:

recall_score(y_train_un, tuned_gbm1.predict(X_train_un)): This line calculates
the recall of the model's predictions on the training data.

The recall_score function takes two arguments:

1. y_train_un: The true labels of the training data.
2. tuned_gbm1.predict(X_train_un): The predicted labels from the model.
'''
gbm1_train_recall

'''
Assign to gbm1_train_recall:

gbm1_train_recall = recall_score(...): The calculated recall score is assigned
to the variable gbm1_train_recall. This variable now holds the recall of the tuned_gbm1
model on the undersampled training data.
'''

# Output recall score
print(f"Recall on undersampled training set: {gbm1_train_recall:.4f}")

Recall on undersampled training set: 0.9785
Recall on undersampled training set: 0.9785


# Accuracy, Recall and all other performance metrics to consider - Start with Accuracy
from sklearn.metrics import accuracy_score

'''
This line below calculates the accuracy score for the tuned_gbm1 model on the validation data (X_val, y_val).
It first predicts labels for the validation data using tuned_gbm1.predict(X_val),
and then compares those predictions with the true labels (y_val) using accuracy_score.
'''

gbm1_val = accuracy_score(y_val, tuned_gbm1.predict(X_val))


# Using Original Data:
randomized_cv.fit(X_train, y_train)
'''
This line calls the fit method on the randomized_cv object,
passing the original training data (X_train and y_train) as arguments.
This will initiate the randomized search process to find the best hyperparameter
combination for the GradientBoostingClassifier model using
the specified parameters and scoring metric.
'''


# MORE COMPLETE CODE - For Reference during Code Development
%%time

#defining model
Model = GradientBoostingClassifier(random_state=1)

#Parameter grid to pass in RandomSearchCV
param_grid = {
    "init": [AdaBoostClassifier(random_state=1),DecisionTreeClassifier(random_state=1)],
    "n_estimators": np.arange(50,110,25),
    "learning_rate": [0.01,0.1,0.05],
    "subsample":[0.7,0.9],
    "max_features":[0.5,0.7,1],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)  # <-- This line fits the model on the original data


print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))


# MORE COMPLETE CODE - For Reference during Code Development
%%time

#defining model
Model = GradientBoostingClassifier(random_state=1)

#Parameter grid to pass in RandomSearchCV
param_grid = {
    "init": [AdaBoostClassifier(random_state=1),DecisionTreeClassifier(random_state=1)],
    "n_estimators": np.arange(50,110,25),
    "learning_rate": [0.01,0.1,0.05],
    "subsample":[0.7,0.9],
    "max_features":[0.5,0.7,1],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)  # <-- This line fits the model on the original data


print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

# Creating new pipeline with best parameters
tuned_gbm2 = GradientBoostingClassifier(
    max_features=randomized_cv.best_params_.get('max_features', None),  # Access best max_features, defaulting to None if not present
    init=AdaBoostClassifier(random_state=1),  # This works, `random_state` is passed to AdaBoost
    random_state=1,  # Ensure reproducibility
    learning_rate=randomized_cv.best_params_.get('learning_rate', 0.1),  # Default learning_rate if missing
    n_estimators=randomized_cv.best_params_.get('n_estimators', 100),  # Default n_estimators if missing
    subsample=randomized_cv.best_params_.get('subsample', 1.0)  # Default subsample if missing
)

# Fit the model on the original training data
tuned_gbm2.fit(X_train, y_train)

Best parameters are {'subsample': 0.9, 'n_estimators': 100, 'max_features': 0.5, 'learning_rate': 0.1, 'init': AdaBoostClassifier(random_state=1)} with CV score=0.8384615384615385:
CPU times: user 8.07 s, sys: 601 ms, total: 8.67 s
Wall time: 3min 58s

GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                           max_features=0.5, random_state=1, subsample=0.9)

GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                           max_features=0.5, random_state=1, subsample=0.9)

AdaBoostClassifier(random_state=1)

AdaBoostClassifier(random_state=1)


# Use Performance Metrics
from sklearn.metrics import accuracy_score
'''
This line calculates the accuracy score for the tuned_gbm2 model on the
X_train_over data (oversampled data) and compares it to the true labels y_train_over.
The accuracy_score function returns the percentage of correct predictions.
'''
gbm2_train = accuracy_score(y_train_over, tuned_gbm2.predict(X_train_over))
gbm2_train


# Use Performance Metrics
from sklearn.metrics import accuracy_score
'''
To calculate the performance of the tuned_gbm2 model on the validation set,
WE can use the accuracy_score function from sklearn.metrics.
This function compares the predicted labels with the true labels and returns
the percentage of correct predictions.
'''
gbm2_val = accuracy_score(y_val, tuned_gbm2.predict(X_val))
'''
This line calculates the accuracy score for the tuned_gbm2 model on the validation data (X_val, y_val).
It first predicts labels for the validation data using tuned_gbm2.predict(X_val),
and then compares those predictions with the true labels (y_val) using accuracy_score.
'''
gbm2_val

0.9664694280078896


# Optional - but useful to understand
randomized_cv.fit(X_train, y_train)
'''
This line calls the fit method on the randomized_cv object, passing the original
training data (X_train and y_train) as arguments. This will initiate the randomized
search process to find the best hyperparameter combination for the
XGBClassifier model using the specified parameters and scoring metric.
'''

'\nThis line calls the fit method on the randomized_cv object, passing the original\ntraining data (X_train and y_train) as arguments. This will initiate the randomized\nsearch process to find the best hyperparameter combination for the\nXGBClassifier model using the specified parameters and scoring metric.\n'


# MORE COMPLETE CODE - For Reference during Code Development
%%time

# defining model
Model = XGBClassifier(random_state=1,eval_metric='logloss')

#Parameter grid to pass in RandomSearchCV
param_grid={'n_estimators':np.arange(50,110,25),
            'scale_pos_weight':[1,2,5],
            'learning_rate':[0.01,0.1,0.05],
            'gamma':[1,3],
            'subsample':[0.7,0.9]
           }
from sklearn import metrics

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)  # <-- This line fits the model on the original data

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.7, 'scale_pos_weight': 5, 'n_estimators': 75, 'learning_rate': 0.05, 'gamma': 3} with CV score=0.9346153846153846:
CPU times: user 4.08 s, sys: 321 ms, total: 4.4 s
Wall time: 1min 17s


# Tuning
    random_state=1,
    eval_metric="logloss",
    #subsample=randomized_cv.best_params_['subsample'],  # Access best subsample
    #scale_pos_weight=randomized_cv.best_params_['scale_pos_weight'],  # Access best scale_pos_weight
    #n_estimators=randomized_cv.best_params_['n_estimators'],  # Access best n_estimators
    #learning_rate=randomized_cv.best_params_['learning_rate'],  # Access best learning_rate
    gamma=1,  # You kept gamma fixed at 1
)

tuned_xgb.fit(X_train, y_train)  # Fit the model on the original training data

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=1, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=1, ...)


# Trained
from sklearn.metrics import accuracy_score
'''
To calculate the performance of the tuned_xgb model on the original training set,
we can use the accuracy_score function from sklearn.metrics.
'''
xgb_train = accuracy_score(y_train, tuned_xgb.predict(X_train))
'''
This line calculates the accuracy score for the tuned_xgb model on the
X_train data (original data) and compares it to the true labels y_train.
The accuracy_score function returns the percentage of correct predictions.
'''
xgb_train

0.9946920133316874


# Tuned
from sklearn.metrics import accuracy_score
'''
To calculate the performance of the tuned_xgb model on the validation set,
we can use the accuracy_score function from sklearn.metrics.
'''
xgb_val = accuracy_score(y_val, tuned_xgb.predict(X_val))
'''
This line above calculates the accuracy score for the tuned_xgb model on the
validation data (X_val, y_val). It first predicts labels for the validation data
using tuned_xgb.predict(X_val), and then compares those predictions with
the true labels (y_val) using accuracy_score.
'''
xgb_val

0.9704142011834319


from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f"\n\033[1;31mPerformance Metrics without data handling (Oversampling/Undersampling):\033[0m") # Main code for multiple models training and evaluations on split datasets (training, validations, test)

# Define your models
models = []
models.append(("Logistic Regression", LogisticRegression(random_state=1)))
models.append(("Decision Tree", DecisionTreeClassifier(random_state=1)))
models.append(("Random Forest", RandomForestClassifier(random_state=1)))
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))
models.append(("XGBoost", XGBClassifier(random_state=1)))  # XGBoost (optional)

# Define a function to print performance metrics
def print_model_performance(model, X, y, dataset_name):
    y_pred = model.predict(X)  # Predict labels
    print(f"\n\033[1;4mPerformance on {dataset_name} data:\033[0m")
    print(f"Accuracy:  {accuracy_score(y, y_pred):.4f}")
    print(f"Recall:    {recall_score(y, y_pred):.4f}")
    print(f"Precision: {precision_score(y, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y, y_pred):.4f}")

# Evaluate each model
for name, model in models:
    model.fit(X_train, y_train)  # Train the model

    # Print the model being evaluated
    print(f"\n\033[1;92mModel Performance Evaluation:\033[0m")
    print(f"---  Model: {name}  ---")

    # Print performance metrics for training data
    print_model_performance(model, X_train, y_train, "Training")

    # Print performance metrics for validation data
    print_model_performance(model, X_val, y_val, "Validation")

    # Print performance metrics for test data
    print_model_performance(model, X_test, y_test, "Test")

print("\033[1;92m\nExamine the highest scores to identify the best model.\033[0m")

Performance Metrics without data handling (Oversampling/Undersampling):

Model Performance Evaluation:
---  Model: Logistic Regression  ---

Performance on Training data:
Accuracy:  0.8750
Recall:    0.4115
Precision: 0.6833
F1 Score:  0.5137

Performance on Validation data:
Accuracy:  0.8698
Recall:    0.3649
Precision: 0.5870
F1 Score:  0.4500

Performance on Test data:
Accuracy:  0.8585
Recall:    0.3439
Precision: 0.6397
F1 Score:  0.4473

Model Performance Evaluation:
---  Model: Decision Tree  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9310
Recall:    0.8243
Precision: 0.7349
F1 Score:  0.7771

Performance on Test data:
Accuracy:  0.9302
Recall:    0.7668
Precision: 0.8050
F1 Score:  0.7854

Model Performance Evaluation:
---  Model: Random Forest  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9527
Recall:    0.7432
Precision: 0.9167
F1 Score:  0.8209

Performance on Test data:
Accuracy:  0.9526
Recall:    0.7668
Precision: 0.9372
F1 Score:  0.8435

Model Performance Evaluation:
---  Model: Gradient Boosting  ---

Performance on Training data:
Accuracy:  0.9770
Recall:    0.8938
Precision: 0.9603
F1 Score:  0.9259

Performance on Validation data:
Accuracy:  0.9645
Recall:    0.8514
Precision: 0.9000
F1 Score:  0.8750

Performance on Test data:
Accuracy:  0.9664
Recall:    0.8379
Precision: 0.9550
F1 Score:  0.8926

Model Performance Evaluation:
---  Model: Bagging  ---

Performance on Training data:
Accuracy:  0.9963
Recall:    0.9785
Precision: 0.9984
F1 Score:  0.9883

Performance on Validation data:
Accuracy:  0.9546
Recall:    0.8514
Precision: 0.8400
F1 Score:  0.8456

Performance on Test data:
Accuracy:  0.9539
Recall:    0.8063
Precision: 0.9067
F1 Score:  0.8536

Model Performance Evaluation:
---  Model: AdaBoost  ---

Performance on Training data:
Accuracy:  0.9654
Recall:    0.8708
Precision: 0.9100
F1 Score:  0.8899

Performance on Validation data:
Accuracy:  0.9546
Recall:    0.8514
Precision: 0.8400
F1 Score:  0.8456

Performance on Test data:
Accuracy:  0.9526
Recall:    0.8063
Precision: 0.8987
F1 Score:  0.8500

Model Performance Evaluation:
---  Model: XGBoost  ---

Performance on Training data:
Accuracy:  1.0000
Recall:    1.0000
Precision: 1.0000
F1 Score:  1.0000

Performance on Validation data:
Accuracy:  0.9684
Recall:    0.9324
Precision: 0.8625
F1 Score:  0.8961

Performance on Test data:
Accuracy:  0.9651
Recall:    0.8656
Precision: 0.9202
F1 Score:  0.8921

Examine the highest scores to identify the best model.


# Initialize the Gradient Boosting Classifier
tuned_gbm = GradientBoostingClassifier(random_state=1)

# Fit the model with training data
tuned_gbm.fit(X_train, y_train)

# Now you can access the feature importances
importances = tuned_gbm.feature_importances_  # Get the feature importances from the fitted model

# Get the feature names from the training set
feature_names = X_train.columns

# Sort the feature importances in ascending order
indices = np.argsort(importances)

# Plot the feature importances
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])  # Match feature names with importances
plt.xlabel("Relative Importance")
plt.show()


# get_metrics_score function can be defined as shown below:

from sklearn.metrics import accuracy_score, recall_score, precision_score

def get_metrics_score(model, X_train, y_train, X_test, y_test, return_train_score=False):
    """
    Calculate accuracy, precision, and recall for a given model.

    :param model: Trained model
    :param X_train: Training data
    :param y_train: True labels for training data
    :param X_test: Test data
    :param y_test: True labels for test data
    :param return_train_score: If True, calculate metrics for training data as well
    :return: List of accuracy, precision, and recall for train and test sets
    """
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Metrics for training data
    accuracy_train = accuracy_score(y_train, y_pred_train) if return_train_score else None
    precision_train = precision_score(y_train, y_pred_train, average='weighted') if return_train_score else None
    recall_train = recall_score(y_train, y_pred_train, average='weighted') if return_train_score else None

    # Metrics for test data
    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test, average='weighted')
    recall_test = recall_score(y_test, y_pred_test, average='weighted')

    return [accuracy_train, accuracy_test, precision_train, precision_test, recall_train, recall_test]


import numpy as np

# Define ANSI escape codes for text formatting
BOLD_YELLOW = '\033[1;33m'
RESET = '\033[0m'

# Example usage of the get_metrics_score function
models = [model1, model2, model3]  # Replace with your list of trained models
model_names = ['Model 1', 'Model 2', 'Model 3']  # Replace with actual model names or identifiers
acc_test = []
precision_test = []
recall_test = []

# Store metrics in a list of tuples for comparison
metrics_list = []

# Loop through each model
for model, name in zip(models, model_names):
    # Get metrics for the current model
    metrics = get_metrics_score(model, X_train, y_train, X_test, y_test, return_train_score=False)

    # Append rounded metrics and model name to the metrics_list
    acc_test_value = np.round(metrics[1], 2)
    precision_test_value = np.round(metrics[3], 2)
    recall_test_value = np.round(metrics[5], 2)

    acc_test.append(acc_test_value)
    precision_test.append(precision_test_value)
    recall_test.append(recall_test_value)

    metrics_list.append((acc_test_value, precision_test_value, recall_test_value, name))

# Find the highest values and corresponding models
max_acc = max(acc_test)
max_precision = max(precision_test)
max_recall = max(recall_test)

# Highlight highest values in bold and yellow
for metric, values, name in zip(['Accuracy', 'Precision', 'Recall'],
                                 [acc_test, precision_test, recall_test],
                                 model_names):
    max_value = max(values)
    print(f"{metric} Values:")
    for value, model in zip(values, model_names):
        if value == max_value:
            print(f"{BOLD_YELLOW}{value} (Highest - {metric}){RESET} - {model}")
        else:
            print(f"{value} - {model}")

print("\nHighest values with corresponding models:")

# Print the highest values and the corresponding model
for acc, precision, recall, model_name in metrics_list:
    if acc == max_acc:
        print(f"Highest Test Accuracy: {BOLD_YELLOW}{acc}{RESET} - {model_name}")
    if precision == max_precision:
        print(f"Highest Test Precision: {BOLD_YELLOW}{precision}{RESET} - {model_name}")
    if recall == max_recall:
        print(f"Highest Test Recall: {BOLD_YELLOW}{recall}{RESET} - {model_name}")


# Model Performance Scores - Code used for reference - no need to run

# Checking recall score on train and validation set
print("Recall on train and validation set")
print(recall_score(y_train, rf.predict(X_train)))
print(recall_score(y_val, rf.predict(X_val)))

# Single line:
# Checking Recall score on train and validation set
recall_train = recall_score(y_train, rf.predict(X_train))
recall_val = recall_score(y_val, rf.predict(X_val))
print(f"Recall on train set: {precision_train}, Recall on validation set: {recall_val}")

print("-" * 30)

# Checking Precision score on train and validation set
print("Precision on train and validation set")
print(precision_score(y_train, rf.predict(X_train)))
print(precision_score(y_val, rf.predict(X_val)))

# Single line:
# Checking Precision score on train and validation set
precision_train = precision_score(y_train, rf.predict(X_train))
precision_val = precision_score(y_val, rf.predict(X_val))
print(f"Precision on train set: {precision_train}, Precision on validation set: {precision_val}")

print("-" * 30)

# Checking Accuracy score on train and validation set
print("Accuracy on train and validation set")
print(accuracy_score(y_train, rf.predict(X_train)))
print(accuracy_score(y_val, rf.predict(X_val)))

# Single line:
# Checking Accuracy score on train and validation set
accuracy_train = accuracy_score(y_train, rf.predict(X_train))
accuracy_val = accuracy_score(y_val, rf.predict(X_val))
print(f"Precision on train set: {accuracy_train}, Precision on validation set: {accuracy_val}")

print("-" * 30)

# Checking F1 score on train and validation set
print("F1 on train and validation set")
print(f1_score(y_train, rf.predict(X_train)))
print(f1_score(y_val, rf.predict(X_val)))

# Single line:
# Checking Precision score on train and validation set
f1_train = f1_score(y_train, rf.predict(X_train))
f1_val = f1_score(y_val, rf.predict(X_val))
print(f"Precision on train set: {f1_train}, Precision on validation set: {f1_val}")

print("\033[1;92mModel Performance Evaluation:\033[0m")
print(f"Model: {model.__class__.__name__}")  # Prints the class name of the model
model_performance_classification_sklearn(model, X_train, y_train) # Calls pre-defined function that displays performance metrics

Recall on train and validation set
1.0
0.7432432432432432
Recall on train set: 1.0, Recall on validation set: 0.7432432432432432
------------------------------
Precision on train and validation set
1.0
0.9166666666666666
Precision on train set: 1.0, Precision on validation set: 0.9166666666666666
------------------------------
Accuracy on train and validation set
1.0
0.9526627218934911
Precision on train set: 1.0, Precision on validation set: 0.9526627218934911
------------------------------
F1 on train and validation set
1.0
0.8208955223880596
Precision on train set: 1.0, Precision on validation set: 0.8208955223880596
Model Performance Evaluation:
Model: XGBClassifier


# ----------------------------- BASELINE VERSION WITH FULL EXPLANATIONS Fore reference - No need to execute - Useful to build up the code

# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# ANSI escape codes for bold and yellow, original data showing minority data.
print("Before Oversampling, counts of label 'Yes': \033[1;33m{}\033[0m ".format(sum(y_train == 1)))
print("Before Oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))

'''
The code below effectively addresses class imbalance by generating synthetic data points
for the minority class, ensuring a more balanced distribution between the classes.
This can be beneficial for improving the performance of machine learning models,
especially when dealing with imbalanced datasets.

'''
# ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# S_ynthetic M_inority O_ver-Sampling Te_chnique:
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
'''
    sampling_strategy=1:
    This indicates that the minority class should be oversampled to have an
    equal number of samples as the majority class.
    k_neighbors=5:
    This specifies the number of neighbors used to generate new
    synthetic data points.
    random_state=1:
    This sets a random seed for reproducibility.
'''
# Oversampled data - minority class increased to match majority.
# Code adjusts the training dataset so that the minority and majority classes in y_train are more balanced,
# improving model performance in cases where class imbalance is a problem (e.g., our churn prediction).
# ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

# ANSI escape codes for bold and yellow, oversampled data.
print("After Oversampling, counts of label 'Yes': \033[1;33m{}\033[0m ".format(sum(y_train_over == 1)))
print("After Oversampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))

# ANSI escape codes for bold and yellow, oversampled data shape.
print("After Oversampling, the shape of X_train: \033[1;33m{}\033[0m ".format(X_train_over.shape))
print("After Oversampling, the shape of y_train: {}\n".format(y_train_over.shape))

	count	mean	std	min	25%	50%	75%	max
CLIENTNUM	10127.000	739177606.334	36903783.450	708082083.000	713036770.500	717926358.000	773143533.000	828343083.000
Customer_Age	10127.000	46.326	8.017	26.000	41.000	46.000	52.000	73.000
Dependent_count	10127.000	2.346	1.299	0.000	1.000	2.000	3.000	5.000
Months_on_book	10127.000	35.928	7.986	13.000	31.000	36.000	40.000	56.000
Total_Relationship_Count	10127.000	3.813	1.554	1.000	3.000	4.000	5.000	6.000
Months_Inactive_12_mon	10127.000	2.341	1.011	0.000	2.000	2.000	3.000	6.000
Contacts_Count_12_mon	10127.000	2.455	1.106	0.000	2.000	2.000	3.000	6.000
Credit_Limit	10127.000	8631.954	9088.777	1438.300	2555.000	4549.000	11067.500	34516.000
Total_Revolving_Bal	10127.000	1162.814	814.987	0.000	359.000	1276.000	1784.000	2517.000
Avg_Open_To_Buy	10127.000	7469.140	9090.685	3.000	1324.500	3474.000	9859.000	34516.000
Total_Amt_Chng_Q4_Q1	10127.000	0.760	0.219	0.000	0.631	0.736	0.859	3.397
Total_Trans_Amt	10127.000	4404.086	3397.129	510.000	2155.500	3899.000	4741.000	18484.000
Total_Trans_Ct	10127.000	64.859	23.473	10.000	45.000	67.000	81.000	139.000
Total_Ct_Chng_Q4_Q1	10127.000	0.712	0.238	0.000	0.582	0.702	0.818	3.714
Avg_Utilization_Ratio	10127.000	0.275	0.276	0.000	0.023	0.176	0.503	0.999

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
0	768805383	Existing Customer	45	M	3	High School	Married	$60K - $80K	Blue	39	5	1	3	12691.000	777	11914.000	1.335	1144	42	1.625	0.061
1	818770008	Existing Customer	49	F	5	Graduate	Single	Less than $40K	Blue	44	6	1	2	8256.000	864	7392.000	1.541	1291	33	3.714	0.105
2	713982108	Existing Customer	51	M	3	Graduate	Married	$80K - $120K	Blue	36	4	1	0	3418.000	0	3418.000	2.594	1887	20	2.333	0.000
3	769911858	Existing Customer	40	F	4	High School	NaN	Less than $40K	Blue	34	3	4	1	3313.000	2517	796.000	1.405	1171	20	2.333	0.760
4	709106358	Existing Customer	40	M	3	Uneducated	Married	$60K - $80K	Blue	21	5	1	0	4716.000	0	4716.000	2.175	816	28	2.500	0.000

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
10122	772366833	Existing Customer	50	M	2	Graduate	Single	$40K - $60K	Blue	40	3	2	3	4003.000	1851	2152.000	0.703	15476	117	0.857	0.462
10123	710638233	Attrited Customer	41	M	2	NaN	Divorced	$40K - $60K	Blue	25	4	2	3	4277.000	2186	2091.000	0.804	8764	69	0.683	0.511
10124	716506083	Attrited Customer	44	F	1	High School	Married	Less than $40K	Blue	36	5	3	4	5409.000	0	5409.000	0.819	10291	60	0.818	0.000
10125	717406983	Attrited Customer	30	M	2	Graduate	NaN	$40K - $60K	Blue	36	4	3	3	5281.000	0	5281.000	0.535	8395	62	0.722	0.000
10126	714337233	Attrited Customer	43	F	2	Graduate	Married	Less than $40K	Silver	25	6	2	4	10388.000	1961	8427.000	0.703	10294	61	0.649	0.189

	count	unique	top	freq
Attrition_Flag	10127	2	Existing Customer	8500
Gender	10127	2	F	5358
Education_Level	8608	6	Graduate	3128
Marital_Status	9378	3	Married	4687
Income_Category	10127	6	Less than $40K	3561
Card_Category	10127	4	Blue	9436

	0
Attrition_Flag	16.066
Customer_Age	100.000
Dependent_count	91.073
Months_on_book	100.000
Total_Relationship_Count	100.000
Months_Inactive_12_mon	99.714
Contacts_Count_12_mon	96.060
Credit_Limit	100.000
Total_Revolving_Bal	75.610
Avg_Open_To_Buy	100.000
Total_Amt_Chng_Q4_Q1	99.951
Total_Trans_Amt	100.000
Total_Trans_Ct	100.000
Total_Ct_Chng_Q4_Q1	99.931
Avg_Utilization_Ratio	75.610

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender_M	Education_Level_Doctorate	Education_Level_Graduate	Education_Level_High School	Education_Level_Post-Graduate	Education_Level_Uneducated	Marital_Status_Married	Marital_Status_Single	Income_Category_$40K - $60K	Income_Category_$60K - $80K	Income_Category_$80K - $120K	Income_Category_Less than $40K	Income_Category_abc	Card_Category_Gold	Card_Category_Platinum	Card_Category_Silver
9066	54	1	36	1	3	3	3723.000	1728	1995.000	0.595	8554	99	0.678	0.464	False	False	True	False	False	False	False	True	False	False	False	False	True	False	False	False
5814	58	4	48	1	4	3	5396.000	1803	3593.000	0.493	2107	39	0.393	0.334	False	False	False	True	False	False	True	False	False	False	False	False	True	False	False	False
792	45	4	36	6	1	3	15987.000	1648	14339.000	0.732	1436	36	1.250	0.103	False	False	True	False	False	False	False	True	False	False	False	True	False	True	False	False
1791	34	2	36	4	3	4	3625.000	2517	1108.000	1.158	2616	46	1.300	0.694	False	False	True	False	False	False	False	True	False	False	False	True	False	False	False	False
5011	49	2	39	5	3	4	2720.000	1926	794.000	0.602	3806	61	0.794	0.708	False	False	False	True	False	False	True	False	True	False	False	False	False	False	False	False
2260	60	0	45	5	2	4	1438.300	648	790.300	0.477	1267	27	1.077	0.451	False	True	False	False	False	False	True	False	False	False	False	True	False	False	False	False
8794	43	4	28	2	2	1	2838.000	1934	904.000	0.873	8644	87	0.554	0.681	False	False	True	False	False	False	False	True	False	False	False	False	True	False	False	False
4292	52	2	45	3	1	3	3476.000	1560	1916.000	0.894	3496	58	0.871	0.449	False	False	True	False	False	False	False	True	True	False	False	False	False	False	False	False
1817	30	0	36	3	3	2	2550.000	1623	927.000	0.650	1870	51	0.275	0.636	True	False	True	False	False	False	True	False	False	False	False	True	False	False	False	False
6025	33	3	36	5	2	3	1457.000	0	1457.000	0.677	2200	45	0.364	0.000	False	False	True	False	False	False	False	True	False	False	False	True	False	False	False	False

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender_M	Education_Level_Doctorate	Education_Level_Graduate	Education_Level_High School	Education_Level_Post-Graduate	Education_Level_Uneducated	Marital_Status_Married	Marital_Status_Single	Income_Category_$40K - $60K	Income_Category_$60K - $80K	Income_Category_$80K - $120K	Income_Category_Less than $40K	Income_Category_abc	Card_Category_Gold	Card_Category_Platinum	Card_Category_Silver
6685	49	1	35	5	2	2	1438.300	0	1438.300	0.681	4109	71	0.919	0.000	True	False	True	False	False	False	True	False	False	False	False	True	False	False	False	False
291	50	4	36	2	3	2	2521.000	1608	913.000	0.587	1328	33	0.571	0.638	False	True	False	False	False	False	False	True	False	False	False	False	True	False	False	False
3082	30	0	19	3	1	4	3213.000	2517	696.000	1.275	2666	46	1.000	0.783	False	False	True	False	False	False	True	False	False	False	False	True	False	False	False	False
8469	42	3	36	2	3	3	2515.000	1453	1062.000	0.649	4025	74	0.805	0.578	False	False	True	False	False	False	True	False	False	False	False	True	False	False	False	False
2088	27	0	15	4	3	4	3682.000	0	3682.000	0.685	1826	35	0.750	0.000	True	False	False	False	False	True	True	False	True	False	False	False	False	False	False	False

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender_M	Education_Level_Doctorate	Education_Level_Graduate	Education_Level_High School	Education_Level_Post-Graduate	Education_Level_Uneducated	Marital_Status_Married	Marital_Status_Single	Income_Category_$40K - $60K	Income_Category_$60K - $80K	Income_Category_$80K - $120K	Income_Category_Less than $40K	Income_Category_abc	Card_Category_Gold	Card_Category_Platinum	Card_Category_Silver
5168	30	1	19	6	1	2	1644.000	0	1644.000	0.820	2533	44	0.517	0.000	False	False	False	False	False	True	False	False	False	False	False	True	False	False	False	False
4889	54	4	43	3	3	0	5139.000	0	5139.000	0.330	1653	44	0.692	0.000	False	False	True	False	False	False	True	False	False	False	False	True	False	False	False	False
8995	52	2	46	2	3	3	25737.000	1168	24569.000	0.718	7722	94	0.469	0.045	True	False	False	True	False	False	False	True	False	False	True	False	False	False	False	False
3065	56	4	41	6	1	4	17753.000	1899	15854.000	0.851	3986	64	0.730	0.107	True	False	False	False	False	False	False	True	False	False	True	False	False	False	False	False
5333	50	1	43	4	3	4	2961.000	2048	913.000	0.913	4056	92	0.769	0.692	False	False	False	True	False	False	True	False	True	False	False	False	False	False	False	False

Problem Statement¶

Business Context¶

Data Description¶

What Is a (Total) Revolving Balance? (#15)

What is the Average Open to Buy? (#16)

What is the Average Utilization Ratio? (#21)

Relation b/w Avg_Open_To_Buy, Credit_Limit and Avg_Utilization_Ratio:

Please read the instructions carefully before starting the project.¶

Stratergy for Building the Models, Data Splits, Performance Evaluations, Model Selection, Model Tuning, and Feature Analysis¶

Importing necessary libraries¶

Loading the dataset¶

Data Overview¶

(1) Displaying the first few rows of the dataset - Examine it Loaded Properly¶

(2) Checking the shape of the dataset - Examine Rows and Columns Count¶

(3) Examine data types of the columns for the dataset - Evaluate if Preferred Format and Values are as Expected¶

(4) Statistical summary of the dataset - Examine Numerical Stats Overview¶

(5) - Checking for duplicate values¶

(6) - Checking for missing values¶

(7) Removing unnecesary data¶

(8) Encoding object datatype for easier analysis and manipulations. Column/Feature = Attrition_Flag.¶

Exploratory Data Analysis (EDA)¶

Functions defined for Exploratory Data Analysis.¶

Univariate Analysis - EDA¶

Bivariate Distributions - EDA¶

EDA & Correlations Observations:¶

Data Preprocessing - Strategy Step 1¶

Outlier Detection¶

Train - Test Data Split - Strategy Step 2¶

Instantiating the imputer for re-use¶

Data Splitting - [|||||||||| train - |||||| val - |||| test (unseen)]¶

Missing value imputation¶

Encoding categorical variables - Important¶

Model Building¶

Model Evaluation Criterion - Strategy Step 5¶

Model Building - Original Data¶

Models' Performance Evaluations:¶

Using GridSearchCV - Model Performance Metrics¶

Model Building - Oversampled Data - Model Evaluation, Selection on Training and Validation sets - Strategy Step 3, Step 4 Step 5 and Step 6:¶

Model Building - Undersampled Data - Model Evaluation, Selection on Training and Validation sets - Strategy Step 3, Step 4 Step 5 and Step 6:¶

Building Classification Models Using Different Sampling Techniques For Handling Imbalanced Datasets During Model Tuning¶

Hyperparameter Tuning - Strategy Step 7¶

Note on param_grid¶

Hypertuning of Models - Srategy Step 7¶

Sample tuning method for Decision Tree (2nd model) with original data

Sample tuning method for Decision Tree (2nd model) with oversampled data

Sample tuning method for Decision Tree (2nd model) with undersampled data

Tuning AdaBoost (6th model) using original data

Tuning Ada Boost (6th model) using undersampled data

Tuning Gradient Boosting (4th model) using undersampled data

Tuning Gradient Boosting (4th model) using original data

Tuning Gradient Boosting (4th model) using oversampled data

Tuning XGBoost Model (Optional model) with original data

Model Comparison and Final Model Selection - Strategy Step 8¶

Feature Importances - Strategy Step 9¶

**Business Insights and Conclusions**

Total_Revolving_Bal (#15) together with Ave_Utilization_Ration(#21). Together, these represent a customer churn risk factor and thresholds should be monitored.¶

APPENDIX I: Addressing Bias - Variance and Missclassification - Model Comparisons (PROS & Cons)- Strategy Step 10 for Extra Credit¶

APPENDIX II - Improving Model Performance¶

APPENDIX III Hyperparameter tuning trade-offs¶

APPENDIX IV - Code used as a reference with constant improvements¶

Note on `param_grid`¶

Business Insights and Conclusions