#Panda
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#random
import random

#SKLearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score


# read data
household = pd.read_csv("data/Train.csv", index_col=0)
household.head()


#Checking the type of the data
household.dtypes

Gender                        object
Age                          float64
Education_Level                int64
Relationship_Status           object
Hometown                      object
Unit                          object
Decision_skill_possess        object
Time_of_service              float64
Time_since_promotion           int64
growth_rate                    int64
Travel_Rate                    int64
Post_Level                     int64
Pay_Scale                    float64
Compensation_and_Benefits     object
Work_Life_balance            float64
VAR1                           int64
VAR2                         float64
VAR3                         float64
VAR4                         float64
VAR5                           int64
VAR6                           int64
VAR7                           int64
Attrition_rate               float64
dtype: object


#Adjust the type of the columns
cat = household.select_dtypes('object').columns
cat

Index(['Gender', 'Relationship_Status', 'Hometown', 'Unit',
       'Decision_skill_possess', 'Compensation_and_Benefits'],
      dtype='object')


# Adjust data type
household[cat] = household[cat].astype('category')
household.dtypes

Gender                       category
Age                           float64
Education_Level                 int64
Relationship_Status          category
Hometown                     category
Unit                         category
Decision_skill_possess       category
Time_of_service               float64
Time_since_promotion            int64
growth_rate                     int64
Travel_Rate                     int64
Post_Level                      int64
Pay_Scale                     float64
Compensation_and_Benefits    category
Work_Life_balance             float64
VAR1                            int64
VAR2                          float64
VAR3                          float64
VAR4                          float64
VAR5                            int64
VAR6                            int64
VAR7                            int64
Attrition_rate                float64
dtype: object


household.head()


#Correlation matrix
household.corr()

#Correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(household.corr(), annot = True)

C:\Users\USER\AppData\Local\Temp\ipykernel_16564\2449804897.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  household.corr()
C:\Users\USER\AppData\Local\Temp\ipykernel_16564\2449804897.py:6: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(household.corr(), annot = True)

<AxesSubplot: >


household.isnull().sum()

Gender                         0
Age                          412
Education_Level                0
Relationship_Status            0
Hometown                       0
Unit                           0
Decision_skill_possess         0
Time_of_service              144
Time_since_promotion           0
growth_rate                    0
Travel_Rate                    0
Post_Level                     0
Pay_Scale                      9
Compensation_and_Benefits      0
Work_Life_balance             11
VAR1                           0
VAR2                         577
VAR3                           0
VAR4                         656
VAR5                           0
VAR6                           0
VAR7                           0
Attrition_rate                 0
dtype: int64


#drop missing value
household_full = household.dropna()


household_full.isna().sum()

Gender                       0
Age                          0
Education_Level              0
Relationship_Status          0
Hometown                     0
Unit                         0
Decision_skill_possess       0
Time_of_service              0
Time_since_promotion         0
growth_rate                  0
Travel_Rate                  0
Post_Level                   0
Pay_Scale                    0
Compensation_and_Benefits    0
Work_Life_balance            0
VAR1                         0
VAR2                         0
VAR3                         0
VAR4                         0
VAR5                         0
VAR6                         0
VAR7                         0
Attrition_rate               0
dtype: int64


# check unique value of gender
#household_full.Gender.unique()


#encode withn cat codes
#household_full['Gender'] = household_full['Gender'].cat.codes


def cat_codes(df):
    for col in df.select_dtypes(include='category').columns:
        df[col] = df[col].cat.codes


# call the function of cat_codes
cat_codes(household_full)

C:\Users\USER\AppData\Local\Temp\ipykernel_16564\3035965246.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].cat.codes


household_mv = household_full.copy()

random.seed(123)

ix = [(row, col) for row in range(household_mv.shape[0]) for col in range(household_mv.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):
    if (col ==  6) | (col ==  7) | (col ==  14) :
        household_mv.iat[row, col] = np.nan


## code here
household_mv.isnull().sum()

Gender                         0
Age                            0
Education_Level                0
Relationship_Status            0
Hometown                       0
Unit                           0
Decision_skill_possess       536
Time_of_service              529
Time_since_promotion           0
growth_rate                    0
Travel_Rate                    0
Post_Level                     0
Pay_Scale                      0
Compensation_and_Benefits      0
Work_Life_balance            558
VAR1                           0
VAR2                           0
VAR3                           0
VAR4                           0
VAR5                           0
VAR6                           0
VAR7                           0
Attrition_rate                 0
dtype: int64


# copy household_mice
household_mice = household_mv.copy()


## initiation
mice_imputer = IterativeImputer()

#imputation
household_mice = pd.DataFrame(mice_imputer.fit_transform(household_mice), columns=household_mice.columns)


household_mice[['Decision_skill_possess', 'Time_of_service', 'Work_Life_balance']] = round(household_mice[['Decision_skill_possess', 'Time_of_service', 'Work_Life_balance']])


household_mice


## code here
mean_squared_error(household_full.Time_of_service, household_mice.Time_of_service)

1.6674761815804222


#accuracy_score(household_full.Work_Life_balance, household_mice.Work_Life_balance)


accuracy_score(household_full.Decision_skill_possess, household_mice.Decision_skill_possess)

0.9243414907528489


household_impute = household.copy()


# label encoding for categorical dataset
cat_codes(household_impute)


# check unique value from Decision Skill Possess after encoding
household_impute['Decision_skill_possess'].unique()

array([2, 0, 1, 3], dtype=int8)


# MICE Imputation using mice_imputer
household_impute = pd.DataFrame(mice_imputer.fit_transform(household_impute), columns=household_impute.columns)

C:\Users\USER\anaconda3\envs\dss_mice\lib\site-packages\sklearn\impute\_iterative.py:785: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn(


# rounding for categorical column
household_impute['Decision_skill_possess'] = round(household_impute['Decision_skill_possess'])


household_impute.isna().sum()

Gender                       0
Age                          0
Education_Level              0
Relationship_Status          0
Hometown                     0
Unit                         0
Decision_skill_possess       0
Time_of_service              0
Time_since_promotion         0
growth_rate                  0
Travel_Rate                  0
Post_Level                   0
Pay_Scale                    0
Compensation_and_Benefits    0
Work_Life_balance            0
VAR1                         0
VAR2                         0
VAR3                         0
VAR4                         0
VAR5                         0
VAR6                         0
VAR7                         0
Attrition_rate               0
dtype: int64


# create scale mapper education 
cat_deci = dict(enumerate(household["Decision_skill_possess"].cat.categories))


# revert code kembali ke nilai sebenarnya
household_impute['Decision_skill_possess'] = household_impute['Decision_skill_possess'].replace(cat_deci)


# reset_index promotion (because we're using index_col when we read the data)
household = household.reset_index()


# replace missing value columns 
household['Age'] = household_impute['Age'] 
household['Decision_skill_possess'] = household_impute['Decision_skill_possess'] 
household['Time_of_service'] = household_impute['Time_of_service']
household['Work_Life_balance'] = household_impute['Work_Life_balance']
household['VAR2'] = household_impute['VAR2']
household['VAR4'] = household_impute['VAR4']


# check again the missing value
household.isnull().sum()

Employee_ID                  0
Gender                       0
Age                          0
Education_Level              0
Relationship_Status          0
Hometown                     0
Unit                         0
Decision_skill_possess       0
Time_of_service              0
Time_since_promotion         0
growth_rate                  0
Travel_Rate                  0
Post_Level                   0
Pay_Scale                    9
Compensation_and_Benefits    0
Work_Life_balance            0
VAR1                         0
VAR2                         0
VAR3                         0
VAR4                         0
VAR5                         0
VAR6                         0
VAR7                         0
Attrition_rate               0
dtype: int64

	Gender	Age	Education_Level	Relationship_Status	Hometown	Unit	Decision_skill_possess	Time_of_service	Time_since_promotion	growth_rate	...	Compensation_and_Benefits	Work_Life_balance	VAR1	VAR2	VAR3	VAR4	VAR5	VAR6	VAR7	Attrition_rate
Employee_ID
EID_23371	F	42.0	4	Married	Franklin	IT	Conceptual	4.0	4	33	...	type2	3.0	4	0.7516	1.8688	2.0	4	5	3	0.1841
EID_18000	M	24.0	3	Single	Springfield	Logistics	Analytical	5.0	4	36	...	type2	4.0	3	-0.9612	-0.4537	2.0	3	5	3	0.0670
EID_3891	F	58.0	3	Married	Clinton	Quality	Conceptual	27.0	3	51	...	type2	1.0	4	-0.9612	-0.4537	3.0	3	8	3	0.0851
EID_17492	F	26.0	3	Single	Lebanon	Human Resource Management	Behavioral	4.0	3	56	...	type2	1.0	3	-1.8176	-0.4537	NaN	3	7	3	0.0668
EID_22534	F	31.0	1	Married	Springfield	Logistics	Conceptual	5.0	4	62	...	type3	3.0	1	0.7516	-0.4537	2.0	2	8	2	0.1827

	Gender	Age	Education_Level	Relationship_Status	Hometown	Unit	Decision_skill_possess	Time_of_service	Time_since_promotion	growth_rate	...	Compensation_and_Benefits	Work_Life_balance	VAR1	VAR2	VAR3	VAR4	VAR5	VAR6	VAR7	Attrition_rate
Employee_ID
EID_23371	F	42.0	4	Married	Franklin	IT	Conceptual	4.0	4	33	...	type2	3.0	4	0.7516	1.8688	2.0	4	5	3	0.1841
EID_18000	M	24.0	3	Single	Springfield	Logistics	Analytical	5.0	4	36	...	type2	4.0	3	-0.9612	-0.4537	2.0	3	5	3	0.0670
EID_3891	F	58.0	3	Married	Clinton	Quality	Conceptual	27.0	3	51	...	type2	1.0	4	-0.9612	-0.4537	3.0	3	8	3	0.0851
EID_17492	F	26.0	3	Single	Lebanon	Human Resource Management	Behavioral	4.0	3	56	...	type2	1.0	3	-1.8176	-0.4537	NaN	3	7	3	0.0668
EID_22534	F	31.0	1	Married	Springfield	Logistics	Conceptual	5.0	4	62	...	type3	3.0	1	0.7516	-0.4537	2.0	2	8	2	0.1827

	Gender	Age	Education_Level	Relationship_Status	Hometown	Unit	Decision_skill_possess	Time_of_service	Time_since_promotion	growth_rate	...	Compensation_and_Benefits	Work_Life_balance	VAR1	VAR2	VAR3	VAR4	VAR5	VAR6	VAR7	Attrition_rate
0	0.0	42.0	4.0	0.0	1.0	2.0	2.0	4.0	4.0	33.0	...	2.0	3.0	4.0	0.7516	1.8688	2.0	4.0	5.0	3.0	0.1841
1	1.0	24.0	3.0	1.0	3.0	3.0	0.0	5.0	4.0	36.0	...	2.0	4.0	3.0	-0.9612	-0.4537	2.0	3.0	5.0	3.0	0.0670
2	0.0	58.0	3.0	0.0	0.0	8.0	2.0	27.0	3.0	51.0	...	2.0	1.0	4.0	-0.9612	-0.4537	3.0	3.0	8.0	3.0	0.0851
3	0.0	31.0	1.0	0.0	3.0	3.0	1.0	5.0	4.0	62.0	...	3.0	3.0	1.0	0.7516	-0.4537	2.0	2.0	8.0	2.0	0.1827
4	1.0	54.0	3.0	0.0	2.0	7.0	2.0	19.0	1.0	21.0	...	2.0	1.0	3.0	-1.8176	1.8688	2.0	2.0	8.0	3.0	0.7613
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5348	0.0	23.0	5.0	0.0	1.0	5.0	1.0	4.0	1.0	20.0	...	3.0	2.0	1.0	0.7516	-0.4537	2.0	3.0	7.0	3.0	0.7839
5349	0.0	44.0	1.0	0.0	2.0	9.0	0.0	16.0	2.0	59.0	...	3.0	3.0	2.0	-0.1048	-2.7762	2.0	4.0	8.0	4.0	0.1055
5350	0.0	49.0	3.0	1.0	3.0	2.0	3.0	10.0	1.0	59.0	...	2.0	2.0	2.0	-0.1048	-0.4537	2.0	4.0	8.0	3.0	0.7847
5351	0.0	47.0	3.0	0.0	4.0	10.0	1.0	24.0	1.0	21.0	...	0.0	2.0	4.0	0.7516	1.8688	2.0	3.0	6.0	3.0	0.4162
5352	0.0	58.0	3.0	0.0	1.0	2.0	2.0	26.0	2.0	74.0	...	2.0	3.0	3.0	-0.1048	-1.6150	3.0	4.0	5.0	3.0	0.0910

Missing Value Imputation Using MICE¶

Background¶

Multiple Imputation by Chained Equations (MICE)¶

Libraries¶

Read Data¶

Exploratory Data Analysis¶

Correlation Check¶

Check Missing Value¶

Label Encoding for Category Variables¶

Artificially Create Missing Value¶

Check Artificial Missing Value¶

Evaluation¶

Impute the Original Dataset¶

Data Preparation¶

Impute using MICE¶

Replace Missing Values¶