# Import packages for data manipulation
import pandas as pd
import numpy as np

# Import packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# This lets us see all of the columns, preventing Juptyer from redacting them.
pd.set_option('display.max_columns', None)

# Import packages for data modeling
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# This is the function that helps plot feature importance
from xgboost import plot_importance

# This module lets us save our models once we fit them.
import pickle


# Import dataset
df0 = pd.read_csv('waze_dataset.csv')


# Inspect the first five rows
### YOUR CODE HERE ###

df0.head()


# Copy the df0 dataframe
### YOUR CODE HERE ###

df = df0.copy()


### YOUR CODE HERE ###

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       14999 non-null  int64  
 1   label                    14299 non-null  object 
 2   sessions                 14999 non-null  int64  
 3   drives                   14999 non-null  int64  
 4   total_sessions           14999 non-null  float64
 5   n_days_after_onboarding  14999 non-null  int64  
 6   total_navigations_fav1   14999 non-null  int64  
 7   total_navigations_fav2   14999 non-null  int64  
 8   driven_km_drives         14999 non-null  float64
 9   duration_minutes_drives  14999 non-null  float64
 10  activity_days            14999 non-null  int64  
 11  driving_days             14999 non-null  int64  
 12  device                   14999 non-null  object 
dtypes: float64(3), int64(8), object(2)
memory usage: 1.5+ MB


# 1. Create `km_per_driving_day` feature
### YOUR CODE HERE ###

df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']
# 2. Get descriptive stats
df['km_per_driving_day'].describe()

count    1.499900e+04
mean              inf
std               NaN
min      3.022063e+00
25%      1.672804e+02
50%      3.231459e+02
75%      7.579257e+02
max               inf
Name: km_per_driving_day, dtype: float64


# 1. Convert infinite values to zero
### YOUR CODE HERE ###
df.loc[df['km_per_driving_day']==np.inf, 'km_per_driving_day']=0
# 2. Confirm that it worked
df['km_per_driving_day'].describe()

count    14999.000000
mean       578.963113
std       1030.094384
min          0.000000
25%        136.238895
50%        272.889272
75%        558.686918
max      15420.234110
Name: km_per_driving_day, dtype: float64


# 1. Create `percent_sessions_in_last_month` feature
df['percent_sessions_in_last_month'] = df['sessions'] / df['total_sessions']

# 1. Get descriptive stats
df['percent_sessions_in_last_month'].describe()

count    14999.000000
mean         0.449255
std          0.286919
min          0.000000
25%          0.196221
50%          0.423097
75%          0.687216
max          1.530637
Name: percent_sessions_in_last_month, dtype: float64


# Create `professional_driver` feature
df['professional_driver '] = np.where((df['drives']> 60) & (df['driving_days'] > 15) ,1 ,0)


# Create `total_sessions_per_day` feature
df['total_sessions_per_day'] = df['total_sessions'] / df['n_days_after_onboarding']


# Get descriptive stats
df.describe()


# Create `km_per_hour` feature
 
df['km_per_hour'] = df['driven_km_drives'] /( df['duration_minutes_drives'] / 60 )


# Create `km_per_drive` feature
df['km_per_driving'] = df['driven_km_drives'] / df['driving_days']
df['km_per_driving'].describe()

count    1.499900e+04
mean              inf
std               NaN
min      3.022063e+00
25%      1.672804e+02
50%      3.231459e+02
75%      7.579257e+02
max               inf
Name: km_per_driving, dtype: float64


# 1. Convert infinite values to zero
df.loc[df['km_per_driving'] == np.inf, 'km_per_driving'] = 0

# 2. Confirm that it worked
df['km_per_driving'].describe()

count    14999.000000
mean       578.963113
std       1030.094384
min          0.000000
25%        136.238895
50%        272.889272
75%        558.686918
max      15420.234110
Name: km_per_driving, dtype: float64


# Create `percent_of_sessions_to_favorite` feature
df['percent_of_sessions_to_favorite'] = ( df['total_navigations_fav1'] + df['total_navigations_fav2'] ) / df['total_sessions']

# Get descriptive stats
df['percent_of_sessions_to_favorite'].describe()

count    14999.000000
mean         1.665439
std          8.865666
min          0.000000
25%          0.203471
50%          0.649818
75%          1.638526
max        777.563629
Name: percent_of_sessions_to_favorite, dtype: float64


# Drop rows with missing values
df = df.dropna(subset = ['label'])


# Create new `device2` variable
df['device2'] = np.where(df['device'] == 'iPhone', 1, 0)
df['device2'].head()

0    0
1    1
2    0
3    1
4    0
Name: device2, dtype: int64


# Create binary `label2` column
df['label2'] = np.where(df['label'] == 'churned', 1, 0)
df['label2']

0        0
1        0
2        0
3        0
4        0
        ..
14994    0
14995    0
14996    0
14997    1
14998    0
Name: label2, Length: 14299, dtype: int64


# Drop `ID` column
df = df.drop(['ID'], axis = 1)


# Get class balance of 'label' col
df['label'].value_counts(normalize=True)

label
retained    0.822645
churned     0.177355
Name: proportion, dtype: float64


# 1. Isolate X variables
X = df.drop(columns= ['label','label2','device'])

# 2. Isolate y variable
y = df['label2']

# 3. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.20, stratify = y , random_state =42)

# 4. Split into train and validate sets
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.25, stratify = y_train, random_state = 42)


for x in [X_tr, X_val, X_test]:
    print(len(x))

8579
2860
2860


# 1. Instantiate the random forest classifier
rf = RandomForestClassifier(random_state = 42)

# 2. Create a dictionary of hyperparameters to tune
cv_params = {
    'max_depth':[None],
    'max_features':[1.0],
    'max_samples':[1.0],
    'min_samples_leaf':[2], 
    'min_samples_split':[2],
    'n_estimators':[200,100,300]
}

# 3. Define a dictionary of scoring metrics to capture
scoring = {'accuracy','precision','recall','f1'}

# 4. Instantiate the GridSearchCV object
rf_cv = GridSearchCV(rf, cv_params, scoring = scoring, cv =4, refit = 'recall')


### YOUR CODE HERE ###


rf_cv.fit(X_tr, y_tr)

GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None], 'max_features': [1.0],
                         'max_samples': [1.0], 'min_samples_leaf': [2],
                         'min_samples_split': [2],
                         'n_estimators': [200, 100, 300]},
             refit='recall', scoring={'recall', 'accuracy', 'precision', 'f1'})

GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None], 'max_features': [1.0],
                         'max_samples': [1.0], 'min_samples_leaf': [2],
                         'min_samples_split': [2],
                         'n_estimators': [200, 100, 300]},
             refit='recall', scoring={'recall', 'accuracy', 'precision', 'f1'})

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)


# Examine best score
rf_cv.best_score_

0.12941359303771238


# Examine best hyperparameter combo
rf_cv.best_params_

{'max_depth': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}


def make_results(model_name:str, model_object, metric:str):
    '''
    Arguments:
        model_name (string): what you want the model to be called in the output table
        model_object: a fit GridSearchCV object
        metric (string): precision, recall, f1, or accuracy

    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean 'metric' score across all validation folds.
    '''
    
    
    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'precision':'mean_test_precision',
                 'recall':'mean_test_recall',
                 'f1':'mean_test_recall',
                 'accuracy':'mean_test_accuracy'}
    

  
    
  # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

  # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(),:]

  # Extract Accuracy, precision, recall, and f1 score from that row
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    accuracy= best_estimator_results.mean_test_accuracy
    precision = best_estimator_results.mean_test_precision

  # Create table of results
    table = pd.DataFrame({'model':[model_name],
                        'precision':[precision],
                        'recall':[recall],
                        'f1':[f1],
                        'accuracy':[accuracy]})

    return table


### YOUR CODE HERE ###
results = make_results('RF cv', rf_cv, 'recall')
results


# 1. Instantiate the XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', random_state=42)

# 2. Create a dictionary of hyperparameters to tune
cv_params = {'max_depth':[2],
             'min_child_weight':[1,2],
             'learning_rate':[0.7],
             'n_estimators':[200]}

# 3. Define a dictionary of scoring metrics to capture
scoring = {'accuracy','precision','recall','f1'}

# 4. Instantiate the GridSearchCV object
xgb_cv = GridSearchCV(xgb, cv_params, scoring= scoring, cv =5, refit = 'recall')


### YOUR CODE HERE ###
xgb_cv.fit(X_tr, y_tr)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=42, ...),
             param_grid={'learning_rate': [0.7], 'max_depth': [2],
                         'min_child_weight': [1, 2], 'n_estimators': [200]},
             refit='recall', scoring={'recall', 'accuracy', 'precision', 'f1'})

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=42, ...),
             param_grid={'learning_rate': [0.7], 'max_depth': [2],
                         'min_child_weight': [1, 2], 'n_estimators': [200]},
             refit='recall', scoring={'recall', 'accuracy', 'precision', 'f1'})

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)


# Examine best score
xgb_cv.best_score_

0.19382010353753235


# Examine best parameters
xgb_cv.best_params_

{'learning_rate': 0.7,
 'max_depth': 2,
 'min_child_weight': 2,
 'n_estimators': 200}


# Call 'make_results()' on the GridSearch object

xgb_cv_results = make_results("XGB cv", xgb_cv, 'recall')

results = pd.concat([results, xgb_cv_results], axis =0)
results


# Use random forest model to predict on validation data
rf_val_preds = rf_cv.best_estimator_.predict(X_val)


def get_test_scores(model_name:str, preds, y_test_data):
    accuracy = metrics.accuracy_score(y_test_data, preds)
    precision = metrics.precision_score(y_test_data, preds)
    recall = metrics.recall_score(y_test_data, preds)
    f1 = metrics.f1_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                          'precision': [precision],
                          'recall': [recall],
                          'F1': [f1],
                          'accuracy': [accuracy],
                          })

    return table
    
    '''
    Generate a table of test scores.

    In:
        model_name (string): Your choice: how the model will be named in the output table
        preds: numpy array of test predictions
        y_test_data: numpy array of y_test data

    Out:
        table: a pandas df of precision, recall, f1, and accuracy scores for your model
    '''


# Get validation scores for RF model
rf_val_scores = get_test_scores('RF val', rf_val_preds, y_val)

# Append to the results table
results = pd.concat([results, rf_val_scores], axis=0)
results


# Use XGBoost model to predict on validation data
xgb_val_preds = xgb_cv.best_estimator_.predict(X_val)

# Get validation scores for XGBoost model
xgb_val_scores = get_test_scores('XGB val', xgb_val_preds, y_val)

# Append to the results table
results = pd.concat([results, xgb_val_scores], axis=0)
results


# Use XGBoost model to predict on test data
xgb_test_pred = xgb_cv.best_estimator_.predict(X_test)

# Get test scores for XGBoost model
xgb_test_scores = get_test_scores('XGB test', xgb_test_pred, y_test)

# Append to the results table
results = pd.concat([results, xgb_test_scores], axis=0)
results


# Generate array of values for confusion matrix
cm = metrics.confusion_matrix( y_test, xgb_test_pred, labels = xgb_cv.classes_)

# Plot confusion matrix
disp = metrics.ConfusionMatrixDisplay( confusion_matrix = cm,
                                       display_labels = ['retained','churned'])

disp.plot();


### YOUR CODE HERE ###

plot_importance(xgb_cv.best_estimator_);


# Plot precision-recall curve
display = metrics.PrecisionRecallDisplay.from_estimator(xgb_cv.best_estimator_, X_test, y_test, name='XGboost')
plt.title("Precision-recall Curve, XGboost model")

Text(0.5, 1.0, 'Precision-recall Curve, XGboost model')


# Get predicted probabilities on the test data
predicted_probabilities = xgb_cv.best_estimator_.predict_proba(X_test)
predicted_probabilities

array([[0.83909667, 0.16090332],
       [0.59840745, 0.40159255],
       [0.90512574, 0.09487425],
       ...,
       [0.6728308 , 0.32716915],
       [0.97635394, 0.02364605],
       [0.8399014 , 0.1600986 ]], dtype=float32)


# Create a list of just the second column values (probability of target)
probs = [x[1] for x in predicted_probabilities]

# Create an array of new predictions that assigns a 1 to any value >= 0.4
new_preds = np.array([1 if x >= 0.4 else 0 for x in probs])
new_preds

array([0, 1, 0, ..., 0, 0, 0])


# Get evaluation metrics for when the threshold is 0.4
### YOUR CODE HERE ###


### YOUR CODE HERE ###

get_test_scores('XGB, threshold = 0.4', new_preds, y_test)


results


def threshold_finder(y_test_data, probabilities, desired_recall):
    '''
    Find the threshold that most closely yields a desired recall score.

    Inputs:
        y_test_data: Array of true y values
        probabilities: The results of the `predict_proba()` model method
        desired_recall: The recall that you want the model to have

    Outputs:
        threshold: The threshold that most closely yields the desired recall
        recall: The exact recall score associated with `threshold`
    '''
    probs = [x[1] for x in probabilities]  # Isolate second column of `probabilities`
    thresholds = np.arange(0, 1, 0.001)    # Set a grid of 1,000 thresholds to test

    scores = []
    for threshold in thresholds:
        # Create a new array of {0, 1} predictions based on new threshold
        preds = np.array([1 if x >= threshold else 0 for x in probs])
        # Calculate recall score for that threshold
        recall = metrics.recall_score(y_test_data, preds)
        # Append the threshold and its corresponding recall score as a tuple to `scores`
        scores.append((threshold, recall))

    distances = []
    for idx, score in enumerate(scores):
        # Calculate how close each actual score is to the desired score
        distance = abs(score[1] - desired_recall)
        # Append the (index#, distance) tuple to `distances`
        distances.append((idx, distance))

    # Sort `distances` by the second value in each of its tuples (least to greatest)
    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=False)
    # Identify the tuple with the actual recall closest to desired recall
    best = sorted_distances[0]
    # Isolate the index of the threshold with the closest recall score
    best_idx = best[0]
    # Retrieve the threshold and actual recall score closest to desired recall
    threshold, recall = scores[best_idx]

    return threshold, recall


# Get the predicted probabilities from the champion model
probabilities = xgb_cv.best_estimator_.predict_proba(X_test)

# Call the function
threshold_finder(y_test, probabilities, 0.5)

(0.211, 0.5009861932938856)


# Create an array of new predictions that assigns a 1 to any value >= 0.124
probs = [x[1] for x in probabilities]
new_preds = np.array([1 if x >= 0.124 else 0 for x in probs])

# Get evaluation metrics for when the threshold is 0.124
get_test_scores('XGB, threshold = 0.124', new_preds, y_test)

	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days	km_per_driving_day	percent_sessions_in_last_month	professional_driver	total_sessions_per_day
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	7499.000000	80.633776	67.281152	189.964447	1749.837789	121.605974	29.672512	4039.340921	1860.976012	15.537102	12.179879	578.963113	0.449255	0.156477	0.338698
std	4329.982679	80.699065	65.913872	136.405128	1008.513876	148.121544	45.394651	2502.149334	1446.702288	9.004655	7.824036	1030.094384	0.286919	0.363319	1.314333
min	0.000000	0.000000	0.000000	0.220211	4.000000	0.000000	0.000000	60.441250	18.282082	0.000000	0.000000	0.000000	0.000000	0.000000	0.000298
25%	3749.500000	23.000000	20.000000	90.661156	878.000000	9.000000	0.000000	2212.600607	835.996260	8.000000	5.000000	136.238895	0.196221	0.000000	0.051037
50%	7499.000000	56.000000	48.000000	159.568115	1741.000000	71.000000	9.000000	3493.858085	1478.249859	16.000000	12.000000	272.889272	0.423097	0.000000	0.100775
75%	11248.500000	112.000000	93.000000	254.192341	2623.500000	178.000000	43.000000	5289.861262	2464.362632	23.000000	19.000000	558.686918	0.687216	0.000000	0.216269
max	14998.000000	743.000000	596.000000	1216.154633	3500.000000	1236.000000	415.000000	21183.401890	15851.727160	31.000000	30.000000	15420.234110	1.530637	1.000000	39.763874

model	precision	recall	f1	accuracy	F1
RF cv	0.451089	0.129414	0.200945	0.817577	NaN
XGB cv	0.413969	0.193820	0.263520	0.808136	NaN
RF val	0.466667	0.138067	NaN	0.819231	0.213090
RF val	0.466667	0.138067	NaN	0.819231	0.213090
XGB val	0.386364	0.167653	NaN	0.805245	0.233838

model	precision	recall	f1	accuracy	F1
RF cv	0.451089	0.129414	0.200945	0.817577	NaN
XGB cv	0.413969	0.193820	0.263520	0.808136	NaN
RF val	0.466667	0.138067	NaN	0.819231	0.213090
RF val	0.466667	0.138067	NaN	0.819231	0.213090
XGB val	0.386364	0.167653	NaN	0.805245	0.233838
XGB test	0.359307	0.163708	NaN	0.800000	0.224932

model	precision	recall	f1	accuracy	F1
RF cv	0.451089	0.129414	0.200945	0.817577	NaN
XGB cv	0.413969	0.193820	0.263520	0.808136	NaN
RF val	0.466667	0.138067	NaN	0.819231	0.213090
RF val	0.466667	0.138067	NaN	0.819231	0.213090
XGB val	0.386364	0.167653	NaN	0.805245	0.233838
XGB test	0.359307	0.163708	NaN	0.800000	0.224932

Waze Project¶

Course 6 End-of-Course Project: Build a machine learning model¶

Build a machine learning model¶

PACE stages¶

PACE: Plan¶

Task 1. Imports and data loading¶

PACE: Analyze¶

Task 2. Feature engineering¶

`km_per_driving_day`¶

`percent_sessions_in_last_month`¶

`professional_driver`¶

`total_sessions_per_day`¶

`km_per_hour`¶

`km_per_drive`¶

`percent_of_sessions_to_favorite`¶

Task 3. Drop missing values¶

Task 4. Outliers¶

Task 5. Variable encoding¶

Dummying features¶

Target encoding¶

Task 6. Feature selection¶

Task 7. Evaluation metric¶

PACE: Construct¶

Task 8. Modeling workflow and model selection process¶

Task 9. Split the data¶

Task 10. Modeling¶

Random forest¶

HINT

XGBoost¶

Task 11. Model selection¶

Random forest¶

XGBoost¶

PACE: Execute¶

Task 12. Use champion model to predict on test data¶

Task 13. Confusion matrix¶

Task 14. Feature importance¶

Task 15. Conclusion¶

BONUS¶

Identify an optimal decision threshold¶

	ID	label	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days	device
0	0	retained	283	226	296.748273	2276	208	0	2628.845068	1985.775061	28	19	Android
1	1	retained	133	107	326.896596	1225	19	64	13715.920550	3160.472914	13	11	iPhone
2	2	retained	114	95	135.522926	2651	0	0	3059.148818	1610.735904	14	8	Android
3	3	retained	49	40	67.589221	15	322	7	913.591123	587.196542	7	3	iPhone
4	4	retained	84	68	168.247020	1562	166	5	3950.202008	1219.555924	27	18	Android

Waze Project¶

Course 6 End-of-Course Project: Build a machine learning model¶

Build a machine learning model¶

PACE stages¶

PACE: Plan¶

Task 1. Imports and data loading¶

PACE: Analyze¶

Task 2. Feature engineering¶

km_per_driving_day¶

percent_sessions_in_last_month¶

professional_driver¶

total_sessions_per_day¶

km_per_hour¶

km_per_drive¶

percent_of_sessions_to_favorite¶

Task 3. Drop missing values¶

Task 4. Outliers¶

Task 5. Variable encoding¶

Dummying features¶

Target encoding¶

Task 6. Feature selection¶

Task 7. Evaluation metric¶

PACE: Construct¶

Task 8. Modeling workflow and model selection process¶

Task 9. Split the data¶

Task 10. Modeling¶

Random forest¶

HINT

XGBoost¶

Task 11. Model selection¶

Random forest¶

XGBoost¶

PACE: Execute¶

Task 12. Use champion model to predict on test data¶

Task 13. Confusion matrix¶

Task 14. Feature importance¶

Task 15. Conclusion¶

BONUS¶

Identify an optimal decision threshold¶

`km_per_driving_day`¶

`percent_sessions_in_last_month`¶

`professional_driver`¶

`total_sessions_per_day`¶

`km_per_hour`¶

`km_per_drive`¶

`percent_of_sessions_to_favorite`¶