Tune Hyperparameters

This notebook is an example of how to tune hyperparameters for a sci-kit learn machine learning model

In [1]:
"""Script to fine tune the hyperparameters"""
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from time import time
from pprint import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
import numpy as np
In [2]:
# This is for the RandomForestClassifier

# Import previously created model see classify.ipynb
# Read in data then split data into training and test sets
# Create Label column defining whether or not the article's upvotes exceed the average vote for the subreddit
# Split data into training and test sets

pipeline = joblib.load('datascience.xz')
data = pd.read_csv('processed_datascience.csv.bz2')
data['gtavg'] = data.ups > data.ups.mean()
train_X, test_X, train_y, test_y = train_test_split(data.title, 
In [3]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            print("Parameters: {0}".format(results['params'][candidate]))
In [4]:
# Hyperparameters to optomize
parameters = {
    'clf__max_depth':sp_randint(5, 31),

# Setup RandomSearch
random_search = RandomizedSearchCV(pipeline, param_distributions=parameters, n_iter=20, n_jobs=-1)

print("Performing Randomized search...")
print("pipeline:", [name for name, _ in pipeline.steps])

# Start training data under differnt hyperparameters
t0 = time()
random_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
Performing Randomized search...
pipeline: ['union', 'clf']
{'clf__max_depth': ,
 'clf__max_features': ('log2', 'sqrt', None),
 'clf__min_samples_leaf': ,
 'clf__min_samples_split': ,
 'clf__n_estimators': }
done in 702.822s

In [5]:
# Print Result Report
y_pred = random_search.predict(test_X)

# Measure Accuracy & F1Score
accuracy = accuracy_score(y_pred=y_pred, y_true=test_y)
print('Accuracy: {:03.1f}%'.format(accuracy * 100))
print('F1 Score:', f1_score(y_pred=y_pred, y_true=test_y))
Model with rank: 1
Mean validation score: 0.755 (std: 0.001)
Parameters: {'clf__max_depth': 19, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 84, 'clf__n_estimators': 267}

Model with rank: 2
Mean validation score: 0.755 (std: 0.001)
Parameters: {'clf__max_depth': 24, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 8, 'clf__min_samples_split': 94, 'clf__n_estimators': 372}

Model with rank: 2
Mean validation score: 0.755 (std: 0.001)
Parameters: {'clf__max_depth': 6, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 6, 'clf__min_samples_split': 75, 'clf__n_estimators': 881}

Accuracy: 73.3%
F1 Score: 0.048192771084337345

Related content