見出し画像

第12章:戦略をブースティングする 第2節: sklearn gbm チューニング

インポートと設定

from time import time
import numpy as np
import pandas as pd
import warnings
# there is now a faster (experimental) HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from itertools import product
import joblib
from pathlib import Path

warnings.filterwarnings('ignore')
np.random.seed(42)

One-hot エンコーディングを作成

def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):
   cols = list(cols)
   df = pd.get_dummies(df,
                       columns=cols + ['sector'],
                       prefix=cols + [''],
                       prefix_sep=['_'] * len(cols) + [''])
   return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})

ホールドアウトテストセットを作成

def get_holdout_set(target, features, period=6):
   idx = pd.IndexSlice
   label = target.name
   dates = np.sort(target.index.get_level_values('date').unique())
   cv_start, cv_end = dates[0], dates[-period - 2]
   holdout_start, holdout_end = dates[-period - 1], dates[-1]

   df = features.join(target.to_frame())
   train = df.loc[idx[:, cv_start: cv_end], :]
   y_train, X_train = train[label], train.drop(label, axis=1)

   test = df.loc[idx[:, holdout_start: holdout_end], :]
   y_test, X_test = test[label], test.drop(label, axis=1)
   return y_train, X_train, y_test, X_test

カスタム時系列分割

class OneStepTimeSeriesSplit:
   """Generates tuples of train_idx, test_idx pairs
   Assumes the index contains a level labeled 'date'"""

   def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
       self.n_splits = n_splits
       self.test_period_length = test_period_length
       self.shuffle = shuffle
       self.test_end = n_splits * test_period_length

   @staticmethod
   def chunks(l, n):
       for i in range(0, len(l), n):
           yield l[i:i + n]

   def split(self, X, y=None, groups=None):
       unique_dates = (X
                           .index
                           .get_level_values('date')
                           .unique()
                           .sort_values(ascending=False)
       [:self.test_end])

       dates = X.reset_index()[['date']]
       for test_date in self.chunks(unique_dates, self.test_period_length):
           train_idx = dates[dates.date < min(test_date)].index
           test_idx = dates[dates.date.isin(test_date)].index
           if self.shuffle:
               np.random.shuffle(list(train_idx))
           yield train_idx, test_idx
   
   def get_n_splits(self, X, y, groups=None):
       return self.n_splits

GradientBoosting分類器初期化

gb_clf = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.1,
                                   n_estimators=100,
                                   subsample=1.0,
                                   criterion='friedman_mse',
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0.0,
                                   max_depth=3,
                                   min_impurity_decrease=0.0,
                                   min_impurity_split=None,
                                   init=None,
                                   random_state=None,
                                   max_features=None,
                                   verbose=0,
                                   max_leaf_nodes=None,
                                   warm_start=False,
                                   presort='auto',
                                   validation_fraction=0.1,
                                   n_iter_no_change=None,
                                   tol=0.0001)

データの読み込み

DATA_STORE = Path('../data/assets.h5')
def get_data(start='2010', end='2018', holding_period=1, dropna=False):
   idx = pd.IndexSlice
   target = f'target_{holding_period}m'
   with pd.HDFStore(DATA_STORE) as store:
       df = store['engineered_features']

   if start is not None and end is not None:
       df = df.loc[idx[:, start: end], :]
   if dropna:
       df = df.dropna()

   y = (df[target] > 0).astype(int)
   X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
   return y, X
n_splits = 12
cv = OneStepTimeSeriesSplit(n_splits=n_splits)
y, features = get_data()
X = get_one_hot_data(features).dropna()

y, X, y_test, X_test = get_holdout_set(target=y,
                                      features=X)
with pd.HDFStore('data/tuning_sklearn_gbm.h5') as store:
   store.put('holdout/features', X_test)
   store.put('holdout/target', y_test)
   store.put('cv/target', y)
   store.put('cv/features', X)    

GridSearchCVのセットアップ

param_grid = dict(
       learning_rate=[.01, .1, .2],
       max_depth=list(range(3, 13, 3)),
       max_features=['sqrt', .8, 1],
       min_impurity_decrease=[0, .01],
       min_samples_split=[10, 50],
       n_estimators=[100, 300],
       subsample=[.8, 1])
all_params = list(product(*param_grid.values()))
print('# Models = :', len(all_params))
'''
# Models = : 576
'''

GridSearchCV初期化

gs = GridSearchCV(gb_clf,
                 param_grid,
                 cv=cv,
                 scoring='roc_auc',
                 verbose=3,
                 n_jobs=-1,
                 return_train_score=True)

フィッティング

これ上のパラメーターだとまず計算終わらないです。

start = time()
gs.fit(X=X, y=y)
done = time()

結果の保存

print(f'Done in {done:.2f}s')
joblib.dump(gs, 'results/sklearn_gbm_gridsearch.joblib')


この記事が気に入ったらサポートをしてみませんか?