見出し画像

第7章 線形モデル編: 第8節(7章最後) ロジスティック回帰入門

はじめに

やっと7章最後です...(笑)
長かった線形モデルもここまで来ると皆さんも何をしているのかは分かったのではないのでしょうか?

アジェンダ

・インポートと設定
・目的変数と説明変数の定義
・交差検定でロジスティック回帰の予測モデルを作成

インポートと設定

import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import sys, os
from time import time

import pandas as pd
import numpy as np

from scipy.stats import spearmanr

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV
sns.set_style('darkgrid')
idx = pd.IndexSlice
YEAR = 252

データの読み込み

with pd.HDFStore('data.h5') as store:
   data = (store['model_data']
           .dropna()
           .drop(['open', 'close', 'low', 'high'], axis=1))
data = data.drop([c for c in data.columns if 'year' in c or 'lag' in c], axis=1)

上位出来高銘柄にフィルター

data = data[data.dollar_vol_rank<100]

目的変数と説明変数の定義

y = data.filter(like='target')
X = data.drop(y.columns, axis=1)
X = X.drop(['dollar_vol', 'dollar_vol_rank', 'volume', 'consumer_durables'], axis=1)

モデル作成

ここで交差検定を行う前に前の記事で紹介したMultipleTimeSeriesCVがございまして、そちらをこちらで一度再定義します。

class MultipleTimeSeriesCV:
   """Generates tuples of train_idx, test_idx pairs
   Assumes the MultiIndex contains levels 'symbol' and 'date'
   purges overlapping outcomes"""

   def __init__(self,
                n_splits=3,
                train_period_length=126,
                test_period_length=21,
                lookahead=None,
                date_idx='date',
                shuffle=False):
       self.n_splits = n_splits
       self.lookahead = lookahead
       self.test_length = test_period_length
       self.train_length = train_period_length
       self.shuffle = shuffle
       self.date_idx = date_idx

   def split(self, X, y=None, groups=None):
       unique_dates = X.index.get_level_values(self.date_idx).unique()
       days = sorted(unique_dates, reverse=True)
       split_idx = []
       for i in range(self.n_splits):
           test_end_idx = i * self.test_length
           test_start_idx = test_end_idx + self.test_length
           train_end_idx = test_start_idx + self.lookahead - 1
           train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
           split_idx.append([train_start_idx, train_end_idx,
                             test_start_idx, test_end_idx])

       dates = X.reset_index()[[self.date_idx]]
       for train_start, train_end, test_start, test_end in split_idx:

           train_idx = dates[(dates[self.date_idx] > days[train_start])
                             & (dates.date <= days[train_end])].index
           test_idx = dates[(dates.date > days[test_start])
                            & (dates.date <= days[test_end])].index
           if self.shuffle:
               np.random.shuffle(list(train_idx))
           yield train_idx.to_numpy(), test_idx.to_numpy()

   def get_n_splits(self, X, y, groups=None):
       return self.n_splits

パラメーター設定

train_period_length = 63
test_period_length = 10
lookahead =1
n_splits = int(3 * YEAR/test_period_length)

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                         test_period_length=test_period_length,
                         lookahead=lookahead,
                         train_period_length=train_period_length)
target = f'target_{lookahead}d'

ここで、目的変数をバイナリ化します。上がっていれば1,それ以外は0とします。

y.loc[:, 'label'] = (y[target] > 0).astype(int)
Cs = np.logspace(-5, 5, 11)
cols = ['C', 'date', 'auc', 'ic', 'pval']

ここでモデルのパラメーターチューニングを行います。

%%time
log_coeffs, log_scores, log_predictions = {}, [], []
for C in Cs:
   print(C)
   model = LogisticRegression(C=C,
                              fit_intercept=True,
                              random_state=42,
                              n_jobs=-1)

   pipe = Pipeline([
       ('scaler', StandardScaler()),
       ('model', model)])
   ics = aucs = 0
   start = time()
   coeffs = []
   for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):
       X_train, y_train, = X.iloc[train_idx], y.label.iloc[train_idx]
       pipe.fit(X=X_train, y=y_train)
       X_test, y_test = X.iloc[test_idx], y.label.iloc[test_idx]
       actuals = y[target].iloc[test_idx]
       if len(y_test) < 10 or len(np.unique(y_test)) < 2:
           continue
       y_score = pipe.predict_proba(X_test)[:, 1]
      
       auc = roc_auc_score(y_score=y_score, y_true=y_test)
       actuals = y[target].iloc[test_idx]
       ic, pval = spearmanr(y_score, actuals)

       log_predictions.append(y_test.to_frame('labels').assign(
           predicted=y_score, C=C, actuals=actuals))
       date = y_test.index.get_level_values('date').min()
       log_scores.append([C, date, auc, ic * 100, pval])
       coeffs.append(pipe.named_steps['model'].coef_)
       ics += ic
       aucs += auc
       if i % 10 == 0:
           print(f'\t{time()-start:5.1f} | {i:03} | {ics/i:>7.2%} | {aucs/i:>7.2%}')

   log_coeffs[C] = np.mean(coeffs, axis=0).squeeze()
'''
1e-05
	  2.4 | 010 |  -0.27% |  50.44%
	  3.8 | 020 |   1.96% |  51.88%
	  5.1 | 030 |   2.83% |  52.01%
	  6.4 | 040 |   3.26% |  51.98%
	  7.7 | 050 |   3.95% |  52.44%
	  9.1 | 060 |   3.95% |  52.27%
	 10.4 | 070 |   4.75% |  52.61%
0.0001
	  1.3 | 010 |  -0.01% |  50.64%
	  2.6 | 020 |   2.32% |  52.07%
	  4.0 | 030 |   3.24% |  52.28%
	  5.3 | 040 |   3.36% |  52.10%
	  6.6 | 050 |   4.04% |  52.54%
	  8.0 | 060 |   4.03% |  52.34%
	  9.3 | 070 |   4.88% |  52.70%
0.001
	  1.4 | 010 |   0.47% |  50.98%
	  2.7 | 020 |   2.59% |  52.18%
	  4.1 | 030 |   3.65% |  52.51%
	  5.5 | 040 |   3.21% |  52.10%
	  6.9 | 050 |   3.87% |  52.52%
	  8.3 | 060 |   4.08% |  52.37%
	  9.7 | 070 |   4.96% |  52.75%
0.01
	  1.5 | 010 |   0.76% |  51.17%
	  2.9 | 020 |   2.46% |  52.02%
	  4.4 | 030 |   3.71% |  52.45%
	  5.9 | 040 |   3.17% |  51.98%
	  7.4 | 050 |   3.96% |  52.48%
	  8.8 | 060 |   4.21% |  52.34%
	 10.3 | 070 |   4.98% |  52.69%
0.1
	  1.5 | 010 |   0.76% |  51.16%
	  3.1 | 020 |   2.26% |  51.86%
	  4.6 | 030 |   3.57% |  52.33%
	  6.2 | 040 |   3.00% |  51.84%
	  7.8 | 050 |   3.79% |  52.34%
	  9.4 | 060 |   3.99% |  52.19%
	 11.0 | 070 |   4.67% |  52.51%
1.0
	  1.6 | 010 |   0.72% |  51.14%
	  3.2 | 020 |   2.21% |  51.83%
	  4.7 | 030 |   3.54% |  52.30%
	  6.3 | 040 |   2.96% |  51.81%
	  7.9 | 050 |   3.73% |  52.31%
	  9.5 | 060 |   3.92% |  52.15%
	 11.0 | 070 |   4.57% |  52.46%
10.0
	  1.6 | 010 |   0.72% |  51.14%
	  3.2 | 020 |   2.20% |  51.82%
	  4.8 | 030 |   3.53% |  52.30%
	  6.4 | 040 |   2.95% |  51.81%
	  7.9 | 050 |   3.72% |  52.30%
	  9.5 | 060 |   3.92% |  52.15%
	 11.1 | 070 |   4.56% |  52.45%
100.0
	  1.5 | 010 |   0.72% |  51.14%
	  3.1 | 020 |   2.20% |  51.82%
	  4.7 | 030 |   3.53% |  52.30%
	  6.3 | 040 |   2.95% |  51.81%
	  7.9 | 050 |   3.72% |  52.30%
	  9.4 | 060 |   3.91% |  52.15%
	 11.0 | 070 |   4.56% |  52.45%
1000.0
	  1.6 | 010 |   0.72% |  51.14%
	  3.1 | 020 |   2.20% |  51.82%
	  4.7 | 030 |   3.53% |  52.30%
	  6.3 | 040 |   2.95% |  51.81%
	  7.9 | 050 |   3.72% |  52.30%
	  9.5 | 060 |   3.91% |  52.15%
	 11.0 | 070 |   4.56% |  52.45%
10000.0
	  1.6 | 010 |   0.72% |  51.14%
	  3.1 | 020 |   2.20% |  51.82%
	  4.7 | 030 |   3.53% |  52.30%
	  6.2 | 040 |   2.95% |  51.81%
	  7.8 | 050 |   3.72% |  52.30%
	  9.4 | 060 |   3.91% |  52.15%
	 10.9 | 070 |   4.56% |  52.45%
100000.0
	  1.6 | 010 |   0.72% |  51.14%
	  3.1 | 020 |   2.20% |  51.82%
	  4.7 | 030 |   3.53% |  52.30%
	  6.3 | 040 |   2.95% |  51.81%
	  7.8 | 050 |   3.72% |  52.30%
	  9.3 | 060 |   3.91% |  52.15%
	 10.9 | 070 |   4.56% |  52.45%
CPU times: user 5min 54s, sys: 35.2 s, total: 6min 29s
Wall time: 2min 4s
'''

結果の評価

log_scores = pd.DataFrame(log_scores, columns=cols)
log_scores.to_hdf('data.h5', 'logistic/scores')

log_coeffs = pd.DataFrame(log_coeffs, index=X.columns).T
log_coeffs.to_hdf('data.h5', 'logistic/coeffs')

log_predictions = pd.concat(log_predictions)
log_predictions.to_hdf('data.h5', 'logistic/predictions')

結果の保存

log_scores = pd.read_hdf('data.h5', 'logistic/scores')
log_scores.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 825 entries, 0 to 824
Data columns (total 5 columns):
#   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
0   C       825 non-null    float64       
1   date    825 non-null    datetime64[ns]
2   auc     825 non-null    float64       
3   ic      825 non-null    float64       
4   pval    825 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 38.7 KB
'''
log_scores.groupby('C').auc.describe()

画像1

結果のプロット

def plot_ic_distribution(df, ax=None):
   if ax is not None:
       sns.distplot(df.ic, ax=ax)    
   else:
       ax = sns.distplot(df.ic)
   mean, median = df.ic.mean(), df.ic.median()
   ax.axvline(0, lw=1, ls='--', c='k')
   ax.text(x=.05, y=.9, s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',
           horizontalalignment='left',
           verticalalignment='center',
           transform=ax.transAxes)
   ax.set_xlabel('Information Coefficient')
   sns.despine()
   plt.tight_layout()
fig, axes= plt.subplots(ncols=2, figsize=(15, 5))

sns.lineplot(x='C', y='auc', data=log_scores, estimator=np.mean, label='Mean', ax=axes[0])
by_alpha = log_scores.groupby('C').auc.agg(['mean', 'median'])
best_auc = by_alpha['mean'].idxmax()
by_alpha['median'].plot(logx=True, ax=axes[0], label='Median', xlim=(10e-6, 10e5))
axes[0].axvline(best_auc, ls='--', c='k', lw=1, label='Max. Mean')
axes[0].axvline(by_alpha['median'].idxmax(), ls='-.', c='k', lw=1, label='Max. Median')
axes[0].legend()
axes[0].set_ylabel('AUC')
axes[0].set_xscale('log')
axes[0].set_title('Area Under the Curve')

plot_ic_distribution(log_scores[log_scores.C==best_auc], ax=axes[1])
axes[1].set_title('Information Coefficient')

fig.suptitle('Logistic Regression', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);

画像2


この記事が気に入ったらサポートをしてみませんか?