見出し画像

第11章:ランダムフォレスト - ロングショート戦略 第5節: ランダムフォレストで売りと買いシグナルを生成する

ここでやること

こんにちは、前回まではデータの準備で色々とやっておりましたが、今回はモデルの作り方について紹介します。

最初に線形回帰で予測モデルを作成して、その後、LightGBMで予測モデルを作成して、二種類のモデルを情報係数基準で比較します。

最後に、最もよかったモデル達を利用して、アンサンブルモデルを作る方法も紹介します。

インポートと設定

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from time import time
from io import StringIO
import sys, os

from itertools import product
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV, format_time
sns.set_style('whitegrid')
np.random.seed(42)
YEAR = 252
idx = pd.IndexSlice
DATA_DIR = Path('..', 'data')
results_path = Path('results', 'return_predictions')
if not results_path.exists():
   results_path.mkdir(parents=True)

データ取得

data = pd.read_hdf('data.h5', 'stooq/japan/equities')
data.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2302060 entries, ('1332.JP', Timestamp('2010-01-04 00:00:00')) to ('9990.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 23 columns):
#   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
0   ret_1            2301120 non-null  float64
1   ret_rel_perc_1   2301120 non-null  float64
2   ret_5            2297360 non-null  float64
3   ret_rel_perc_5   2297360 non-null  float64
4   ret_10           2292660 non-null  float64
5   ret_rel_perc_10  2292660 non-null  float64
6   ret_21           2282320 non-null  float64
7   ret_rel_perc_21  2282320 non-null  float64
8   ret_63           2242840 non-null  float64
9   ret_rel_perc_63  2242840 non-null  float64
10  PPO              2278560 non-null  float64
11  NATR             2288900 non-null  float64
12  RSI              2288900 non-null  float64
13  bbl              2298300 non-null  float64
14  bbu              2298300 non-null  float64
15  weekday          2302060 non-null  int64  
16  month            2302060 non-null  int64  
17  year             2302060 non-null  int64  
18  fwd_ret_01       2301120 non-null  float64
19  fwd_ret_05       2297360 non-null  float64
20  fwd_ret_10       2292660 non-null  float64
21  fwd_ret_21       2282320 non-null  float64
22  fwd_ret_63       2242840 non-null  float64
dtypes: float64(20), int64(3)
memory usage: 412.9+ MB
'''
len(data.index.unique('ticker'))
'''
940
'''

出来高上位50銘柄を選定する。

本notebookでは250銘柄としているが、計算がまず終わらないので50銘柄に絞る。

prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'stooq/jp/tse/stocks/prices')
         .loc[idx[:, '2010': '2017'], :])

dollar_vol = prices.close.mul(prices.volume).loc[idx[:, :'2017'],:]
dollar_vol_rank = dollar_vol.groupby(level='date').rank(ascending=False)
universe = dollar_vol_rank.groupby(level='ticker').mean().nsmallest(50).index

多重時系列交差検証

cv = MultipleTimeSeriesCV(n_splits=36,
                         test_period_length=21,
                         lookahead=5,
                         train_period_length=2 * 252)
for i, (train_idx, test_idx) in enumerate(cv.split(X=data)):
   train = data.iloc[train_idx]
   train_dates = train.index.get_level_values('date')
   test = data.iloc[test_idx]
   test_dates = test.index.get_level_values('date')
   df = train.reset_index().append(test.reset_index())
   n = len(df)
   assert n== len(df.drop_duplicates())
   msg = f'Training: {train_dates.min().date()}-{train_dates.max().date()} '
   msg += f' ({train.groupby(level="ticker").size().value_counts().index[0]:,.0f} days) | '
   msg += f'Test: {test_dates.min().date()}-{test_dates.max().date()} '
   msg += f'({test.groupby(level="ticker").size().value_counts().index[0]:,.0f} days)'
   print(msg)
   if i == 3:
       break
'''
Training: 2017-10-24-2019-11-25  (508 days) | Test: 2019-12-02-2019-12-30 (21 days)
Training: 2017-09-22-2019-10-24  (508 days) | Test: 2019-10-31-2019-11-29 (21 days)
Training: 2017-08-23-2019-09-20  (508 days) | Test: 2019-09-30-2019-10-30 (21 days)
Training: 2017-07-24-2019-08-21  (508 days) | Test: 2019-08-28-2019-09-27 (21 days)
'''

モデルの選定:期間とホライズン

期間は2017年までとする。

cv_data = data.loc[idx[universe, :'2017'], :]
tickers = cv_data.index.unique('ticker')

データを保存しておく。

cv_data.to_hdf('data.h5', 'stooq/japan/equities/cv_data')
with pd.HDFStore('data.h5') as store:
   print(store.info())

予測ホライズン

lookaheads = [1, 5, 10, 21]

ベースライン: 線形回帰モデル

線形回帰については第7章に詳しく書いてある。

lr = LinearRegression()
labels = sorted(cv_data.filter(like='fwd').columns)
features = cv_data.columns.difference(labels).tolist()

CVのパラメーター

train_lengths = [5 * YEAR, 3 * YEAR, YEAR, 126, 63]
test_lengths = [5, 21]
test_params = list(product(lookaheads, train_lengths, test_lengths))
lr_metrics = []
for lookahead, train_length, test_length in test_params:
   label = f'fwd_ret_{lookahead:02}'
   df = cv_data.loc[:, features + [label]].dropna()
   X, y = df.drop(label, axis=1), df[label]

   n_splits = int(2 * YEAR / test_length)
   cv = MultipleTimeSeriesCV(n_splits=n_splits,
                             test_period_length=test_length,
                             lookahead=lookahead,
                             train_period_length=train_length)

   ic, preds = [], []
   for i, (train_idx, test_idx) in enumerate(cv.split(X=X)):
       X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
       X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
       lr.fit(X_train, y_train)
       y_pred = lr.predict(X_test)
       preds.append(y_test.to_frame('y_true').assign(y_pred=y_pred))
       ic.append(spearmanr(y_test, y_pred)[0])
   preds = pd.concat(preds)
   lr_metrics.append([
       lookahead, train_length, test_length,
       np.mean(ic),
       spearmanr(preds.y_true, preds.y_pred)[0]
   ])

columns = ['lookahead', 'train_length', 'test_length', 'ic_by_day', 'ic']
lr_metrics = pd.DataFrame(lr_metrics, columns=columns)
lr_metrics.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
#   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
0   lookahead     40 non-null     int64  
1   train_length  40 non-null     int64  
2   test_length   40 non-null     int64  
3   ic_by_day     40 non-null     float64
4   ic            40 non-null     float64
dtypes: float64(2), int64(3)
memory usage: 1.7 KB
'''

Lookaheadによる情報係数分布

lr_metrics_long = pd.concat([(lr_metrics.drop('ic', axis=1)
                             .rename(columns={'ic_by_day': 'ic'})
                             .assign(Measured='By Day')),
                            lr_metrics.drop('ic_by_day', axis=1)
                            .assign(Measured='Overall')])
lr_metrics_long.columns=['Lookahead', 'Train Length', 'Test Length', 'IC', 'Measure']
lr_metrics_long.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 39
Data columns (total 5 columns):
#   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
0   Lookahead     80 non-null     int64  
1   Train Length  80 non-null     int64  
2   Test Length   80 non-null     int64  
3   IC            80 non-null     float64
4   Measure       80 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 3.8+ KB
'''
sns.catplot(x='Train Length',
           y='IC',
           hue='Test Length',
           col='Lookahead',
           row='Measure',
           data=lr_metrics_long,
           kind='bar')

画像1

fig, axes =plt.subplots(ncols=2, figsize=(14,5), sharey=True)
sns.boxplot(x='lookahead', y='ic_by_day',data=lr_metrics, ax=axes[0])
axes[0].set_title('IC by Day')
sns.boxplot(x='lookahead', y='ic',data=lr_metrics, ax=axes[1])
axes[1].set_title('IC Overall')
axes[0].set_ylabel('Information Coefficient')
axes[1].set_ylabel('')
sns.despine()
fig.tight_layout()

画像2

最適訓練/テスト期間

(lr_metrics.groupby('lookahead', group_keys=False)
.apply(lambda x: x.nlargest(3, 'ic')))

画像3

lr_metrics.to_csv(results_path / 'lin_reg_performance.csv', index=False)

LightGBM ランダムフォレストモデル チューニング

def get_fi(model):
   fi = model.feature_importance(importance_type='gain')
   return (pd.Series(fi / fi.sum(),
                     index=model.feature_name()))
base_params = dict(boosting_type='rf',
                  objective='regression',
                  bagging_freq=1,
                  verbose=-1)

ハイパーパラメーターオプション

bagging_fraction_opts = [.5, .75, .95]
feature_fraction_opts = [.75, .95]
min_data_in_leaf_opts = [250, 500, 1000]

組み合わせは3*3*2=18通り

cv_params = list(product(bagging_fraction_opts,
                        feature_fraction_opts,
                        min_data_in_leaf_opts))
n_cv_params = len(cv_params)
n_cv_params
'''
18
'''

ランダムサンプル

sample_proportion = .5
sample_size = int(sample_proportion * n_cv_params)

cv_param_sample = np.random.choice(list(range(n_cv_params)), 
                                    size=int(sample_size), 
                                    replace=False)
cv_params_ = [cv_params[i] for i in cv_param_sample]
print('# CV parameters:', len(cv_params_))
'''
# CV parameters: 9
'''
num_iterations = [25] + list(range(50, 501, 25))
num_boost_round = num_iterations[-1]

訓練/テスト期間

train_lengths = [5 * YEAR, 3 * YEAR, YEAR, 126, 63]
test_lengths = [5, 21]
test_params = list(product(train_lengths, test_lengths))
n_test_params = len(test_params)
sample_proportion = 1.0
sample_size = int(sample_proportion * n_test_params)

test_param_sample = np.random.choice(list(range(n_test_params)), 
                                    size=int(sample_size), 
                                    replace=False)
test_params_ = [test_params[i] for i in test_param_sample]
print('Train configs:', len(test_params_))
print('CV Iterations:', len(cv_params_) * len(test_params_))
'''
Train configs: 10
CV Iterations: 90
'''

カテゴリカル変数

categoricals = ['year', 'weekday', 'month']
for feature in categoricals:
   data[feature] = pd.factorize(data[feature], sort=True)[0]

交差検証実行

labels = sorted(cv_data.filter(like='fwd').columns)
features = cv_data.columns.difference(labels).tolist()
label_dict = dict(zip(lookaheads, labels))
cv_store = Path(results_path / 'parameter_tuning.h5')
ic_cols = ['bagging_fraction',
          'feature_fraction',
          'min_data_in_leaf',
          't'] + [str(n) for n in num_iterations]

以下のステップでモデルを作る。

・予測ホライズンと訓練/テスト期間を一個ずつ試す
・それぞれの対してMultipleTimeSeriesCVを設定する
LightGBMのデータセットの目的変数をバイナリ化する
・ハイパーパラメーターから最適なモデルを探索する
for lookahead in lookaheads:
   for train_length, test_length in test_params_:
       n_splits = int(2 * YEAR / test_length)
       print(f'Lookahead: {lookahead:2.0f} | Train: {train_length:3.0f} | '
             f'Test: {test_length:2.0f} | Params: {len(cv_params_):3.0f}')

       cv = MultipleTimeSeriesCV(n_splits=n_splits,
                                 test_period_length=test_length,
                                 train_period_length=train_length,
                                 lookahead=lookahead)

       label = label_dict[lookahead]
       outcome_data = data.loc[:, features + [label]].dropna()

       lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
                              label=outcome_data[label],
                              categorical_feature=categoricals,
                              free_raw_data=False)
       predictions, daily_ic, ic, feature_importance = [], [], [], []
       key = f'{lookahead}/{train_length}/{test_length}'
       T = 0
       for p, (bagging_fraction, feature_fraction, min_data_in_leaf) in enumerate(cv_params_):
           params = base_params.copy()
           params.update(dict(bagging_fraction=bagging_fraction,
                              feature_fraction=feature_fraction,
                              min_data_in_leaf=min_data_in_leaf))

           start = time()
           cv_preds, nrounds = [], []
           for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
               lgb_train = lgb_data.subset(train_idx.tolist()).construct()
               lgb_test = lgb_data.subset(test_idx.tolist()).construct()

               model = lgb.train(params=params,
                                 train_set=lgb_train,
                                 num_boost_round=num_boost_round,
                                 verbose_eval=False)
               if i == 0:
                   fi = get_fi(model).to_frame()
               else:
                   fi[i] = get_fi(model)

               test_set = outcome_data.iloc[test_idx, :]
               X_test = test_set.loc[:, model.feature_name()]
               y_test = test_set.loc[:, label]
               y_pred = {str(n): model.predict(X_test, num_iteration=n)
                         for n in num_iterations}
               cv_preds.append(y_test.to_frame(
                   'y_test').assign(**y_pred).assign(i=i))
               nrounds.append(model.best_iteration)
           feature_importance.append(fi.T.describe().T.assign(bagging_fraction=bagging_fraction,
                                                              feature_fraction=feature_fraction,
                                                              min_data_in_leaf=min_data_in_leaf))
           cv_preds = pd.concat(cv_preds).assign(bagging_fraction=bagging_fraction,
                                                 feature_fraction=feature_fraction,
                                                 min_data_in_leaf=min_data_in_leaf)

           predictions.append(cv_preds)
           by_day = cv_preds.groupby(level='date')
           ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test,
                                                                   x[str(n)])[0]).to_frame(n)
                                  for n in num_iterations], axis=1)

           daily_ic.append(ic_by_day.assign(bagging_fraction=bagging_fraction,
                                            feature_fraction=feature_fraction,
                                            min_data_in_leaf=min_data_in_leaf))

           cv_ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
                 for n in num_iterations]

           T += time() - start
           ic.append([bagging_fraction, feature_fraction,
                      min_data_in_leaf, lookahead] + cv_ic)

           msg = f'{p:3.0f} | {format_time(T)} | '
           msg += f'{bagging_fraction:3.0%} | {feature_fraction:3.0%} | {min_data_in_leaf:5,.0f} | '
           msg += f'{max(cv_ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {ic_by_day.median().max(): 6.2%}'
           print(msg)

       m = pd.DataFrame(ic, columns=ic_cols)
       m.to_hdf(cv_store, 'ic/' + key)
       pd.concat(daily_ic).to_hdf(cv_store, 'daily_ic/' + key)
       pd.concat(feature_importance).to_hdf(cv_store, 'fi/' + key)
       pd.concat(predictions).to_hdf(cv_store, 'predictions/' + key)

交差検証 結果分析

分析用データを計算

以下の変数を結合する

id_vars = ['train_length',
          'test_length',
          'bagging_fraction',
          'feature_fraction',
          'min_data_in_leaf',
          't', 'date']

ICの計算。

daily_ic, ic = [], []
for t in lookaheads:
   print(t)
   with pd.HDFStore(cv_store) as store:
       keys = [k[1:] for k in store.keys() if k.startswith(f'/fi/{t}')]
       for key in keys:
           train_length, test_length = key.split('/')[2:]
           print(train_length, test_length)
           k = f'{t}/{train_length}/{test_length}'
           cols = {'t': t,
                   'train_length': int(train_length),
                   'test_length': int(test_length)}

           ic.append(pd.melt(store['ic/' + k]
                             .assign(**cols),
                             id_vars=id_vars[:-1],
                             value_name='ic',
                             var_name='rounds')
                     .apply(pd.to_numeric))

           df = store['daily_ic/' + k].assign(**cols).reset_index()
           daily_ic.append(pd.melt(df,
                                   id_vars=id_vars,
                                   value_name='daily_ic',
                                   var_name='rounds')
                           .set_index('date')
                           .apply(pd.to_numeric)
                           .reset_index())            
ic = pd.concat(ic, ignore_index=True)
daily_ic = pd.concat(daily_ic, ignore_index=True)

予測パフォーマンス: 日別CV情報係数

group_cols = ['t','train_length', 'test_length', 
             'bagging_fraction', 'feature_fraction', 'min_data_in_leaf']
daily_ic_avg = daily_ic.groupby(group_cols + ['rounds']).daily_ic.mean().to_frame('ic').reset_index()
daily_ic_avg.groupby('t', group_keys=False).apply(lambda x: x.nlargest(3, 'ic'))
daily_ic_avg.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4320 entries, 0 to 4319
Data columns (total 8 columns):
#   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
0   t                 4320 non-null   int64  
1   train_length      4320 non-null   int64  
2   test_length       4320 non-null   int64  
3   bagging_fraction  4320 non-null   float64
4   feature_fraction  4320 non-null   float64
5   min_data_in_leaf  4320 non-null   int64  
6   rounds            4320 non-null   int64  
7   ic                4320 non-null   float64
dtypes: float64(3), int64(5)
memory usage: 270.1 KB
'''

日次では75%が正の日次ICを出していた。21日でも同じ傾向が見られた。

ax = sns.boxenplot(x='t', y='ic', data=daily_ic_avg)
ax.axhline(0, ls='--', lw=1, c='k');

画像4

訓練期間別

g = sns.catplot(x='t',
               y='ic',
               col='train_length',
               row='test_length',
               data=daily_ic_avg[(daily_ic_avg.test_length == 21)],
               kind='boxen')
g.savefig(results_path / 'daily_ic_test_21', dpi=300);

画像5

lin_reg = {}
for t in [1, 5]:
   df_ = daily_ic_avg[(daily_ic_avg.t==t)&(daily_ic_avg.rounds<=250)].dropna()
   y, X = df_.ic, df_.drop(['ic', 't'], axis=1)
   X = sm.add_constant(pd.get_dummies(X, columns=X.columns, drop_first=True))
   model = sm.OLS(endog=y, exog=X)
   lin_reg[t] = model.fit()
   s = lin_reg[t].summary()
   coefs = pd.read_csv(StringIO(s.tables[1].as_csv())).rename(
       columns=lambda x: x.strip())
   coefs.columns = ['variable', 'coef', 'std_err',
                    't', 'p_value', 'ci_low', 'ci_high']
   coefs.to_csv(results_path / f'lr_result_{t:02}.csv', index=False)
def visualize_lr_result(model, ax):
   ci = model.conf_int()
   errors = ci[1].sub(ci[0]).div(2)

   coefs = (model.params.to_frame('coef').assign(error=errors)
            .reset_index().rename(columns={'index': 'variable'}))
   coefs = coefs[~coefs['variable'].str.startswith(
       'date') & (coefs.variable != 'const')]
   coefs.variable = coefs.variable.str.split('_').str[-1]

   coefs.plot(x='variable', y='coef', kind='bar', ax=ax, 
              color='none', capsize=3, yerr='error', legend=False, rot=0)    
   ax.set_ylabel('IC')
   ax.set_xlabel('')
   ax.scatter(x=pd.np.arange(len(coefs)), marker='_', s=120, y=coefs['coef'], color='black')
   ax.axhline(y=0, linestyle='--', color='black', linewidth=1)
   ax.xaxis.set_ticks_position('none')

   ax.annotate('Train\nLength', xy=(.09, -0.1), xytext=(.09, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=5, lengthB=0.8', lw=1.0, color='black'))

   ax.annotate('Test\nLength', xy=(.23, -0.1), xytext=(.23, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=2, lengthB=0.8', lw=1.0, color='black'))

   ax.annotate('Bagging\nFraction', xy=(.32, -0.1), xytext=(.32, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=2.7, lengthB=0.8', lw=1.0, color='black'))


   ax.annotate('Feature\nFraction', xy=(.44, -0.1), xytext=(.44, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=3.4, lengthB=1.0', lw=1.0, color='black'))
   

   ax.annotate('Min.\nSamples', xy=(.55, -0.1), xytext=(.55, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=2.5, lengthB=1.0', lw=1.0, color='black'))    
   
   ax.annotate('Number of\nRounds', xy=(.8, -0.1), xytext=(.8, -0.2),
               xycoords='axes fraction',
               textcoords='axes fraction',
               fontsize=11, ha='center', va='bottom',
               bbox=dict(boxstyle='square', fc='white', ec='black'),
               arrowprops=dict(arrowstyle='-[, widthB=11.2, lengthB=1.0', lw=1.0, color='black'))​

以下のプロットは、回帰係数値とその信頼区間を示しています。切片(図示していない)は小さな正の値を持ち、統計的に有意です。ドロップされたカテゴリーの影響(各パラメーターの最小値)を取り込みます。

1日間の予測では、すべてではありませんが一部の結果は洞察に富んでいます。21日間のテストの方が適切であり、min_samples_leafは500または1,000です。 100〜200本のツリーが最適に機能するように見えますが、トレーニング期間が短い場合も長い場合も、中間値よりも優れています。

with sns.axes_style('white'):
   fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
   axes = axes.flatten()
   for i, t in enumerate([1, 5]):
       visualize_lr_result(lin_reg[t], axes[i])
       axes[i].set_title(f'Lookahead: {t} Day(s)')
   fig.suptitle('OLS Coefficients & Confidence Intervals', fontsize=20)
   fig.tight_layout()
   fig.subplots_adjust(top=.92)

画像6

情報係数:まとめ

しばしば報告されますが、必ずしもモデルのリターン予測と毎日のICを使用する毎日の取引戦略の目標とは一致しない、全体的なIC値も確認します。

g = sns.catplot(x='t',
               y='ic',
               col='train_length',
               row='test_length',
               data=ic[(ic.test_length == 21) & (ic.t < 21)],
               kind='box')

画像7

t = 1
train_length = 756
test_length = 21
g = sns.catplot(x='rounds',
   y='ic',
   col='feature_fraction',
   hue='bagging_fraction',
   row='min_data_in_leaf',
   data=ic[(ic.t == t) &
           (ic.train_length == train_length) &
           (ic.test_length == test_length)],
   kind='swarm');

画像8

ランダムフォレスト vs 線形回帰

線形モデルデータ読み込み

lr_metrics = pd.read_csv(model_path / 'lin_reg_performance.csv')
with sns.axes_style("white"):
   ax = (ic.groupby('t').ic.max().to_frame('Random Forest')
    .join(lr_metrics.groupby('lookahead').ic.max().to_frame('Linear Regression')).plot.barh())
   ax.set_ylabel('Lookahead')
   ax.set_xlabel('Information Coefficient')
   sns.despine()
   plt.tight_layout();

画像9

予測を生成

10個のベストモデルを使用して、アンサンブルモデルを作る。

param_cols = ['train_length', 'test_length', 'bagging_fraction',
             'feature_fraction', 'min_data_in_leaf', 'rounds']
def get_params(data, t=5, best=0):
   df = data[data.t == t].sort_values('ic', ascending=False).iloc[best]
   df = df.loc[param_cols]
   rounds = int(df.rounds)
   params = pd.to_numeric(df.drop('rounds'))
   return params, rounds
base_params = dict(boosting_type='rf',
                  objective='regression',
                  bagging_freq=1,
                  verbose=-1)

store = Path(results_path / 'predictions.h5')
for lookahead in [1, 5, 10, 21]:
   if lookahead > 1: continue
   print(f'\nLookahead: {lookahead:02}')
   data = (pd.read_hdf('data.h5', 'stooq/japan/equities'))
   labels = sorted(data.filter(like='fwd').columns)
   features = data.columns.difference(labels).tolist()
   label = f'fwd_ret_{lookahead:02}'
   data = data.loc[:, features + [label]].dropna()

   categoricals = ['year', 'weekday', 'month']
   for feature in categoricals:
       data[feature] = pd.factorize(data[feature], sort=True)[0]

   lgb_data = lgb.Dataset(data=data[features],
                          label=data[label],
                          categorical_feature=categoricals,
                          free_raw_data=False)
   
   for position in range(10):
       params, num_boost_round = get_params(daily_ic_avg,
                                            t=lookahead,
                                            best=position)
       params = params.to_dict()
       params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
       train_length = int(params.pop('train_length'))
       test_length = int(params.pop('test_length'))
       params.update(base_params)

       print(f'\tPosition: {position:02}')

       n_splits = int(2 * YEAR / test_length)
       cv = MultipleTimeSeriesCV(n_splits=n_splits,
                                 test_period_length=test_length,
                                 lookahead=lookahead,
                                 train_period_length=train_length)

       predictions = []
       start = time()
       for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
           lgb_train = lgb_data.subset(train_idx.tolist()).construct()

           model = lgb.train(params=params,
                             train_set=lgb_train,
                             num_boost_round=num_boost_round,
                             verbose_eval=False)

           test_set = data.iloc[test_idx, :]
           y_test = test_set.loc[:, label].to_frame('y_test')
           y_pred = model.predict(test_set.loc[:, model.feature_name()])
           predictions.append(y_test.assign(prediction=y_pred))

       if position == 0:
           test_predictions = (pd.concat(predictions)
                               .rename(columns={'prediction': position}))
       else:
           test_predictions[position] = pd.concat(predictions).prediction
       

   by_day = test_predictions.groupby(level='date')
   for position in range(10):
       if position == 0:
           ic_by_day = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0]).to_frame()
       else:
           ic_by_day[position] = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0])

   test_predictions.to_hdf(store, f'test/{lookahead:02}')





































この記事が気に入ったらサポートをしてみませんか?