第11章:ランダムフォレスト - ロングショート戦略第4節: 日本株特徴量

2020年8月22日 23:52

ここでは、これから予測モデルを作成する前の特徴量を作る処理を行います。

インポートと設定

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd
import talib

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('white')

idx = pd.IndexSlice

データ取得

DATA_DIR = Path('..', 'data')

DATA_DIR / 'assets.h5'

prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'stooq/jp/tse/stocks/prices')
         .loc[idx[:, '2010': '2019'], :])

prices.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10286682 entries, ('1301.JP', Timestamp('2005-03-22 00:00:00')) to ('9997.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
#   Column  Non-Null Count     Dtype  
---  ------  --------------     -----  
0   open    10286682 non-null  float64
1   high    10286682 non-null  float64
2   low     10286682 non-null  float64
3   close   10286682 non-null  float64
4   volume  10286682 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 432.0+ MB
'''

before = len(prices.index.unique('ticker').unique())

欠損値を取り除く

prices = (prices.unstack('ticker')
       .sort_index()
       .ffill(limit=5)
       .dropna(axis=1)
       .stack('ticker')
       .swaplevel())
prices.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
#   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
0   open    873446 non-null  float64
1   high    873446 non-null  float64
2   low     873446 non-null  float64
3   close   873446 non-null  float64
4   volume  873446 non-null  float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''

after = len(prices.index.unique('ticker').unique())
print(f'Before: {before:,.0f} after: {after:,.0f}')
'''
Before: 3,667 after: 178
'''

最も取引されてるシンボルを残す

dv = prices.close.mul(prices.volume)
keep = dv.groupby('ticker').median().nlargest(1000).index.tolist()

prices = prices.loc[idx[keep, :], :]
prices.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
#   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
0   open    873446 non-null  float64
1   high    873446 non-null  float64
2   low     873446 non-null  float64
3   close   873446 non-null  float64
4   volume  873446 non-null  float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''

特徴量エンジニアリング

intervals = [1, 5, 10, 21, 63]

returns = []
by_ticker = prices.groupby(level='ticker').close
for t in intervals:
   returns.append(by_ticker.pct_change(t).to_frame(f'ret_{t}'))
returns = pd.concat(returns, axis=1)

returns.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
#   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
0   ret_1   873268 non-null  float64
1   ret_5   872556 non-null  float64
2   ret_10  871666 non-null  float64
3   ret_21  869708 non-null  float64
4   ret_63  862232 non-null  float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''

異常値を取り除く

max_ret_by_sym = returns.groupby(level='ticker').max()

percentiles = [0.001, .005, .01, .025, .05, .1]
percentiles += [1-p for p in percentiles]
max_ret_by_sym.describe(percentiles=sorted(percentiles)[6:])

quantiles = max_ret_by_sym.quantile(.95)
to_drop = []
for ret, q in quantiles.items():
   to_drop.extend(max_ret_by_sym[max_ret_by_sym[ret]>q].index.tolist())

to_drop = pd.Series(to_drop).value_counts()
to_drop = to_drop[to_drop > 1].index.tolist()
len(to_drop)
'''
11
'''

prices = prices.drop(to_drop, level='ticker')
prices.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 819469 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
#   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
0   open    819469 non-null  float64
1   high    819469 non-null  float64
2   low     819469 non-null  float64
3   close   819469 non-null  float64
4   volume  819469 non-null  float64
dtypes: float64(5)
memory usage: 34.5+ MB
'''

相対リターンのパーセンタイルを計算

returns = []
by_sym = prices.groupby(level='ticker').close
for t in intervals:
   ret = by_sym.pct_change(t)
   rel_perc = (ret.groupby(level='date')
            .apply(lambda x: pd.qcut(x, q=20, labels=False, duplicates='drop')))
   returns.extend([ret.to_frame(f'ret_{t}'), rel_perc.to_frame(f'ret_rel_perc_{t}')])
returns = pd.concat(returns, axis=1)

テクニカルインジケーター

Percentage Price Oscillator

ppo = prices.groupby(level='ticker').close.apply(talib.PPO).to_frame('PPO')

Normalized Average True Range

natr = prices.groupby(level='ticker', group_keys=False).apply(lambda x: talib.NATR(x.high, x.low, x.close)).to_frame('NATR')

RSI

rsi = prices.groupby(level='ticker').close.apply(talib.RSI).to_frame('RSI')

Bollinger Bands

def get_bollinger(x):
   u, m, l = talib.BBANDS(x)
   return pd.DataFrame({'u': u, 'm': m, 'l': l})

bbands = prices.groupby(level='ticker').close.apply(get_bollinger)

特徴が結合される

data = pd.concat([prices, returns, ppo, natr, rsi, bbands], axis=1)

data['bbl'] = data.close.div(data.l)
data['bbu'] = data.u.div(data.close)
data = data.drop(['u', 'm', 'l'], axis=1)

data.bbu.corr(data.bbl, method='spearman')
'''
-0.17464878509378576
'''

ランダムサンプルティッカーのインジケータープロット

indicators = ['close', 'bbl', 'bbu', 'PPO', 'NATR', 'RSI']
ticker = np.random.choice(data.index.get_level_values('ticker'))
(data.loc[idx[ticker, :], indicators].reset_index('ticker', drop=True)
.plot(lw=1, subplots=True, figsize=(16, 10), title=indicators, layout=(3, 2), legend=False))
plt.suptitle(ticker, fontsize=14)
sns.despine()
plt.tight_layout()
plt.subplots_adjust(top=.95)

data = data.drop(prices.columns, axis=1)

時間系インジケーター作成

dates = data.index.get_level_values('date')
data['weekday'] = dates.weekday
data['month'] = dates.month
data['year'] = dates.year

フォワードリターンの計算

outcomes = []
by_ticker = data.groupby('ticker')
for t in intervals:
   k = f'fwd_ret_{t:02}'
   outcomes.append(k)
   data[k] = by_ticker[f'ret_{t}'].shift(-t)

data.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 819469 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 23 columns):
#   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
0   ret_1            819302 non-null  float64
1   ret_rel_perc_1   818968 non-null  float64
2   ret_5            818634 non-null  float64
3   ret_rel_perc_5   818634 non-null  float64
4   ret_10           817799 non-null  float64
5   ret_rel_perc_10  817799 non-null  float64
6   ret_21           815962 non-null  float64
7   ret_rel_perc_21  815962 non-null  float64
8   ret_63           808948 non-null  float64
9   ret_rel_perc_63  808948 non-null  float64
10  PPO              815294 non-null  float64
11  NATR             817131 non-null  float64
12  RSI              817131 non-null  float64
13  bbl              818801 non-null  float64
14  bbu              818801 non-null  float64
15  weekday          819469 non-null  int64  
16  month            819469 non-null  int64  
17  year             819469 non-null  int64  
18  fwd_ret_01       819302 non-null  float64
19  fwd_ret_05       818634 non-null  float64
20  fwd_ret_10       817799 non-null  float64
21  fwd_ret_21       815962 non-null  float64
22  fwd_ret_63       808948 non-null  float64
dtypes: float64(20), int64(3)
memory usage: 147.0+ MB
'''

data.to_hdf('data.h5', 'stooq/japan/equities')

この記事が気に入ったらサポートをしてみませんか？

第11章:ランダムフォレスト - ロングショート戦略 第4節: 日本株特徴量

インポートと設定

データ取得

特徴量エンジニアリング

第11章:ランダムフォレスト - ロングショート戦略第4節: 日本株特徴量