見出し画像

ボット機械学習アイデア

コードはCC0

NonstationaryFeatureRemover

from sklearn.base import BaseEstimator, TransformerMixin, clone
import lightgbm as lgb
import numpy as np
import pandas as pd

class NonstationaryFeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, estimator=None, remove_count=None, remove_ratio=None):
        if remove_count and remove_ratio:
            raise Exception('remove_count and remove_ratio cannot be set simultaneously')
        self.estimator = lgb.LGBMRegressor(n_jobs=-1, random_state=1) if estimator is None else estimator
        self.remove_count = remove_count
        self.remove_ratio = remove_ratio

    def fit(self, X, y=None):
        X = self._validate_data(X)

        model = clone(self.estimator)

        model.fit(X, np.arange(X.shape[0]))
        importances = model.feature_importances_

        if self.remove_count:
            remove_count = self.remove_count
        else:
            remove_count = int(self.remove_ratio * X.shape[1])

        features = list(range(X.shape[1]))
        feature_imp = pd.DataFrame(zip(importances, features), columns=['value', 'feature'])
        feature_imp = feature_imp.sort_values('value')

        for i in range(X.shape[1] - remove_count, X.shape[1]):
            features.remove(int(feature_imp['feature'].iloc[i]))

        self.selected_features_ = np.array(features)

        return self

    def transform(self, X, y=None):
        X = self._validate_data(X)

        return X[:, self.selected_features_].copy()

    def inverse_transform(self, X, y=None):
        raise Exception('inverse_transform not implemented')

signとsample_weightで学習

model.fit(df[features], np.sign(df['y']), sample_weight=np.abs(df['y']))

非ゼロのみで学習

yを約定シミュレーションで作るときに、約定しなかったケースをゼロにすることがある。約定しなかったケースの精度は成績に影響しないので、ゼロを除いて学習すると成績が上がることがあった

df = df.loc[df['y'] != 0]
model.fit(df[features], df['y'])

rolling rankでy

window = 100
df['y_rr'] = df['y'].iloc[::-1].rolling(window, 1).rank(pct=True).iloc[::-1]
model.fit(df[features], df['y_rr'])

rolling rankで特徴量

window = 100
df['feature_1_rank'] = df['feature_1'].rolling(window, 1).rank(pct=True)