Kaggle 販売予測モデルのGridSearch
kaggleのこのコードがすごくわかりやすい。感謝。
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
# Create a DMatrix for LightGBM
train_data = lgb.Dataset(X_train, label=y_train.values.ravel())
# Define parameters for grid search
param_search = {
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [20, 40],
'boosting_type' : ['gbdt'],
'objective' : ['regression'],
'random_state' : [501],
'colsample_bytree' : [0.8, 0.1],
'subsample' : [0.8, 1],
'min_split_gain' : [0.01],
'metric':['l1', 'l2'],
'device': ['gpu'] # Enable GPU
}
# Cross-validation time series split
tscv = TimeSeriesSplit(n_splits=5)
# GridSearchCV
gsearch = GridSearchCV(lgb.LGBMRegressor(), param_grid=param_search, scoring=my_scorer, cv=tscv)
gsearch.fit(X_train, y_train.values.ravel())
# Best parameters
best_params = gsearch.best_params_
print("Best parameters: ", best_params)
こういうサマリ表示の関数も便利。
def summary(df):
# Print the shape of the DataFrame
print(f'data shape: {df.shape}')
# Create a summary DataFrame
summ = pd.DataFrame(df.dtypes, columns=['data type'])
# Calculate the number of missing values
summ['#missing'] = df.isnull().sum().values
# Calculate the percentage of missing values
summ['%missing'] = df.isnull().sum().values / len(df)* 100
# Calculate the number of unique values
summ['#unique'] = df.nunique().values
# Create a descriptive DataFrame
desc = pd.DataFrame(df.describe(include='all').transpose())
# Add the minimum, maximum, and first three values to the summary DataFrame
summ['min'] = desc['min'].values
summ['max'] = desc['max'].values
summ['first value'] = df.loc[0].values
summ['second value'] = df.loc[1].values
summ['third value'] = df.loc[2].values
# Return the summary DataFrame
return summ
この記事が気に入ったらサポートをしてみませんか?