ML 常用trick & code
Published:
pd 常用操作
df.shape()
df.head(3)
df.drop([’’,’’], axis=1)
y_df = df[y] x_df = df.drop(y, axis=1)
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.3, random_state=42)
def print_evaluate(true, predicted):
mae = metrics.mean_absolute_error(true, predicted) mse = metrics.mean_squared_error(true, predicted) rmse = np.sqrt(metrics.mean_squared_error(true, predicted)) r2_square = metrics.r2_score(true, predicted) print(‘MAE:’, mae) print(‘MSE:’, mse) print(‘RMSE:’, rmse) print(‘R2 Square’, r2_square)
optna 调参
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
}
model = xgb.XGBRegressor(**params, random_state=42)
scores = -np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1))
return scores
# 使用n_jobs = -1 来多核运算
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=-1)
# 最佳参数
best_params = study.best_params
print("Best parameters:", best_params)
model = xgb.XGBRegressor(**best_params)
# 在训练集上训练模型
model.fit(X_train, y_train)
# 使用测试数据进行预测
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)
print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)
evaluate(y_test,test_pred)