ML 常用trick & code

Published:

pd 常用操作

df.shape()

df.head(3)

df.drop([’’,’’], axis=1)

y_df = df[y] x_df = df.drop(y, axis=1)

X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.3, random_state=42)

def print_evaluate(true, predicted):
mae = metrics.mean_absolute_error(true, predicted) mse = metrics.mean_squared_error(true, predicted) rmse = np.sqrt(metrics.mean_squared_error(true, predicted)) r2_square = metrics.r2_score(true, predicted) print(‘MAE:’, mae) print(‘MSE:’, mse) print(‘RMSE:’, rmse) print(‘R2 Square’, r2_square)

optna 调参

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = xgb.XGBRegressor(**params, random_state=42)
    scores = -np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)) 
    return scores

# 使用n_jobs = -1 来多核运算

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=-1)

# 最佳参数
best_params = study.best_params
print("Best parameters:", best_params)





model = xgb.XGBRegressor(**best_params)
# 在训练集上训练模型
model.fit(X_train, y_train)

# 使用测试数据进行预测
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

evaluate(y_test,test_pred)