Skip to content

Machine Learning Notes

Opptimization of hyperparameters

调参的问题。一般而言调参是针对特定数据集的,最好用验证集或K-Fold验证。调参非常昂贵(expensive),通常最好从最重要的超参数开始,只有在必要时才考虑调整其他超参数(够用就好)。

Techniques for hyperparameter tuning include grid search, random search, and Bayesian optimization.

虽然说有些人认为 grid search 对参数范围过大的情况不友好,所以推荐 Bayesian Method,但是各个参数未必平滑变化,意味着基于平滑空间探索的Bayesian optimization Method 未必一定适合。(不过确实好用)

XGBoost

官网永远是第一手资料,XGBoost Hyperparameter Optimization

核心超参数列表
参数名称 (英文)描述作用推荐值
max_depth单棵树的最大深度增加深度提升模型复杂度(易过拟合)3-9
min_child_weight子节点分裂所需的最小实例权重和较大值约束树生长(防止过拟合)1-7
subsample每棵树使用的训练数据采样比例较低值引入随机性(防过拟合)0.6-1.0
colsample_bytree每棵树使用的特征列采样比例限制特征选择空间(增强泛化能力)0.6-1.0
learning_rate权重更新的步长收缩率小步长提升稳定性(需配合更多树)0.01-0.3

Ref: Suggested Ranges for Tuning XGBoost Hyperparameters

& Most Important XGBoost Hyperparameters to Tune

Xgboost 扩展超参数列表
展开
参数名称 (英文)描述作用推荐范围
gamma节点分裂所需的最小损失减少量较大值使模型保守(适用于噪声数据)0-5
reg_alpha (L1)L1正则化系数稀疏化特征权重(产生稀疏解)0-10
reg_lambda (L2)L2正则化系数平滑化特征权重(防止极端值)0-10
scale_pos_weight正负样本权重平衡系数处理类别不平衡问题(值>1时增强正样本权重)1-10
n_estimators集成中树的总数量增加数量提升模型容量(需平衡计算成本)100-2000

这里参数范围基本上是瞎设置的(因为官网上没有)


Optuna 调参

Ref:Bayesian Optimization of XGBoost Hyperparameters with optuna

官网的例子
python
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import optuna

# Generate a synthetic classification dataset
X, y = make_classification(n_samples=1000, n_classes=2, n_informative=10, random_state=42)

def objective(trial):
    # Suggest hyperparameters
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    # Create an XGBoost classifier with the suggested hyperparameters
    model = XGBClassifier(**params, n_estimators=100, objective='binary:logistic', random_state=42)

    # Perform 5-fold cross-validation and return the mean accuracy
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return scores.mean()

# Create an Optuna study with TPE sampler
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

# Optimize the study for 100 trials
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and best score
print(f"Best hyperparameters: {study.best_params}")
print(f"Best score: {study.best_value:.4f}")
使用手动的K-Fold
python
from sklearn.model_selection import KFold
from xgboost import XGBRegressor, callback

def optimize_hyperparameters(target_name, X_train_, y_train_):
    """使用交叉验证进行参数优化"""
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 2.0),
            'tree_method': 'hist' #'gpu_hist'
        }
        
        # 使用分层K折交叉验证(更适合分类任务)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train_, y_train_)):
            X_tr, y_tr = X_train_.iloc[train_idx], y_train_.iloc[train_idx]
            X_v, y_v = X_train_.iloc[val_idx], y_train_.iloc[val_idx]

            model = XGBRegressor(
                **params,
                n_estimators=2000,
                early_stopping_rounds=20,
                eval_metric='rmse',
                random_state=42 # + fold_idx,  # 为每个fold设置不同的随机种子 ?
                n_jobs=-1
            )
            
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_v, y_v)],
                verbose=False
            )
            
            # 使用最佳迭代预测并计算分数
            y_pred = model.predict(X_v, iteration_range=(0, model.best_iteration + 1))
            fold_score = np.sqrt(mean_squared_error(y_v, y_pred))
            scores.append(fold_score)
            
            # 添加Optuna中途剪枝功能
            trial.report(fold_score, step=fold_idx)
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        return np.mean(scores)

    # 创建带持久化存储的Optuna study
    storage_name = f"sqlite:///{target_name}_tuning_optuna.db"
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=42),
        study_name=target_name,
        storage=storage_name,
        load_if_exists=True
    )
    
    # 优化时显示进度条并设置超时时间
    study.optimize(objective, n_trials=500, show_progress_bar=True, timeout=86400)  # 24小时超时
    
    # 保存完整的study对象
    joblib.dump(study, f"optuna_study_{target_name}.pkl")
    return study.best_params

Ps:超级想吐槽sqlite:///不知道为何是这种结构