MLPlatform/function/model_trainer.py

import numpy as pd
import numpy as np
from typing import Dict, List, Optional
import logging
from pathlib import Path
import datetime
import yaml
import mlflow
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score, explained_variance_score,
    adjusted_rand_score, homogeneity_score, completeness_score, silhouette_score
)

class ModelTrainer:
    """模型训练类"""

    def __init__(self, config: Dict = None):
        """初始化模型训练器"""
        self.config = config or {}
        self.logger = logging.getLogger(__name__)
        self._setup_logging()
        self._load_metrics()
        self._load_parameters()
        self._load_model_info()

        # with open("confg/config.yaml", 'r', encoding='utf-8') as f:
        #     config = yaml.safe_load(f)

        # 初始化MLflow
        mlflow.set_tracking_uri(self.config.get('mlflow_uri', 'http://10.0.0.202:5000'))

    def _setup_logging(self):
        """设置日志"""
        log_dir = Path('.log')
        log_dir.mkdir(exist_ok=True)

        file_handler = logging.FileHandler(
            log_dir / f'model_training_{datetime.datetime.now():%Y%m%d_%H%M%S}.log'
        )
        file_handler.setFormatter(
            logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        )
        self.logger.addHandler(file_handler)
        self.logger.setLevel(logging.INFO)

    def _load_metrics(self):
        """加载评估指标配置"""
        try:
            with open('model/metrics.yaml', 'r', encoding='utf-8') as f:
                self.metrics_config = yaml.safe_load(f)
        except Exception as e:
            self.logger.error(f"Error loading metrics config: {str(e)}")
            raise

    def _load_parameters(self):
        """加载模型参数配置"""
        try:
            with open('model/parameter.yaml', 'r', encoding='utf-8') as f:
                self.parameter_config = yaml.safe_load(f)
        except Exception as e:
            self.logger.error(f"Error loading parameter config: {str(e)}")
            raise

    def _load_model_info(self):
        """加载模型信息配置"""
        try:
            with open('model/model.yaml', 'r', encoding='utf-8') as f:
                self.model_info = yaml.safe_load(f)
        except Exception as e:
            self.logger.error(f"Error loading model info: {str(e)}")
            raise

    def _get_model_class(self, algorithm_name: str):
        """获取模型类"""
        # 分类算法
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, OneClassSVM
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.ensemble import (
            RandomForestClassifier, GradientBoostingClassifier,
            AdaBoostClassifier, IsolationForest
        )
        from sklearn.naive_bayes import GaussianNB
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.neural_network import MLPClassifier
        import xgboost as xgb
        import lightgbm as lgb
        from catboost import CatBoostClassifier

        # 回归算法
        from sklearn.linear_model import (
            LinearRegression, Ridge, Lasso,
            ElasticNet
        )
        from sklearn.svm import SVR
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.ensemble import (
            RandomForestRegressor, GradientBoostingRegressor,
            AdaBoostRegressor
        )
        from catboost import CatBoostRegressor
        from sklearn.neural_network import MLPRegressor

        # 聚类算法
        from sklearn.cluster import (
            KMeans, AgglomerativeClustering,
            DBSCAN, SpectralClustering
        )
        from sklearn.mixture import GaussianMixture

        algorithm_map = {
            # 分类算法
            'LogisticRegression': LogisticRegression,
            'SVC': SVC,
            'SVDD': OneClassSVM,  # SVDD使用OneClassSVM实现
            'DecisionTreeClassifier': DecisionTreeClassifier,
            'RandomForestClassifier': RandomForestClassifier,
            'XGBClassifier': xgb.XGBClassifier,
            'AdaBoostClassifier': AdaBoostClassifier,
            'CatBoostClassifier': CatBoostClassifier,
            'LGBMClassifier': lgb.LGBMClassifier,
            'GaussianNB': GaussianNB,
            'KNeighborsClassifier': KNeighborsClassifier,
            'MLPClassifier': MLPClassifier,
            'GradientBoostingClassifier': GradientBoostingClassifier,

            # 回归算法
            'LinearRegression': LinearRegression,
            'Ridge': Ridge,
            'Lasso': Lasso,
            'ElasticNet': ElasticNet,
            'SVR': SVR,
            'DecisionTreeRegressor': DecisionTreeRegressor,
            'RandomForestRegressor': RandomForestRegressor,
            'XGBRegressor': xgb.XGBRegressor,
            'AdaBoostRegressor': AdaBoostRegressor,
            'CatBoostRegressor': CatBoostRegressor,
            'LGBMRegressor': lgb.LGBMRegressor,
            'MLPRegressor': MLPRegressor,

            # 聚类算法
            'KMeans': KMeans,
            'KMeansPlusPlus': KMeans,  # KMeans++使用KMeans实现，通过init参数控制
            'AgglomerativeClustering': AgglomerativeClustering,
            'DBSCAN': DBSCAN,
            'GaussianMixture': GaussianMixture,
            'SpectralClustering': SpectralClustering
        }

        if algorithm_name not in algorithm_map:
            raise ValueError(f"Unknown algorithm: {algorithm_name}")

        return algorithm_map[algorithm_name]

    def _get_algorithm_info(self, algorithm_name: str) -> Dict:
        """获取算法信息"""
        for category in ['classification_algorithms', 'regression_algorithms', 'clustering_algorithms']:
            if algorithm_name in self.model_info.get(category, {}):
                return self.model_info[category][algorithm_name]
        raise ValueError(f"Algorithm {algorithm_name} not found in model info")

    def train_model(
        self,
        train_data: Dict,
        val_data: Dict,
        model_config: Dict,
        experiment_name: str
    ) -> Dict:
        """
        训练模型

        Args:
            train_data: 训练数据，包含特征和标签
            val_data: 验证数据，包含特征和标签
            model_config: 模型配置，包含算法名称和参数
            experiment_name: MLflow实验名称

        Returns:
            训练结果字典
        """
        try:
            # 检查实验是否存在且被删除
            experiment = mlflow.get_experiment_by_name(experiment_name)
            if experiment and experiment.lifecycle_stage == 'deleted':
                # 如果实验被删除，则创建一个新的实验名称
                timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                new_experiment_name = f"{experiment_name}_{timestamp}"
                self.logger.info(f"Original experiment was deleted, creating new experiment: {new_experiment_name}")
                experiment_name = new_experiment_name

            # 设置MLflow实验
            mlflow.set_experiment(experiment_name)

            with mlflow.start_run() as run:
                # 记录基本信息
                mlflow.log_param('algorithm', model_config['algorithm'])
                mlflow.log_param('task_type', model_config['task_type'])
                # mlflow.log_param('dataset', experiment_name.split('_')[0])  # 从实验名称提取数据集名称
                mlflow.log_param('dataset', model_config['dataset']) # 直接写数据集路径

                # timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                # mlflow.log_param('start_time', timestamp)

                # 记录模型参数
                for param_name, param_value in model_config['params'].items():
                    mlflow.log_param(param_name, param_value)

                # 记录算法信息
                algorithm_info = self._get_algorithm_info(model_config['algorithm'])
                mlflow.log_param('principle', algorithm_info['principle'])
                mlflow.log_param('advantages', str(algorithm_info['advantages']))
                mlflow.log_param('disadvantages', str(algorithm_info['disadvantages']))

                # 特殊处理KMeans++
                if model_config['algorithm'] == 'KMeansPlusPlus':
                    model_config['params']['init'] = 'k-means++'

                # 获取模型类和信息
                model_class = self._get_model_class(model_config['algorithm'])

                # 创建模型实例
                model = model_class(**model_config['params'])

                # 训练模型
                self.logger.info(f"Starting training {model_config['algorithm']}")
                model.fit(train_data['features'], train_data['labels'])

                # timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                # mlflow.log_param('end_time', timestamp)

                # 在验证集上评估
                val_predictions = model.predict(val_data['features'])
                metrics = self._calculate_metrics(
                    val_data['labels'],
                    val_predictions,
                    model_config['task_type']
                )

                # 记录指标
                for metric_name, metric_value in metrics.items():
                    mlflow.log_metric(metric_name, metric_value)

                # 保存模型
                mlflow.sklearn.log_model(model, "model")

                self.logger.info(f"Training completed. Run ID: {run.info.run_id}")

                return {
                    'status': 'success',
                    'run_id': run.info.run_id,
                    'metrics': metrics,
                    'algorithm_info': algorithm_info
                }

        except Exception as e:
            error_msg = f"Error training model: {str(e)}"
            self.logger.error(error_msg)
            return {
                'status': 'error',
                'message': error_msg
            }

    def _calculate_metrics(
        self,
        true_labels: np.ndarray,
        predictions: np.ndarray,
        task_type: str
    ) -> Dict:
        """计算评估指标"""
        metrics = {}

        if task_type == 'classification':
            metrics['accuracy'] = accuracy_score(true_labels, predictions)
            metrics['precision'] = precision_score(true_labels, predictions, average='weighted')
            metrics['recall'] = recall_score(true_labels, predictions, average='weighted')
            metrics['f1'] = f1_score(true_labels, predictions, average='weighted')
            if len(np.unique(true_labels)) == 2:  # 二分类问题
                metrics['roc_auc'] = roc_auc_score(true_labels, predictions)

        elif task_type == 'regression':
            metrics['mae'] = mean_absolute_error(true_labels, predictions)
            metrics['mse'] = mean_squared_error(true_labels, predictions)
            metrics['rmse'] = np.sqrt(metrics['mse'])
            metrics['r2'] = r2_score(true_labels, predictions)
            metrics['explained_variance'] = explained_variance_score(true_labels, predictions)

        elif task_type == 'clustering':
            metrics['adjusted_rand'] = adjusted_rand_score(true_labels, predictions)
            metrics['homogeneity'] = homogeneity_score(true_labels, predictions)
            metrics['completeness'] = completeness_score(true_labels, predictions)
            if len(np.unique(predictions)) > 1:  # 确保有多个簇
                metrics['silhouette'] = silhouette_score(
                    true_labels.reshape(-1, 1),
                    predictions
                )

        return metrics