CostPrediction/src/data_preparation.py

from sklearn.preprocessing import StandardScaler
from datetime import datetime
import os
import joblib
import pandas as pd
import numpy as np
from src.feature_analysis import FeatureAnalysis
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, LeaveOneOut
import json
import logging
from src.database.db_connection import get_db_connection
from sklearn.metrics import mean_absolute_error, mean_squared_error

class DataPreparation:
    def __init__(self):
        self.feature_analyzer = FeatureAnalysis()
        self.feature_scaler = StandardScaler()
        self.target_scaler = StandardScaler()  # 添加目标值标准化器

    def prepare_training_data(self, equipment_data, equipment_type):
        """
        准备训练数据
        """
        try:
            logging.info(f"Preparing training data for {equipment_type}")
            logging.info(f"Raw data size: {len(equipment_data)}")

            # 如果输入已经是 numpy 数组，直接返回
            if isinstance(equipment_data, np.ndarray):
                X = equipment_data
                logging.info(f"Input is already numpy array with shape: {X.shape}")

                # 处理无效值
                X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

                return {
                    'X': X,
                    'feature_names': self.feature_analyzer.get_equipment_specific_features(equipment_type),
                    'feature_scaler': self.feature_scaler,
                    'target_scaler': self.target_scaler
                }

            # 从原始数据中提取特征和目标值
            feature_names = self.feature_analyzer.get_equipment_specific_features(equipment_type)
            features = []
            targets = []

            for item in equipment_data:
                # 提取特征值
                feature_values = []
                for name in feature_names:
                    value = item.get(name)
                    try:
                        feature_values.append(float(value) if value is not None else 0.0)
                    except (ValueError, TypeError):
                        feature_values.append(0.0)
                features.append(feature_values)

                # 提取目标值（成本）
                try:
                    cost = float(item['actual_cost'])
                    if cost > 0:  # 只使用正数成本值
                        targets.append(cost)
                    else:
                        logging.warning(f"Skipping non-positive cost value: {cost}")
                except (ValueError, TypeError, KeyError):
                    logging.error(f"Invalid cost value: {item.get('actual_cost')}")
                    continue

            # 转换为numpy数组
            X = np.array(features, dtype=float)
            y = np.array(targets, dtype=float)

            # 记录原始数据范围
            logging.info(f"Original X range: min={X.min()}, max={X.max()}")
            logging.info(f"Original y range: min={y.min()}, max={y.max()}")

            # 处理无效值
            X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

            # 标准化特征和目标值
            X_scaled = self.feature_scaler.fit_transform(X)
            y_scaled = self.target_scaler.fit_transform(y.reshape(-1, 1)).ravel()

            # 记录标准化后的数据范围
            logging.info(f"Scaled X range: min={X_scaled.min()}, max={X_scaled.max()}")
            logging.info(f"Scaled y range: min={y_scaled.min()}, max={y_scaled.max()}")

            return {
                'X': X_scaled,
                'y': y_scaled,
                'feature_names': feature_names,
                'feature_scaler': self.feature_scaler,
                'target_scaler': self.target_scaler
            }

        except Exception as e:
            logging.error(f"Error in data preparation: {str(e)}")
            raise Exception(f"Training error: {str(e)}")

    def prepare_validation_data(self, validation_data, equipment_type, feature_names=None, scalers=None):
        """
        准备验证数据
        """
        try:
            logging.info(f"Preparing validation data for {equipment_type}")
            logging.info(f"Raw validation data size: {len(validation_data)}")

            # 如果输入已经是 numpy 数组，直接使用
            if isinstance(validation_data, np.ndarray):
                X = validation_data
                logging.info(f"Input is already numpy array with shape: {X.shape}")

                # 处理无效值
                X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

                # 使用训练数据的标准化器
                if scalers and 'feature_scaler' in scalers:
                    X_scaled = scalers['feature_scaler'].transform(X)
                else:
                    # 如果没有提供标准化器，直接返回处理后的数组
                    X_scaled = X

                logging.info(f"Preprocessed data shape: {X_scaled.shape}")
                logging.info(f"Validation features shape: {X_scaled.shape}")
                logging.info(f"Validation features type: {X_scaled.dtype}")

                return {
                    'X': X_scaled,
                    'y': None  # 验证数据可能没有标签
                }

            # 否则，从原始数据中提取特征
            if not feature_names:
                feature_names = self.feature_analyzer.get_equipment_specific_features(equipment_type)

            # 提取特征和目标值
            features = []
            targets = []

            for item in validation_data:
                # 提取特征值
                feature_values = []
                for name in feature_names:
                    value = item.get(name)
                    try:
                        feature_values.append(float(value) if value is not None else 0.0)
                    except (ValueError, TypeError):
                        feature_values.append(0.0)  # 使用0替代NaN
                features.append(feature_values)

                # 提取目标值（成本）
                try:
                    targets.append(float(item['actual_cost']))
                except (ValueError, TypeError):
                    logging.error(f"Invalid cost value: {item.get('actual_cost')}")
                    continue

            # 转换为numpy数组
            X = np.array(features, dtype=float)
            y = np.array(targets, dtype=float)

            # 处理无效值
            X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

            # 使用训练数据的标准化器
            if scalers and 'feature_scaler' in scalers:
                X_scaled = scalers['feature_scaler'].transform(X)
            else:
                # 如果没有提供标准化器，直接返回处理后的数组
                X_scaled = X

            logging.info(f"Preprocessed data shape: {X_scaled.shape}")
            logging.info(f"Validation features shape: {X_scaled.shape}")
            logging.info(f"Validation features type: {X_scaled.dtype}")

            return {
                'X': X_scaled,
                'y': y  # 返回原始成本值
            }

        except Exception as e:
            logging.error(f"Error in validation data preparation: {str(e)}")
            logging.error(f"Feature names: {feature_names}")
            logging.error(f"Equipment type: {equipment_type}")
            raise Exception(f"Validation error: {str(e)}")

    def calculate_derived_features(self, data, equipment_type):
        """
        计算衍生特征
        """
        try:
            return self.feature_analyzer.calculate_derived_features(data, equipment_type)
        except Exception as e:
            logging.error(f"Error calculating derived features: {str(e)}")
            raise Exception(f"Feature calculation error: {str(e)}")