from sklearn.preprocessing import StandardScaler from datetime import datetime import os import joblib import pandas as pd import numpy as np from src.feature_analysis import FeatureAnalysis from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from sklearn.model_selection import cross_val_score, LeaveOneOut import json import logging from src.database.db_connection import get_db_connection from sklearn.metrics import mean_absolute_error, mean_squared_error class DataPreparation: def __init__(self): self.feature_analyzer = FeatureAnalysis() self.feature_scaler = StandardScaler() self.target_scaler = StandardScaler() # 添加目标值标准化器 def prepare_training_data(self, equipment_data, equipment_type): """ 准备训练数据 """ try: logging.info(f"Preparing training data for {equipment_type}") logging.info(f"Raw data size: {len(equipment_data)}") # 如果输入已经是 numpy 数组,直接返回 if isinstance(equipment_data, np.ndarray): X = equipment_data logging.info(f"Input is already numpy array with shape: {X.shape}") # 处理无效值 X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) return { 'X': X, 'feature_names': self.feature_analyzer.get_equipment_specific_features(equipment_type), 'feature_scaler': self.feature_scaler, 'target_scaler': self.target_scaler } # 从原始数据中提取特征和目标值 feature_names = self.feature_analyzer.get_equipment_specific_features(equipment_type) features = [] targets = [] for item in equipment_data: # 提取特征值 feature_values = [] for name in feature_names: value = item.get(name) try: feature_values.append(float(value) if value is not None else 0.0) except (ValueError, TypeError): feature_values.append(0.0) features.append(feature_values) # 提取目标值(成本) try: cost = float(item['actual_cost']) if cost > 0: # 只使用正数成本值 targets.append(cost) else: logging.warning(f"Skipping non-positive cost value: {cost}") except (ValueError, TypeError, KeyError): logging.error(f"Invalid cost value: {item.get('actual_cost')}") continue # 转换为numpy数组 X = np.array(features, dtype=float) y = np.array(targets, dtype=float) # 记录原始数据范围 logging.info(f"Original X range: min={X.min()}, max={X.max()}") logging.info(f"Original y range: min={y.min()}, max={y.max()}") # 处理无效值 X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) # 标准化特征和目标值 X_scaled = self.feature_scaler.fit_transform(X) y_scaled = self.target_scaler.fit_transform(y.reshape(-1, 1)).ravel() # 记录标准化后的数据范围 logging.info(f"Scaled X range: min={X_scaled.min()}, max={X_scaled.max()}") logging.info(f"Scaled y range: min={y_scaled.min()}, max={y_scaled.max()}") return { 'X': X_scaled, 'y': y_scaled, 'feature_names': feature_names, 'feature_scaler': self.feature_scaler, 'target_scaler': self.target_scaler } except Exception as e: logging.error(f"Error in data preparation: {str(e)}") raise Exception(f"Training error: {str(e)}") def prepare_validation_data(self, validation_data, equipment_type, feature_names=None, scalers=None): """ 准备验证数据 """ try: logging.info(f"Preparing validation data for {equipment_type}") logging.info(f"Raw validation data size: {len(validation_data)}") # 如果输入已经是 numpy 数组,直接使用 if isinstance(validation_data, np.ndarray): X = validation_data logging.info(f"Input is already numpy array with shape: {X.shape}") # 处理无效值 X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) # 使用训练数据的标准化器 if scalers and 'feature_scaler' in scalers: X_scaled = scalers['feature_scaler'].transform(X) else: # 如果没有提供标准化器,直接返回处理后的数组 X_scaled = X logging.info(f"Preprocessed data shape: {X_scaled.shape}") logging.info(f"Validation features shape: {X_scaled.shape}") logging.info(f"Validation features type: {X_scaled.dtype}") return { 'X': X_scaled, 'y': None # 验证数据可能没有标签 } # 否则,从原始数据中提取特征 if not feature_names: feature_names = self.feature_analyzer.get_equipment_specific_features(equipment_type) # 提取特征和目标值 features = [] targets = [] for item in validation_data: # 提取特征值 feature_values = [] for name in feature_names: value = item.get(name) try: feature_values.append(float(value) if value is not None else 0.0) except (ValueError, TypeError): feature_values.append(0.0) # 使用0替代NaN features.append(feature_values) # 提取目标值(成本) try: targets.append(float(item['actual_cost'])) except (ValueError, TypeError): logging.error(f"Invalid cost value: {item.get('actual_cost')}") continue # 转换为numpy数组 X = np.array(features, dtype=float) y = np.array(targets, dtype=float) # 处理无效值 X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) # 使用训练数据的标准化器 if scalers and 'feature_scaler' in scalers: X_scaled = scalers['feature_scaler'].transform(X) else: # 如果没有提供标准化器,直接返回处理后的数组 X_scaled = X logging.info(f"Preprocessed data shape: {X_scaled.shape}") logging.info(f"Validation features shape: {X_scaled.shape}") logging.info(f"Validation features type: {X_scaled.dtype}") return { 'X': X_scaled, 'y': y # 返回原始成本值 } except Exception as e: logging.error(f"Error in validation data preparation: {str(e)}") logging.error(f"Feature names: {feature_names}") logging.error(f"Equipment type: {equipment_type}") raise Exception(f"Validation error: {str(e)}") def calculate_derived_features(self, data, equipment_type): """ 计算衍生特征 """ try: return self.feature_analyzer.calculate_derived_features(data, equipment_type) except Exception as e: logging.error(f"Error calculating derived features: {str(e)}") raise Exception(f"Feature calculation error: {str(e)}")