from .data_processor import DataProcessor import pandas as pd import numpy as np from typing import Dict, List from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler from sklearn.feature_selection import SelectKBest, chi2, f_classif class FeatureEngineer(DataProcessor): """特征工程类""" def __init__(self, config: Dict = None): super().__init__(config) self.scalers = { 'standard': StandardScaler(), 'minmax': MinMaxScaler(), 'robust': RobustScaler() } def scale_features(self, df: pd.DataFrame, method: str = 'standard', columns: List[str] = None) -> pd.DataFrame: """特征缩放""" try: if columns is None: columns = df.select_dtypes(include=[np.number]).columns if method not in self.scalers: raise ValueError(f"Unsupported scaling method: {method}") scaler = self.scalers[method] df[columns] = scaler.fit_transform(df[columns]) self.logger.info(f"Successfully scaled features using {method} method") return df except Exception as e: self.logger.error(f"Error scaling features: {str(e)}") raise def select_features(self, df: pd.DataFrame, target: str, method: str = 'chi2', k: int = 10) -> pd.DataFrame: """特征选择""" try: X = df.drop(columns=[target]) y = df[target] if method == 'chi2': # 要求输入x不能为负的 selector = SelectKBest(chi2, k=k) elif method == 'f_classif': selector = SelectKBest(f_classif, k=k) else: raise ValueError(f"Unsupported feature selection method: {method}") X_selected = selector.fit_transform(X, y) selected_features = X.columns[selector.get_support()].tolist() self.logger.info(f"Selected {len(selected_features)} features") return df[selected_features + [target]] except Exception as e: self.logger.error(f"Error selecting features: {str(e)}") raise def create_datetime_features(self, df: pd.DataFrame, datetime_column: str) -> pd.DataFrame: """创建时间特征""" try: df[datetime_column] = pd.to_datetime(df[datetime_column]) df[f'{datetime_column}_year'] = df[datetime_column].dt.year df[f'{datetime_column}_month'] = df[datetime_column].dt.month df[f'{datetime_column}_day'] = df[datetime_column].dt.day df[f'{datetime_column}_weekday'] = df[datetime_column].dt.weekday df[f'{datetime_column}_is_weekend'] = df[datetime_column].dt.weekday.isin([5, 6]) self.logger.info(f"Created datetime features from {datetime_column}") return df except Exception as e: self.logger.error(f"Error creating datetime features: {str(e)}") raise