77 lines
3.1 KiB
Python
77 lines
3.1 KiB
Python
from .data_processor import DataProcessor
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import Dict, List
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
|
|
from sklearn.feature_selection import SelectKBest, chi2, f_classif
|
|
|
|
class FeatureEngineer(DataProcessor):
|
|
"""特征工程类"""
|
|
|
|
def __init__(self, config: Dict = None):
|
|
super().__init__(config)
|
|
self.scalers = {
|
|
'standard': StandardScaler(),
|
|
'minmax': MinMaxScaler(),
|
|
'robust': RobustScaler()
|
|
}
|
|
|
|
def scale_features(self, df: pd.DataFrame, method: str = 'standard', columns: List[str] = None) -> pd.DataFrame:
|
|
"""特征缩放"""
|
|
try:
|
|
if columns is None:
|
|
columns = df.select_dtypes(include=[np.number]).columns
|
|
|
|
if method not in self.scalers:
|
|
raise ValueError(f"Unsupported scaling method: {method}")
|
|
|
|
scaler = self.scalers[method]
|
|
df[columns] = scaler.fit_transform(df[columns])
|
|
|
|
self.logger.info(f"Successfully scaled features using {method} method")
|
|
return df
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error scaling features: {str(e)}")
|
|
raise
|
|
|
|
def select_features(self, df: pd.DataFrame, target: str, method: str = 'chi2', k: int = 10) -> pd.DataFrame:
|
|
"""特征选择"""
|
|
try:
|
|
X = df.drop(columns=[target])
|
|
y = df[target]
|
|
|
|
if method == 'chi2':
|
|
# 要求输入x不能为负的
|
|
selector = SelectKBest(chi2, k=k)
|
|
elif method == 'f_classif':
|
|
selector = SelectKBest(f_classif, k=k)
|
|
else:
|
|
raise ValueError(f"Unsupported feature selection method: {method}")
|
|
|
|
X_selected = selector.fit_transform(X, y)
|
|
selected_features = X.columns[selector.get_support()].tolist()
|
|
|
|
self.logger.info(f"Selected {len(selected_features)} features")
|
|
return df[selected_features + [target]]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error selecting features: {str(e)}")
|
|
raise
|
|
|
|
def create_datetime_features(self, df: pd.DataFrame, datetime_column: str) -> pd.DataFrame:
|
|
"""创建时间特征"""
|
|
try:
|
|
df[datetime_column] = pd.to_datetime(df[datetime_column])
|
|
df[f'{datetime_column}_year'] = df[datetime_column].dt.year
|
|
df[f'{datetime_column}_month'] = df[datetime_column].dt.month
|
|
df[f'{datetime_column}_day'] = df[datetime_column].dt.day
|
|
df[f'{datetime_column}_weekday'] = df[datetime_column].dt.weekday
|
|
df[f'{datetime_column}_is_weekend'] = df[datetime_column].dt.weekday.isin([5, 6])
|
|
|
|
self.logger.info(f"Created datetime features from {datetime_column}")
|
|
return df
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error creating datetime features: {str(e)}")
|
|
raise |