MLPlatform/function_old/data_process/feature_engineer.py

77 lines
3.1 KiB
Python

from .data_processor import DataProcessor
import pandas as pd
import numpy as np
from typing import Dict, List
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif
class FeatureEngineer(DataProcessor):
"""特征工程类"""
def __init__(self, config: Dict = None):
super().__init__(config)
self.scalers = {
'standard': StandardScaler(),
'minmax': MinMaxScaler(),
'robust': RobustScaler()
}
def scale_features(self, df: pd.DataFrame, method: str = 'standard', columns: List[str] = None) -> pd.DataFrame:
"""特征缩放"""
try:
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns
if method not in self.scalers:
raise ValueError(f"Unsupported scaling method: {method}")
scaler = self.scalers[method]
df[columns] = scaler.fit_transform(df[columns])
self.logger.info(f"Successfully scaled features using {method} method")
return df
except Exception as e:
self.logger.error(f"Error scaling features: {str(e)}")
raise
def select_features(self, df: pd.DataFrame, target: str, method: str = 'chi2', k: int = 10) -> pd.DataFrame:
"""特征选择"""
try:
X = df.drop(columns=[target])
y = df[target]
if method == 'chi2':
# 要求输入x不能为负的
selector = SelectKBest(chi2, k=k)
elif method == 'f_classif':
selector = SelectKBest(f_classif, k=k)
else:
raise ValueError(f"Unsupported feature selection method: {method}")
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
self.logger.info(f"Selected {len(selected_features)} features")
return df[selected_features + [target]]
except Exception as e:
self.logger.error(f"Error selecting features: {str(e)}")
raise
def create_datetime_features(self, df: pd.DataFrame, datetime_column: str) -> pd.DataFrame:
"""创建时间特征"""
try:
df[datetime_column] = pd.to_datetime(df[datetime_column])
df[f'{datetime_column}_year'] = df[datetime_column].dt.year
df[f'{datetime_column}_month'] = df[datetime_column].dt.month
df[f'{datetime_column}_day'] = df[datetime_column].dt.day
df[f'{datetime_column}_weekday'] = df[datetime_column].dt.weekday
df[f'{datetime_column}_is_weekend'] = df[datetime_column].dt.weekday.isin([5, 6])
self.logger.info(f"Created datetime features from {datetime_column}")
return df
except Exception as e:
self.logger.error(f"Error creating datetime features: {str(e)}")
raise