from .data_processor import DataProcessor import pandas as pd import numpy as np from typing import Dict, List from sklearn.impute import SimpleImputer from sklearn.ensemble import IsolationForest from scipy import stats class DataCleaner(DataProcessor): """数据清洗类""" def __init__(self, config: Dict = None): super().__init__(config) self.missing_value_methods = { 'mean': SimpleImputer(strategy='mean'), 'median': SimpleImputer(strategy='median'), 'mode': SimpleImputer(strategy='most_frequent'), 'constant': SimpleImputer(strategy='constant') } def handle_missing_values(self, df: pd.DataFrame, method: str = 'mean', columns: List[str] = None) -> pd.DataFrame: """处理缺失值""" try: if columns is None: columns = df.select_dtypes(include=[np.number]).columns if method not in self.missing_value_methods: raise ValueError(f"Unsupported method: {method}") imputer = self.missing_value_methods[method] df[columns] = imputer.fit_transform(df[columns]) self.logger.info(f"Successfully handled missing values using {method} method") return df except Exception as e: self.logger.error(f"Error handling missing values: {str(e)}") raise def remove_duplicates(self, df: pd.DataFrame, subset: List[str] = None) -> pd.DataFrame: """删除重复值""" try: original_shape = df.shape df = df.drop_duplicates(subset=subset) self.logger.info(f"Removed {original_shape[0] - df.shape[0]} duplicate rows") return df except Exception as e: self.logger.error(f"Error removing duplicates: {str(e)}") raise def detect_outliers(self, df: pd.DataFrame, method: str = 'zscore', threshold: float = 3) -> pd.DataFrame: """检测异常值""" try: if method == 'zscore': z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) outliers = (z_scores > threshold).any(axis=1) elif method == 'isolation_forest': iso_forest = IsolationForest(contamination=0.1, random_state=42) outliers = iso_forest.fit_predict(df.select_dtypes(include=[np.number])) == -1 self.logger.info(f"Detected {sum(outliers)} outliers using {method} method") return df[~outliers] except Exception as e: self.logger.error(f"Error detecting outliers: {str(e)}") raise