66 lines
2.7 KiB
Python
66 lines
2.7 KiB
Python
from .data_processor import DataProcessor
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import Dict, List
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.ensemble import IsolationForest
|
|
from scipy import stats
|
|
|
|
class DataCleaner(DataProcessor):
|
|
"""数据清洗类"""
|
|
|
|
def __init__(self, config: Dict = None):
|
|
super().__init__(config)
|
|
self.missing_value_methods = {
|
|
'mean': SimpleImputer(strategy='mean'),
|
|
'median': SimpleImputer(strategy='median'),
|
|
'mode': SimpleImputer(strategy='most_frequent'),
|
|
'constant': SimpleImputer(strategy='constant')
|
|
}
|
|
|
|
def handle_missing_values(self, df: pd.DataFrame, method: str = 'mean', columns: List[str] = None) -> pd.DataFrame:
|
|
"""处理缺失值"""
|
|
try:
|
|
if columns is None:
|
|
columns = df.select_dtypes(include=[np.number]).columns
|
|
|
|
if method not in self.missing_value_methods:
|
|
raise ValueError(f"Unsupported method: {method}")
|
|
|
|
imputer = self.missing_value_methods[method]
|
|
df[columns] = imputer.fit_transform(df[columns])
|
|
|
|
self.logger.info(f"Successfully handled missing values using {method} method")
|
|
return df
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error handling missing values: {str(e)}")
|
|
raise
|
|
|
|
def remove_duplicates(self, df: pd.DataFrame, subset: List[str] = None) -> pd.DataFrame:
|
|
"""删除重复值"""
|
|
try:
|
|
original_shape = df.shape
|
|
df = df.drop_duplicates(subset=subset)
|
|
self.logger.info(f"Removed {original_shape[0] - df.shape[0]} duplicate rows")
|
|
return df
|
|
except Exception as e:
|
|
self.logger.error(f"Error removing duplicates: {str(e)}")
|
|
raise
|
|
|
|
def detect_outliers(self, df: pd.DataFrame, method: str = 'zscore', threshold: float = 3) -> pd.DataFrame:
|
|
"""检测异常值"""
|
|
try:
|
|
if method == 'zscore':
|
|
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
|
|
outliers = (z_scores > threshold).any(axis=1)
|
|
elif method == 'isolation_forest':
|
|
iso_forest = IsolationForest(contamination=0.1, random_state=42)
|
|
outliers = iso_forest.fit_predict(df.select_dtypes(include=[np.number])) == -1
|
|
|
|
self.logger.info(f"Detected {sum(outliers)} outliers using {method} method")
|
|
return df[~outliers]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error detecting outliers: {str(e)}")
|
|
raise |