MLPlatform/function_old/data_process/data_cleaner.py

66 lines
2.7 KiB
Python

from .data_processor import DataProcessor
import pandas as pd
import numpy as np
from typing import Dict, List
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from scipy import stats
class DataCleaner(DataProcessor):
"""数据清洗类"""
def __init__(self, config: Dict = None):
super().__init__(config)
self.missing_value_methods = {
'mean': SimpleImputer(strategy='mean'),
'median': SimpleImputer(strategy='median'),
'mode': SimpleImputer(strategy='most_frequent'),
'constant': SimpleImputer(strategy='constant')
}
def handle_missing_values(self, df: pd.DataFrame, method: str = 'mean', columns: List[str] = None) -> pd.DataFrame:
"""处理缺失值"""
try:
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns
if method not in self.missing_value_methods:
raise ValueError(f"Unsupported method: {method}")
imputer = self.missing_value_methods[method]
df[columns] = imputer.fit_transform(df[columns])
self.logger.info(f"Successfully handled missing values using {method} method")
return df
except Exception as e:
self.logger.error(f"Error handling missing values: {str(e)}")
raise
def remove_duplicates(self, df: pd.DataFrame, subset: List[str] = None) -> pd.DataFrame:
"""删除重复值"""
try:
original_shape = df.shape
df = df.drop_duplicates(subset=subset)
self.logger.info(f"Removed {original_shape[0] - df.shape[0]} duplicate rows")
return df
except Exception as e:
self.logger.error(f"Error removing duplicates: {str(e)}")
raise
def detect_outliers(self, df: pd.DataFrame, method: str = 'zscore', threshold: float = 3) -> pd.DataFrame:
"""检测异常值"""
try:
if method == 'zscore':
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
outliers = (z_scores > threshold).any(axis=1)
elif method == 'isolation_forest':
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = iso_forest.fit_predict(df.select_dtypes(include=[np.number])) == -1
self.logger.info(f"Detected {sum(outliers)} outliers using {method} method")
return df[~outliers]
except Exception as e:
self.logger.error(f"Error detecting outliers: {str(e)}")
raise