import pandas as pd import numpy as np from typing import Dict, List, Union, Optional from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split import logging import hashlib from datetime import datetime import json import os class DataProcessor: """数据处理基类""" def __init__(self, config: Dict = None): self.config = config or {} self.logger = logging.getLogger(__name__) self._setup_logging() def _setup_logging(self): """设置日志""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) def load_data(self, file_path: str) -> pd.DataFrame: """加载数据""" try: file_type = file_path.split('.')[-1].lower() if file_type == 'csv': df = pd.read_csv(file_path, **self.config.get('csv_params', {})) elif file_type == 'parquet': df = pd.read_parquet(file_path) elif file_type == 'hdf5': df = pd.read_hdf(file_path) else: raise ValueError(f"Unsupported file type: {file_type}") self.logger.info(f"Successfully loaded data from {file_path}") return df except Exception as e: self.logger.error(f"Error loading data: {str(e)}") raise def save_data(self, df: pd.DataFrame, file_path: str): """保存数据""" try: file_type = file_path.split('.')[-1].lower() if file_type == 'csv': df.to_csv(file_path, index=False) elif file_type == 'parquet': df.to_parquet(file_path) elif file_type == 'hdf5': df.to_hdf(file_path, key='data') self.logger.info(f"Successfully saved data to {file_path}") except Exception as e: self.logger.error(f"Error saving data: {str(e)}") raise