63 lines
2.1 KiB
Python
63 lines
2.1 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from typing import Dict, List, Union, Optional
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.model_selection import train_test_split
|
|
import logging
|
|
import hashlib
|
|
from datetime import datetime
|
|
import json
|
|
import os
|
|
|
|
class DataProcessor:
|
|
"""数据处理基类"""
|
|
|
|
def __init__(self, config: Dict = None):
|
|
self.config = config or {}
|
|
self.logger = logging.getLogger(__name__)
|
|
self._setup_logging()
|
|
|
|
def _setup_logging(self):
|
|
"""设置日志"""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
def load_data(self, file_path: str) -> pd.DataFrame:
|
|
"""加载数据"""
|
|
try:
|
|
file_type = file_path.split('.')[-1].lower()
|
|
if file_type == 'csv':
|
|
df = pd.read_csv(file_path, **self.config.get('csv_params', {}))
|
|
elif file_type == 'parquet':
|
|
df = pd.read_parquet(file_path)
|
|
elif file_type == 'hdf5':
|
|
df = pd.read_hdf(file_path)
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {file_type}")
|
|
|
|
self.logger.info(f"Successfully loaded data from {file_path}")
|
|
return df
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error loading data: {str(e)}")
|
|
raise
|
|
|
|
def save_data(self, df: pd.DataFrame, file_path: str):
|
|
"""保存数据"""
|
|
try:
|
|
file_type = file_path.split('.')[-1].lower()
|
|
if file_type == 'csv':
|
|
df.to_csv(file_path, index=False)
|
|
elif file_type == 'parquet':
|
|
df.to_parquet(file_path)
|
|
elif file_type == 'hdf5':
|
|
df.to_hdf(file_path, key='data')
|
|
|
|
self.logger.info(f"Successfully saved data to {file_path}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error saving data: {str(e)}")
|
|
raise |