MLPlatform/function_old/data_process/data_processor.py

63 lines
2.1 KiB
Python

import pandas as pd
import numpy as np
from typing import Dict, List, Union, Optional
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import logging
import hashlib
from datetime import datetime
import json
import os
class DataProcessor:
"""数据处理基类"""
def __init__(self, config: Dict = None):
self.config = config or {}
self.logger = logging.getLogger(__name__)
self._setup_logging()
def _setup_logging(self):
"""设置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def load_data(self, file_path: str) -> pd.DataFrame:
"""加载数据"""
try:
file_type = file_path.split('.')[-1].lower()
if file_type == 'csv':
df = pd.read_csv(file_path, **self.config.get('csv_params', {}))
elif file_type == 'parquet':
df = pd.read_parquet(file_path)
elif file_type == 'hdf5':
df = pd.read_hdf(file_path)
else:
raise ValueError(f"Unsupported file type: {file_type}")
self.logger.info(f"Successfully loaded data from {file_path}")
return df
except Exception as e:
self.logger.error(f"Error loading data: {str(e)}")
raise
def save_data(self, df: pd.DataFrame, file_path: str):
"""保存数据"""
try:
file_type = file_path.split('.')[-1].lower()
if file_type == 'csv':
df.to_csv(file_path, index=False)
elif file_type == 'parquet':
df.to_parquet(file_path)
elif file_type == 'hdf5':
df.to_hdf(file_path, key='data')
self.logger.info(f"Successfully saved data to {file_path}")
except Exception as e:
self.logger.error(f"Error saving data: {str(e)}")
raise