from data_process.data_processor import DataProcessor from data_process.data_cleaner import DataCleaner from data_process.feature_engineer import FeatureEngineer from data_process.data_splitter import DataSplitter # 数据处理示例 # 配置 config = { 'csv_params': { 'encoding': 'utf-8', 'na_values': ['', 'NULL', 'null'] } } # 创建处理器实例 cleaner = DataCleaner(config) engineer = FeatureEngineer(config) splitter = DataSplitter(config) # 加载数据 df = cleaner.load_data('./dataset/dataset_raw/breast_cancer.csv') # 数据清洗 df = cleaner.handle_missing_values(df, method='mean') df = cleaner.remove_duplicates(df) df = cleaner.detect_outliers(df, method='zscore') # 特征工程 df = engineer.scale_features(df, method='standard') # 创建时间特征有问题,并不能生成那一行 # df = engineer.create_datetime_features(df, 'date_column') df = engineer.select_features(df, target='target', method='f_classif' ,k=10,) # 数据集划分 train, val, test = splitter.train_val_test_split(df, target='target') # 保存处理后的数据 cleaner.save_data(train, './dataset/dataset_processed/breast_cancer_train.csv') cleaner.save_data(val, './dataset/dataset_processed/breast_cancer_val.csv') cleaner.save_data(test, './dataset/dataset_processed/breast_cancer_test.csv')