41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
from data_process.data_processor import DataProcessor
|
|
from data_process.data_cleaner import DataCleaner
|
|
from data_process.feature_engineer import FeatureEngineer
|
|
from data_process.data_splitter import DataSplitter
|
|
|
|
# 数据处理示例
|
|
|
|
# 配置
|
|
config = {
|
|
'csv_params': {
|
|
'encoding': 'utf-8',
|
|
'na_values': ['', 'NULL', 'null']
|
|
}
|
|
}
|
|
|
|
# 创建处理器实例
|
|
cleaner = DataCleaner(config)
|
|
engineer = FeatureEngineer(config)
|
|
splitter = DataSplitter(config)
|
|
|
|
# 加载数据
|
|
df = cleaner.load_data('./dataset/dataset_raw/breast_cancer.csv')
|
|
|
|
# 数据清洗
|
|
df = cleaner.handle_missing_values(df, method='mean')
|
|
df = cleaner.remove_duplicates(df)
|
|
df = cleaner.detect_outliers(df, method='zscore')
|
|
|
|
# 特征工程
|
|
df = engineer.scale_features(df, method='standard')
|
|
# 创建时间特征有问题,并不能生成那一行
|
|
# df = engineer.create_datetime_features(df, 'date_column')
|
|
df = engineer.select_features(df, target='target', method='f_classif' ,k=10,)
|
|
|
|
# 数据集划分
|
|
train, val, test = splitter.train_val_test_split(df, target='target')
|
|
|
|
# 保存处理后的数据
|
|
cleaner.save_data(train, './dataset/dataset_processed/breast_cancer_train.csv')
|
|
cleaner.save_data(val, './dataset/dataset_processed/breast_cancer_val.csv')
|
|
cleaner.save_data(test, './dataset/dataset_processed/breast_cancer_test.csv') |