117 lines
2.8 KiB
Python
117 lines
2.8 KiB
Python
from function.data_processor_date import DataProcessor
|
|
import numpy as np
|
|
|
|
# 创建处理器实例
|
|
processor = DataProcessor()
|
|
|
|
# 定义数据预处理方法
|
|
process_methods = [
|
|
# 缺失值处理
|
|
{
|
|
'method_name': 'SimpleImputer',
|
|
'params': {
|
|
'strategy': 'mean',
|
|
'missing_values': np.nan
|
|
}
|
|
},
|
|
# 异常值检测
|
|
{
|
|
'method_name': 'IsolationForest',
|
|
'params': {
|
|
'contamination': 0.1,
|
|
'random_state': 42
|
|
}
|
|
},
|
|
# 数据标准化
|
|
{
|
|
'method_name': 'StandardScaler',
|
|
'params': {
|
|
'with_mean': True,
|
|
'with_std': True
|
|
}
|
|
}
|
|
]
|
|
|
|
# 定义特征工程方法
|
|
feature_methods = [
|
|
# 类别特征编码
|
|
{
|
|
'method_name': 'OneHotEncoder',
|
|
'params': {
|
|
'sparse': False,
|
|
'handle_unknown': 'ignore'
|
|
},
|
|
|
|
# columns 要处理的列名
|
|
'columns': ['categorical_feature1', 'categorical_feature2']
|
|
},
|
|
# 数值特征离散化
|
|
{
|
|
'method_name': 'KBinsDiscretizer',
|
|
'params': {
|
|
'n_bins': 5,
|
|
'encode': 'onehot',
|
|
'strategy': 'uniform'
|
|
},
|
|
|
|
# columns 要处理的列名
|
|
'columns': ['numeric_feature1', 'numeric_feature2']
|
|
},
|
|
# 特征选择
|
|
{
|
|
'method_name': 'SelectKBest',
|
|
'params': {
|
|
'k': 10,
|
|
'score_func': 'f_classif'
|
|
},
|
|
|
|
# columns 要处理的列名
|
|
'columns': ['feature1', 'feature2', 'feature3', 'feature4']
|
|
},
|
|
# 降维
|
|
{
|
|
'method_name': 'PCA',
|
|
'params': {
|
|
'n_components': 2,
|
|
'random_state': 42
|
|
},
|
|
|
|
# columns 要处理的列名, 现并不能通过列的序号来获得列, 注意文件的列名
|
|
'columns': ['feature5', 'feature6', 'feature7']
|
|
}
|
|
]
|
|
|
|
# 数据集划分参数
|
|
split_params = {
|
|
'test_size': 0.2,
|
|
'val_size': 0.1
|
|
}
|
|
|
|
# 处理数据集
|
|
result = processor.process_dataset(
|
|
input_path='dataset/dataset_raw/breast_cancer.csv',
|
|
output_dir='dataset/dataset_processed',
|
|
process_methods=process_methods,
|
|
# feature_methods=feature_methods,
|
|
feature_methods=[],
|
|
split_params=split_params
|
|
)
|
|
|
|
# 打印处理结果
|
|
print("\n数据处理结果:")
|
|
print(f"状态: {result['status']}")
|
|
if result['status'] == 'success':
|
|
print("\n处理记录:")
|
|
record = result['process_record']
|
|
print(f"输入文件: {record['input_file']}")
|
|
print(f"处理时间: {record['timestamp']}")
|
|
print("\n输出文件:")
|
|
for key, path in record['output_files'].items():
|
|
print(f"{key}: {path}")
|
|
|
|
print("\n处理步骤:")
|
|
for step in record['steps']:
|
|
if 'shape' in step:
|
|
print(f"{step['step']}: 数据形状 {step['shape']}")
|
|
else:
|
|
print(f"错误信息: {result['message']}") |