diff --git a/doc/典型示例新版.md b/doc/典型示例新版.md index b512d97..6725480 100644 --- a/doc/典型示例新版.md +++ b/doc/典型示例新版.md @@ -316,7 +316,8 @@ TEP数据集共包含53个特征列: 1. **完整性检查**:确认数据集无缺失值 2. **异常值处理**:使用IsolationForest方法识别并移除异常值 3. **数据标准化**:采用StandardScaler方法将特征标准化到[-1, 1]区间 -4. **数据集划分**:按8:2比例划分训练集和测试集 +4. **特征提取**: 使用PCA将特征维度从53降低到20 +5. **数据集划分**:按8:2比例划分训练集和测试集 **预处理配置**: ```json @@ -327,7 +328,7 @@ TEP数据集共包含53个特征列: { "method_name": "IsolationForest", "params": { - "n_estimators": 100, + "n_estimators": 200, "max_samples": "auto" } }, @@ -339,7 +340,14 @@ TEP数据集共包含53个特征列: } } ], - "feature_methods": [], + "feature_methods": [ + { + "method_name":"PCA", + "params":{ + "n_components": 20 + } + } + ], "split_params": { "train_size": 0.8, "val_size": 0.2 diff --git a/doc/接口文档code.md b/doc/接口文档code.md index 2b9a739..f2e319e 100644 --- a/doc/接口文档code.md +++ b/doc/接口文档code.md @@ -151,7 +151,14 @@ Request: } } ], - "feature_methods": [], + "feature_methods": [ + { + "method_name":"PCA", + "params":{ + "n_components": 20 + } + } + ], "split_params": { "train_size": 0.8, "val_size": 0.2 diff --git a/function/__pycache__/data_manager.cpython-39.pyc b/function/__pycache__/data_manager.cpython-39.pyc index c18e45b..c1e1baf 100644 Binary files a/function/__pycache__/data_manager.cpython-39.pyc and b/function/__pycache__/data_manager.cpython-39.pyc differ diff --git a/function/data_manager.py b/function/data_manager.py index 6be6719..daf5611 100644 --- a/function/data_manager.py +++ b/function/data_manager.py @@ -336,7 +336,10 @@ class DataManager: try: method_name = method['method_name'] params = method.get('params', {}) - columns = method.get('columns', df.drop('target', axis=1).columns) # 排除target列 + + df_columns = df.columns[:-1] + + columns = method.get('columns', df_columns) # 排除target列 if method_name not in self.feature_engineering_methods: raise ValueError(f"Unknown feature engineering method: {method_name}") @@ -445,7 +448,7 @@ class DataManager: train_data, val_data = train_test_split( train_val_data, test_size=val_size_adjusted, - random_state=42 + random_state=random_state ) else: train_data = train_val_data