diff --git a/doc/接口文档code.md b/doc/接口文档code.md index 0da36b1..bdc4920 100644 --- a/doc/接口文档code.md +++ b/doc/接口文档code.md @@ -188,28 +188,85 @@ Response: { "status": "success", "datasets": [ - { - "id": "dataset_001", - "name": "breast_cancer", - "created_at": "2023-08-20T10:00:00", - "files": { - "raw": "dataset/dataset_raw/breast_cancer.csv", - "processed": { - "train": "dataset/dataset_processed/breast_cancer_train.csv", - "val": "dataset/dataset_processed/breast_cancer_val.csv", - "test": "dataset/dataset_processed/breast_cancer_test.csv" - } - }, - "process_history": [ - { - "step": "missing_value_handler", - "params": {"strategy": "mean"} - }, - { - "step": "standard_scaler", - "params": {"with_mean": true} - } + "input_file": "dataset/dataset_raw/breast_cancer.csv", + "timestamp": "2025-02-18T09:48:48.983863", + "process_methods": [ + { + "method_name": "SimpleImputer", + "params": { + "strategy": "mean", + "missing_values": NaN + } + }, + { + "method_name": "IsolationForest", + "params": { + "contamination": 0.1, + "random_state": 42 + } + }, + { + "method_name": "StandardScaler", + "params": { + "with_mean": true, + "with_std": true + } + } + ], + "feature_methods": [], + "split_params": { + "test_size": 0.2, + "val_size": 0.1 + }, + "steps": [ + { + "step": "load_data", + "shape": [ + 569, + 31 ] + }, + { + "step": "cleaning", + "method": "SimpleImputer", + "params": { + "strategy": "mean", + "missing_values": NaN + }, + "shape": [ + 569, + 31 + ] + }, + { + "step": "cleaning", + "method": "IsolationForest", + "params": { + "contamination": 0.1, + "random_state": 42 + }, + "shape": [ + 512, + 31 + ] + }, + { + "step": "cleaning", + "method": "StandardScaler", + "params": { + "with_mean": true, + "with_std": true + }, + "shape": [ + 512, + 31 + ] + } + ], + "output_files": { + "train": "dataset/dataset_processed/breast_cancer_20250218_094848/train_breast_cancer_20250218_094848.csv", + "validation": "dataset/dataset_processed/breast_cancer_20250218_094848/val_breast_cancer_20250218_094848.csv", + "test": "dataset/dataset_processed/breast_cancer_20250218_094848/test_breast_cancer_20250218_094848.csv" } ] } diff --git a/example_get_all_dataset.py b/example_get_all_dataset.py new file mode 100644 index 0000000..db1a727 --- /dev/null +++ b/example_get_all_dataset.py @@ -0,0 +1,5 @@ +from function.get_all_dataset import DatasetHistory + + +t = DatasetHistory() +print(t.get_dataset()) \ No newline at end of file diff --git a/function/__pycache__/get_all_dataset.cpython-39.pyc b/function/__pycache__/get_all_dataset.cpython-39.pyc new file mode 100644 index 0000000..757cfcb Binary files /dev/null and b/function/__pycache__/get_all_dataset.cpython-39.pyc differ diff --git a/function/get_all_dataset.py b/function/get_all_dataset.py index 90abab9..336c770 100644 --- a/function/get_all_dataset.py +++ b/function/get_all_dataset.py @@ -1,4 +1,33 @@ +import os import json +from pathlib import Path # 查询可用数据集, 处理过后的数据集默认在 dataset/dataset_processed/ 下 -# 各个文件夹下的json文件记录了处理数据 \ No newline at end of file +# 各个文件夹下的json文件记录了处理数据 + +class DatasetHistory: + def __init__(self) -> None: + self.dataset_processed_path = 'dataset/dataset_processed' + + def get_dataset(self): + back = list() + + dataset_files_path = os.listdir(self.dataset_processed_path) + + for dataset_file in dataset_files_path: + path = os.path.join(self.dataset_processed_path, dataset_file) + # 指定要查看的文件夹路径 + folder_path = Path(path) + + # 获取文件夹下所有以 .json 结尾的文件 + json_files = list(folder_path.glob('*.json')) + + for json_file in json_files: + with open(json_file.as_posix(), 'r', encoding='utf-8') as f: + json_data = json.load(f) + + back.append(json_data) + return back + + +