完成--完成获取已处理好的数据集方法

2025-02-18 10:07:31 +08:00 · 2025-02-18 10:07:31 +08:00 · 837ccc386a
commit 837ccc386a
parent d0ad80fadd
4 changed files with 113 additions and 22 deletions
--- a/doc/接口文档code.md
+++ b/doc/接口文档code.md
@ -188,28 +188,85 @@ Response:
 {
    "status": "success",
    "datasets": [
-        {
-            "id": "dataset_001",
-            "name": "breast_cancer",
-            "created_at": "2023-08-20T10:00:00",
-            "files": {
-                "raw": "dataset/dataset_raw/breast_cancer.csv",
-                "processed": {
-                    "train": "dataset/dataset_processed/breast_cancer_train.csv",
-                    "val": "dataset/dataset_processed/breast_cancer_val.csv",
-                    "test": "dataset/dataset_processed/breast_cancer_test.csv"
-                }
-            },
-            "process_history": [
-                {
-                    "step": "missing_value_handler",
-                    "params": {"strategy": "mean"}
-                },
-                {
-                    "step": "standard_scaler",
-                    "params": {"with_mean": true}
-                }
+        "input_file": "dataset/dataset_raw/breast_cancer.csv",
+        "timestamp": "2025-02-18T09:48:48.983863",
+        "process_methods": [
+          {
+            "method_name": "SimpleImputer",
+            "params": {
+              "strategy": "mean",
+              "missing_values": NaN
+            }
+          },
+          {
+            "method_name": "IsolationForest",
+            "params": {
+              "contamination": 0.1,
+              "random_state": 42
+            }
+          },
+          {
+            "method_name": "StandardScaler",
+            "params": {
+              "with_mean": true,
+              "with_std": true
+            }
+          }
+        ],
+        "feature_methods": [],
+        "split_params": {
+          "test_size": 0.2,
+          "val_size": 0.1
+        },
+        "steps": [
+          {
+            "step": "load_data",
+            "shape": [
+              569,
+              31
            ]
+          },
+          {
+            "step": "cleaning",
+            "method": "SimpleImputer",
+            "params": {
+              "strategy": "mean",
+              "missing_values": NaN
+            },
+            "shape": [
+              569,
+              31
+            ]
+          },
+          {
+            "step": "cleaning",
+            "method": "IsolationForest",
+            "params": {
+              "contamination": 0.1,
+              "random_state": 42
+            },
+            "shape": [
+              512,
+              31
+            ]
+          },
+          {
+            "step": "cleaning",
+            "method": "StandardScaler",
+            "params": {
+              "with_mean": true,
+              "with_std": true
+            },
+            "shape": [
+              512,
+              31
+            ]
+          }
+        ],
+        "output_files": {
+          "train": "dataset/dataset_processed/breast_cancer_20250218_094848/train_breast_cancer_20250218_094848.csv",
+          "validation": "dataset/dataset_processed/breast_cancer_20250218_094848/val_breast_cancer_20250218_094848.csv",
+          "test": "dataset/dataset_processed/breast_cancer_20250218_094848/test_breast_cancer_20250218_094848.csv"
        }
    ]
 }
--- a/example_get_all_dataset.py
+++ b/example_get_all_dataset.py
@ -0,0 +1,5 @@
+from function.get_all_dataset import DatasetHistory
+
+
+t = DatasetHistory()
+print(t.get_dataset())
--- a/function/pycache/get_all_dataset.cpython-39.pyc
+++ b/function/pycache/get_all_dataset.cpython-39.pyc
--- a/function/get_all_dataset.py
+++ b/function/get_all_dataset.py
@ -1,4 +1,33 @@
+import os
 import json 
+from pathlib import Path

 # 查询可用数据集, 处理过后的数据集默认在 dataset/dataset_processed/ 下
-# 各个文件夹下的json文件记录了处理数据
+# 各个文件夹下的json文件记录了处理数据
+
+class DatasetHistory:
+    def __init__(self) -> None:
+        self.dataset_processed_path = 'dataset/dataset_processed'
+
+    def get_dataset(self):
+        back = list()
+
+        dataset_files_path = os.listdir(self.dataset_processed_path)
+
+        for dataset_file in dataset_files_path:
+            path = os.path.join(self.dataset_processed_path, dataset_file)
+            # 指定要查看的文件夹路径
+            folder_path = Path(path)
+
+            # 获取文件夹下所有以 .json 结尾的文件
+            json_files = list(folder_path.glob('*.json'))
+
+            for json_file in json_files:
+                with open(json_file.as_posix(), 'r', encoding='utf-8') as f:
+                    json_data = json.load(f)
+
+                back.append(json_data)
+        return back
+
+
+