完成--完成获取已处理好的数据集方法

This commit is contained in:
haotian 2025-02-18 10:07:31 +08:00
parent d0ad80fadd
commit 837ccc386a
4 changed files with 113 additions and 22 deletions

View File

@ -188,28 +188,85 @@ Response:
{
"status": "success",
"datasets": [
{
"id": "dataset_001",
"name": "breast_cancer",
"created_at": "2023-08-20T10:00:00",
"files": {
"raw": "dataset/dataset_raw/breast_cancer.csv",
"processed": {
"train": "dataset/dataset_processed/breast_cancer_train.csv",
"val": "dataset/dataset_processed/breast_cancer_val.csv",
"test": "dataset/dataset_processed/breast_cancer_test.csv"
}
},
"process_history": [
{
"step": "missing_value_handler",
"params": {"strategy": "mean"}
},
{
"step": "standard_scaler",
"params": {"with_mean": true}
}
"input_file": "dataset/dataset_raw/breast_cancer.csv",
"timestamp": "2025-02-18T09:48:48.983863",
"process_methods": [
{
"method_name": "SimpleImputer",
"params": {
"strategy": "mean",
"missing_values": NaN
}
},
{
"method_name": "IsolationForest",
"params": {
"contamination": 0.1,
"random_state": 42
}
},
{
"method_name": "StandardScaler",
"params": {
"with_mean": true,
"with_std": true
}
}
],
"feature_methods": [],
"split_params": {
"test_size": 0.2,
"val_size": 0.1
},
"steps": [
{
"step": "load_data",
"shape": [
569,
31
]
},
{
"step": "cleaning",
"method": "SimpleImputer",
"params": {
"strategy": "mean",
"missing_values": NaN
},
"shape": [
569,
31
]
},
{
"step": "cleaning",
"method": "IsolationForest",
"params": {
"contamination": 0.1,
"random_state": 42
},
"shape": [
512,
31
]
},
{
"step": "cleaning",
"method": "StandardScaler",
"params": {
"with_mean": true,
"with_std": true
},
"shape": [
512,
31
]
}
],
"output_files": {
"train": "dataset/dataset_processed/breast_cancer_20250218_094848/train_breast_cancer_20250218_094848.csv",
"validation": "dataset/dataset_processed/breast_cancer_20250218_094848/val_breast_cancer_20250218_094848.csv",
"test": "dataset/dataset_processed/breast_cancer_20250218_094848/test_breast_cancer_20250218_094848.csv"
}
]
}

View File

@ -0,0 +1,5 @@
from function.get_all_dataset import DatasetHistory
t = DatasetHistory()
print(t.get_dataset())

Binary file not shown.

View File

@ -1,4 +1,33 @@
import os
import json
from pathlib import Path
# 查询可用数据集, 处理过后的数据集默认在 dataset/dataset_processed/ 下
# 各个文件夹下的json文件记录了处理数据
# 各个文件夹下的json文件记录了处理数据
class DatasetHistory:
def __init__(self) -> None:
self.dataset_processed_path = 'dataset/dataset_processed'
def get_dataset(self):
back = list()
dataset_files_path = os.listdir(self.dataset_processed_path)
for dataset_file in dataset_files_path:
path = os.path.join(self.dataset_processed_path, dataset_file)
# 指定要查看的文件夹路径
folder_path = Path(path)
# 获取文件夹下所有以 .json 结尾的文件
json_files = list(folder_path.glob('*.json'))
for json_file in json_files:
with open(json_file.as_posix(), 'r', encoding='utf-8') as f:
json_data = json.load(f)
back.append(json_data)
return back