完成--完成读取csv文件功能

This commit is contained in:
haotian 2025-02-25 10:39:33 +08:00
parent 4d660f9b8c
commit 1245847dd0
117 changed files with 553 additions and 1 deletions

View File

@ -14,6 +14,13 @@ class ProcessRequest(BaseModel):
feature_methods: List[Dict]
split_params: Dict[str, float]
class CSVRequest(BaseModel):
data_path: str
head: int = 5
tail: int = 5
info: bool = True
describe: bool = True
@router.get("/preprocessing/methods")
async def get_preprocessing_methods():
"""获取数据预处理方法列表"""
@ -77,3 +84,17 @@ async def get_datasets():
status_code=500,
detail=f"获取数据集列表失败: {str(e)}"
)
@router.post("/csv")
async def read_csv(request: CSVRequest):
"""读取CSV文件并展示"""
result = data_manager.read_csv(
data_path=request.data_path,
head=request.head,
tail=request.tail,
info=request.info,
describe=request.describe
)
if result['status'] == 'error':
raise HTTPException(status_code=500, detail=result['message'])
return result

View File

@ -271,6 +271,83 @@ Response:
]
}
```
### 1.7 读取csv文件并展示
```http
POST /data/csv
Content-Type: application/json
Request:
{
"data_path": "dataset/dataset_raw/data.csv",
"head": 5, # 可选默认显示前5行
"tail": 5, # 可选默认显示后5行
"info": true, # 可选,是否显示数据集信息
"describe": true # 可选,是否显示数据统计信息
}
Response:
{
"status": "success",
"data": {
"head": [ # 数据集前几行
{
"column1": "value1",
"column2": "value2",
...
},
...
],
"tail": [ # 数据集后几行
{
"column1": "value1",
"column2": "value2",
...
},
...
],
"info": { # 数据集基本信息
"rows": 1000,
"columns": 10,
"column_types": {
"column1": "int64",
"column2": "float64",
"column3": "object",
...
},
"memory_usage": "80.5 KB",
"missing_values": {
"column1": 0,
"column2": 5,
...
}
},
"describe": { # 数据统计信息
"column1": {
"count": 1000,
"mean": 45.3,
"std": 12.5,
"min": 0,
"25%": 35.0,
"50%": 45.0,
"75%": 55.0,
"max": 100.0
},
...
}
}
}
Error Response:
{
"status": "error",
"message": "读取CSV文件失败",
"details": {
"error_type": "FileNotFoundError",
"error_message": "File not found: dataset/dataset_raw/data.csv"
}
}
```
## 2. 模型接口
### 2.1 获取可用模型列表

View File

@ -646,4 +646,105 @@ class DataManager:
self.logger.info("获取处理好的数据集")
# print("可用数据集", back)
return back
return back
def read_csv(
self,
data_path: str,
head: int = 5,
tail: int = 5,
info: bool = True,
describe: bool = True
) -> Dict:
"""
读取并展示CSV文件内容
Args:
data_path: CSV文件路径
head: 显示前几行
tail: 显示后几行
info: 是否显示数据集信息
describe: 是否显示数据统计信息
Returns:
数据集信息字典
"""
try:
self.logger.info(f"Reading CSV file: {data_path}")
# 读取CSV文件
df = pd.read_csv(data_path)
result = {
"status": "success",
"data": {}
}
# 获取前几行数据
if head > 0:
result["data"]["head"] = df.head(head).to_dict('records')
# 获取后几行数据
if tail > 0:
result["data"]["tail"] = df.tail(tail).to_dict('records')
# 获取数据集信息
if info:
# 获取每列的缺失值数量
missing_values = df.isnull().sum().to_dict()
# 获取每列的数据类型
column_types = df.dtypes.astype(str).to_dict()
# 计算内存使用
memory_usage = df.memory_usage(deep=True).sum()
if memory_usage < 1024:
memory_str = f"{memory_usage} B"
elif memory_usage < 1024 * 1024:
memory_str = f"{memory_usage/1024:.1f} KB"
else:
memory_str = f"{memory_usage/(1024*1024):.1f} MB"
result["data"]["info"] = {
"rows": len(df),
"columns": len(df.columns),
"column_types": column_types,
"memory_usage": memory_str,
"missing_values": missing_values
}
# 获取数据统计信息
if describe:
# 对数值列进行统计描述
numeric_describe = df.describe().to_dict()
# 对分类列进行统计描述
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_describe = {}
for col in categorical_columns:
categorical_describe[col] = {
"count": df[col].count(),
"unique": df[col].nunique(),
"top": df[col].mode()[0] if not df[col].mode().empty else None,
"freq": df[col].value_counts().iloc[0] if not df[col].value_counts().empty else 0
}
result["data"]["describe"] = {
**numeric_describe,
**categorical_describe
}
self.logger.info(f"Successfully read CSV file: {data_path}")
return result
except Exception as e:
error_msg = f"Error reading CSV file: {str(e)}"
self.logger.error(error_msg)
return {
"status": "error",
"message": "读取CSV文件失败",
"details": {
"error_type": type(e).__name__,
"error_message": str(e)
}
}

View File

@ -0,0 +1,20 @@
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
predict_fn: predict
python_version: 3.9.19
sklearn:
code: null
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.5.2
mlflow_version: 2.20.1
model_size_bytes: 106353
model_uuid: 978934ba28b44b89aa72d5ad0a472e5e
run_id: 7a199919f0dc4e929257dd628d0ea068
utc_time_created: '2025-02-25 01:35:56.104998'

View File

@ -0,0 +1,15 @@
channels:
- conda-forge
dependencies:
- python=3.9.19
- pip<=24.0
- pip:
- mlflow==2.20.1
- cloudpickle==3.1.0
- numpy==1.26.4
- pandas==2.2.2
- psutil==6.0.0
- scikit-learn==1.5.2
- scipy==1.13.1
- xgboost==2.1.4
name: mlflow-env

View File

@ -0,0 +1,7 @@
python: 3.9.19
build_dependencies:
- pip==24.0
- setuptools==60.2.0
- wheel==0.43.0
dependencies:
- -r requirements.txt

View File

@ -0,0 +1,8 @@
mlflow==2.20.1
cloudpickle==3.1.0
numpy==1.26.4
pandas==2.2.2
psutil==6.0.0
scikit-learn==1.5.2
scipy==1.13.1
xgboost==2.1.4

View File

@ -0,0 +1,20 @@
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
predict_fn: predict
python_version: 3.9.19
sklearn:
code: null
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.5.2
mlflow_version: 2.20.1
model_size_bytes: 106353
model_uuid: 190fcacb3df44232bc122cfd5c0768ef
run_id: 9f9c80e8e9634eb9b09978a685695619
utc_time_created: '2025-02-25 01:49:39.319999'

View File

@ -0,0 +1,15 @@
channels:
- conda-forge
dependencies:
- python=3.9.19
- pip<=24.0
- pip:
- mlflow==2.20.1
- cloudpickle==3.1.0
- numpy==1.26.4
- pandas==2.2.2
- psutil==6.0.0
- scikit-learn==1.5.2
- scipy==1.13.1
- xgboost==2.1.4
name: mlflow-env

View File

@ -0,0 +1,7 @@
python: 3.9.19
build_dependencies:
- pip==24.0
- setuptools==60.2.0
- wheel==0.43.0
dependencies:
- -r requirements.txt

View File

@ -0,0 +1,8 @@
mlflow==2.20.1
cloudpickle==3.1.0
numpy==1.26.4
pandas==2.2.2
psutil==6.0.0
scikit-learn==1.5.2
scipy==1.13.1
xgboost==2.1.4

View File

@ -0,0 +1,20 @@
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
predict_fn: predict
python_version: 3.9.19
sklearn:
code: null
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.5.2
mlflow_version: 2.20.1
model_size_bytes: 106353
model_uuid: db9eff91af5c46ddb07b49424304108f
run_id: b0f8602b2bda4f349cef30e446d08a88
utc_time_created: '2025-02-25 01:46:24.477384'

View File

@ -0,0 +1,15 @@
channels:
- conda-forge
dependencies:
- python=3.9.19
- pip<=24.0
- pip:
- mlflow==2.20.1
- cloudpickle==3.1.0
- numpy==1.26.4
- pandas==2.2.2
- psutil==6.0.0
- scikit-learn==1.5.2
- scipy==1.13.1
- xgboost==2.1.4
name: mlflow-env

View File

@ -0,0 +1,7 @@
python: 3.9.19
build_dependencies:
- pip==24.0
- setuptools==60.2.0
- wheel==0.43.0
dependencies:
- -r requirements.txt

View File

@ -0,0 +1,8 @@
mlflow==2.20.1
cloudpickle==3.1.0
numpy==1.26.4
pandas==2.2.2
psutil==6.0.0
scikit-learn==1.5.2
scipy==1.13.1
xgboost==2.1.4

View File

@ -0,0 +1,20 @@
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
predict_fn: predict
python_version: 3.9.19
sklearn:
code: null
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.5.2
mlflow_version: 2.20.1
model_size_bytes: 106353
model_uuid: 25f86aaa070f44e3b7cee5656e7c055d
run_id: 41d222c59e3e46c8ba8c101247d5fd02
utc_time_created: '2025-02-25 02:06:34.332990'

View File

@ -0,0 +1,15 @@
channels:
- conda-forge
dependencies:
- python=3.9.19
- pip<=24.0
- pip:
- mlflow==2.20.1
- cloudpickle==3.1.0
- numpy==1.26.4
- pandas==2.2.2
- psutil==6.0.0
- scikit-learn==1.5.2
- scipy==1.13.1
- xgboost==2.1.4
name: mlflow-env

View File

@ -0,0 +1,7 @@
python: 3.9.19
build_dependencies:
- pip==24.0
- setuptools==60.2.0
- wheel==0.43.0
dependencies:
- -r requirements.txt

View File

@ -0,0 +1,8 @@
mlflow==2.20.1
cloudpickle==3.1.0
numpy==1.26.4
pandas==2.2.2
psutil==6.0.0
scikit-learn==1.5.2
scipy==1.13.1
xgboost==2.1.4

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/433321862082712659/7a199919f0dc4e929257dd628d0ea068/artifacts
end_time: 1740447358528
entry_point_name: ''
experiment_id: '433321862082712659'
lifecycle_stage: active
run_id: 7a199919f0dc4e929257dd628d0ea068
run_name: grandiose-seal-133
run_uuid: 7a199919f0dc4e929257dd628d0ea068
source_name: ''
source_type: 4
source_version: ''
start_time: 1740447355512
status: 3
tags: []
user_id: admin-root

View File

@ -0,0 +1 @@
1740447356081 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740447356094 0.990328791886068 0

View File

@ -0,0 +1 @@
1740447356086 0.9905768132495717 0

View File

@ -0,0 +1 @@
1740447356090 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740447356098 0.9928571428571429 0

View File

@ -0,0 +1 @@
['计算效率高,支持并行计算。', '具有内置的缺失值处理能力。']

View File

@ -0,0 +1 @@
XGBClassifier

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/dataset/dataset_processed/breast_cancer_20250219_144629

View File

@ -0,0 +1 @@
['参数较多,调优较复杂。']

View File

@ -0,0 +1 @@
XGBoostExtreme Gradient Boosting是一种基于梯度提升树GBDT的改进算法具有更强的正则化和并行处理能力。

View File

@ -0,0 +1 @@
classification

View File

@ -0,0 +1 @@
[{"run_id": "7a199919f0dc4e929257dd628d0ea068", "artifact_path": "model", "utc_time_created": "2025-02-25 01:35:56.104998", "model_uuid": "978934ba28b44b89aa72d5ad0a472e5e", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.9.19", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.5.2", "serialization_format": "cloudpickle", "code": null}}}]

View File

@ -0,0 +1 @@
grandiose-seal-133

View File

@ -0,0 +1 @@
aede371f38cbfd1418bd074b929dd9bd6ea64a22

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/example_model_manager.py

View File

@ -0,0 +1 @@
admin-root

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/433321862082712659/9f9c80e8e9634eb9b09978a685695619/artifacts
end_time: 1740448182660
entry_point_name: ''
experiment_id: '433321862082712659'
lifecycle_stage: active
run_id: 9f9c80e8e9634eb9b09978a685695619
run_name: capable-colt-135
run_uuid: 9f9c80e8e9634eb9b09978a685695619
source_name: ''
source_type: 4
source_version: ''
start_time: 1740448178742
status: 3
tags: []
user_id: admin-root

View File

@ -0,0 +1 @@
1740448179296 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740448179309 0.990328791886068 0

View File

@ -0,0 +1 @@
1740448179301 0.9905768132495717 0

View File

@ -0,0 +1 @@
1740448179305 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740448179314 0.9928571428571429 0

View File

@ -0,0 +1 @@
['计算效率高,支持并行计算。', '具有内置的缺失值处理能力。']

View File

@ -0,0 +1 @@
XGBClassifier

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/dataset/dataset_processed/breast_cancer_20250224_170615/train_breast_cancer_20250224_170615.csv

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/dataset/dataset_processed/breast_cancer_20250224_170615/val_breast_cancer_20250224_170615.csv

View File

@ -0,0 +1 @@
['参数较多,调优较复杂。']

View File

@ -0,0 +1 @@
XGBoostExtreme Gradient Boosting是一种基于梯度提升树GBDT的改进算法具有更强的正则化和并行处理能力。

View File

@ -0,0 +1 @@
classification

View File

@ -0,0 +1 @@
[{"run_id": "9f9c80e8e9634eb9b09978a685695619", "artifact_path": "model", "utc_time_created": "2025-02-25 01:49:39.319999", "model_uuid": "190fcacb3df44232bc122cfd5c0768ef", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.9.19", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.5.2", "serialization_format": "cloudpickle", "code": null}}}]

View File

@ -0,0 +1 @@
capable-colt-135

View File

@ -0,0 +1 @@
382271e424c6a74f41aa5c92743ec7c8eb3882af

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/example_model_manager.py

View File

@ -0,0 +1 @@
admin-root

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/433321862082712659/b0f8602b2bda4f349cef30e446d08a88/artifacts
end_time: 1740447986878
entry_point_name: ''
experiment_id: '433321862082712659'
lifecycle_stage: active
run_id: b0f8602b2bda4f349cef30e446d08a88
run_name: sneaky-ray-592
run_uuid: b0f8602b2bda4f349cef30e446d08a88
source_name: ''
source_type: 4
source_version: ''
start_time: 1740447983879
status: 3
tags: []
user_id: admin-root

View File

@ -0,0 +1 @@
1740447984454 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740447984467 0.990328791886068 0

View File

@ -0,0 +1 @@
1740447984458 0.9905768132495717 0

View File

@ -0,0 +1 @@
1740447984463 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740447984471 0.9928571428571429 0

View File

@ -0,0 +1 @@
['计算效率高,支持并行计算。', '具有内置的缺失值处理能力。']

View File

@ -0,0 +1 @@
XGBClassifier

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/dataset/dataset_processed/breast_cancer_20250224_170615/train_breast_cancer_20250224_170615.csv

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/dataset/dataset_processed/breast_cancer_20250224_170615/val_breast_cancer_20250224_170615.csv

View File

@ -0,0 +1 @@
['参数较多,调优较复杂。']

View File

@ -0,0 +1 @@
XGBoostExtreme Gradient Boosting是一种基于梯度提升树GBDT的改进算法具有更强的正则化和并行处理能力。

View File

@ -0,0 +1 @@
classification

View File

@ -0,0 +1 @@
[{"run_id": "b0f8602b2bda4f349cef30e446d08a88", "artifact_path": "model", "utc_time_created": "2025-02-25 01:46:24.477384", "model_uuid": "db9eff91af5c46ddb07b49424304108f", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.9.19", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.5.2", "serialization_format": "cloudpickle", "code": null}}}]

View File

@ -0,0 +1 @@
sneaky-ray-592

View File

@ -0,0 +1 @@
d21060c67020bc49df3e06a1f0addd03b64f4975

View File

@ -0,0 +1 @@
/home/admin-root/haotian/MLPlatform/example_model_manager.py

View File

@ -0,0 +1 @@
admin-root

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/452770488800904984/41d222c59e3e46c8ba8c101247d5fd02/artifacts
end_time: 1740449197273
entry_point_name: ''
experiment_id: '452770488800904984'
lifecycle_stage: active
run_id: 41d222c59e3e46c8ba8c101247d5fd02
run_name: bright-lamb-489
run_uuid: 41d222c59e3e46c8ba8c101247d5fd02
source_name: ''
source_type: 4
source_version: ''
start_time: 1740449194009
status: 3
tags: []
user_id: admin-root

View File

@ -0,0 +1 @@
1740449194317 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740449194326 0.990328791886068 0

View File

@ -0,0 +1 @@
1740449194320 0.9905768132495717 0

View File

@ -0,0 +1 @@
1740449194323 0.9902912621359223 0

View File

@ -0,0 +1 @@
1740449194328 0.9928571428571429 0

View File

@ -0,0 +1 @@
['计算效率高,支持并行计算。', '具有内置的缺失值处理能力。']

Some files were not shown because too many files have changed in this diff Show More