291 lines
11 KiB
Python
291 lines
11 KiB
Python
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
|
||
from sklearn.linear_model import LinearRegression, Ridge
|
||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.neighbors import KNeighborsRegressor
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.svm import SVR
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class AlgorithmDefinition:
|
||
key: str
|
||
name: str
|
||
english_name: str
|
||
family: str
|
||
description: str
|
||
estimator: Any
|
||
|
||
|
||
class DemoModelService:
|
||
target_column = "actual_cost"
|
||
ignored_columns = {"name", "type", target_column}
|
||
|
||
def __init__(self, dataset_path: Path | str | None = None):
|
||
root = Path(__file__).resolve().parent.parent
|
||
self.dataset_path = Path(dataset_path) if dataset_path else root / "data" / "demo_equipment_costs.csv"
|
||
|
||
def get_algorithms(self) -> list[dict[str, str]]:
|
||
algorithms, _ = self._available_algorithms()
|
||
return [
|
||
{
|
||
"key": item.key,
|
||
"name": item.name,
|
||
"english_name": item.english_name,
|
||
"family": item.family,
|
||
"description": item.description,
|
||
}
|
||
for item in algorithms.values()
|
||
]
|
||
|
||
def get_dataset_summary(self) -> dict[str, Any]:
|
||
frame = self._load_dataset()
|
||
feature_columns = self._feature_columns(frame)
|
||
return {
|
||
"source": "local-file",
|
||
"path": str(self.dataset_path),
|
||
"row_count": int(len(frame)),
|
||
"columns": list(frame.columns),
|
||
"features": feature_columns,
|
||
"target": self.target_column,
|
||
"target_label": "实际成本",
|
||
"equipment_types": sorted(frame["type"].dropna().unique().tolist()),
|
||
"preview": frame.head(8).to_dict(orient="records"),
|
||
}
|
||
|
||
def run_demo(self, selected_algorithms: list[str] | None = None) -> dict[str, Any]:
|
||
frame = self._load_dataset()
|
||
feature_columns = self._feature_columns(frame)
|
||
algorithms, availability_warnings = self._available_algorithms()
|
||
|
||
requested = selected_algorithms or list(algorithms.keys())
|
||
warnings = list(availability_warnings)
|
||
selected = []
|
||
for key in requested:
|
||
if key in algorithms:
|
||
selected.append(key)
|
||
else:
|
||
warnings.append(f"算法 '{key}' 不可用,已自动跳过。")
|
||
|
||
if not selected:
|
||
selected = ["linear"]
|
||
warnings.append("所选算法均不可用,已自动使用线性回归。")
|
||
|
||
X = frame[feature_columns]
|
||
y = frame[self.target_column]
|
||
train_x, test_x, train_y, test_y = train_test_split(
|
||
X,
|
||
y,
|
||
test_size=0.3,
|
||
random_state=42,
|
||
)
|
||
|
||
metrics: dict[str, dict[str, float | str]] = {}
|
||
predictions: dict[str, list[float]] = {}
|
||
feature_importance: dict[str, list[dict[str, float | str]]] = {}
|
||
|
||
for key in selected:
|
||
definition = algorithms[key]
|
||
model = definition.estimator
|
||
model.fit(train_x, train_y)
|
||
predicted = model.predict(test_x)
|
||
predictions[key] = [float(value) for value in predicted]
|
||
metrics[key] = {
|
||
"name": definition.name,
|
||
"r2": float(r2_score(test_y, predicted)),
|
||
"mae": float(mean_absolute_error(test_y, predicted)),
|
||
"rmse": float(np.sqrt(mean_squared_error(test_y, predicted))),
|
||
}
|
||
feature_importance[key] = self._feature_importance(model, feature_columns)
|
||
|
||
best_model = min(metrics, key=lambda key: float(metrics[key]["rmse"]))
|
||
ordered_test = test_x.copy()
|
||
ordered_test["actual"] = test_y
|
||
ordered_test["name"] = frame.loc[test_x.index, "name"]
|
||
|
||
prediction_points = []
|
||
for position, (index, row) in enumerate(ordered_test.sort_values("actual").iterrows()):
|
||
point = {
|
||
"name": row["name"],
|
||
"actual": float(row["actual"]),
|
||
}
|
||
for key in selected:
|
||
original_position = list(test_x.index).index(index)
|
||
point[key] = predictions[key][original_position]
|
||
prediction_points.append(point)
|
||
|
||
sample = frame.sort_values(self.target_column).iloc[len(frame) // 2]
|
||
sample_x = pd.DataFrame([sample[feature_columns].to_dict()])
|
||
sample_predictions = {
|
||
key: float(algorithms[key].estimator.fit(X, y).predict(sample_x)[0])
|
||
for key in selected
|
||
}
|
||
|
||
return {
|
||
"source": "local-file",
|
||
"dataset": self.get_dataset_summary(),
|
||
"algorithms": self.get_algorithms(),
|
||
"selected_algorithms": selected,
|
||
"best_model": best_model,
|
||
"metrics": metrics,
|
||
"feature_importance": feature_importance,
|
||
"prediction_points": prediction_points,
|
||
"sample_prediction": {
|
||
"input": sample.drop(labels=[self.target_column]).to_dict(),
|
||
"actual": float(sample[self.target_column]),
|
||
"predictions": sample_predictions,
|
||
},
|
||
"warnings": warnings,
|
||
}
|
||
|
||
def _load_dataset(self) -> pd.DataFrame:
|
||
if not self.dataset_path.exists():
|
||
raise FileNotFoundError(f"Demo dataset not found: {self.dataset_path}")
|
||
|
||
frame = pd.read_csv(self.dataset_path)
|
||
if self.target_column not in frame.columns:
|
||
raise ValueError(f"Demo dataset must include '{self.target_column}'.")
|
||
return frame
|
||
|
||
def _feature_columns(self, frame: pd.DataFrame) -> list[str]:
|
||
columns = [
|
||
column
|
||
for column in frame.columns
|
||
if column not in self.ignored_columns and pd.api.types.is_numeric_dtype(frame[column])
|
||
]
|
||
if not columns:
|
||
raise ValueError("Demo dataset has no numeric feature columns.")
|
||
return columns
|
||
|
||
def _available_algorithms(self) -> tuple[dict[str, AlgorithmDefinition], list[str]]:
|
||
algorithms = {
|
||
"linear": AlgorithmDefinition(
|
||
"linear",
|
||
"线性回归",
|
||
"Linear Regression",
|
||
"线性模型",
|
||
"快速建立基准模型,用于展示参数与成本之间的线性关系。",
|
||
Pipeline([("scaler", StandardScaler()), ("model", LinearRegression())]),
|
||
),
|
||
"ridge": AlgorithmDefinition(
|
||
"ridge",
|
||
"岭回归",
|
||
"Ridge Regression",
|
||
"线性模型",
|
||
"带正则化的线性模型,适合特征存在相关性的场景。",
|
||
Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))]),
|
||
),
|
||
"random_forest": AlgorithmDefinition(
|
||
"random_forest",
|
||
"随机森林",
|
||
"Random Forest",
|
||
"树模型集成",
|
||
"通过多棵决策树集成预测,能够捕捉非线性特征影响。",
|
||
RandomForestRegressor(n_estimators=160, max_depth=6, random_state=42),
|
||
),
|
||
"gradient_boosting": AlgorithmDefinition(
|
||
"gradient_boosting",
|
||
"梯度提升树",
|
||
"Gradient Boosting",
|
||
"树模型集成",
|
||
"逐步修正误差的提升模型,常用于表格数据回归任务。",
|
||
GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=3, random_state=42),
|
||
),
|
||
"svr": AlgorithmDefinition(
|
||
"svr",
|
||
"支持向量回归",
|
||
"Support Vector Regression",
|
||
"核方法",
|
||
"使用核函数拟合平滑回归关系,适合展示不同算法偏好。",
|
||
Pipeline([("scaler", StandardScaler()), ("model", SVR(C=500000, epsilon=50000))]),
|
||
),
|
||
"knn": AlgorithmDefinition(
|
||
"knn",
|
||
"近邻回归",
|
||
"KNN Regression",
|
||
"实例学习",
|
||
"基于相似样本进行预测,便于解释局部相似性。",
|
||
Pipeline([("scaler", StandardScaler()), ("model", KNeighborsRegressor(n_neighbors=4))]),
|
||
),
|
||
}
|
||
warnings = []
|
||
|
||
try:
|
||
from xgboost import XGBRegressor
|
||
|
||
algorithms["xgboost"] = AlgorithmDefinition(
|
||
"xgboost",
|
||
"XGBoost",
|
||
"XGBoost",
|
||
"提升模型",
|
||
"面向表格数据的高性能梯度提升实现。",
|
||
XGBRegressor(
|
||
n_estimators=120,
|
||
max_depth=3,
|
||
learning_rate=0.05,
|
||
subsample=0.9,
|
||
colsample_bytree=0.9,
|
||
random_state=42,
|
||
objective="reg:squarederror",
|
||
),
|
||
)
|
||
except Exception:
|
||
warnings.append("当前环境未安装 XGBoost,页面已自动隐藏该算法。")
|
||
|
||
try:
|
||
from lightgbm import LGBMRegressor
|
||
|
||
algorithms["lightgbm"] = AlgorithmDefinition(
|
||
"lightgbm",
|
||
"LightGBM",
|
||
"LightGBM",
|
||
"提升模型",
|
||
"基于直方图优化的快速梯度提升模型。",
|
||
LGBMRegressor(
|
||
n_estimators=120,
|
||
learning_rate=0.05,
|
||
max_depth=4,
|
||
random_state=42,
|
||
verbose=-1,
|
||
),
|
||
)
|
||
except Exception:
|
||
warnings.append("当前环境未安装 LightGBM,页面已自动隐藏该算法。")
|
||
|
||
return algorithms, warnings
|
||
|
||
def _feature_importance(self, model: Any, feature_columns: list[str]) -> list[dict[str, float | str]]:
|
||
estimator = model
|
||
if isinstance(model, Pipeline):
|
||
estimator = model.named_steps["model"]
|
||
|
||
if hasattr(estimator, "feature_importances_"):
|
||
values = estimator.feature_importances_
|
||
elif hasattr(estimator, "coef_"):
|
||
values = np.abs(np.ravel(estimator.coef_))
|
||
else:
|
||
values = np.zeros(len(feature_columns))
|
||
|
||
total = float(np.sum(values))
|
||
if total > 0:
|
||
values = values / total
|
||
|
||
ranked = sorted(
|
||
[
|
||
{"feature": feature, "importance": float(value)}
|
||
for feature, value in zip(feature_columns, values)
|
||
],
|
||
key=lambda item: item["importance"],
|
||
reverse=True,
|
||
)
|
||
return ranked[:8]
|