完成--完成获取数据预处理方法列表,获取方法详情

This commit is contained in:
haotian 2025-02-17 15:18:25 +08:00
parent 789285e312
commit f428efa7db
8 changed files with 289 additions and 372 deletions

Binary file not shown.

View File

@ -11,6 +11,7 @@ class MethodReader:
"""初始化方法读取器"""
self.logger = logging.getLogger(__name__)
self.method_config = self._load_method_config()
self.parameter_config = self._load_parameter_config()
def _load_method_config(self) -> Dict:
"""加载方法配置文件"""
@ -29,6 +30,22 @@ class MethodReader:
self.logger.error(f"Error loading method config: {str(e)}")
raise
def _load_parameter_config(self) -> Dict:
"""加载参数配置文件"""
try:
config_path = Path('date_preprocessing/parameter.yaml')
if not config_path.exists():
raise FileNotFoundError(f"Parameter config file not found at {config_path}")
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
self.logger.info("Successfully loaded parameter config")
return config
except Exception as e:
self.logger.error(f"Error loading parameter config: {str(e)}")
raise
def get_preprocessing_methods(self) -> Dict:
"""获取预处理方法列表"""
try:
@ -76,23 +93,40 @@ class MethodReader:
def get_method_details(self, method_name: str) -> Dict:
"""获取指定方法的详细信息"""
try:
# 在各个方法类别中查找
# 在各个方法类别中查找方法原理和优缺点
method_info = None
for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']:
if method_name in self.method_config.get(category, {}):
method_info = self.method_config[category][method_name]
break
if method_info is None:
raise ValueError(f"Method {method_name} not found in method config")
# 查找方法参数信息
parameter_info = None
for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']:
if method_name in self.parameter_config.get(category, {}):
parameter_info = self.parameter_config[category][method_name]
break
if parameter_info is None:
raise ValueError(f"Method {method_name} not found in parameter config")
# 组合返回信息
return {
"status": "success",
"method": {
"name": method_name,
"description": parameter_info.get('description', ''),
"principle": method_info.get('principle', ''),
"advantages": method_info.get('advantages', []),
"disadvantages": method_info.get('disadvantages', []),
"applicable_scenarios": method_info.get('applicable_scenarios', [])
"applicable_scenarios": method_info.get('applicable_scenarios', []),
"parameters": parameter_info.get('parameters', [])
}
}
raise ValueError(f"Method {method_name} not found")
except Exception as e:
self.logger.error(f"Error getting method details: {str(e)}")
return {

View File

@ -1,186 +1,255 @@
feature_engineering_methods_parameters:
LabelEncoder:
parameters: {}
description: "将分类标签编码为整数。"
parameters: []
KBinsDiscretizer:
description: "将连续数据分箱为离散数据。"
parameters:
n_bins:
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
- name: "n_bins"
type: "int or array-like"
default: 5
encode:
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
- name: "encode"
type: "str"
default: "onehot"
description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。"
default: 'onehot'
strategy:
- name: "strategy"
type: "str"
default: "quantile"
description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'K-Means 聚类分箱)。"
default: 'quantile'
FunctionTransformer:
description: "对数据应用自定义函数进行转换。"
parameters:
func:
- name: "func"
type: "callable"
default: null
description: "要应用于输入数据的函数。"
- name: "inverse_func"
type: "callable"
default: null
inverse_func:
description: "func 的逆函数,如果存在。"
default: null
validate:
description: "布尔值,指示是否在转换前验证输入数据。"
- name: "validate"
type: "bool"
default: false
accept_sparse:
description: "布尔值,指示是否接受稀疏矩阵作为输入。"
description: "指示是否在转换前验证输入数据。"
- name: "accept_sparse"
type: "bool"
default: false
check_inverse:
description: "布尔值,指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
description: "指示是否接受稀疏矩阵作为输入。"
- name: "check_inverse"
type: "bool"
default: true
kw_args:
description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
- name: "kw_args"
type: "dict"
default: null
description: "传递给 func 的其他关键字参数。"
- name: "inv_kw_args"
type: "dict"
default: null
inv_kw_args:
description: "传递给 inverse_func 的其他关键字参数。"
default: null
PowerTransformer:
description: "对数据进行幂变换以使其更符合正态分布。"
parameters:
method:
- name: "method"
type: "str"
default: "yeo-johnson"
description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。"
default: 'yeo-johnson'
standardize:
description: "布尔值,指示是否在变换后将数据标准化为零均值和单位方差。"
- name: "standardize"
type: "bool"
default: true
copy:
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
description: "指示是否在变换后将数据标准化为零均值和单位方差。"
- name: "copy"
type: "bool"
default: true
description: "指示是否复制输入数据,或在原地进行变换。"
QuantileTransformer:
description: "将数据转换为均匀分布或正态分布。"
parameters:
n_quantiles:
description: "用于分位数变换的分位数数量。"
- name: "n_quantiles"
type: "int"
default: 1000
output_distribution:
description: "用于分位数变换的分位数数量。"
- name: "output_distribution"
type: "str"
default: "uniform"
description: "指定输出分布。可选值包括 'uniform' 和 'normal'。"
default: 'uniform'
ignore_implicit_zeros:
description: "布尔值,指示是否忽略隐式零。"
- name: "ignore_implicit_zeros"
type: "bool"
default: false
subsample:
description: "指示是否忽略隐式零。"
- name: "subsample"
type: "int"
default: 100000
description: "用于计算分位数的子样本大小。"
default: 1e5
random_state:
description: "用于随机数生成的种子。"
- name: "random_state"
type: "int or None"
default: null
copy:
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
description: "用于随机数生成的种子。"
- name: "copy"
type: "bool"
default: true
description: "指示是否复制输入数据,或在原地进行变换。"
FeatureHasher:
description: "使用哈希技巧将特征映射到向量。"
parameters:
n_features:
description: "哈希空间的维度。"
- name: "n_features"
type: "int"
default: 1048576
input_type:
description: "哈希空间的维度。"
- name: "input_type"
type: "str"
default: "dict"
description: "输入数据的类型。可选值包括 'dict' 和 'pair'。"
default: 'dict'
dtype:
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
default: 'float64'
alternate_sign:
description: "布尔值,指示是否在哈希时使用交替符号。"
- name: "alternate_sign"
type: "bool"
default: true
description: "指示是否在哈希时使用交替符号。"
DictVectorizer:
description: "将符号表示的特征(如字典)转换为稀疏矩阵。"
parameters:
dtype:
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
default: 'float64'
separator:
- name: "separator"
type: "str"
default: "="
description: "用于分隔特征名称的分隔符。"
default: '='
sparse:
description: "布尔值,指示是否返回稀疏矩阵。"
- name: "sparse"
type: "bool"
default: true
sort:
description: "布尔值,指示是否对特征名称排序。"
description: "指示是否返回稀疏矩阵。"
- name: "sort"
type: "bool"
default: true
description: "指示是否对特征名称排序。"
PCA:
description: "主成分分析,用于降维。"
parameters:
n_components:
- name: "n_components"
type: "int, float, None or str"
default: null
description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。"
default: null
copy:
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
- name: "copy"
type: "bool"
default: true
whiten:
description: "布尔值,指示是否对主成分进行白化。"
description: "指示是否复制输入数据,或在原地进行变换。"
- name: "whiten"
type: "bool"
default: false
svd_solver:
description: "指示是否对主成分进行白化。"
- name: "svd_solver"
type: "str"
default: "auto"
description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。"
default: 'auto'
tol:
description: "奇异值分解的容差。"
- name: "tol"
type: "float"
default: 0.0
iterated_power:
description: "用于随机化 SVD 的迭代次数。"
description: "奇异值分解的容差。"
- name: "iterated_power"
type: "int or 'auto'"
default: 'auto'
random_state:
description: "用于随机数生成的种子。"
description: "用于随机化 SVD 的迭代次数。"
- name: "random_state"
type: "int or None"
default: null
description: "用于随机数生成的种子。"
SelectKBest:
description: "选择最重要的 K 个特征。"
parameters:
score_func:
- name: "score_func"
type: "callable"
default: "f_classif"
description: "用于计算特征得分的函数。"
default: 'f_classif'
k:
description: "要选择的特征数量。"
- name: "k"
type: "int"
default: 10
description: "要选择的特征数量。"
RFE:
description: "递归特征消除,用于选择最重要的特征。"
parameters:
estimator:
- name: "estimator"
type: "object"
default: null
description: "用于特征选择的基模型。"
- name: "n_features_to_select"
type: "int"
default: null
n_features_to_select:
description: "要选择的特征数量。"
default: null
step:
description: "每次迭代要移除的特征数量。"
- name: "step"
type: "int"
default: 1
verbose:
description: "控制冗长模式的整数。"
description: "每次迭代要移除的特征数量。"
- name: "verbose"
type: "int"
default: 0
description: "控制冗长模式的整数。"
PolynomialFeatures:
description: "生成多项式特征,增加模型的非线性能力。"
parameters:
degree:
description: "生成多项式特征的最高次数。"
- name: "degree"
type: "int"
default: 2
interaction_only:
description: "布尔值,指示是否仅包含交互项。"
description: "生成多项式特征的最高次数。"
- name: "interaction_only"
type: "bool"
default: false
include_bias:
description: "布尔值,指示是否包含偏置列。"
description: "指示是否仅包含交互项。"
- name: "include_bias"
type: "bool"
default: true
order:
description: "指示是否包含偏置列。"
- name: "order"
type: "str"
default: "C"
description: "输出特征的顺序。可选值包括 'C' 和 'F'。"
default: 'C'
OneHotEncoder:
description: "将分类特征转换为独热编码。"
parameters:
categories:
- name: "categories"
type: "str or list or 'auto'"
default: "auto"
description: "指定每个特征的类别。"
default: 'auto'
drop:
description: "指定要从每个特征中删除的类别。"
- name: "drop"
type: "str or array-like"
default: null
sparse:
description: "布尔值,指示是否返回稀疏矩阵。"
description: "指定要从每个特征中删除的类别。"
- name: "sparse"
type: "bool"
default: true
dtype:
description: "指示是否返回稀疏矩阵。"
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
default: 'float64'
handle_unknown:
description: "指定如何处理未知类别。可选值"
- name: "handle_unknown"
type: "str"
default: "error"
description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。"
- name: "max_categories"
type: "int or None"
default: null
description: "在类别过多时,将类别限制为最大类别数量。"

View File

@ -1,255 +0,0 @@
feature_engineering_methods_parameters:
LabelEncoder:
description: "将分类标签编码为整数。"
parameters: []
KBinsDiscretizer:
description: "将连续数据分箱为离散数据。"
parameters:
- name: "n_bins"
type: "int or array-like"
default: 5
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
- name: "encode"
type: "str"
default: "onehot"
description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。"
- name: "strategy"
type: "str"
default: "quantile"
description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'K-Means 聚类分箱)。"
FunctionTransformer:
description: "对数据应用自定义函数进行转换。"
parameters:
- name: "func"
type: "callable"
default: null
description: "要应用于输入数据的函数。"
- name: "inverse_func"
type: "callable"
default: null
description: "func 的逆函数,如果存在。"
- name: "validate"
type: "bool"
default: false
description: "指示是否在转换前验证输入数据。"
- name: "accept_sparse"
type: "bool"
default: false
description: "指示是否接受稀疏矩阵作为输入。"
- name: "check_inverse"
type: "bool"
default: true
description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
- name: "kw_args"
type: "dict"
default: null
description: "传递给 func 的其他关键字参数。"
- name: "inv_kw_args"
type: "dict"
default: null
description: "传递给 inverse_func 的其他关键字参数。"
PowerTransformer:
description: "对数据进行幂变换以使其更符合正态分布。"
parameters:
- name: "method"
type: "str"
default: "yeo-johnson"
description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。"
- name: "standardize"
type: "bool"
default: true
description: "指示是否在变换后将数据标准化为零均值和单位方差。"
- name: "copy"
type: "bool"
default: true
description: "指示是否复制输入数据,或在原地进行变换。"
QuantileTransformer:
description: "将数据转换为均匀分布或正态分布。"
parameters:
- name: "n_quantiles"
type: "int"
default: 1000
description: "用于分位数变换的分位数数量。"
- name: "output_distribution"
type: "str"
default: "uniform"
description: "指定输出分布。可选值包括 'uniform' 和 'normal'。"
- name: "ignore_implicit_zeros"
type: "bool"
default: false
description: "指示是否忽略隐式零。"
- name: "subsample"
type: "int"
default: 100000
description: "用于计算分位数的子样本大小。"
- name: "random_state"
type: "int or None"
default: null
description: "用于随机数生成的种子。"
- name: "copy"
type: "bool"
default: true
description: "指示是否复制输入数据,或在原地进行变换。"
FeatureHasher:
description: "使用哈希技巧将特征映射到向量。"
parameters:
- name: "n_features"
type: "int"
default: 1048576
description: "哈希空间的维度。"
- name: "input_type"
type: "str"
default: "dict"
description: "输入数据的类型。可选值包括 'dict' 和 'pair'。"
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
- name: "alternate_sign"
type: "bool"
default: true
description: "指示是否在哈希时使用交替符号。"
DictVectorizer:
description: "将符号表示的特征(如字典)转换为稀疏矩阵。"
parameters:
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
- name: "separator"
type: "str"
default: "="
description: "用于分隔特征名称的分隔符。"
- name: "sparse"
type: "bool"
default: true
description: "指示是否返回稀疏矩阵。"
- name: "sort"
type: "bool"
default: true
description: "指示是否对特征名称排序。"
PCA:
description: "主成分分析,用于降维。"
parameters:
- name: "n_components"
type: "int, float, None or str"
default: null
description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。"
- name: "copy"
type: "bool"
default: true
description: "指示是否复制输入数据,或在原地进行变换。"
- name: "whiten"
type: "bool"
default: false
description: "指示是否对主成分进行白化。"
- name: "svd_solver"
type: "str"
default: "auto"
description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。"
- name: "tol"
type: "float"
default: 0.0
description: "奇异值分解的容差。"
- name: "iterated_power"
type: "int or 'auto'"
default: 'auto'
description: "用于随机化 SVD 的迭代次数。"
- name: "random_state"
type: "int or None"
default: null
description: "用于随机数生成的种子。"
SelectKBest:
description: "选择最重要的 K 个特征。"
parameters:
- name: "score_func"
type: "callable"
default: "f_classif"
description: "用于计算特征得分的函数。"
- name: "k"
type: "int"
default: 10
description: "要选择的特征数量。"
RFE:
description: "递归特征消除,用于选择最重要的特征。"
parameters:
- name: "estimator"
type: "object"
default: null
description: "用于特征选择的基模型。"
- name: "n_features_to_select"
type: "int"
default: null
description: "要选择的特征数量。"
- name: "step"
type: "int"
default: 1
description: "每次迭代要移除的特征数量。"
- name: "verbose"
type: "int"
default: 0
description: "控制冗长模式的整数。"
PolynomialFeatures:
description: "生成多项式特征,增加模型的非线性能力。"
parameters:
- name: "degree"
type: "int"
default: 2
description: "生成多项式特征的最高次数。"
- name: "interaction_only"
type: "bool"
default: false
description: "指示是否仅包含交互项。"
- name: "include_bias"
type: "bool"
default: true
description: "指示是否包含偏置列。"
- name: "order"
type: "str"
default: "C"
description: "输出特征的顺序。可选值包括 'C' 和 'F'。"
OneHotEncoder:
description: "将分类特征转换为独热编码。"
parameters:
- name: "categories"
type: "str or list or 'auto'"
default: "auto"
description: "指定每个特征的类别。"
- name: "drop"
type: "str or array-like"
default: null
description: "指定要从每个特征中删除的类别。"
- name: "sparse"
type: "bool"
default: true
description: "指示是否返回稀疏矩阵。"
- name: "dtype"
type: "type"
default: "float64"
description: "输出数据的类型。"
- name: "handle_unknown"
type: "str"
default: "error"
description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。"
- name: "max_categories"
type: "int or None"
default: null
description: "在类别过多时,将类别限制为最大类别数量。"

14
example_method_reader.py Normal file
View File

@ -0,0 +1,14 @@
from data_process.method_reader_date_process import MethodReader
# 创建方法读取器实例
reader = MethodReader()
# 获取所有预处理方法
methods = reader.get_preprocessing_methods()
print("预处理方法列表:")
print(methods)
# 获取特定方法的详细信息
method_details = reader.get_method_details('StandardScaler')
print("\nStandardScaler方法详情:")
print(method_details)

6
mlruns/0/meta.yaml Normal file
View File

@ -0,0 +1,6 @@
artifact_location: mlflow-artifacts:/0
creation_time: 1739520200398
experiment_id: '0'
last_update_time: 1739520200398
lifecycle_stage: active
name: Default

View File

@ -0,0 +1,49 @@
import unittest
from data_process.method_reader_date_process import MethodReader
class TestMethodReader(unittest.TestCase):
def setUp(self):
self.reader = MethodReader()
def test_get_preprocessing_methods(self):
result = self.reader.get_preprocessing_methods()
self.assertEqual(result['status'], 'success')
self.assertIsInstance(result['methods'], list)
# 检查返回的方法列表
methods = result['methods']
self.assertTrue(any(m['name'] == 'data_scaler' for m in methods))
self.assertTrue(any(m['name'] == 'missing_value_handler' for m in methods))
self.assertTrue(any(m['name'] == 'outlier_detector' for m in methods))
def test_get_method_details(self):
# 测试获取StandardScaler的详细信息
result = self.reader.get_method_details('StandardScaler')
self.assertEqual(result['status'], 'success')
self.assertEqual(result['method']['name'], 'StandardScaler')
# 检查返回的详细信息字段
method = result['method']
self.assertIn('description', method)
self.assertIn('principle', method)
self.assertIn('advantages', method)
self.assertIn('disadvantages', method)
self.assertIn('applicable_scenarios', method)
self.assertIn('parameters', method)
# 检查参数信息
parameters = method['parameters']
self.assertIsInstance(parameters, list)
if parameters:
param = parameters[0]
self.assertIn('name', param)
self.assertIn('type', param)
self.assertIn('default', param)
self.assertIn('description', param)
# 测试获取不存在的方法
result = self.reader.get_method_details('NonExistentMethod')
self.assertEqual(result['status'], 'error')
if __name__ == '__main__':
unittest.main()