完成--完成获取数据预处理方法列表,获取方法详情
This commit is contained in:
parent
789285e312
commit
f428efa7db
BIN
data_process/__pycache__/method_reader.cpython-39.pyc
Normal file
BIN
data_process/__pycache__/method_reader.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
@ -11,6 +11,7 @@ class MethodReader:
|
|||||||
"""初始化方法读取器"""
|
"""初始化方法读取器"""
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self.method_config = self._load_method_config()
|
self.method_config = self._load_method_config()
|
||||||
|
self.parameter_config = self._load_parameter_config()
|
||||||
|
|
||||||
def _load_method_config(self) -> Dict:
|
def _load_method_config(self) -> Dict:
|
||||||
"""加载方法配置文件"""
|
"""加载方法配置文件"""
|
||||||
@ -29,6 +30,22 @@ class MethodReader:
|
|||||||
self.logger.error(f"Error loading method config: {str(e)}")
|
self.logger.error(f"Error loading method config: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def _load_parameter_config(self) -> Dict:
|
||||||
|
"""加载参数配置文件"""
|
||||||
|
try:
|
||||||
|
config_path = Path('date_preprocessing/parameter.yaml')
|
||||||
|
if not config_path.exists():
|
||||||
|
raise FileNotFoundError(f"Parameter config file not found at {config_path}")
|
||||||
|
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
self.logger.info("Successfully loaded parameter config")
|
||||||
|
return config
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error loading parameter config: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
def get_preprocessing_methods(self) -> Dict:
|
def get_preprocessing_methods(self) -> Dict:
|
||||||
"""获取预处理方法列表"""
|
"""获取预处理方法列表"""
|
||||||
try:
|
try:
|
||||||
@ -76,23 +93,40 @@ class MethodReader:
|
|||||||
def get_method_details(self, method_name: str) -> Dict:
|
def get_method_details(self, method_name: str) -> Dict:
|
||||||
"""获取指定方法的详细信息"""
|
"""获取指定方法的详细信息"""
|
||||||
try:
|
try:
|
||||||
# 在各个方法类别中查找
|
# 在各个方法类别中查找方法原理和优缺点
|
||||||
|
method_info = None
|
||||||
for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']:
|
for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']:
|
||||||
if method_name in self.method_config.get(category, {}):
|
if method_name in self.method_config.get(category, {}):
|
||||||
method_info = self.method_config[category][method_name]
|
method_info = self.method_config[category][method_name]
|
||||||
|
break
|
||||||
|
|
||||||
|
if method_info is None:
|
||||||
|
raise ValueError(f"Method {method_name} not found in method config")
|
||||||
|
|
||||||
|
# 查找方法参数信息
|
||||||
|
parameter_info = None
|
||||||
|
for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']:
|
||||||
|
if method_name in self.parameter_config.get(category, {}):
|
||||||
|
parameter_info = self.parameter_config[category][method_name]
|
||||||
|
break
|
||||||
|
|
||||||
|
if parameter_info is None:
|
||||||
|
raise ValueError(f"Method {method_name} not found in parameter config")
|
||||||
|
|
||||||
|
# 组合返回信息
|
||||||
return {
|
return {
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"method": {
|
"method": {
|
||||||
"name": method_name,
|
"name": method_name,
|
||||||
|
"description": parameter_info.get('description', ''),
|
||||||
"principle": method_info.get('principle', ''),
|
"principle": method_info.get('principle', ''),
|
||||||
"advantages": method_info.get('advantages', []),
|
"advantages": method_info.get('advantages', []),
|
||||||
"disadvantages": method_info.get('disadvantages', []),
|
"disadvantages": method_info.get('disadvantages', []),
|
||||||
"applicable_scenarios": method_info.get('applicable_scenarios', [])
|
"applicable_scenarios": method_info.get('applicable_scenarios', []),
|
||||||
|
"parameters": parameter_info.get('parameters', [])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
raise ValueError(f"Method {method_name} not found")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error getting method details: {str(e)}")
|
self.logger.error(f"Error getting method details: {str(e)}")
|
||||||
return {
|
return {
|
||||||
@ -1,186 +1,255 @@
|
|||||||
feature_engineering_methods_parameters:
|
feature_engineering_methods_parameters:
|
||||||
|
|
||||||
LabelEncoder:
|
LabelEncoder:
|
||||||
parameters: {}
|
description: "将分类标签编码为整数。"
|
||||||
|
parameters: []
|
||||||
|
|
||||||
KBinsDiscretizer:
|
KBinsDiscretizer:
|
||||||
|
description: "将连续数据分箱为离散数据。"
|
||||||
parameters:
|
parameters:
|
||||||
n_bins:
|
- name: "n_bins"
|
||||||
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
|
type: "int or array-like"
|
||||||
default: 5
|
default: 5
|
||||||
encode:
|
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
|
||||||
|
- name: "encode"
|
||||||
|
type: "str"
|
||||||
|
default: "onehot"
|
||||||
description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。"
|
description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。"
|
||||||
default: 'onehot'
|
- name: "strategy"
|
||||||
strategy:
|
type: "str"
|
||||||
|
default: "quantile"
|
||||||
description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。"
|
description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。"
|
||||||
default: 'quantile'
|
|
||||||
|
|
||||||
FunctionTransformer:
|
FunctionTransformer:
|
||||||
|
description: "对数据应用自定义函数进行转换。"
|
||||||
parameters:
|
parameters:
|
||||||
func:
|
- name: "func"
|
||||||
|
type: "callable"
|
||||||
|
default: null
|
||||||
description: "要应用于输入数据的函数。"
|
description: "要应用于输入数据的函数。"
|
||||||
|
- name: "inverse_func"
|
||||||
|
type: "callable"
|
||||||
default: null
|
default: null
|
||||||
inverse_func:
|
|
||||||
description: "func 的逆函数,如果存在。"
|
description: "func 的逆函数,如果存在。"
|
||||||
default: null
|
- name: "validate"
|
||||||
validate:
|
type: "bool"
|
||||||
description: "布尔值,指示是否在转换前验证输入数据。"
|
|
||||||
default: false
|
default: false
|
||||||
accept_sparse:
|
description: "指示是否在转换前验证输入数据。"
|
||||||
description: "布尔值,指示是否接受稀疏矩阵作为输入。"
|
- name: "accept_sparse"
|
||||||
|
type: "bool"
|
||||||
default: false
|
default: false
|
||||||
check_inverse:
|
description: "指示是否接受稀疏矩阵作为输入。"
|
||||||
description: "布尔值,指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
|
- name: "check_inverse"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
kw_args:
|
description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
|
||||||
|
- name: "kw_args"
|
||||||
|
type: "dict"
|
||||||
|
default: null
|
||||||
description: "传递给 func 的其他关键字参数。"
|
description: "传递给 func 的其他关键字参数。"
|
||||||
|
- name: "inv_kw_args"
|
||||||
|
type: "dict"
|
||||||
default: null
|
default: null
|
||||||
inv_kw_args:
|
|
||||||
description: "传递给 inverse_func 的其他关键字参数。"
|
description: "传递给 inverse_func 的其他关键字参数。"
|
||||||
default: null
|
|
||||||
|
|
||||||
PowerTransformer:
|
PowerTransformer:
|
||||||
|
description: "对数据进行幂变换以使其更符合正态分布。"
|
||||||
parameters:
|
parameters:
|
||||||
method:
|
- name: "method"
|
||||||
|
type: "str"
|
||||||
|
default: "yeo-johnson"
|
||||||
description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。"
|
description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。"
|
||||||
default: 'yeo-johnson'
|
- name: "standardize"
|
||||||
standardize:
|
type: "bool"
|
||||||
description: "布尔值,指示是否在变换后将数据标准化为零均值和单位方差。"
|
|
||||||
default: true
|
default: true
|
||||||
copy:
|
description: "指示是否在变换后将数据标准化为零均值和单位方差。"
|
||||||
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
|
- name: "copy"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
|
description: "指示是否复制输入数据,或在原地进行变换。"
|
||||||
|
|
||||||
QuantileTransformer:
|
QuantileTransformer:
|
||||||
|
description: "将数据转换为均匀分布或正态分布。"
|
||||||
parameters:
|
parameters:
|
||||||
n_quantiles:
|
- name: "n_quantiles"
|
||||||
description: "用于分位数变换的分位数数量。"
|
type: "int"
|
||||||
default: 1000
|
default: 1000
|
||||||
output_distribution:
|
description: "用于分位数变换的分位数数量。"
|
||||||
|
- name: "output_distribution"
|
||||||
|
type: "str"
|
||||||
|
default: "uniform"
|
||||||
description: "指定输出分布。可选值包括 'uniform' 和 'normal'。"
|
description: "指定输出分布。可选值包括 'uniform' 和 'normal'。"
|
||||||
default: 'uniform'
|
- name: "ignore_implicit_zeros"
|
||||||
ignore_implicit_zeros:
|
type: "bool"
|
||||||
description: "布尔值,指示是否忽略隐式零。"
|
|
||||||
default: false
|
default: false
|
||||||
subsample:
|
description: "指示是否忽略隐式零。"
|
||||||
|
- name: "subsample"
|
||||||
|
type: "int"
|
||||||
|
default: 100000
|
||||||
description: "用于计算分位数的子样本大小。"
|
description: "用于计算分位数的子样本大小。"
|
||||||
default: 1e5
|
- name: "random_state"
|
||||||
random_state:
|
type: "int or None"
|
||||||
description: "用于随机数生成的种子。"
|
|
||||||
default: null
|
default: null
|
||||||
copy:
|
description: "用于随机数生成的种子。"
|
||||||
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
|
- name: "copy"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
|
description: "指示是否复制输入数据,或在原地进行变换。"
|
||||||
|
|
||||||
FeatureHasher:
|
FeatureHasher:
|
||||||
|
description: "使用哈希技巧将特征映射到向量。"
|
||||||
parameters:
|
parameters:
|
||||||
n_features:
|
- name: "n_features"
|
||||||
description: "哈希空间的维度。"
|
type: "int"
|
||||||
default: 1048576
|
default: 1048576
|
||||||
input_type:
|
description: "哈希空间的维度。"
|
||||||
|
- name: "input_type"
|
||||||
|
type: "str"
|
||||||
|
default: "dict"
|
||||||
description: "输入数据的类型。可选值包括 'dict' 和 'pair'。"
|
description: "输入数据的类型。可选值包括 'dict' 和 'pair'。"
|
||||||
default: 'dict'
|
- name: "dtype"
|
||||||
dtype:
|
type: "type"
|
||||||
|
default: "float64"
|
||||||
description: "输出数据的类型。"
|
description: "输出数据的类型。"
|
||||||
default: 'float64'
|
- name: "alternate_sign"
|
||||||
alternate_sign:
|
type: "bool"
|
||||||
description: "布尔值,指示是否在哈希时使用交替符号。"
|
|
||||||
default: true
|
default: true
|
||||||
|
description: "指示是否在哈希时使用交替符号。"
|
||||||
|
|
||||||
DictVectorizer:
|
DictVectorizer:
|
||||||
|
description: "将符号表示的特征(如字典)转换为稀疏矩阵。"
|
||||||
parameters:
|
parameters:
|
||||||
dtype:
|
- name: "dtype"
|
||||||
|
type: "type"
|
||||||
|
default: "float64"
|
||||||
description: "输出数据的类型。"
|
description: "输出数据的类型。"
|
||||||
default: 'float64'
|
- name: "separator"
|
||||||
separator:
|
type: "str"
|
||||||
|
default: "="
|
||||||
description: "用于分隔特征名称的分隔符。"
|
description: "用于分隔特征名称的分隔符。"
|
||||||
default: '='
|
- name: "sparse"
|
||||||
sparse:
|
type: "bool"
|
||||||
description: "布尔值,指示是否返回稀疏矩阵。"
|
|
||||||
default: true
|
default: true
|
||||||
sort:
|
description: "指示是否返回稀疏矩阵。"
|
||||||
description: "布尔值,指示是否对特征名称排序。"
|
- name: "sort"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
|
description: "指示是否对特征名称排序。"
|
||||||
|
|
||||||
|
|
||||||
PCA:
|
PCA:
|
||||||
|
description: "主成分分析,用于降维。"
|
||||||
parameters:
|
parameters:
|
||||||
n_components:
|
- name: "n_components"
|
||||||
|
type: "int, float, None or str"
|
||||||
|
default: null
|
||||||
description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。"
|
description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。"
|
||||||
default: null
|
- name: "copy"
|
||||||
copy:
|
type: "bool"
|
||||||
description: "布尔值,指示是否复制输入数据,或在原地进行变换。"
|
|
||||||
default: true
|
default: true
|
||||||
whiten:
|
description: "指示是否复制输入数据,或在原地进行变换。"
|
||||||
description: "布尔值,指示是否对主成分进行白化。"
|
- name: "whiten"
|
||||||
|
type: "bool"
|
||||||
default: false
|
default: false
|
||||||
svd_solver:
|
description: "指示是否对主成分进行白化。"
|
||||||
|
- name: "svd_solver"
|
||||||
|
type: "str"
|
||||||
|
default: "auto"
|
||||||
description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。"
|
description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。"
|
||||||
default: 'auto'
|
- name: "tol"
|
||||||
tol:
|
type: "float"
|
||||||
description: "奇异值分解的容差。"
|
|
||||||
default: 0.0
|
default: 0.0
|
||||||
iterated_power:
|
description: "奇异值分解的容差。"
|
||||||
description: "用于随机化 SVD 的迭代次数。"
|
- name: "iterated_power"
|
||||||
|
type: "int or 'auto'"
|
||||||
default: 'auto'
|
default: 'auto'
|
||||||
random_state:
|
description: "用于随机化 SVD 的迭代次数。"
|
||||||
description: "用于随机数生成的种子。"
|
- name: "random_state"
|
||||||
|
type: "int or None"
|
||||||
default: null
|
default: null
|
||||||
|
description: "用于随机数生成的种子。"
|
||||||
|
|
||||||
SelectKBest:
|
SelectKBest:
|
||||||
|
description: "选择最重要的 K 个特征。"
|
||||||
parameters:
|
parameters:
|
||||||
score_func:
|
- name: "score_func"
|
||||||
|
type: "callable"
|
||||||
|
default: "f_classif"
|
||||||
description: "用于计算特征得分的函数。"
|
description: "用于计算特征得分的函数。"
|
||||||
default: 'f_classif'
|
- name: "k"
|
||||||
k:
|
type: "int"
|
||||||
description: "要选择的特征数量。"
|
|
||||||
default: 10
|
default: 10
|
||||||
|
description: "要选择的特征数量。"
|
||||||
|
|
||||||
RFE:
|
RFE:
|
||||||
|
description: "递归特征消除,用于选择最重要的特征。"
|
||||||
parameters:
|
parameters:
|
||||||
estimator:
|
- name: "estimator"
|
||||||
|
type: "object"
|
||||||
|
default: null
|
||||||
description: "用于特征选择的基模型。"
|
description: "用于特征选择的基模型。"
|
||||||
|
- name: "n_features_to_select"
|
||||||
|
type: "int"
|
||||||
default: null
|
default: null
|
||||||
n_features_to_select:
|
|
||||||
description: "要选择的特征数量。"
|
description: "要选择的特征数量。"
|
||||||
default: null
|
- name: "step"
|
||||||
step:
|
type: "int"
|
||||||
description: "每次迭代要移除的特征数量。"
|
|
||||||
default: 1
|
default: 1
|
||||||
verbose:
|
description: "每次迭代要移除的特征数量。"
|
||||||
description: "控制冗长模式的整数。"
|
- name: "verbose"
|
||||||
|
type: "int"
|
||||||
default: 0
|
default: 0
|
||||||
|
description: "控制冗长模式的整数。"
|
||||||
|
|
||||||
PolynomialFeatures:
|
PolynomialFeatures:
|
||||||
|
description: "生成多项式特征,增加模型的非线性能力。"
|
||||||
parameters:
|
parameters:
|
||||||
degree:
|
- name: "degree"
|
||||||
description: "生成多项式特征的最高次数。"
|
type: "int"
|
||||||
default: 2
|
default: 2
|
||||||
interaction_only:
|
description: "生成多项式特征的最高次数。"
|
||||||
description: "布尔值,指示是否仅包含交互项。"
|
- name: "interaction_only"
|
||||||
|
type: "bool"
|
||||||
default: false
|
default: false
|
||||||
include_bias:
|
description: "指示是否仅包含交互项。"
|
||||||
description: "布尔值,指示是否包含偏置列。"
|
- name: "include_bias"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
order:
|
description: "指示是否包含偏置列。"
|
||||||
|
- name: "order"
|
||||||
|
type: "str"
|
||||||
|
default: "C"
|
||||||
description: "输出特征的顺序。可选值包括 'C' 和 'F'。"
|
description: "输出特征的顺序。可选值包括 'C' 和 'F'。"
|
||||||
default: 'C'
|
|
||||||
|
|
||||||
OneHotEncoder:
|
OneHotEncoder:
|
||||||
|
description: "将分类特征转换为独热编码。"
|
||||||
parameters:
|
parameters:
|
||||||
categories:
|
- name: "categories"
|
||||||
|
type: "str or list or 'auto'"
|
||||||
|
default: "auto"
|
||||||
description: "指定每个特征的类别。"
|
description: "指定每个特征的类别。"
|
||||||
default: 'auto'
|
- name: "drop"
|
||||||
drop:
|
type: "str or array-like"
|
||||||
description: "指定要从每个特征中删除的类别。"
|
|
||||||
default: null
|
default: null
|
||||||
sparse:
|
description: "指定要从每个特征中删除的类别。"
|
||||||
description: "布尔值,指示是否返回稀疏矩阵。"
|
- name: "sparse"
|
||||||
|
type: "bool"
|
||||||
default: true
|
default: true
|
||||||
dtype:
|
description: "指示是否返回稀疏矩阵。"
|
||||||
|
- name: "dtype"
|
||||||
|
type: "type"
|
||||||
|
default: "float64"
|
||||||
description: "输出数据的类型。"
|
description: "输出数据的类型。"
|
||||||
default: 'float64'
|
- name: "handle_unknown"
|
||||||
handle_unknown:
|
type: "str"
|
||||||
description: "指定如何处理未知类别。可选值"
|
default: "error"
|
||||||
|
description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。"
|
||||||
|
- name: "max_categories"
|
||||||
|
type: "int or None"
|
||||||
|
default: null
|
||||||
|
description: "在类别过多时,将类别限制为最大类别数量。"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,255 +0,0 @@
|
|||||||
feature_engineering_methods_parameters:
|
|
||||||
|
|
||||||
LabelEncoder:
|
|
||||||
description: "将分类标签编码为整数。"
|
|
||||||
parameters: []
|
|
||||||
|
|
||||||
KBinsDiscretizer:
|
|
||||||
description: "将连续数据分箱为离散数据。"
|
|
||||||
parameters:
|
|
||||||
- name: "n_bins"
|
|
||||||
type: "int or array-like"
|
|
||||||
default: 5
|
|
||||||
description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。"
|
|
||||||
- name: "encode"
|
|
||||||
type: "str"
|
|
||||||
default: "onehot"
|
|
||||||
description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。"
|
|
||||||
- name: "strategy"
|
|
||||||
type: "str"
|
|
||||||
default: "quantile"
|
|
||||||
description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。"
|
|
||||||
|
|
||||||
FunctionTransformer:
|
|
||||||
description: "对数据应用自定义函数进行转换。"
|
|
||||||
parameters:
|
|
||||||
- name: "func"
|
|
||||||
type: "callable"
|
|
||||||
default: null
|
|
||||||
description: "要应用于输入数据的函数。"
|
|
||||||
- name: "inverse_func"
|
|
||||||
type: "callable"
|
|
||||||
default: null
|
|
||||||
description: "func 的逆函数,如果存在。"
|
|
||||||
- name: "validate"
|
|
||||||
type: "bool"
|
|
||||||
default: false
|
|
||||||
description: "指示是否在转换前验证输入数据。"
|
|
||||||
- name: "accept_sparse"
|
|
||||||
type: "bool"
|
|
||||||
default: false
|
|
||||||
description: "指示是否接受稀疏矩阵作为输入。"
|
|
||||||
- name: "check_inverse"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。"
|
|
||||||
- name: "kw_args"
|
|
||||||
type: "dict"
|
|
||||||
default: null
|
|
||||||
description: "传递给 func 的其他关键字参数。"
|
|
||||||
- name: "inv_kw_args"
|
|
||||||
type: "dict"
|
|
||||||
default: null
|
|
||||||
description: "传递给 inverse_func 的其他关键字参数。"
|
|
||||||
|
|
||||||
PowerTransformer:
|
|
||||||
description: "对数据进行幂变换以使其更符合正态分布。"
|
|
||||||
parameters:
|
|
||||||
- name: "method"
|
|
||||||
type: "str"
|
|
||||||
default: "yeo-johnson"
|
|
||||||
description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。"
|
|
||||||
- name: "standardize"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否在变换后将数据标准化为零均值和单位方差。"
|
|
||||||
- name: "copy"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否复制输入数据,或在原地进行变换。"
|
|
||||||
|
|
||||||
QuantileTransformer:
|
|
||||||
description: "将数据转换为均匀分布或正态分布。"
|
|
||||||
parameters:
|
|
||||||
- name: "n_quantiles"
|
|
||||||
type: "int"
|
|
||||||
default: 1000
|
|
||||||
description: "用于分位数变换的分位数数量。"
|
|
||||||
- name: "output_distribution"
|
|
||||||
type: "str"
|
|
||||||
default: "uniform"
|
|
||||||
description: "指定输出分布。可选值包括 'uniform' 和 'normal'。"
|
|
||||||
- name: "ignore_implicit_zeros"
|
|
||||||
type: "bool"
|
|
||||||
default: false
|
|
||||||
description: "指示是否忽略隐式零。"
|
|
||||||
- name: "subsample"
|
|
||||||
type: "int"
|
|
||||||
default: 100000
|
|
||||||
description: "用于计算分位数的子样本大小。"
|
|
||||||
- name: "random_state"
|
|
||||||
type: "int or None"
|
|
||||||
default: null
|
|
||||||
description: "用于随机数生成的种子。"
|
|
||||||
- name: "copy"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否复制输入数据,或在原地进行变换。"
|
|
||||||
|
|
||||||
FeatureHasher:
|
|
||||||
description: "使用哈希技巧将特征映射到向量。"
|
|
||||||
parameters:
|
|
||||||
- name: "n_features"
|
|
||||||
type: "int"
|
|
||||||
default: 1048576
|
|
||||||
description: "哈希空间的维度。"
|
|
||||||
- name: "input_type"
|
|
||||||
type: "str"
|
|
||||||
default: "dict"
|
|
||||||
description: "输入数据的类型。可选值包括 'dict' 和 'pair'。"
|
|
||||||
- name: "dtype"
|
|
||||||
type: "type"
|
|
||||||
default: "float64"
|
|
||||||
description: "输出数据的类型。"
|
|
||||||
- name: "alternate_sign"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否在哈希时使用交替符号。"
|
|
||||||
|
|
||||||
DictVectorizer:
|
|
||||||
description: "将符号表示的特征(如字典)转换为稀疏矩阵。"
|
|
||||||
parameters:
|
|
||||||
- name: "dtype"
|
|
||||||
type: "type"
|
|
||||||
default: "float64"
|
|
||||||
description: "输出数据的类型。"
|
|
||||||
- name: "separator"
|
|
||||||
type: "str"
|
|
||||||
default: "="
|
|
||||||
description: "用于分隔特征名称的分隔符。"
|
|
||||||
- name: "sparse"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否返回稀疏矩阵。"
|
|
||||||
- name: "sort"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否对特征名称排序。"
|
|
||||||
|
|
||||||
|
|
||||||
PCA:
|
|
||||||
description: "主成分分析,用于降维。"
|
|
||||||
parameters:
|
|
||||||
- name: "n_components"
|
|
||||||
type: "int, float, None or str"
|
|
||||||
default: null
|
|
||||||
description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。"
|
|
||||||
- name: "copy"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否复制输入数据,或在原地进行变换。"
|
|
||||||
- name: "whiten"
|
|
||||||
type: "bool"
|
|
||||||
default: false
|
|
||||||
description: "指示是否对主成分进行白化。"
|
|
||||||
- name: "svd_solver"
|
|
||||||
type: "str"
|
|
||||||
default: "auto"
|
|
||||||
description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。"
|
|
||||||
- name: "tol"
|
|
||||||
type: "float"
|
|
||||||
default: 0.0
|
|
||||||
description: "奇异值分解的容差。"
|
|
||||||
- name: "iterated_power"
|
|
||||||
type: "int or 'auto'"
|
|
||||||
default: 'auto'
|
|
||||||
description: "用于随机化 SVD 的迭代次数。"
|
|
||||||
- name: "random_state"
|
|
||||||
type: "int or None"
|
|
||||||
default: null
|
|
||||||
description: "用于随机数生成的种子。"
|
|
||||||
|
|
||||||
SelectKBest:
|
|
||||||
description: "选择最重要的 K 个特征。"
|
|
||||||
parameters:
|
|
||||||
- name: "score_func"
|
|
||||||
type: "callable"
|
|
||||||
default: "f_classif"
|
|
||||||
description: "用于计算特征得分的函数。"
|
|
||||||
- name: "k"
|
|
||||||
type: "int"
|
|
||||||
default: 10
|
|
||||||
description: "要选择的特征数量。"
|
|
||||||
|
|
||||||
RFE:
|
|
||||||
description: "递归特征消除,用于选择最重要的特征。"
|
|
||||||
parameters:
|
|
||||||
- name: "estimator"
|
|
||||||
type: "object"
|
|
||||||
default: null
|
|
||||||
description: "用于特征选择的基模型。"
|
|
||||||
- name: "n_features_to_select"
|
|
||||||
type: "int"
|
|
||||||
default: null
|
|
||||||
description: "要选择的特征数量。"
|
|
||||||
- name: "step"
|
|
||||||
type: "int"
|
|
||||||
default: 1
|
|
||||||
description: "每次迭代要移除的特征数量。"
|
|
||||||
- name: "verbose"
|
|
||||||
type: "int"
|
|
||||||
default: 0
|
|
||||||
description: "控制冗长模式的整数。"
|
|
||||||
|
|
||||||
PolynomialFeatures:
|
|
||||||
description: "生成多项式特征,增加模型的非线性能力。"
|
|
||||||
parameters:
|
|
||||||
- name: "degree"
|
|
||||||
type: "int"
|
|
||||||
default: 2
|
|
||||||
description: "生成多项式特征的最高次数。"
|
|
||||||
- name: "interaction_only"
|
|
||||||
type: "bool"
|
|
||||||
default: false
|
|
||||||
description: "指示是否仅包含交互项。"
|
|
||||||
- name: "include_bias"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否包含偏置列。"
|
|
||||||
- name: "order"
|
|
||||||
type: "str"
|
|
||||||
default: "C"
|
|
||||||
description: "输出特征的顺序。可选值包括 'C' 和 'F'。"
|
|
||||||
|
|
||||||
OneHotEncoder:
|
|
||||||
description: "将分类特征转换为独热编码。"
|
|
||||||
parameters:
|
|
||||||
- name: "categories"
|
|
||||||
type: "str or list or 'auto'"
|
|
||||||
default: "auto"
|
|
||||||
description: "指定每个特征的类别。"
|
|
||||||
- name: "drop"
|
|
||||||
type: "str or array-like"
|
|
||||||
default: null
|
|
||||||
description: "指定要从每个特征中删除的类别。"
|
|
||||||
- name: "sparse"
|
|
||||||
type: "bool"
|
|
||||||
default: true
|
|
||||||
description: "指示是否返回稀疏矩阵。"
|
|
||||||
- name: "dtype"
|
|
||||||
type: "type"
|
|
||||||
default: "float64"
|
|
||||||
description: "输出数据的类型。"
|
|
||||||
- name: "handle_unknown"
|
|
||||||
type: "str"
|
|
||||||
default: "error"
|
|
||||||
description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。"
|
|
||||||
- name: "max_categories"
|
|
||||||
type: "int or None"
|
|
||||||
default: null
|
|
||||||
description: "在类别过多时,将类别限制为最大类别数量。"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
14
example_method_reader.py
Normal file
14
example_method_reader.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from data_process.method_reader_date_process import MethodReader
|
||||||
|
|
||||||
|
# 创建方法读取器实例
|
||||||
|
reader = MethodReader()
|
||||||
|
|
||||||
|
# 获取所有预处理方法
|
||||||
|
methods = reader.get_preprocessing_methods()
|
||||||
|
print("预处理方法列表:")
|
||||||
|
print(methods)
|
||||||
|
|
||||||
|
# 获取特定方法的详细信息
|
||||||
|
method_details = reader.get_method_details('StandardScaler')
|
||||||
|
print("\nStandardScaler方法详情:")
|
||||||
|
print(method_details)
|
||||||
6
mlruns/0/meta.yaml
Normal file
6
mlruns/0/meta.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
artifact_location: mlflow-artifacts:/0
|
||||||
|
creation_time: 1739520200398
|
||||||
|
experiment_id: '0'
|
||||||
|
last_update_time: 1739520200398
|
||||||
|
lifecycle_stage: active
|
||||||
|
name: Default
|
||||||
49
tests/test_method_reader.py
Normal file
49
tests/test_method_reader.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import unittest
|
||||||
|
from data_process.method_reader_date_process import MethodReader
|
||||||
|
|
||||||
|
class TestMethodReader(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.reader = MethodReader()
|
||||||
|
|
||||||
|
def test_get_preprocessing_methods(self):
|
||||||
|
result = self.reader.get_preprocessing_methods()
|
||||||
|
self.assertEqual(result['status'], 'success')
|
||||||
|
self.assertIsInstance(result['methods'], list)
|
||||||
|
|
||||||
|
# 检查返回的方法列表
|
||||||
|
methods = result['methods']
|
||||||
|
self.assertTrue(any(m['name'] == 'data_scaler' for m in methods))
|
||||||
|
self.assertTrue(any(m['name'] == 'missing_value_handler' for m in methods))
|
||||||
|
self.assertTrue(any(m['name'] == 'outlier_detector' for m in methods))
|
||||||
|
|
||||||
|
def test_get_method_details(self):
|
||||||
|
# 测试获取StandardScaler的详细信息
|
||||||
|
result = self.reader.get_method_details('StandardScaler')
|
||||||
|
self.assertEqual(result['status'], 'success')
|
||||||
|
self.assertEqual(result['method']['name'], 'StandardScaler')
|
||||||
|
|
||||||
|
# 检查返回的详细信息字段
|
||||||
|
method = result['method']
|
||||||
|
self.assertIn('description', method)
|
||||||
|
self.assertIn('principle', method)
|
||||||
|
self.assertIn('advantages', method)
|
||||||
|
self.assertIn('disadvantages', method)
|
||||||
|
self.assertIn('applicable_scenarios', method)
|
||||||
|
self.assertIn('parameters', method)
|
||||||
|
|
||||||
|
# 检查参数信息
|
||||||
|
parameters = method['parameters']
|
||||||
|
self.assertIsInstance(parameters, list)
|
||||||
|
if parameters:
|
||||||
|
param = parameters[0]
|
||||||
|
self.assertIn('name', param)
|
||||||
|
self.assertIn('type', param)
|
||||||
|
self.assertIn('default', param)
|
||||||
|
self.assertIn('description', param)
|
||||||
|
|
||||||
|
# 测试获取不存在的方法
|
||||||
|
result = self.reader.get_method_details('NonExistentMethod')
|
||||||
|
self.assertEqual(result['status'], 'error')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Loading…
Reference in New Issue
Block a user