diff --git a/data_process/__pycache__/method_reader.cpython-39.pyc b/data_process/__pycache__/method_reader.cpython-39.pyc new file mode 100644 index 0000000..20a9060 Binary files /dev/null and b/data_process/__pycache__/method_reader.cpython-39.pyc differ diff --git a/data_process/__pycache__/method_reader_date_process.cpython-39.pyc b/data_process/__pycache__/method_reader_date_process.cpython-39.pyc new file mode 100644 index 0000000..1e22f73 Binary files /dev/null and b/data_process/__pycache__/method_reader_date_process.cpython-39.pyc differ diff --git a/data_process/method_reader.py b/data_process/method_reader_date_process.py similarity index 60% rename from data_process/method_reader.py rename to data_process/method_reader_date_process.py index 3d01904..378a4e3 100644 --- a/data_process/method_reader.py +++ b/data_process/method_reader_date_process.py @@ -11,6 +11,7 @@ class MethodReader: """初始化方法读取器""" self.logger = logging.getLogger(__name__) self.method_config = self._load_method_config() + self.parameter_config = self._load_parameter_config() def _load_method_config(self) -> Dict: """加载方法配置文件""" @@ -29,6 +30,22 @@ class MethodReader: self.logger.error(f"Error loading method config: {str(e)}") raise + def _load_parameter_config(self) -> Dict: + """加载参数配置文件""" + try: + config_path = Path('date_preprocessing/parameter.yaml') + if not config_path.exists(): + raise FileNotFoundError(f"Parameter config file not found at {config_path}") + + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + + self.logger.info("Successfully loaded parameter config") + return config + except Exception as e: + self.logger.error(f"Error loading parameter config: {str(e)}") + raise + def get_preprocessing_methods(self) -> Dict: """获取预处理方法列表""" try: @@ -76,22 +93,39 @@ class MethodReader: def get_method_details(self, method_name: str) -> Dict: """获取指定方法的详细信息""" try: - # 在各个方法类别中查找 + # 在各个方法类别中查找方法原理和优缺点 + method_info = None for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']: if method_name in self.method_config.get(category, {}): method_info = self.method_config[category][method_name] - return { - "status": "success", - "method": { - "name": method_name, - "principle": method_info.get('principle', ''), - "advantages": method_info.get('advantages', []), - "disadvantages": method_info.get('disadvantages', []), - "applicable_scenarios": method_info.get('applicable_scenarios', []) - } - } + break - raise ValueError(f"Method {method_name} not found") + if method_info is None: + raise ValueError(f"Method {method_name} not found in method config") + + # 查找方法参数信息 + parameter_info = None + for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']: + if method_name in self.parameter_config.get(category, {}): + parameter_info = self.parameter_config[category][method_name] + break + + if parameter_info is None: + raise ValueError(f"Method {method_name} not found in parameter config") + + # 组合返回信息 + return { + "status": "success", + "method": { + "name": method_name, + "description": parameter_info.get('description', ''), + "principle": method_info.get('principle', ''), + "advantages": method_info.get('advantages', []), + "disadvantages": method_info.get('disadvantages', []), + "applicable_scenarios": method_info.get('applicable_scenarios', []), + "parameters": parameter_info.get('parameters', []) + } + } except Exception as e: self.logger.error(f"Error getting method details: {str(e)}") diff --git a/date_feature/parameter.yaml b/date_feature/parameter.yaml index 7155848..9bd9da1 100644 --- a/date_feature/parameter.yaml +++ b/date_feature/parameter.yaml @@ -1,186 +1,255 @@ feature_engineering_methods_parameters: LabelEncoder: - parameters: {} + description: "将分类标签编码为整数。" + parameters: [] KBinsDiscretizer: + description: "将连续数据分箱为离散数据。" parameters: - n_bins: - description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" + - name: "n_bins" + type: "int or array-like" default: 5 - encode: + description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" + - name: "encode" + type: "str" + default: "onehot" description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。" - default: 'onehot' - strategy: + - name: "strategy" + type: "str" + default: "quantile" description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。" - default: 'quantile' FunctionTransformer: + description: "对数据应用自定义函数进行转换。" parameters: - func: + - name: "func" + type: "callable" + default: null description: "要应用于输入数据的函数。" + - name: "inverse_func" + type: "callable" default: null - inverse_func: description: "func 的逆函数,如果存在。" - default: null - validate: - description: "布尔值,指示是否在转换前验证输入数据。" + - name: "validate" + type: "bool" default: false - accept_sparse: - description: "布尔值,指示是否接受稀疏矩阵作为输入。" + description: "指示是否在转换前验证输入数据。" + - name: "accept_sparse" + type: "bool" default: false - check_inverse: - description: "布尔值,指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" + description: "指示是否接受稀疏矩阵作为输入。" + - name: "check_inverse" + type: "bool" default: true - kw_args: + description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" + - name: "kw_args" + type: "dict" + default: null description: "传递给 func 的其他关键字参数。" + - name: "inv_kw_args" + type: "dict" default: null - inv_kw_args: description: "传递给 inverse_func 的其他关键字参数。" - default: null PowerTransformer: + description: "对数据进行幂变换以使其更符合正态分布。" parameters: - method: + - name: "method" + type: "str" + default: "yeo-johnson" description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。" - default: 'yeo-johnson' - standardize: - description: "布尔值,指示是否在变换后将数据标准化为零均值和单位方差。" + - name: "standardize" + type: "bool" default: true - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" + description: "指示是否在变换后将数据标准化为零均值和单位方差。" + - name: "copy" + type: "bool" default: true + description: "指示是否复制输入数据,或在原地进行变换。" QuantileTransformer: + description: "将数据转换为均匀分布或正态分布。" parameters: - n_quantiles: - description: "用于分位数变换的分位数数量。" + - name: "n_quantiles" + type: "int" default: 1000 - output_distribution: + description: "用于分位数变换的分位数数量。" + - name: "output_distribution" + type: "str" + default: "uniform" description: "指定输出分布。可选值包括 'uniform' 和 'normal'。" - default: 'uniform' - ignore_implicit_zeros: - description: "布尔值,指示是否忽略隐式零。" + - name: "ignore_implicit_zeros" + type: "bool" default: false - subsample: + description: "指示是否忽略隐式零。" + - name: "subsample" + type: "int" + default: 100000 description: "用于计算分位数的子样本大小。" - default: 1e5 - random_state: - description: "用于随机数生成的种子。" + - name: "random_state" + type: "int or None" default: null - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" + description: "用于随机数生成的种子。" + - name: "copy" + type: "bool" default: true + description: "指示是否复制输入数据,或在原地进行变换。" FeatureHasher: + description: "使用哈希技巧将特征映射到向量。" parameters: - n_features: - description: "哈希空间的维度。" + - name: "n_features" + type: "int" default: 1048576 - input_type: + description: "哈希空间的维度。" + - name: "input_type" + type: "str" + default: "dict" description: "输入数据的类型。可选值包括 'dict' 和 'pair'。" - default: 'dict' - dtype: + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - alternate_sign: - description: "布尔值,指示是否在哈希时使用交替符号。" + - name: "alternate_sign" + type: "bool" default: true + description: "指示是否在哈希时使用交替符号。" DictVectorizer: + description: "将符号表示的特征(如字典)转换为稀疏矩阵。" parameters: - dtype: + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - separator: + - name: "separator" + type: "str" + default: "=" description: "用于分隔特征名称的分隔符。" - default: '=' - sparse: - description: "布尔值,指示是否返回稀疏矩阵。" + - name: "sparse" + type: "bool" default: true - sort: - description: "布尔值,指示是否对特征名称排序。" + description: "指示是否返回稀疏矩阵。" + - name: "sort" + type: "bool" default: true + description: "指示是否对特征名称排序。" + PCA: - parameters: - n_components: - description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" - default: null - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" - default: true - whiten: - description: "布尔值,指示是否对主成分进行白化。" - default: false - svd_solver: - description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" - default: 'auto' - tol: - description: "奇异值分解的容差。" - default: 0.0 - iterated_power: - description: "用于随机化 SVD 的迭代次数。" - default: 'auto' - random_state: - description: "用于随机数生成的种子。" - default: null + description: "主成分分析,用于降维。" + parameters: + - name: "n_components" + type: "int, float, None or str" + default: null + description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" + - name: "copy" + type: "bool" + default: true + description: "指示是否复制输入数据,或在原地进行变换。" + - name: "whiten" + type: "bool" + default: false + description: "指示是否对主成分进行白化。" + - name: "svd_solver" + type: "str" + default: "auto" + description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" + - name: "tol" + type: "float" + default: 0.0 + description: "奇异值分解的容差。" + - name: "iterated_power" + type: "int or 'auto'" + default: 'auto' + description: "用于随机化 SVD 的迭代次数。" + - name: "random_state" + type: "int or None" + default: null + description: "用于随机数生成的种子。" SelectKBest: + description: "选择最重要的 K 个特征。" parameters: - score_func: + - name: "score_func" + type: "callable" + default: "f_classif" description: "用于计算特征得分的函数。" - default: 'f_classif' - k: - description: "要选择的特征数量。" + - name: "k" + type: "int" default: 10 + description: "要选择的特征数量。" RFE: + description: "递归特征消除,用于选择最重要的特征。" parameters: - estimator: + - name: "estimator" + type: "object" + default: null description: "用于特征选择的基模型。" + - name: "n_features_to_select" + type: "int" default: null - n_features_to_select: description: "要选择的特征数量。" - default: null - step: - description: "每次迭代要移除的特征数量。" + - name: "step" + type: "int" default: 1 - verbose: - description: "控制冗长模式的整数。" + description: "每次迭代要移除的特征数量。" + - name: "verbose" + type: "int" default: 0 + description: "控制冗长模式的整数。" PolynomialFeatures: + description: "生成多项式特征,增加模型的非线性能力。" parameters: - degree: - description: "生成多项式特征的最高次数。" + - name: "degree" + type: "int" default: 2 - interaction_only: - description: "布尔值,指示是否仅包含交互项。" + description: "生成多项式特征的最高次数。" + - name: "interaction_only" + type: "bool" default: false - include_bias: - description: "布尔值,指示是否包含偏置列。" + description: "指示是否仅包含交互项。" + - name: "include_bias" + type: "bool" default: true - order: + description: "指示是否包含偏置列。" + - name: "order" + type: "str" + default: "C" description: "输出特征的顺序。可选值包括 'C' 和 'F'。" - default: 'C' OneHotEncoder: + description: "将分类特征转换为独热编码。" parameters: - categories: + - name: "categories" + type: "str or list or 'auto'" + default: "auto" description: "指定每个特征的类别。" - default: 'auto' - drop: - description: "指定要从每个特征中删除的类别。" + - name: "drop" + type: "str or array-like" default: null - sparse: - description: "布尔值,指示是否返回稀疏矩阵。" + description: "指定要从每个特征中删除的类别。" + - name: "sparse" + type: "bool" default: true - dtype: + description: "指示是否返回稀疏矩阵。" + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - handle_unknown: - description: "指定如何处理未知类别。可选值" + - name: "handle_unknown" + type: "str" + default: "error" + description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。" + - name: "max_categories" + type: "int or None" + default: null + description: "在类别过多时,将类别限制为最大类别数量。" - + + + diff --git a/date_feature/parameter_new.yaml b/date_feature/parameter_new.yaml deleted file mode 100644 index 9bd9da1..0000000 --- a/date_feature/parameter_new.yaml +++ /dev/null @@ -1,255 +0,0 @@ -feature_engineering_methods_parameters: - - LabelEncoder: - description: "将分类标签编码为整数。" - parameters: [] - - KBinsDiscretizer: - description: "将连续数据分箱为离散数据。" - parameters: - - name: "n_bins" - type: "int or array-like" - default: 5 - description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" - - name: "encode" - type: "str" - default: "onehot" - description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。" - - name: "strategy" - type: "str" - default: "quantile" - description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。" - - FunctionTransformer: - description: "对数据应用自定义函数进行转换。" - parameters: - - name: "func" - type: "callable" - default: null - description: "要应用于输入数据的函数。" - - name: "inverse_func" - type: "callable" - default: null - description: "func 的逆函数,如果存在。" - - name: "validate" - type: "bool" - default: false - description: "指示是否在转换前验证输入数据。" - - name: "accept_sparse" - type: "bool" - default: false - description: "指示是否接受稀疏矩阵作为输入。" - - name: "check_inverse" - type: "bool" - default: true - description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" - - name: "kw_args" - type: "dict" - default: null - description: "传递给 func 的其他关键字参数。" - - name: "inv_kw_args" - type: "dict" - default: null - description: "传递给 inverse_func 的其他关键字参数。" - - PowerTransformer: - description: "对数据进行幂变换以使其更符合正态分布。" - parameters: - - name: "method" - type: "str" - default: "yeo-johnson" - description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。" - - name: "standardize" - type: "bool" - default: true - description: "指示是否在变换后将数据标准化为零均值和单位方差。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - QuantileTransformer: - description: "将数据转换为均匀分布或正态分布。" - parameters: - - name: "n_quantiles" - type: "int" - default: 1000 - description: "用于分位数变换的分位数数量。" - - name: "output_distribution" - type: "str" - default: "uniform" - description: "指定输出分布。可选值包括 'uniform' 和 'normal'。" - - name: "ignore_implicit_zeros" - type: "bool" - default: false - description: "指示是否忽略隐式零。" - - name: "subsample" - type: "int" - default: 100000 - description: "用于计算分位数的子样本大小。" - - name: "random_state" - type: "int or None" - default: null - description: "用于随机数生成的种子。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - FeatureHasher: - description: "使用哈希技巧将特征映射到向量。" - parameters: - - name: "n_features" - type: "int" - default: 1048576 - description: "哈希空间的维度。" - - name: "input_type" - type: "str" - default: "dict" - description: "输入数据的类型。可选值包括 'dict' 和 'pair'。" - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "alternate_sign" - type: "bool" - default: true - description: "指示是否在哈希时使用交替符号。" - - DictVectorizer: - description: "将符号表示的特征(如字典)转换为稀疏矩阵。" - parameters: - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "separator" - type: "str" - default: "=" - description: "用于分隔特征名称的分隔符。" - - name: "sparse" - type: "bool" - default: true - description: "指示是否返回稀疏矩阵。" - - name: "sort" - type: "bool" - default: true - description: "指示是否对特征名称排序。" - - - PCA: - description: "主成分分析,用于降维。" - parameters: - - name: "n_components" - type: "int, float, None or str" - default: null - description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - name: "whiten" - type: "bool" - default: false - description: "指示是否对主成分进行白化。" - - name: "svd_solver" - type: "str" - default: "auto" - description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" - - name: "tol" - type: "float" - default: 0.0 - description: "奇异值分解的容差。" - - name: "iterated_power" - type: "int or 'auto'" - default: 'auto' - description: "用于随机化 SVD 的迭代次数。" - - name: "random_state" - type: "int or None" - default: null - description: "用于随机数生成的种子。" - - SelectKBest: - description: "选择最重要的 K 个特征。" - parameters: - - name: "score_func" - type: "callable" - default: "f_classif" - description: "用于计算特征得分的函数。" - - name: "k" - type: "int" - default: 10 - description: "要选择的特征数量。" - - RFE: - description: "递归特征消除,用于选择最重要的特征。" - parameters: - - name: "estimator" - type: "object" - default: null - description: "用于特征选择的基模型。" - - name: "n_features_to_select" - type: "int" - default: null - description: "要选择的特征数量。" - - name: "step" - type: "int" - default: 1 - description: "每次迭代要移除的特征数量。" - - name: "verbose" - type: "int" - default: 0 - description: "控制冗长模式的整数。" - - PolynomialFeatures: - description: "生成多项式特征,增加模型的非线性能力。" - parameters: - - name: "degree" - type: "int" - default: 2 - description: "生成多项式特征的最高次数。" - - name: "interaction_only" - type: "bool" - default: false - description: "指示是否仅包含交互项。" - - name: "include_bias" - type: "bool" - default: true - description: "指示是否包含偏置列。" - - name: "order" - type: "str" - default: "C" - description: "输出特征的顺序。可选值包括 'C' 和 'F'。" - - OneHotEncoder: - description: "将分类特征转换为独热编码。" - parameters: - - name: "categories" - type: "str or list or 'auto'" - default: "auto" - description: "指定每个特征的类别。" - - name: "drop" - type: "str or array-like" - default: null - description: "指定要从每个特征中删除的类别。" - - name: "sparse" - type: "bool" - default: true - description: "指示是否返回稀疏矩阵。" - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "handle_unknown" - type: "str" - default: "error" - description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。" - - name: "max_categories" - type: "int or None" - default: null - description: "在类别过多时,将类别限制为最大类别数量。" - - - - - diff --git a/example_method_reader.py b/example_method_reader.py new file mode 100644 index 0000000..4f11c7c --- /dev/null +++ b/example_method_reader.py @@ -0,0 +1,14 @@ +from data_process.method_reader_date_process import MethodReader + +# 创建方法读取器实例 +reader = MethodReader() + +# 获取所有预处理方法 +methods = reader.get_preprocessing_methods() +print("预处理方法列表:") +print(methods) + +# 获取特定方法的详细信息 +method_details = reader.get_method_details('StandardScaler') +print("\nStandardScaler方法详情:") +print(method_details) \ No newline at end of file diff --git a/mlruns/0/meta.yaml b/mlruns/0/meta.yaml new file mode 100644 index 0000000..ee51c2d --- /dev/null +++ b/mlruns/0/meta.yaml @@ -0,0 +1,6 @@ +artifact_location: mlflow-artifacts:/0 +creation_time: 1739520200398 +experiment_id: '0' +last_update_time: 1739520200398 +lifecycle_stage: active +name: Default diff --git a/tests/test_method_reader.py b/tests/test_method_reader.py new file mode 100644 index 0000000..5cb3f51 --- /dev/null +++ b/tests/test_method_reader.py @@ -0,0 +1,49 @@ +import unittest +from data_process.method_reader_date_process import MethodReader + +class TestMethodReader(unittest.TestCase): + def setUp(self): + self.reader = MethodReader() + + def test_get_preprocessing_methods(self): + result = self.reader.get_preprocessing_methods() + self.assertEqual(result['status'], 'success') + self.assertIsInstance(result['methods'], list) + + # 检查返回的方法列表 + methods = result['methods'] + self.assertTrue(any(m['name'] == 'data_scaler' for m in methods)) + self.assertTrue(any(m['name'] == 'missing_value_handler' for m in methods)) + self.assertTrue(any(m['name'] == 'outlier_detector' for m in methods)) + + def test_get_method_details(self): + # 测试获取StandardScaler的详细信息 + result = self.reader.get_method_details('StandardScaler') + self.assertEqual(result['status'], 'success') + self.assertEqual(result['method']['name'], 'StandardScaler') + + # 检查返回的详细信息字段 + method = result['method'] + self.assertIn('description', method) + self.assertIn('principle', method) + self.assertIn('advantages', method) + self.assertIn('disadvantages', method) + self.assertIn('applicable_scenarios', method) + self.assertIn('parameters', method) + + # 检查参数信息 + parameters = method['parameters'] + self.assertIsInstance(parameters, list) + if parameters: + param = parameters[0] + self.assertIn('name', param) + self.assertIn('type', param) + self.assertIn('default', param) + self.assertIn('description', param) + + # 测试获取不存在的方法 + result = self.reader.get_method_details('NonExistentMethod') + self.assertEqual(result['status'], 'error') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file