From f428efa7db5f5e6e29a449524dd58bde21c327e0 Mon Sep 17 00:00:00 2001 From: haotian <2421912570@qq.com> Date: Mon, 17 Feb 2025 15:18:25 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90--=E5=AE=8C=E6=88=90=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=95=B0=E6=8D=AE=E9=A2=84=E5=A4=84=E7=90=86=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E5=88=97=E8=A1=A8,=E8=8E=B7=E5=8F=96=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E8=AF=A6=E6=83=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../__pycache__/method_reader.cpython-39.pyc | Bin 0 -> 2955 bytes .../method_reader_date_process.cpython-39.pyc | Bin 0 -> 3757 bytes ...eader.py => method_reader_date_process.py} | 58 +++- date_feature/parameter.yaml | 279 +++++++++++------- date_feature/parameter_new.yaml | 255 ---------------- example_method_reader.py | 14 + mlruns/0/meta.yaml | 6 + tests/test_method_reader.py | 49 +++ 8 files changed, 289 insertions(+), 372 deletions(-) create mode 100644 data_process/__pycache__/method_reader.cpython-39.pyc create mode 100644 data_process/__pycache__/method_reader_date_process.cpython-39.pyc rename data_process/{method_reader.py => method_reader_date_process.py} (60%) delete mode 100644 date_feature/parameter_new.yaml create mode 100644 example_method_reader.py create mode 100644 mlruns/0/meta.yaml create mode 100644 tests/test_method_reader.py diff --git a/data_process/__pycache__/method_reader.cpython-39.pyc b/data_process/__pycache__/method_reader.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20a9060406d8603cd90302757b3773a267bd7fd6 GIT binary patch literal 2955 zcmaJ@-H#kc5%2E#+}-=K6B{lNXA?ri0=_eOLJ^8UaSRXkDZ&y~Mk2#xdv`dWkBca@MI@IaKdR!Jbbg5c9_RHhG)=X-1JO6 zZ+RBHjo9uuoSC-J4hUu3aC$d13s{`%OBt5p3gLo`5I24}>H` zD%ptU#5Zy>G)Dwlmi|K4`!pzwU%fQ`#p~lYE)>T1&2BVo<-}>b9VKnA(hlWGXoo_% zzMll0(D#*%voLaxAEyELJ9@W%Gfi4iTP@CYF>YBHx2_Pe06zcujrGm66Rrom6D3DQ zn#%RfAeB*&te-mhR2;}wDmv>tkb&P7X*0~S_0pWs!C33|aUW3fjMozhb zSUGRJdEXY3=FrL^x?9E>!flh7WE+r{tbB!J8^;LM_tPMA@|-<%xHF>A2Hn%3Dc$~M zO14Z&1`q4`3U_5SH@CGn)I3Mdkf*`!mieNYREzg9atuZQAC=f2ePiH7+5{oI{H-z zo*O33l!J|p!GouJO+1lSFOK_lJo%8c9!`y^+cj=vl8+X4(xCW#UG?N*1(2jWFTjGBZkAsj58rLNn~jC{2_VqBmt^QZy`4 z10Ance3jn?sw&#@>=KkT)bK0BB4~;V2*@ZTI}Cy_MqRj<*b4mufXQY{^e~-x;A%mz~CLa?u@RPo8mRCTj7AHsV=VpdKWIT5`R|zZ-74w@OP}l=c06T>l6I( zrJ~Q5agB{wT-mZ-v`&|Ajl~tf;(^|0;6wL*^n1uix8BKIUWyU}ffq+s5tXJmu!+xghpbHG);!~)Kw9xuu69>^w0uS;IlBiX1JfBLYv8jW0_BaTnj%ggGCfp3~f4k2A>w(5?*8+OM zI{WtEkyA6JZtZ`%gV@%Jjp`aC@F1HE-Iv!~LjMa44hEbaC*S=ZPz_4pb5IO(hD13_ zgpK{2t&*Isk}W9IpkWMkdLJ>}vf7}jTlRj-!4`Kf>VnN6wIi9Hx1mf_u}tvVB?Ak^ z8IpVpQoPasbD83iVo)sjg0yE-JYOvOd}*Iz%b4zgV*0HqwhRT{1q13fg(f6j{z!Gs+ zBp*a*l?W>La1u;pxB-ALbLm0I{&3N1asSJ_y;Qpw>p%h22a0bjYr%h8+h|mUgPW@Q zekbKUh@-C-eE<1g5SPDAj_>od>HA^{^aP?;S0T~B7BW~oiUR+WW`wR*2ye8WVLouF z$$&x!mcA@-yipPOi>Peb??RF(4CAk$VreF@Dk%r+VH`cH>#trk<3RgEFUtK|vGd3A NXR>s)q6^~!`wv$0M$-TQ literal 0 HcmV?d00001 diff --git a/data_process/__pycache__/method_reader_date_process.cpython-39.pyc b/data_process/__pycache__/method_reader_date_process.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e22f73599daca90a3dc564680be60c379661a1e GIT binary patch literal 3757 zcmds4ZI2t(6~1@gZSQ)$35gmgi_-)e(ZJmbAH zj;khDcD&?N2uxZ*?AV(;cB6sb|qxidEQcHsxqBcHi1 z=bn3g?s=Yb?9R^S34C7u6Ok?^wVfvb4Ta2C~72^)rBGrl_v?u&pBDB+foK&dL&brBR z4d*65;}bjuqaLv-Beue*tulqF>%`VPooQDn)0wfZ*+$kgnFT#F>*ZJ;dX|@ie&LG3 zX4owB^Q;J`nG-W-ykt3IPkAo$cpJPxh8rIbKX~WP%a^uqU*EZLb8vZM@T>Q0bmCap z#*_UBzVT5I3F*^B*`z(QH%uQUU z#iN=R$BWs68rAJ6yT_RLI&w=cbpBjCWnVS^apnDE96RUHLl2wh8?om0PV^)&y zX=`#f)VxTZCC`D~RqcWn=F|5QS%MJ=v0QrXulILu-<}Mu;l`_<-uz2B*DjtpeA;)xz*z1c=6B{uJz8jgnKivs@{( zsO5!1!-Lwn*sXgq?6`%?G(W6IBKPErHLsQUQ7Ci|cN1!y@GYIsf{tBCf0fe$>OwZ^X(L@$ z0GgV)qGUv)cRm54(Sc}|G9sEyte*Zk(Xe9LXLGp5K+-9#>KF8L*;|rM2}ozYeINYL zonL$i+3U`)*9X_ux8HgtWuL)ozueh;U+kA89@pFevO`J>v0a!LY~$@iWDPG}-@dgq zeEV;yk%}rbOh}@@ytu~w6x&8>K`b==I0i0p&bvX|bC%tZ1=t;JaA1#lNV^E}?OT5x zT>InTrCaGf;!xC10v`^?Ac578LB>`}(^FAuU~uc=;G>Vg!0& z3&U*z6*B%+R7L)i`g{Re6+@C8e;C?!ahJ#EMh+QyXm0e|vH^~G*o6~u2sVZ+J8|fd zZyo{d_;C=(k(zptYVaud{KqoCtOpu*EgcXB>x{eIht5o<=P%q2d|OBlDytMe4;$AM z+&=pt_z=}Hb1y;R)cpqom@(xE^8B$?$kGt#P(1Zi1j~8C^VsjHhe(eeBC84|E03<^ z0f@9s_-V-Tu;M)^DzB=T<-rmFb;6Q5Vd-hG1Q5+ELH1s`u<|rO4=rtsE$Nvh15Tp# zDTC5+6G#ABkma*hz5!>GV})^RMz(OpEVfwjikX5^Rt(x(wDa%;l=Z&BN{KlI<@vPF z_Kl%TGbqcqgfbS)``Z`+DU^qAzB;)6Zi?gWch`4r{C4~1E1&*-bNKTc)r!i$28#S4 z6ki9?HPbsRcki3182oTL&vzH5C&yHp(K$bc_OQqmMC4lB4{Lra@I>Ba=iM-I8(u7C znIBJd?*S|Xe$9O$@SM2jg)aA_Smei+iW+)adsZ%`-rlBMrKmXS5TC=4Q5+6u! zF{^6+cb96Vz)jdg ziQ@B&EL*bp!txuph|!4_)fCGp&KFC?Z*rWeGm?HrQ6AQe+ fe?b@Rh9eW|aQ|)DI+Z literal 0 HcmV?d00001 diff --git a/data_process/method_reader.py b/data_process/method_reader_date_process.py similarity index 60% rename from data_process/method_reader.py rename to data_process/method_reader_date_process.py index 3d01904..378a4e3 100644 --- a/data_process/method_reader.py +++ b/data_process/method_reader_date_process.py @@ -11,6 +11,7 @@ class MethodReader: """初始化方法读取器""" self.logger = logging.getLogger(__name__) self.method_config = self._load_method_config() + self.parameter_config = self._load_parameter_config() def _load_method_config(self) -> Dict: """加载方法配置文件""" @@ -29,6 +30,22 @@ class MethodReader: self.logger.error(f"Error loading method config: {str(e)}") raise + def _load_parameter_config(self) -> Dict: + """加载参数配置文件""" + try: + config_path = Path('date_preprocessing/parameter.yaml') + if not config_path.exists(): + raise FileNotFoundError(f"Parameter config file not found at {config_path}") + + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + + self.logger.info("Successfully loaded parameter config") + return config + except Exception as e: + self.logger.error(f"Error loading parameter config: {str(e)}") + raise + def get_preprocessing_methods(self) -> Dict: """获取预处理方法列表""" try: @@ -76,22 +93,39 @@ class MethodReader: def get_method_details(self, method_name: str) -> Dict: """获取指定方法的详细信息""" try: - # 在各个方法类别中查找 + # 在各个方法类别中查找方法原理和优缺点 + method_info = None for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']: if method_name in self.method_config.get(category, {}): method_info = self.method_config[category][method_name] - return { - "status": "success", - "method": { - "name": method_name, - "principle": method_info.get('principle', ''), - "advantages": method_info.get('advantages', []), - "disadvantages": method_info.get('disadvantages', []), - "applicable_scenarios": method_info.get('applicable_scenarios', []) - } - } + break - raise ValueError(f"Method {method_name} not found") + if method_info is None: + raise ValueError(f"Method {method_name} not found in method config") + + # 查找方法参数信息 + parameter_info = None + for category in ['data_scaler_methods', 'missing_value_handling_methods', 'outlier_detection_methods']: + if method_name in self.parameter_config.get(category, {}): + parameter_info = self.parameter_config[category][method_name] + break + + if parameter_info is None: + raise ValueError(f"Method {method_name} not found in parameter config") + + # 组合返回信息 + return { + "status": "success", + "method": { + "name": method_name, + "description": parameter_info.get('description', ''), + "principle": method_info.get('principle', ''), + "advantages": method_info.get('advantages', []), + "disadvantages": method_info.get('disadvantages', []), + "applicable_scenarios": method_info.get('applicable_scenarios', []), + "parameters": parameter_info.get('parameters', []) + } + } except Exception as e: self.logger.error(f"Error getting method details: {str(e)}") diff --git a/date_feature/parameter.yaml b/date_feature/parameter.yaml index 7155848..9bd9da1 100644 --- a/date_feature/parameter.yaml +++ b/date_feature/parameter.yaml @@ -1,186 +1,255 @@ feature_engineering_methods_parameters: LabelEncoder: - parameters: {} + description: "将分类标签编码为整数。" + parameters: [] KBinsDiscretizer: + description: "将连续数据分箱为离散数据。" parameters: - n_bins: - description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" + - name: "n_bins" + type: "int or array-like" default: 5 - encode: + description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" + - name: "encode" + type: "str" + default: "onehot" description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。" - default: 'onehot' - strategy: + - name: "strategy" + type: "str" + default: "quantile" description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。" - default: 'quantile' FunctionTransformer: + description: "对数据应用自定义函数进行转换。" parameters: - func: + - name: "func" + type: "callable" + default: null description: "要应用于输入数据的函数。" + - name: "inverse_func" + type: "callable" default: null - inverse_func: description: "func 的逆函数,如果存在。" - default: null - validate: - description: "布尔值,指示是否在转换前验证输入数据。" + - name: "validate" + type: "bool" default: false - accept_sparse: - description: "布尔值,指示是否接受稀疏矩阵作为输入。" + description: "指示是否在转换前验证输入数据。" + - name: "accept_sparse" + type: "bool" default: false - check_inverse: - description: "布尔值,指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" + description: "指示是否接受稀疏矩阵作为输入。" + - name: "check_inverse" + type: "bool" default: true - kw_args: + description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" + - name: "kw_args" + type: "dict" + default: null description: "传递给 func 的其他关键字参数。" + - name: "inv_kw_args" + type: "dict" default: null - inv_kw_args: description: "传递给 inverse_func 的其他关键字参数。" - default: null PowerTransformer: + description: "对数据进行幂变换以使其更符合正态分布。" parameters: - method: + - name: "method" + type: "str" + default: "yeo-johnson" description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。" - default: 'yeo-johnson' - standardize: - description: "布尔值,指示是否在变换后将数据标准化为零均值和单位方差。" + - name: "standardize" + type: "bool" default: true - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" + description: "指示是否在变换后将数据标准化为零均值和单位方差。" + - name: "copy" + type: "bool" default: true + description: "指示是否复制输入数据,或在原地进行变换。" QuantileTransformer: + description: "将数据转换为均匀分布或正态分布。" parameters: - n_quantiles: - description: "用于分位数变换的分位数数量。" + - name: "n_quantiles" + type: "int" default: 1000 - output_distribution: + description: "用于分位数变换的分位数数量。" + - name: "output_distribution" + type: "str" + default: "uniform" description: "指定输出分布。可选值包括 'uniform' 和 'normal'。" - default: 'uniform' - ignore_implicit_zeros: - description: "布尔值,指示是否忽略隐式零。" + - name: "ignore_implicit_zeros" + type: "bool" default: false - subsample: + description: "指示是否忽略隐式零。" + - name: "subsample" + type: "int" + default: 100000 description: "用于计算分位数的子样本大小。" - default: 1e5 - random_state: - description: "用于随机数生成的种子。" + - name: "random_state" + type: "int or None" default: null - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" + description: "用于随机数生成的种子。" + - name: "copy" + type: "bool" default: true + description: "指示是否复制输入数据,或在原地进行变换。" FeatureHasher: + description: "使用哈希技巧将特征映射到向量。" parameters: - n_features: - description: "哈希空间的维度。" + - name: "n_features" + type: "int" default: 1048576 - input_type: + description: "哈希空间的维度。" + - name: "input_type" + type: "str" + default: "dict" description: "输入数据的类型。可选值包括 'dict' 和 'pair'。" - default: 'dict' - dtype: + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - alternate_sign: - description: "布尔值,指示是否在哈希时使用交替符号。" + - name: "alternate_sign" + type: "bool" default: true + description: "指示是否在哈希时使用交替符号。" DictVectorizer: + description: "将符号表示的特征(如字典)转换为稀疏矩阵。" parameters: - dtype: + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - separator: + - name: "separator" + type: "str" + default: "=" description: "用于分隔特征名称的分隔符。" - default: '=' - sparse: - description: "布尔值,指示是否返回稀疏矩阵。" + - name: "sparse" + type: "bool" default: true - sort: - description: "布尔值,指示是否对特征名称排序。" + description: "指示是否返回稀疏矩阵。" + - name: "sort" + type: "bool" default: true + description: "指示是否对特征名称排序。" + PCA: - parameters: - n_components: - description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" - default: null - copy: - description: "布尔值,指示是否复制输入数据,或在原地进行变换。" - default: true - whiten: - description: "布尔值,指示是否对主成分进行白化。" - default: false - svd_solver: - description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" - default: 'auto' - tol: - description: "奇异值分解的容差。" - default: 0.0 - iterated_power: - description: "用于随机化 SVD 的迭代次数。" - default: 'auto' - random_state: - description: "用于随机数生成的种子。" - default: null + description: "主成分分析,用于降维。" + parameters: + - name: "n_components" + type: "int, float, None or str" + default: null + description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" + - name: "copy" + type: "bool" + default: true + description: "指示是否复制输入数据,或在原地进行变换。" + - name: "whiten" + type: "bool" + default: false + description: "指示是否对主成分进行白化。" + - name: "svd_solver" + type: "str" + default: "auto" + description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" + - name: "tol" + type: "float" + default: 0.0 + description: "奇异值分解的容差。" + - name: "iterated_power" + type: "int or 'auto'" + default: 'auto' + description: "用于随机化 SVD 的迭代次数。" + - name: "random_state" + type: "int or None" + default: null + description: "用于随机数生成的种子。" SelectKBest: + description: "选择最重要的 K 个特征。" parameters: - score_func: + - name: "score_func" + type: "callable" + default: "f_classif" description: "用于计算特征得分的函数。" - default: 'f_classif' - k: - description: "要选择的特征数量。" + - name: "k" + type: "int" default: 10 + description: "要选择的特征数量。" RFE: + description: "递归特征消除,用于选择最重要的特征。" parameters: - estimator: + - name: "estimator" + type: "object" + default: null description: "用于特征选择的基模型。" + - name: "n_features_to_select" + type: "int" default: null - n_features_to_select: description: "要选择的特征数量。" - default: null - step: - description: "每次迭代要移除的特征数量。" + - name: "step" + type: "int" default: 1 - verbose: - description: "控制冗长模式的整数。" + description: "每次迭代要移除的特征数量。" + - name: "verbose" + type: "int" default: 0 + description: "控制冗长模式的整数。" PolynomialFeatures: + description: "生成多项式特征,增加模型的非线性能力。" parameters: - degree: - description: "生成多项式特征的最高次数。" + - name: "degree" + type: "int" default: 2 - interaction_only: - description: "布尔值,指示是否仅包含交互项。" + description: "生成多项式特征的最高次数。" + - name: "interaction_only" + type: "bool" default: false - include_bias: - description: "布尔值,指示是否包含偏置列。" + description: "指示是否仅包含交互项。" + - name: "include_bias" + type: "bool" default: true - order: + description: "指示是否包含偏置列。" + - name: "order" + type: "str" + default: "C" description: "输出特征的顺序。可选值包括 'C' 和 'F'。" - default: 'C' OneHotEncoder: + description: "将分类特征转换为独热编码。" parameters: - categories: + - name: "categories" + type: "str or list or 'auto'" + default: "auto" description: "指定每个特征的类别。" - default: 'auto' - drop: - description: "指定要从每个特征中删除的类别。" + - name: "drop" + type: "str or array-like" default: null - sparse: - description: "布尔值,指示是否返回稀疏矩阵。" + description: "指定要从每个特征中删除的类别。" + - name: "sparse" + type: "bool" default: true - dtype: + description: "指示是否返回稀疏矩阵。" + - name: "dtype" + type: "type" + default: "float64" description: "输出数据的类型。" - default: 'float64' - handle_unknown: - description: "指定如何处理未知类别。可选值" + - name: "handle_unknown" + type: "str" + default: "error" + description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。" + - name: "max_categories" + type: "int or None" + default: null + description: "在类别过多时,将类别限制为最大类别数量。" - + + + diff --git a/date_feature/parameter_new.yaml b/date_feature/parameter_new.yaml deleted file mode 100644 index 9bd9da1..0000000 --- a/date_feature/parameter_new.yaml +++ /dev/null @@ -1,255 +0,0 @@ -feature_engineering_methods_parameters: - - LabelEncoder: - description: "将分类标签编码为整数。" - parameters: [] - - KBinsDiscretizer: - description: "将连续数据分箱为离散数据。" - parameters: - - name: "n_bins" - type: "int or array-like" - default: 5 - description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" - - name: "encode" - type: "str" - default: "onehot" - description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。" - - name: "strategy" - type: "str" - default: "quantile" - description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。" - - FunctionTransformer: - description: "对数据应用自定义函数进行转换。" - parameters: - - name: "func" - type: "callable" - default: null - description: "要应用于输入数据的函数。" - - name: "inverse_func" - type: "callable" - default: null - description: "func 的逆函数,如果存在。" - - name: "validate" - type: "bool" - default: false - description: "指示是否在转换前验证输入数据。" - - name: "accept_sparse" - type: "bool" - default: false - description: "指示是否接受稀疏矩阵作为输入。" - - name: "check_inverse" - type: "bool" - default: true - description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" - - name: "kw_args" - type: "dict" - default: null - description: "传递给 func 的其他关键字参数。" - - name: "inv_kw_args" - type: "dict" - default: null - description: "传递给 inverse_func 的其他关键字参数。" - - PowerTransformer: - description: "对数据进行幂变换以使其更符合正态分布。" - parameters: - - name: "method" - type: "str" - default: "yeo-johnson" - description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。" - - name: "standardize" - type: "bool" - default: true - description: "指示是否在变换后将数据标准化为零均值和单位方差。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - QuantileTransformer: - description: "将数据转换为均匀分布或正态分布。" - parameters: - - name: "n_quantiles" - type: "int" - default: 1000 - description: "用于分位数变换的分位数数量。" - - name: "output_distribution" - type: "str" - default: "uniform" - description: "指定输出分布。可选值包括 'uniform' 和 'normal'。" - - name: "ignore_implicit_zeros" - type: "bool" - default: false - description: "指示是否忽略隐式零。" - - name: "subsample" - type: "int" - default: 100000 - description: "用于计算分位数的子样本大小。" - - name: "random_state" - type: "int or None" - default: null - description: "用于随机数生成的种子。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - FeatureHasher: - description: "使用哈希技巧将特征映射到向量。" - parameters: - - name: "n_features" - type: "int" - default: 1048576 - description: "哈希空间的维度。" - - name: "input_type" - type: "str" - default: "dict" - description: "输入数据的类型。可选值包括 'dict' 和 'pair'。" - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "alternate_sign" - type: "bool" - default: true - description: "指示是否在哈希时使用交替符号。" - - DictVectorizer: - description: "将符号表示的特征(如字典)转换为稀疏矩阵。" - parameters: - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "separator" - type: "str" - default: "=" - description: "用于分隔特征名称的分隔符。" - - name: "sparse" - type: "bool" - default: true - description: "指示是否返回稀疏矩阵。" - - name: "sort" - type: "bool" - default: true - description: "指示是否对特征名称排序。" - - - PCA: - description: "主成分分析,用于降维。" - parameters: - - name: "n_components" - type: "int, float, None or str" - default: null - description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" - - name: "copy" - type: "bool" - default: true - description: "指示是否复制输入数据,或在原地进行变换。" - - name: "whiten" - type: "bool" - default: false - description: "指示是否对主成分进行白化。" - - name: "svd_solver" - type: "str" - default: "auto" - description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" - - name: "tol" - type: "float" - default: 0.0 - description: "奇异值分解的容差。" - - name: "iterated_power" - type: "int or 'auto'" - default: 'auto' - description: "用于随机化 SVD 的迭代次数。" - - name: "random_state" - type: "int or None" - default: null - description: "用于随机数生成的种子。" - - SelectKBest: - description: "选择最重要的 K 个特征。" - parameters: - - name: "score_func" - type: "callable" - default: "f_classif" - description: "用于计算特征得分的函数。" - - name: "k" - type: "int" - default: 10 - description: "要选择的特征数量。" - - RFE: - description: "递归特征消除,用于选择最重要的特征。" - parameters: - - name: "estimator" - type: "object" - default: null - description: "用于特征选择的基模型。" - - name: "n_features_to_select" - type: "int" - default: null - description: "要选择的特征数量。" - - name: "step" - type: "int" - default: 1 - description: "每次迭代要移除的特征数量。" - - name: "verbose" - type: "int" - default: 0 - description: "控制冗长模式的整数。" - - PolynomialFeatures: - description: "生成多项式特征,增加模型的非线性能力。" - parameters: - - name: "degree" - type: "int" - default: 2 - description: "生成多项式特征的最高次数。" - - name: "interaction_only" - type: "bool" - default: false - description: "指示是否仅包含交互项。" - - name: "include_bias" - type: "bool" - default: true - description: "指示是否包含偏置列。" - - name: "order" - type: "str" - default: "C" - description: "输出特征的顺序。可选值包括 'C' 和 'F'。" - - OneHotEncoder: - description: "将分类特征转换为独热编码。" - parameters: - - name: "categories" - type: "str or list or 'auto'" - default: "auto" - description: "指定每个特征的类别。" - - name: "drop" - type: "str or array-like" - default: null - description: "指定要从每个特征中删除的类别。" - - name: "sparse" - type: "bool" - default: true - description: "指示是否返回稀疏矩阵。" - - name: "dtype" - type: "type" - default: "float64" - description: "输出数据的类型。" - - name: "handle_unknown" - type: "str" - default: "error" - description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。" - - name: "max_categories" - type: "int or None" - default: null - description: "在类别过多时,将类别限制为最大类别数量。" - - - - - diff --git a/example_method_reader.py b/example_method_reader.py new file mode 100644 index 0000000..4f11c7c --- /dev/null +++ b/example_method_reader.py @@ -0,0 +1,14 @@ +from data_process.method_reader_date_process import MethodReader + +# 创建方法读取器实例 +reader = MethodReader() + +# 获取所有预处理方法 +methods = reader.get_preprocessing_methods() +print("预处理方法列表:") +print(methods) + +# 获取特定方法的详细信息 +method_details = reader.get_method_details('StandardScaler') +print("\nStandardScaler方法详情:") +print(method_details) \ No newline at end of file diff --git a/mlruns/0/meta.yaml b/mlruns/0/meta.yaml new file mode 100644 index 0000000..ee51c2d --- /dev/null +++ b/mlruns/0/meta.yaml @@ -0,0 +1,6 @@ +artifact_location: mlflow-artifacts:/0 +creation_time: 1739520200398 +experiment_id: '0' +last_update_time: 1739520200398 +lifecycle_stage: active +name: Default diff --git a/tests/test_method_reader.py b/tests/test_method_reader.py new file mode 100644 index 0000000..5cb3f51 --- /dev/null +++ b/tests/test_method_reader.py @@ -0,0 +1,49 @@ +import unittest +from data_process.method_reader_date_process import MethodReader + +class TestMethodReader(unittest.TestCase): + def setUp(self): + self.reader = MethodReader() + + def test_get_preprocessing_methods(self): + result = self.reader.get_preprocessing_methods() + self.assertEqual(result['status'], 'success') + self.assertIsInstance(result['methods'], list) + + # 检查返回的方法列表 + methods = result['methods'] + self.assertTrue(any(m['name'] == 'data_scaler' for m in methods)) + self.assertTrue(any(m['name'] == 'missing_value_handler' for m in methods)) + self.assertTrue(any(m['name'] == 'outlier_detector' for m in methods)) + + def test_get_method_details(self): + # 测试获取StandardScaler的详细信息 + result = self.reader.get_method_details('StandardScaler') + self.assertEqual(result['status'], 'success') + self.assertEqual(result['method']['name'], 'StandardScaler') + + # 检查返回的详细信息字段 + method = result['method'] + self.assertIn('description', method) + self.assertIn('principle', method) + self.assertIn('advantages', method) + self.assertIn('disadvantages', method) + self.assertIn('applicable_scenarios', method) + self.assertIn('parameters', method) + + # 检查参数信息 + parameters = method['parameters'] + self.assertIsInstance(parameters, list) + if parameters: + param = parameters[0] + self.assertIn('name', param) + self.assertIn('type', param) + self.assertIn('default', param) + self.assertIn('description', param) + + # 测试获取不存在的方法 + result = self.reader.get_method_details('NonExistentMethod') + self.assertEqual(result['status'], 'error') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file