feature_engineering_methods: LabelEncoder: description: "将分类标签编码为整数。" parameters: [] KBinsDiscretizer: description: "将连续数据分箱为离散数据。" parameters: - name: "n_bins" type: "int or array-like" default: 5 description: "指定每个特征要分成的箱数。可以是单个整数,表示所有特征使用相同的箱数;也可以是形状为 (n_features,) 的数组,为每个特征指定不同的箱数。" - name: "encode" type: "str" default: "onehot" description: "指定离散化后输出的编码方式。可选值包括 'onehot'(独热编码)、'onehot-dense'(密集独热编码)、'ordinal'(序数编码)。" - name: "strategy" type: "str" default: "quantile" description: "定义分箱策略。可选值包括 'uniform'(均匀分箱)、'quantile'(分位数分箱)、'kmeans'(K-Means 聚类分箱)。" FunctionTransformer: description: "对数据应用自定义函数进行转换。" parameters: - name: "func" type: "callable" default: null description: "要应用于输入数据的函数。" - name: "inverse_func" type: "callable" default: null description: "func 的逆函数,如果存在。" - name: "validate" type: "bool" default: false description: "指示是否在转换前验证输入数据。" - name: "accept_sparse" type: "bool" default: false description: "指示是否接受稀疏矩阵作为输入。" - name: "check_inverse" type: "bool" default: true description: "指示在适合期间是否检查 func 和 inverse_func 是否互为逆函数。" - name: "kw_args" type: "dict" default: null description: "传递给 func 的其他关键字参数。" - name: "inv_kw_args" type: "dict" default: null description: "传递给 inverse_func 的其他关键字参数。" PowerTransformer: description: "对数据进行幂变换以使其更符合正态分布。" parameters: - name: "method" type: "str" default: "yeo-johnson" description: "指定变换方法。可选值包括 'yeo-johnson' 和 'box-cox'。" - name: "standardize" type: "bool" default: true description: "指示是否在变换后将数据标准化为零均值和单位方差。" - name: "copy" type: "bool" default: true description: "指示是否复制输入数据,或在原地进行变换。" QuantileTransformer: description: "将数据转换为均匀分布或正态分布。" parameters: - name: "n_quantiles" type: "int" default: 1000 description: "用于分位数变换的分位数数量。" - name: "output_distribution" type: "str" default: "uniform" description: "指定输出分布。可选值包括 'uniform' 和 'normal'。" - name: "ignore_implicit_zeros" type: "bool" default: false description: "指示是否忽略隐式零。" - name: "subsample" type: "int" default: 100000 description: "用于计算分位数的子样本大小。" - name: "random_state" type: "int or None" default: null description: "用于随机数生成的种子。" - name: "copy" type: "bool" default: true description: "指示是否复制输入数据,或在原地进行变换。" FeatureHasher: description: "使用哈希技巧将特征映射到向量。" parameters: - name: "n_features" type: "int" default: 1048576 description: "哈希空间的维度。" - name: "input_type" type: "str" default: "dict" description: "输入数据的类型。可选值包括 'dict' 和 'pair'。" - name: "dtype" type: "type" default: "float64" description: "输出数据的类型。" - name: "alternate_sign" type: "bool" default: true description: "指示是否在哈希时使用交替符号。" DictVectorizer: description: "将符号表示的特征(如字典)转换为稀疏矩阵。" parameters: - name: "dtype" type: "type" default: "float64" description: "输出数据的类型。" - name: "separator" type: "str" default: "=" description: "用于分隔特征名称的分隔符。" - name: "sparse" type: "bool" default: true description: "指示是否返回稀疏矩阵。" - name: "sort" type: "bool" default: true description: "指示是否对特征名称排序。" PCA: description: "主成分分析,用于降维。" parameters: - name: "n_components" type: "int, float, None or str" default: null description: "要保留的主成分数量。可以是整数、浮点数或 'mle'。" - name: "copy" type: "bool" default: true description: "指示是否复制输入数据,或在原地进行变换。" - name: "whiten" type: "bool" default: false description: "指示是否对主成分进行白化。" - name: "svd_solver" type: "str" default: "auto" description: "用于计算 SVD 的方法。可选值包括 'auto'、'full'、'arpack' 和 'randomized'。" - name: "tol" type: "float" default: 0.0 description: "奇异值分解的容差。" - name: "iterated_power" type: "int or 'auto'" default: 'auto' description: "用于随机化 SVD 的迭代次数。" - name: "random_state" type: "int or None" default: null description: "用于随机数生成的种子。" SelectKBest: description: "选择最重要的 K 个特征。" parameters: - name: "score_func" type: "callable" default: "f_classif" description: "用于计算特征得分的函数。" - name: "k" type: "int" default: 10 description: "要选择的特征数量。" RFE: description: "递归特征消除,用于选择最重要的特征。" parameters: - name: "estimator" type: "object" default: null description: "用于特征选择的基模型。" - name: "n_features_to_select" type: "int" default: null description: "要选择的特征数量。" - name: "step" type: "int" default: 1 description: "每次迭代要移除的特征数量。" - name: "verbose" type: "int" default: 0 description: "控制冗长模式的整数。" PolynomialFeatures: description: "生成多项式特征,增加模型的非线性能力。" parameters: - name: "degree" type: "int" default: 2 description: "生成多项式特征的最高次数。" - name: "interaction_only" type: "bool" default: false description: "指示是否仅包含交互项。" - name: "include_bias" type: "bool" default: true description: "指示是否包含偏置列。" - name: "order" type: "str" default: "C" description: "输出特征的顺序。可选值包括 'C' 和 'F'。" OneHotEncoder: description: "将分类特征转换为独热编码。" parameters: - name: "categories" type: "str or list or 'auto'" default: "auto" description: "指定每个特征的类别。" - name: "drop" type: "str or array-like" default: null description: "指定要从每个特征中删除的类别。" - name: "sparse" type: "bool" default: true description: "指示是否返回稀疏矩阵。" - name: "dtype" type: "type" default: "float64" description: "输出数据的类型。" - name: "handle_unknown" type: "str" default: "error" description: "指定如何处理未知类别。可选值包括 'error'(抛出异常)、'ignore'(忽略)。" - name: "max_categories" type: "int or None" default: null description: "在类别过多时,将类别限制为最大类别数量。"