data_scaler_methods: StandardScaler: description: "标准化特征,使其均值为0,标准差为1。" parameters: - name: "copy" type: "bool" default: "True" description: "是否复制数据,若为False,则会对原始数据进行修改。" - name: "with_mean" type: "bool" default: "True" description: "是否去均值处理。如果为False,则不做均值化处理。" - name: "with_std" type: "bool" default: "True" description: "是否按标准差缩放。如果为False,则不做标准差处理。" MinMaxScaler: description: "将特征缩放到指定范围,通常是[0, 1],保持原始数据比例。" parameters: - name: "feature_range" type: "tuple" default: "(0, 1)" description: "输出范围,控制转换后数据的最小值和最大值。" - name: "copy" type: "bool" default: "True" description: "是否复制数据,若为False,则会对原始数据进行修改。" RobustScaler: description: "使用中位数和四分位距进行缩放,适用于包含异常值的数据。" parameters: - name: "center" type: "bool" default: "True" description: "是否使用中位数进行中心化处理。如果为False,则不进行中心化。" - name: "scale" type: "bool" default: "True" description: "是否使用四分位距进行缩放。如果为False,则不进行缩放。" - name: "copy" type: "bool" default: "True" description: "是否复制数据,若为False,则会对原始数据进行修改。" Normalizer: description: "对样本(而非特征)进行归一化处理,使每个样本的范数为1。" parameters: - name: "norm" type: "str" default: "'l2'" description: "使用的归一化范数。'l1'、'l2'、'max'。'l2'表示L2范数(默认),'l1'表示L1范数,'max'表示最大值归一化。" - name: "copy" type: "bool" default: "True" description: "是否复制数据,若为False,则会对原始数据进行修改。" Binarizer: description: "将数据二值化,根据阈值将特征值大于该阈值的设为1,否则设为0。" parameters: - name: "threshold" type: "float" default: "0.0" description: "二值化的阈值。大于该值的样本会被设置为1,其他为0。" - name: "copy" type: "bool" default: "True" description: "是否复制数据,若为False,则会对原始数据进行修改。" missing_value_handling_methods: SimpleImputer: description: "使用统计方法(如均值、中位数、众数)或常数值填充缺失值。" parameters: - name: "missing_values" type: "int, float, str, np.nan 或 None" default: "np.nan" description: "指定需要填充的缺失值。" - name: "strategy" type: "str" default: "'mean'" description: "填充策略,可选值为 'mean'(均值)、'median'(中位数)、'most_frequent'(众数)和 'constant'(常数)。" - name: "fill_value" type: "str 或 数值" default: "None" description: "当 strategy='constant' 时,指定用于填充的常数值。" - name: "verbose" type: "int" default: "0" description: "控制冗长度,0 表示不输出信息。" - name: "copy" type: "bool" default: "True" description: "是否创建数据的副本进行填充,False 则在原数据上进行填充。" - name: "add_indicator" type: "bool" default: "False" description: "是否添加缺失值指示器特征,标记缺失值的位置。" IterativeImputer: description: "使用多重插补方法,基于其他特征预测缺失值。" parameters: - name: "estimator" type: "对象" default: "BayesianRidge()" description: "用于预测的估计器对象。" - name: "missing_values" type: "数值" default: "np.nan" description: "表示缺失值的占位符。" - name: "max_iter" type: "int" default: "10" description: "插补过程的最大迭代次数。" - name: "tol" type: "float" default: "1e-3" description: "早停的容忍度。" - name: "n_nearest_features" type: "int" default: "None" description: "用于预测的最近特征数量。" - name: "initial_strategy" type: "str" default: "'mean'" description: "初始插补的策略。" - name: "imputation_order" type: "str" default: "'ascending'" description: "插补的顺序。" - name: "skip_complete" type: "bool" default: "False" description: "是否跳过没有缺失值的特征。" - name: "min_value" type: "float 或 array-like" default: "None" description: "每个特征的最小可接受值。" - name: "max_value" type: "float 或 array-like" default: "None" description: "每个特征的最大可接受值。" - name: "verbose" type: "int" default: "0" description: "控制冗长度。" - name: "random_state" type: "int, RandomState 实例或 None" default: "None" description: "随机数生成器的种子。" - name: "add_indicator" type: "bool" default: "False" description: "是否添加缺失值指示器特征。" KNNImputer: description: "基于k近邻算法,用相似样本的值填充缺失值。" parameters: - name: "missing_values" type: "数值" default: "np.nan" description: "表示缺失值的占位符。" - name: "n_neighbors" type: "int" default: "5" description: "用于插补的邻居数量。" - name: "weights" type: "str 或 callable" default: "'uniform'" description: "权重函数,可选 'uniform'、'distance' 或自定义函数。" - name: "metric" type: "str 或 callable" default: "'nan_euclidean'" description: "距离度量方式。" - name: "copy" type: "bool" default: "True" description: "是否创建数据的副本进行填充。" - name: "add_indicator" type: "bool" default: "False" description: "是否添加缺失值指示器特征。" MissingIndicator: description: "生成指示器变量,标记数据中缺失值的位置。" parameters: - name: "missing_values" type: "数值" default: "np.nan" description: "表示缺失值的占位符。" - name: "features" type: "str" default: "'missing-only'" description: "指示器特征的范围,可选 'missing-only' 或 'all'。" - name: "sparse" type: "bool" default: "False" description: "是否返回稀疏矩阵。" - name: "error_on_new" type: "bool" default: "True" description: "在 transform 时遇到新特征时是否报错。" outlier_detection_methods: IsolationForest: description: "通过构建随机决策树,将数据分割以孤立异常点的算法。" parameters: - name: "n_estimators" type: "int" default: "100" description: "森林中树的数量。" - name: "max_samples" type: "int 或 float" default: "'auto'" description: "用于构建每棵树的样本数量。'auto' 表示使用数据集大小。" - name: "contamination" type: "float" default: "0.1" description: "数据集中异常点的比例,用于确定决策函数的阈值。" - name: "max_features" type: "int 或 float" default: "1.0" description: "用于构建每棵树的特征数量。" - name: "bootstrap" type: "bool" default: "False" description: "是否对样本进行有放回抽样。" - name: "n_jobs" type: "int" default: "None" description: "并行运行的作业数量。" - name: "random_state" type: "int, RandomState 实例或 None" default: "None" description: "控制随机性。" - name: "verbose" type: "int" default: "0" description: "控制冗余信息的输出。" OneClassSVM: description: "使用支持向量机寻找将正常数据与异常数据分离的超平面。" parameters: - name: "kernel" type: "str" default: "'rbf'" description: "核函数类型,如 'linear'、'poly'、'rbf' 等。" - name: "degree" type: "int" default: "3" description: "多项式核函数的度,仅在 kernel='poly' 时有效。" - name: "gamma" type: "float 或 'scale' 或 'auto'" default: "'scale'" description: "核系数。" - name: "coef0" type: "float" default: "0.0" description: "核函数中的独立项。" - name: "tol" type: "float" default: "1e-3" description: "停止标准的容忍度。" - name: "nu" type: "float" default: "0.5" description: "训练误差的上限和支持向量的下限。" - name: "shrinking" type: "bool" default: "True" description: "是否使用收缩启发式。" - name: "cache_size" type: "float" default: "200" description: "缓存大小(以 MB 为单位)。" - name: "verbose" type: "bool" default: "False" description: "启用详细输出。" - name: "max_iter" type: "int" default: "-1" description: "最大迭代次数,-1 表示无限制。" LocalOutlierFactor: description: "通过比较样本与其邻居的局部密度差异来识别异常点。" parameters: - name: "n_neighbors" type: "int" default: "20" description: "用于计算局部密度的邻居数量。" - name: "algorithm" type: "str" default: "'auto'" description: "用于最近邻搜索的算法。" - name: "leaf_size" type: "int" default: "30" description: "BallTree 或 KDTree 的叶子大小。" - name: "metric" type: "str 或 callable" default: "'minkowski'" description: "距离度量方式。" - name: "p" type: "int" default: "2" description: "Minkowski 度量的幂参数。" - name: "metric_params" type: "dict" default: "None" description: "度量函数的其他关键字参数。" - name: "contamination" type: "float 或 'auto'" default: "'auto'" description: "数据集中异常点的比例。" - name: "novelty" type: "bool" default: "False" description: "是否用于新颖性检测。" - name: "n_jobs" type: "int" default: "None" description: "并行运行的作业数量。" EllipticEnvelope: description: "假设数据服从高斯分布,拟合一个椭圆包络以包围数据,超出包络的点被视为异常。" parameters: - name: "store_precision" type: "bool" default: "True" description: "是否存储精度矩阵。" - name: "assume_centered" type: "bool" default: "False" description: "假设数据已中心化。" - name: "support_fraction" type: "float" default: "None" description: "用于估计协方差的样本比例。" - name: "contamination" type: "float" default: "0.1" description: "数据集中异常点的比例。" - name: "random_state" type: "int, RandomState 实例或 None" default: "None" description: "控制随机性。"