MLPlatform/date_preprocessing/parameter.yaml

350 lines
12 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

data_scaler_methods:
StandardScaler:
description: "标准化特征使其均值为0标准差为1。"
parameters:
- name: "copy"
type: "bool"
default: "True"
description: "是否复制数据若为False则会对原始数据进行修改。"
- name: "with_mean"
type: "bool"
default: "True"
description: "是否去均值处理。如果为False则不做均值化处理。"
- name: "with_std"
type: "bool"
default: "True"
description: "是否按标准差缩放。如果为False则不做标准差处理。"
MinMaxScaler:
description: "将特征缩放到指定范围,通常是[0, 1],保持原始数据比例。"
parameters:
- name: "feature_range"
type: "tuple"
default: "(0, 1)"
description: "输出范围,控制转换后数据的最小值和最大值。"
- name: "copy"
type: "bool"
default: "True"
description: "是否复制数据若为False则会对原始数据进行修改。"
RobustScaler:
description: "使用中位数和四分位距进行缩放,适用于包含异常值的数据。"
parameters:
- name: "center"
type: "bool"
default: "True"
description: "是否使用中位数进行中心化处理。如果为False则不进行中心化。"
- name: "scale"
type: "bool"
default: "True"
description: "是否使用四分位距进行缩放。如果为False则不进行缩放。"
- name: "copy"
type: "bool"
default: "True"
description: "是否复制数据若为False则会对原始数据进行修改。"
Normalizer:
description: "对样本而非特征进行归一化处理使每个样本的范数为1。"
parameters:
- name: "norm"
type: "str"
default: "'l2'"
description: "使用的归一化范数。'l1'、'l2'、'max'。'l2'表示L2范数默认'l1'表示L1范数'max'表示最大值归一化。"
- name: "copy"
type: "bool"
default: "True"
description: "是否复制数据若为False则会对原始数据进行修改。"
Binarizer:
description: "将数据二值化根据阈值将特征值大于该阈值的设为1否则设为0。"
parameters:
- name: "threshold"
type: "float"
default: "0.0"
description: "二值化的阈值。大于该值的样本会被设置为1其他为0。"
- name: "copy"
type: "bool"
default: "True"
description: "是否复制数据若为False则会对原始数据进行修改。"
missing_value_handling_methods:
SimpleImputer:
description: "使用统计方法(如均值、中位数、众数)或常数值填充缺失值。"
parameters:
- name: "missing_values"
type: "int, float, str, np.nan 或 None"
default: "np.nan"
description: "指定需要填充的缺失值。"
- name: "strategy"
type: "str"
default: "'mean'"
description: "填充策略,可选值为 'mean'(均值)、'median'(中位数)、'most_frequent'(众数)和 'constant'(常数)。"
- name: "fill_value"
type: "str 或 数值"
default: "None"
description: "当 strategy='constant' 时,指定用于填充的常数值。"
- name: "verbose"
type: "int"
default: "0"
description: "控制冗长度0 表示不输出信息。"
- name: "copy"
type: "bool"
default: "True"
description: "是否创建数据的副本进行填充False 则在原数据上进行填充。"
- name: "add_indicator"
type: "bool"
default: "False"
description: "是否添加缺失值指示器特征,标记缺失值的位置。"
IterativeImputer:
description: "使用多重插补方法,基于其他特征预测缺失值。"
parameters:
- name: "estimator"
type: "对象"
default: "BayesianRidge()"
description: "用于预测的估计器对象。"
- name: "missing_values"
type: "数值"
default: "np.nan"
description: "表示缺失值的占位符。"
- name: "max_iter"
type: "int"
default: "10"
description: "插补过程的最大迭代次数。"
- name: "tol"
type: "float"
default: "1e-3"
description: "早停的容忍度。"
- name: "n_nearest_features"
type: "int"
default: "None"
description: "用于预测的最近特征数量。"
- name: "initial_strategy"
type: "str"
default: "'mean'"
description: "初始插补的策略。"
- name: "imputation_order"
type: "str"
default: "'ascending'"
description: "插补的顺序。"
- name: "skip_complete"
type: "bool"
default: "False"
description: "是否跳过没有缺失值的特征。"
- name: "min_value"
type: "float 或 array-like"
default: "None"
description: "每个特征的最小可接受值。"
- name: "max_value"
type: "float 或 array-like"
default: "None"
description: "每个特征的最大可接受值。"
- name: "verbose"
type: "int"
default: "0"
description: "控制冗长度。"
- name: "random_state"
type: "int, RandomState 实例或 None"
default: "None"
description: "随机数生成器的种子。"
- name: "add_indicator"
type: "bool"
default: "False"
description: "是否添加缺失值指示器特征。"
KNNImputer:
description: "基于k近邻算法用相似样本的值填充缺失值。"
parameters:
- name: "missing_values"
type: "数值"
default: "np.nan"
description: "表示缺失值的占位符。"
- name: "n_neighbors"
type: "int"
default: "5"
description: "用于插补的邻居数量。"
- name: "weights"
type: "str 或 callable"
default: "'uniform'"
description: "权重函数,可选 'uniform'、'distance' 或自定义函数。"
- name: "metric"
type: "str 或 callable"
default: "'nan_euclidean'"
description: "距离度量方式。"
- name: "copy"
type: "bool"
default: "True"
description: "是否创建数据的副本进行填充。"
- name: "add_indicator"
type: "bool"
default: "False"
description: "是否添加缺失值指示器特征。"
MissingIndicator:
description: "生成指示器变量,标记数据中缺失值的位置。"
parameters:
- name: "missing_values"
type: "数值"
default: "np.nan"
description: "表示缺失值的占位符。"
- name: "features"
type: "str"
default: "'missing-only'"
description: "指示器特征的范围,可选 'missing-only' 或 'all'。"
- name: "sparse"
type: "bool"
default: "False"
description: "是否返回稀疏矩阵。"
- name: "error_on_new"
type: "bool"
default: "True"
description: "在 transform 时遇到新特征时是否报错。"
outlier_detection_methods:
IsolationForest:
description: "通过构建随机决策树,将数据分割以孤立异常点的算法。"
parameters:
- name: "n_estimators"
type: "int"
default: "100"
description: "森林中树的数量。"
- name: "max_samples"
type: "int 或 float"
default: "'auto'"
description: "用于构建每棵树的样本数量。'auto' 表示使用数据集大小。"
- name: "contamination"
type: "float"
default: "0.1"
description: "数据集中异常点的比例,用于确定决策函数的阈值。"
- name: "max_features"
type: "int 或 float"
default: "1.0"
description: "用于构建每棵树的特征数量。"
- name: "bootstrap"
type: "bool"
default: "False"
description: "是否对样本进行有放回抽样。"
- name: "n_jobs"
type: "int"
default: "None"
description: "并行运行的作业数量。"
- name: "random_state"
type: "int, RandomState 实例或 None"
default: "None"
description: "控制随机性。"
- name: "verbose"
type: "int"
default: "0"
description: "控制冗余信息的输出。"
OneClassSVM:
description: "使用支持向量机寻找将正常数据与异常数据分离的超平面。"
parameters:
- name: "kernel"
type: "str"
default: "'rbf'"
description: "核函数类型,如 'linear'、'poly'、'rbf' 等。"
- name: "degree"
type: "int"
default: "3"
description: "多项式核函数的度,仅在 kernel='poly' 时有效。"
- name: "gamma"
type: "float 或 'scale' 或 'auto'"
default: "'scale'"
description: "核系数。"
- name: "coef0"
type: "float"
default: "0.0"
description: "核函数中的独立项。"
- name: "tol"
type: "float"
default: "1e-3"
description: "停止标准的容忍度。"
- name: "nu"
type: "float"
default: "0.5"
description: "训练误差的上限和支持向量的下限。"
- name: "shrinking"
type: "bool"
default: "True"
description: "是否使用收缩启发式。"
- name: "cache_size"
type: "float"
default: "200"
description: "缓存大小(以 MB 为单位)。"
- name: "verbose"
type: "bool"
default: "False"
description: "启用详细输出。"
- name: "max_iter"
type: "int"
default: "-1"
description: "最大迭代次数,-1 表示无限制。"
LocalOutlierFactor:
description: "通过比较样本与其邻居的局部密度差异来识别异常点。"
parameters:
- name: "n_neighbors"
type: "int"
default: "20"
description: "用于计算局部密度的邻居数量。"
- name: "algorithm"
type: "str"
default: "'auto'"
description: "用于最近邻搜索的算法。"
- name: "leaf_size"
type: "int"
default: "30"
description: "BallTree 或 KDTree 的叶子大小。"
- name: "metric"
type: "str 或 callable"
default: "'minkowski'"
description: "距离度量方式。"
- name: "p"
type: "int"
default: "2"
description: "Minkowski 度量的幂参数。"
- name: "metric_params"
type: "dict"
default: "None"
description: "度量函数的其他关键字参数。"
- name: "contamination"
type: "float 或 'auto'"
default: "'auto'"
description: "数据集中异常点的比例。"
- name: "novelty"
type: "bool"
default: "False"
description: "是否用于新颖性检测。"
- name: "n_jobs"
type: "int"
default: "None"
description: "并行运行的作业数量。"
EllipticEnvelope:
description: "假设数据服从高斯分布,拟合一个椭圆包络以包围数据,超出包络的点被视为异常。"
parameters:
- name: "store_precision"
type: "bool"
default: "True"
description: "是否存储精度矩阵。"
- name: "assume_centered"
type: "bool"
default: "False"
description: "假设数据已中心化。"
- name: "support_fraction"
type: "float"
default: "None"
description: "用于估计协方差的样本比例。"
- name: "contamination"
type: "float"
default: "0.1"
description: "数据集中异常点的比例。"
- name: "random_state"
type: "int, RandomState 实例或 None"
default: "None"
description: "控制随机性。"