添加--数据预处理方法和方法参数的yaml文件
This commit is contained in:
parent
a3255c53a8
commit
2ca62b9094
152
date_preprocessing/method.yaml
Normal file
152
date_preprocessing/method.yaml
Normal file
@ -0,0 +1,152 @@
|
||||
data_scaler_methods:
|
||||
StandardScaler:
|
||||
principle: "通过去均值和按标准差缩放特征,使数据呈标准正态分布(均值为0,标准差为1)。"
|
||||
advantages:
|
||||
- "适用于大多数机器学习算法,尤其是对特征范围敏感的模型(如SVM、KNN)。"
|
||||
- "有效消除量纲影响,提高数值稳定性。"
|
||||
disadvantages:
|
||||
- "对异常值敏感,异常值可能会影响均值和标准差。"
|
||||
applicable_scenarios:
|
||||
- "数据服从正态分布或接近正态分布的情况。"
|
||||
- "使用需要标准化特征的模型,如线性回归、逻辑回归、SVM等。"
|
||||
|
||||
MinMaxScaler:
|
||||
principle: "将特征缩放到指定范围(默认[0,1]),保持数据的相对比例不变。"
|
||||
advantages:
|
||||
- "适用于保持原始数据分布不变的场景。"
|
||||
- "对异常值不敏感,相比标准化更稳定。"
|
||||
disadvantages:
|
||||
- "可能会压缩原始数据的方差,影响模型性能。"
|
||||
applicable_scenarios:
|
||||
- "数据分布未知或不符合正态分布。"
|
||||
- "神经网络、决策树等不受特征尺度影响的模型。"
|
||||
|
||||
RobustScaler:
|
||||
principle: "使用中位数和四分位距对数据进行缩放,以减少异常值的影响。"
|
||||
advantages:
|
||||
- "对异常值具有较强的鲁棒性。"
|
||||
- "适用于数据具有较多离群值的情况。"
|
||||
disadvantages:
|
||||
- "与标准化相比,可能会损失部分数据的可解释性。"
|
||||
applicable_scenarios:
|
||||
- "数据包含大量异常值,但仍需缩放特征。"
|
||||
- "数据分布较为偏态的情况。"
|
||||
|
||||
Normalizer:
|
||||
principle: "对样本(而非特征)进行归一化处理,使每个样本的范数为1。"
|
||||
advantages:
|
||||
- "适用于强调样本间的相对大小而非绝对值的情况。"
|
||||
- "有助于在稀疏数据(如文本数据)上使用。"
|
||||
disadvantages:
|
||||
- "不能消除特征之间的量纲差异。"
|
||||
applicable_scenarios:
|
||||
- "文本分类、推荐系统等涉及余弦相似度的任务。"
|
||||
- "数据表示为向量时,如TF-IDF特征向量。"
|
||||
|
||||
Binarizer:
|
||||
principle: "根据设定阈值将数据二值化,特征值大于该阈值设为1,否则设为0。"
|
||||
advantages:
|
||||
- "适用于需要转换为布尔变量的情况。"
|
||||
- "可用于离散化连续特征。"
|
||||
disadvantages:
|
||||
- "丢失原始特征的数值信息,可能导致信息损失。"
|
||||
applicable_scenarios:
|
||||
- "处理二元分类任务,如是否点击广告、是否购买商品。"
|
||||
- "对特定阈值敏感的数据特征进行转换。"
|
||||
|
||||
|
||||
missing_value_handling_methods:
|
||||
SimpleImputer:
|
||||
principle: "使用统计方法(如均值、中位数、众数)或常数值填充缺失值。"
|
||||
advantages:
|
||||
- "实现简单,计算速度快。"
|
||||
- "适用于数值型和分类型数据。"
|
||||
disadvantages:
|
||||
- "未考虑特征之间的相关性,可能导致偏差。"
|
||||
applicable_scenarios:
|
||||
- "数据缺失比例较小且随机分布。"
|
||||
- "需要快速处理缺失值的场景。"
|
||||
|
||||
IterativeImputer:
|
||||
principle: "使用多重插补方法,基于其他特征预测缺失值。"
|
||||
advantages:
|
||||
- "考虑了特征之间的相关性,填补更准确。"
|
||||
disadvantages:
|
||||
- "计算复杂度高,处理时间较长。"
|
||||
- "对模型的选择和参数设置较为敏感。"
|
||||
applicable_scenarios:
|
||||
- "数据缺失模式复杂,特征之间存在较强相关性。"
|
||||
- "对数据完整性要求高的场景。"
|
||||
|
||||
KNNImputer:
|
||||
principle: "基于k近邻算法,用相似样本的值填充缺失值。"
|
||||
advantages:
|
||||
- "利用了数据的局部结构信息,填补效果较好。"
|
||||
disadvantages:
|
||||
- "计算量大,特别是对于大型数据集。"
|
||||
- "对异常值敏感,可能受到噪声影响。"
|
||||
applicable_scenarios:
|
||||
- "数据集较小且特征之间存在相关性。"
|
||||
- "缺失值与其邻近样本的值相似的情况。"
|
||||
|
||||
MissingIndicator:
|
||||
principle: "生成指示器变量,标记数据中缺失值的位置。"
|
||||
advantages:
|
||||
- "保留了缺失值的信息,供模型使用。"
|
||||
- "可与其他填补方法结合使用。"
|
||||
disadvantages:
|
||||
- "增加了特征数量,可能导致模型复杂度增加。"
|
||||
applicable_scenarios:
|
||||
- "希望模型感知缺失模式的场景。"
|
||||
- "与填补方法结合使用,以提高模型性能。"
|
||||
|
||||
|
||||
outlier_detection_methods:
|
||||
IsolationForest:
|
||||
principle: "通过构建随机决策树,将数据分割为更小的部分,孤立出异常点。异常点在树中更接近根节点。"
|
||||
advantages:
|
||||
- "对高维数据有效。"
|
||||
- "无需对数据进行标准化处理。"
|
||||
- "处理大规模数据集时速度较快。"
|
||||
disadvantages:
|
||||
- "对数据中的噪声和离群点敏感。"
|
||||
- "需要选择适当的子采样大小和树的数量。"
|
||||
applicable_scenarios:
|
||||
- "大规模、高维数据集的异常值检测。"
|
||||
- "需要高效处理速度的实时应用。"
|
||||
|
||||
OneClassSVM:
|
||||
principle: "使用支持向量机方法,寻找最大化数据与原点间距的超平面,将正常数据与异常数据分离。"
|
||||
advantages:
|
||||
- "适用于线性和非线性数据。"
|
||||
- "在小样本数据集上表现良好。"
|
||||
disadvantages:
|
||||
- "对参数敏感,需谨慎调节。"
|
||||
- "在大规模数据集上计算成本较高。"
|
||||
applicable_scenarios:
|
||||
- "小型数据集的异常值检测。"
|
||||
- "数据分布复杂,需要非线性分割的情况。"
|
||||
|
||||
LocalOutlierFactor:
|
||||
principle: "通过比较样本与其邻居的局部密度差异,识别异常点。异常点通常位于低密度区域。"
|
||||
advantages:
|
||||
- "无需监督标签即可检测异常。"
|
||||
- "能够发现局部异常。"
|
||||
disadvantages:
|
||||
- "计算复杂度高,处理大数据集时效率较低。"
|
||||
- "对参数(如邻居数量)敏感。"
|
||||
applicable_scenarios:
|
||||
- "数据集中存在局部异常的情况。"
|
||||
- "需要无监督异常检测的场景。"
|
||||
|
||||
EllipticEnvelope:
|
||||
principle: "假设数据服从高斯分布,拟合一个椭圆包络,将数据包围其中,超出包络的点被视为异常。"
|
||||
advantages:
|
||||
- "适用于数据接近高斯分布的情况。"
|
||||
- "实现简单,计算速度快。"
|
||||
disadvantages:
|
||||
- "对非高斯分布的数据效果较差。"
|
||||
- "对异常值和噪声敏感。"
|
||||
applicable_scenarios:
|
||||
- "数据近似高斯分布的异常值检测。"
|
||||
- "需要快速检测异常的应用。"
|
||||
349
date_preprocessing/parameter.yaml
Normal file
349
date_preprocessing/parameter.yaml
Normal file
@ -0,0 +1,349 @@
|
||||
data_scaler_methods:
|
||||
StandardScaler:
|
||||
description: "标准化特征,使其均值为0,标准差为1。"
|
||||
parameters:
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否复制数据,若为False,则会对原始数据进行修改。"
|
||||
- name: "with_mean"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否去均值处理。如果为False,则不做均值化处理。"
|
||||
- name: "with_std"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否按标准差缩放。如果为False,则不做标准差处理。"
|
||||
|
||||
MinMaxScaler:
|
||||
description: "将特征缩放到指定范围,通常是[0, 1],保持原始数据比例。"
|
||||
parameters:
|
||||
- name: "feature_range"
|
||||
type: "tuple"
|
||||
default: "(0, 1)"
|
||||
description: "输出范围,控制转换后数据的最小值和最大值。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否复制数据,若为False,则会对原始数据进行修改。"
|
||||
|
||||
RobustScaler:
|
||||
description: "使用中位数和四分位距进行缩放,适用于包含异常值的数据。"
|
||||
parameters:
|
||||
- name: "center"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否使用中位数进行中心化处理。如果为False,则不进行中心化。"
|
||||
- name: "scale"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否使用四分位距进行缩放。如果为False,则不进行缩放。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否复制数据,若为False,则会对原始数据进行修改。"
|
||||
|
||||
Normalizer:
|
||||
description: "对样本(而非特征)进行归一化处理,使每个样本的范数为1。"
|
||||
parameters:
|
||||
- name: "norm"
|
||||
type: "str"
|
||||
default: "'l2'"
|
||||
description: "使用的归一化范数。'l1'、'l2'、'max'。'l2'表示L2范数(默认),'l1'表示L1范数,'max'表示最大值归一化。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否复制数据,若为False,则会对原始数据进行修改。"
|
||||
|
||||
Binarizer:
|
||||
description: "将数据二值化,根据阈值将特征值大于该阈值的设为1,否则设为0。"
|
||||
parameters:
|
||||
- name: "threshold"
|
||||
type: "float"
|
||||
default: "0.0"
|
||||
description: "二值化的阈值。大于该值的样本会被设置为1,其他为0。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否复制数据,若为False,则会对原始数据进行修改。"
|
||||
|
||||
|
||||
missing_value_handling_methods:
|
||||
SimpleImputer:
|
||||
description: "使用统计方法(如均值、中位数、众数)或常数值填充缺失值。"
|
||||
parameters:
|
||||
- name: "missing_values"
|
||||
type: "int, float, str, np.nan 或 None"
|
||||
default: "np.nan"
|
||||
description: "指定需要填充的缺失值。"
|
||||
- name: "strategy"
|
||||
type: "str"
|
||||
default: "'mean'"
|
||||
description: "填充策略,可选值为 'mean'(均值)、'median'(中位数)、'most_frequent'(众数)和 'constant'(常数)。"
|
||||
- name: "fill_value"
|
||||
type: "str 或 数值"
|
||||
default: "None"
|
||||
description: "当 strategy='constant' 时,指定用于填充的常数值。"
|
||||
- name: "verbose"
|
||||
type: "int"
|
||||
default: "0"
|
||||
description: "控制冗长度,0 表示不输出信息。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否创建数据的副本进行填充,False 则在原数据上进行填充。"
|
||||
- name: "add_indicator"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否添加缺失值指示器特征,标记缺失值的位置。"
|
||||
|
||||
IterativeImputer:
|
||||
description: "使用多重插补方法,基于其他特征预测缺失值。"
|
||||
parameters:
|
||||
- name: "estimator"
|
||||
type: "对象"
|
||||
default: "BayesianRidge()"
|
||||
description: "用于预测的估计器对象。"
|
||||
- name: "missing_values"
|
||||
type: "数值"
|
||||
default: "np.nan"
|
||||
description: "表示缺失值的占位符。"
|
||||
- name: "max_iter"
|
||||
type: "int"
|
||||
default: "10"
|
||||
description: "插补过程的最大迭代次数。"
|
||||
- name: "tol"
|
||||
type: "float"
|
||||
default: "1e-3"
|
||||
description: "早停的容忍度。"
|
||||
- name: "n_nearest_features"
|
||||
type: "int"
|
||||
default: "None"
|
||||
description: "用于预测的最近特征数量。"
|
||||
- name: "initial_strategy"
|
||||
type: "str"
|
||||
default: "'mean'"
|
||||
description: "初始插补的策略。"
|
||||
- name: "imputation_order"
|
||||
type: "str"
|
||||
default: "'ascending'"
|
||||
description: "插补的顺序。"
|
||||
- name: "skip_complete"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否跳过没有缺失值的特征。"
|
||||
- name: "min_value"
|
||||
type: "float 或 array-like"
|
||||
default: "None"
|
||||
description: "每个特征的最小可接受值。"
|
||||
- name: "max_value"
|
||||
type: "float 或 array-like"
|
||||
default: "None"
|
||||
description: "每个特征的最大可接受值。"
|
||||
- name: "verbose"
|
||||
type: "int"
|
||||
default: "0"
|
||||
description: "控制冗长度。"
|
||||
- name: "random_state"
|
||||
type: "int, RandomState 实例或 None"
|
||||
default: "None"
|
||||
description: "随机数生成器的种子。"
|
||||
- name: "add_indicator"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否添加缺失值指示器特征。"
|
||||
|
||||
KNNImputer:
|
||||
description: "基于k近邻算法,用相似样本的值填充缺失值。"
|
||||
parameters:
|
||||
- name: "missing_values"
|
||||
type: "数值"
|
||||
default: "np.nan"
|
||||
description: "表示缺失值的占位符。"
|
||||
- name: "n_neighbors"
|
||||
type: "int"
|
||||
default: "5"
|
||||
description: "用于插补的邻居数量。"
|
||||
- name: "weights"
|
||||
type: "str 或 callable"
|
||||
default: "'uniform'"
|
||||
description: "权重函数,可选 'uniform'、'distance' 或自定义函数。"
|
||||
- name: "metric"
|
||||
type: "str 或 callable"
|
||||
default: "'nan_euclidean'"
|
||||
description: "距离度量方式。"
|
||||
- name: "copy"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否创建数据的副本进行填充。"
|
||||
- name: "add_indicator"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否添加缺失值指示器特征。"
|
||||
|
||||
MissingIndicator:
|
||||
description: "生成指示器变量,标记数据中缺失值的位置。"
|
||||
parameters:
|
||||
- name: "missing_values"
|
||||
type: "数值"
|
||||
default: "np.nan"
|
||||
description: "表示缺失值的占位符。"
|
||||
- name: "features"
|
||||
type: "str"
|
||||
default: "'missing-only'"
|
||||
description: "指示器特征的范围,可选 'missing-only' 或 'all'。"
|
||||
- name: "sparse"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否返回稀疏矩阵。"
|
||||
- name: "error_on_new"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "在 transform 时遇到新特征时是否报错。"
|
||||
|
||||
|
||||
outlier_detection_methods:
|
||||
IsolationForest:
|
||||
description: "通过构建随机决策树,将数据分割以孤立异常点的算法。"
|
||||
parameters:
|
||||
- name: "n_estimators"
|
||||
type: "int"
|
||||
default: "100"
|
||||
description: "森林中树的数量。"
|
||||
- name: "max_samples"
|
||||
type: "int 或 float"
|
||||
default: "'auto'"
|
||||
description: "用于构建每棵树的样本数量。'auto' 表示使用数据集大小。"
|
||||
- name: "contamination"
|
||||
type: "float"
|
||||
default: "0.1"
|
||||
description: "数据集中异常点的比例,用于确定决策函数的阈值。"
|
||||
- name: "max_features"
|
||||
type: "int 或 float"
|
||||
default: "1.0"
|
||||
description: "用于构建每棵树的特征数量。"
|
||||
- name: "bootstrap"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否对样本进行有放回抽样。"
|
||||
- name: "n_jobs"
|
||||
type: "int"
|
||||
default: "None"
|
||||
description: "并行运行的作业数量。"
|
||||
- name: "random_state"
|
||||
type: "int, RandomState 实例或 None"
|
||||
default: "None"
|
||||
description: "控制随机性。"
|
||||
- name: "verbose"
|
||||
type: "int"
|
||||
default: "0"
|
||||
description: "控制冗余信息的输出。"
|
||||
|
||||
OneClassSVM:
|
||||
description: "使用支持向量机寻找将正常数据与异常数据分离的超平面。"
|
||||
parameters:
|
||||
- name: "kernel"
|
||||
type: "str"
|
||||
default: "'rbf'"
|
||||
description: "核函数类型,如 'linear'、'poly'、'rbf' 等。"
|
||||
- name: "degree"
|
||||
type: "int"
|
||||
default: "3"
|
||||
description: "多项式核函数的度,仅在 kernel='poly' 时有效。"
|
||||
- name: "gamma"
|
||||
type: "float 或 'scale' 或 'auto'"
|
||||
default: "'scale'"
|
||||
description: "核系数。"
|
||||
- name: "coef0"
|
||||
type: "float"
|
||||
default: "0.0"
|
||||
description: "核函数中的独立项。"
|
||||
- name: "tol"
|
||||
type: "float"
|
||||
default: "1e-3"
|
||||
description: "停止标准的容忍度。"
|
||||
- name: "nu"
|
||||
type: "float"
|
||||
default: "0.5"
|
||||
description: "训练误差的上限和支持向量的下限。"
|
||||
- name: "shrinking"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否使用收缩启发式。"
|
||||
- name: "cache_size"
|
||||
type: "float"
|
||||
default: "200"
|
||||
description: "缓存大小(以 MB 为单位)。"
|
||||
- name: "verbose"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "启用详细输出。"
|
||||
- name: "max_iter"
|
||||
type: "int"
|
||||
default: "-1"
|
||||
description: "最大迭代次数,-1 表示无限制。"
|
||||
|
||||
LocalOutlierFactor:
|
||||
description: "通过比较样本与其邻居的局部密度差异来识别异常点。"
|
||||
parameters:
|
||||
- name: "n_neighbors"
|
||||
type: "int"
|
||||
default: "20"
|
||||
description: "用于计算局部密度的邻居数量。"
|
||||
- name: "algorithm"
|
||||
type: "str"
|
||||
default: "'auto'"
|
||||
description: "用于最近邻搜索的算法。"
|
||||
- name: "leaf_size"
|
||||
type: "int"
|
||||
default: "30"
|
||||
description: "BallTree 或 KDTree 的叶子大小。"
|
||||
- name: "metric"
|
||||
type: "str 或 callable"
|
||||
default: "'minkowski'"
|
||||
description: "距离度量方式。"
|
||||
- name: "p"
|
||||
type: "int"
|
||||
default: "2"
|
||||
description: "Minkowski 度量的幂参数。"
|
||||
- name: "metric_params"
|
||||
type: "dict"
|
||||
default: "None"
|
||||
description: "度量函数的其他关键字参数。"
|
||||
- name: "contamination"
|
||||
type: "float 或 'auto'"
|
||||
default: "'auto'"
|
||||
description: "数据集中异常点的比例。"
|
||||
- name: "novelty"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "是否用于新颖性检测。"
|
||||
- name: "n_jobs"
|
||||
type: "int"
|
||||
default: "None"
|
||||
description: "并行运行的作业数量。"
|
||||
|
||||
EllipticEnvelope:
|
||||
description: "假设数据服从高斯分布,拟合一个椭圆包络以包围数据,超出包络的点被视为异常。"
|
||||
parameters:
|
||||
- name: "store_precision"
|
||||
type: "bool"
|
||||
default: "True"
|
||||
description: "是否存储精度矩阵。"
|
||||
- name: "assume_centered"
|
||||
type: "bool"
|
||||
default: "False"
|
||||
description: "假设数据已中心化。"
|
||||
- name: "support_fraction"
|
||||
type: "float"
|
||||
default: "None"
|
||||
description: "用于估计协方差的样本比例。"
|
||||
- name: "contamination"
|
||||
type: "float"
|
||||
default: "0.1"
|
||||
description: "数据集中异常点的比例。"
|
||||
- name: "random_state"
|
||||
type: "int, RandomState 实例或 None"
|
||||
default: "None"
|
||||
description: "控制随机性。"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user