# -*- coding: utf-8 -*- """ @author: yq @time: 2024/11/27 @desc: """ import time from entitys import DataSplitEntity, MlConfigEntity from pipeline import Pipeline if __name__ == "__main__": time_now = time.time() import scorecardpy as sc # 加载数据 dat = sc.germancredit() dat_columns = dat.columns.tolist() dat_columns = [c.replace(".","_") for c in dat_columns] dat.columns = dat_columns dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0) dat["credit_amount_corr1"] = dat["credit_amount"] * 2 dat["credit_amount_corr2"] = dat["credit_amount"] * 3 data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:]) # 训练并生成报告 train_pipeline = Pipeline(MlConfigEntity.from_config('./config/ml_config_template.json'), data) # 特征处理 cfg = { "project_name": "demo", # jupyter下输出内容 "jupyter_print": True, # 是否开启粗分箱 "format_bin": False, # 变量切分点搜索采样率 "bin_sample_rate": 0.01, # 最多保留候选变量数 "max_feature_num": 10, # 单调性允许变化次数 "monto_shift_threshold": 1, "iv_threshold": 0.01, "corr_threshold": 0.4, "psi_threshold": 0.2, "vif_threshold": 10, # 压力测试 "stress_test": True, "stress_sample_times": 10, # 特殊值 "special_values": {"age_in_years": [36]}, # 手动定义切分点,字符型的变量以'%,%'合并枚举值 "breaks_list": { # 'duration_in_month': [12, 18, 48], 'credit_amount': [2000, 3500, 4000, 7000], 'purpose': ['retraining%,%car (used)', 'radio/television', 'furniture/equipment%,%business%,%repairs', 'domestic appliances%,%education%,%car (new)%,%others'], # 'age_in_years': [27, 34, 58] }, # y "y_column": "creditability", # 候选变量 "x_columns": [ "duration_in_month", "credit_amount", "age_in_years", "purpose", "credit_history", "credit_amount_corr1", "credit_amount_corr2", ], "columns_anns": { "age_in_years": "年龄", "credit_history": "借贷历史" }, "columns_exclude": [], # "columns_include": ["age_in_years"], } train_pipeline = Pipeline(data=data, **cfg) train_pipeline.train() train_pipeline.report() print(time.time() - time_now)