# -*- coding: utf-8 -*- """ @author: yq @time: 2024/11/27 @desc: """ import time from entitys import DataSplitEntity, MlConfigEntity from pipeline import Pipeline if __name__ == "__main__": time_now = time.time() import scorecardpy as sc # 加载数据 dat = sc.germancredit() dat_columns = dat.columns.tolist() dat_columns = [c.replace(".","_") for c in dat_columns] dat.columns = dat_columns dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0) # dat["credit_amount_corr1"] = dat["credit_amount"] * 2 # dat["credit_amount_corr2"] = dat["credit_amount"] * 3 data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:]) # 训练并生成报告 # train_pipeline = Pipeline(MlConfigEntity.from_config('config/demo/ml_config_template.json'), data) # 特征处理 cfg = { # 项目名称,影响数据存储位置 "project_name": "demo", # jupyter下输出内容 "jupyter_print": True, # 是否开启粗分箱 "format_bin": True, "max_feature_num": 5, # 压力测试 "stress_test": False, # 压力测试抽样次数 "stress_sample_times": 10, # y "y_column": "creditability", # 参与建模的候选变量 # "x_columns": [ # "duration_in_month", # "credit_amount", # "age_in_years", # "purpose", # "credit_history", # "random", # "credit_amount_corr1", # "credit_amount_corr2", # ], # 变量释义 "columns_anns": { "age_in_years": "年龄", "credit_history": "借贷历史" }, # 被排除的变量 "columns_exclude": [], # 强制使用的变量 # "columns_include": ["credit_amount"], "model_type": "xgb", "feature_strategy": "norm", } train_pipeline = Pipeline(data=data, **cfg) train_pipeline.train() train_pipeline.report() print(time.time() - time_now)