123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- # -*- coding: utf-8 -*-
- """
- @author: yq
- @time: 2024/11/27
- @desc:
- """
- import time
- from entitys import DataSplitEntity, MlConfigEntity
- from pipeline import Pipeline
- if __name__ == "__main__":
- time_now = time.time()
- import scorecardpy as sc
- # 加载数据
- dat = sc.germancredit()
- dat_columns = dat.columns.tolist()
- dat_columns = [c.replace(".","_") for c in dat_columns]
- dat.columns = dat_columns
- dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
- dat["credit_amount_corr1"] = dat["credit_amount"] * 2
- dat["credit_amount_corr2"] = dat["credit_amount"] * 3
- data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
- # 训练并生成报告
- train_pipeline = Pipeline(MlConfigEntity.from_config('./config/ml_config_template.json'), data)
- # 特征处理
- cfg = {
- "project_name": "demo",
- # jupyter下输出内容
- "jupyter_print": False,
- # 是否开启粗分箱
- "format_bin": True,
- # 变量切分点搜索采样率
- "bin_sample_rate": 0.01,
- # 最多保留候选变量数
- "max_feature_num": 10,
- # 单调性允许变化次数
- "monto_shift_threshold": 1,
- "iv_threshold": 0.01,
- "corr_threshold": 0.4,
- "psi_threshold": 0.001,
- "vif_threshold": 1.06,
- # 压力测试
- "stress_test": True,
- "stress_sample_times": 10,
- # 特殊值
- "special_values": {"age_in_years": [36]},
- # 手动定义切分点,字符型的变量以'%,%'合并枚举值
- "breaks_list": {
- # 'duration_in_month': [12, 18, 48],
- # 'credit_amount': [2000, 3500, 4000, 7000],
- 'purpose': ['retraining%,%car (used)', 'radio/television', 'furniture/equipment%,%business%,%repairs',
- 'domestic appliances%,%education%,%car (new)%,%others'],
- # 'age_in_years': [27, 34, 58]
- },
- # y
- "y_column": "creditability",
- # 候选变量
- "x_columns": [
- "duration_in_month",
- "credit_amount",
- "age_in_years",
- "purpose",
- "credit_history",
- "credit_amount_corr1",
- "credit_amount_corr2",
- ],
- "columns_anns": {
- "age_in_years": "年龄",
- "credit_history": "借贷历史"
- },
- "columns_exclude": [],
- # "columns_include": ["credit_amount"],
- "rules": ["df.loc[df['credit_amount']>=9000,'SCORE'] += -50"]
- }
- train_pipeline = Pipeline(data=data, **cfg)
- train_pipeline.train()
- train_pipeline.report()
- print(time.time() - time_now)
|