train_test_lr.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/27
  5. @desc:
  6. """
  7. import time
  8. from entitys import DataSplitEntity, MlConfigEntity
  9. from pipeline import Pipeline
  10. if __name__ == "__main__":
  11. time_now = time.time()
  12. import scorecardpy as sc
  13. # 加载数据
  14. dat = sc.germancredit()
  15. dat_columns = dat.columns.tolist()
  16. dat_columns = [c.replace(".","_") for c in dat_columns]
  17. dat.columns = dat_columns
  18. dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
  19. dat["credit_amount_corr1"] = dat["credit_amount"] * 2
  20. dat["credit_amount_corr2"] = dat["credit_amount"] * 3
  21. data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
  22. # 训练并生成报告
  23. train_pipeline = Pipeline(MlConfigEntity.from_config('config/demo/ml_config_template.json'), data)
  24. # 特征处理
  25. cfg = {
  26. "project_name": "demo",
  27. # jupyter下输出内容
  28. "jupyter_print": False,
  29. # 是否开启粗分箱
  30. "format_bin": True,
  31. # 变量切分点搜索采样率
  32. "bin_sample_rate": 0.01,
  33. # 最多保留候选变量数
  34. "max_feature_num": 10,
  35. # 单调性允许变化次数
  36. "monto_shift_threshold": 1,
  37. "iv_threshold": 0.01,
  38. "corr_threshold": 0.4,
  39. "psi_threshold": 0.001,
  40. "vif_threshold": 1.06,
  41. # 压力测试
  42. "stress_test": False,
  43. "stress_sample_times": 10,
  44. # 特殊值
  45. "special_values": {"age_in_years": [36]},
  46. # 手动定义切分点,字符型的变量以'%,%'合并枚举值
  47. "breaks_list": {
  48. # 'duration_in_month': [12, 18, 48],
  49. # 'credit_amount': [2000, 3500, 4000, 7000],
  50. 'purpose': ['retraining%,%car (used)', 'radio/television', 'furniture/equipment%,%business%,%repairs',
  51. 'domestic appliances%,%education%,%car (new)%,%others'],
  52. # 'age_in_years': [27, 34, 58]
  53. },
  54. # y
  55. "y_column": "creditability",
  56. # 候选变量
  57. "x_columns": [
  58. "duration_in_month",
  59. "credit_amount",
  60. "age_in_years",
  61. # "purpose",
  62. # "credit_history",
  63. #
  64. # "credit_amount_corr1",
  65. # "credit_amount_corr2",
  66. ],
  67. "columns_anns": {
  68. "age_in_years": "年龄",
  69. "credit_history": "借贷历史"
  70. },
  71. "columns_exclude": [],
  72. # "columns_include": ["credit_amount"],
  73. "rules": ["df.loc[df['credit_amount']>=9000,'SCORE'] += -50"]
  74. }
  75. train_pipeline = Pipeline(data=data, **cfg)
  76. train_pipeline.train()
  77. train_pipeline.report()
  78. train_pipeline.save()
  79. print(time.time() - time_now)