model_lr.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import os.path
  8. from os.path import dirname, realpath
  9. from typing import Dict
  10. import pandas as pd
  11. import scorecardpy as sc
  12. from sklearn.linear_model import LogisticRegression
  13. from commom import f_df_to_image, f_display_images_by_side
  14. from entitys import DataPreparedEntity, MetricFucEntity, DataSplitEntity
  15. from feature import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
  16. from .model_base import ModelBase
  17. class ModelLr(ModelBase):
  18. def __init__(self, *args, **kwargs):
  19. super().__init__(*args, **kwargs)
  20. # 报告模板
  21. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
  22. self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
  23. def get_template_path(self):
  24. return self._template_path
  25. def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
  26. bins = kwargs["bins"]
  27. data_split_original: DataSplitEntity = kwargs["data_split_original"]
  28. jupyter = kwargs["jupyter"]
  29. # woe编码之前的数据
  30. train_data_original = data_split_original.train_data
  31. test_data_original = data_split_original.test_data
  32. train_data = data.train_data
  33. train_y = train_data.get_Ydata()
  34. y_column = train_data.y_column
  35. test_data = data.test_data
  36. self.lr.fit(train_data.get_Xdata(), train_y)
  37. metric_value_dict = {}
  38. # 评分卡
  39. card: Dict = sc.scorecard(bins, self.lr, train_data.x_columns, points0=600, odds0=train_data.get_odds0(),
  40. pdo=50)
  41. card_df = pd.DataFrame(columns=card['basepoints'].keys())
  42. for k, v in card.items():
  43. card_df = pd.concat((card_df, v))
  44. card_df_path = self._train_config.f_get_save_path(f"card_df.png")
  45. f_df_to_image(card_df, card_df_path)
  46. metric_value_dict["评分卡"] = MetricFucEntity(table=card_df, image_path=card_df_path)
  47. # 模型系数
  48. coef = dict(zip(train_data.x_columns, self.lr.coef_.reshape(-1)))
  49. coef_df = pd.DataFrame()
  50. coef_df['变量'] = coef.keys()
  51. coef_df['变量系数'] = coef.values()
  52. metric_value_dict["变量系数"] = MetricFucEntity(table=coef_df, table_font_size=10)
  53. # 模型ks auc
  54. train_prob = self.lr.predict_proba(train_data.get_Xdata())[:, 1]
  55. image_path_list = []
  56. train_perf = sc.perf_eva(train_y, train_prob, title="train", show_plot=True)
  57. path = self._train_config.f_get_save_path(f"train_perf.png")
  58. train_perf["pic"].savefig(path)
  59. image_path_list.append(path)
  60. train_auc = train_perf["AUC"]
  61. train_ks = train_perf["KS"]
  62. test_auc = "-"
  63. test_ks = "-"
  64. if test_data is not None:
  65. test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
  66. test_y = test_data.get_Ydata()
  67. test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
  68. path = self._train_config.f_get_save_path(f"test_perf.png")
  69. test_perf["pic"].savefig(path)
  70. image_path_list.append(path)
  71. test_auc = test_perf["AUC"]
  72. test_ks = test_perf["KS"]
  73. df_auc = pd.DataFrame()
  74. df_auc["样本集"] = ["训练集", "测试集"]
  75. df_auc["AUC"] = [train_auc, test_auc]
  76. df_auc["KS"] = [train_ks, test_ks]
  77. metric_value_dict["模型结果"] = MetricFucEntity(table=df_auc, image_path=image_path_list, image_size=5,
  78. table_font_size=10)
  79. # 评分卡分箱
  80. train_data_original, score_bins = f_get_model_score_bin(train_data_original, card)
  81. train_data_gain = f_calcu_model_ks(train_data_original, y_column, sort_ascending=True)
  82. train_data_gain_path = self._train_config.f_get_save_path(f"train_data_gain.png")
  83. f_df_to_image(train_data_gain, train_data_gain_path)
  84. metric_value_dict["训练集分数分箱"] = MetricFucEntity(table=train_data_gain, image_path=train_data_gain_path)
  85. if test_data is not None:
  86. test_data_original, bins = f_get_model_score_bin(test_data_original, card, score_bins)
  87. test_data_gain = f_calcu_model_ks(test_data_original, y_column, sort_ascending=True)
  88. test_data_gain_path = self._train_config.f_get_save_path(f"test_data_gain.png")
  89. f_df_to_image(test_data_gain, test_data_gain_path)
  90. metric_value_dict["测试集分数分箱"] = MetricFucEntity(table=test_data_gain, image_path=test_data_gain_path)
  91. # 模型分psi
  92. model_psi = f_calcu_model_psi(train_data_original, test_data_original)
  93. model_psi_path = self._train_config.f_get_save_path(f"model_psi.png")
  94. f_df_to_image(model_psi, model_psi_path)
  95. metric_value_dict["模型稳定性"] = MetricFucEntity(table=model_psi, value=model_psi["psi"].sum().round(4),
  96. image_path=model_psi_path)
  97. if jupyter:
  98. from IPython import display
  99. print("-----模型结果-----")
  100. display.display(metric_value_dict["模型结果"].table)
  101. f_display_images_by_side(metric_value_dict["模型结果"].image_path, display)
  102. # 模型psi
  103. display.display(metric_value_dict["模型稳定性"].table)
  104. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  105. display.display(metric_value_dict["变量系数"].table)
  106. print("-----训练集-分数分箱-----")
  107. display.display(metric_value_dict["训练集分数分箱"].table)
  108. print("-----测试集-分数分箱-----")
  109. display.display(metric_value_dict["测试集分数分箱"].table)
  110. # 评分卡
  111. display.display(metric_value_dict["评分卡"].table)
  112. return metric_value_dict
  113. def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
  114. return self.lr.predict_proba(x)[:, 1]
  115. def export_model_file(self):
  116. pass
  117. if __name__ == "__main__":
  118. pass