model_lr.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import os.path
  8. from os.path import dirname, realpath
  9. from typing import Dict
  10. import pandas as pd
  11. import scorecardpy as sc
  12. from sklearn.linear_model import LogisticRegression
  13. from commom import f_df_to_image
  14. from entitys import DataPreparedEntity, MetricFucEntity, DataSplitEntity
  15. from feature import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
  16. from .model_base import ModelBase
  17. class ModelLr(ModelBase):
  18. def __init__(self, *args, **kwargs):
  19. super().__init__(*args, **kwargs)
  20. # 报告模板
  21. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
  22. self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
  23. def get_template_path(self):
  24. return self._template_path
  25. def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
  26. bins = kwargs["bins"]
  27. data_split_original: DataSplitEntity = kwargs["data_split_original"]
  28. # woe编码之前的数据
  29. train_data_original = data_split_original.train_data
  30. test_data_original = data_split_original.test_data
  31. train_data = data.train_data
  32. train_y = train_data.get_Ydata()
  33. y_column = train_data.y_column
  34. test_data = data.test_data
  35. self.lr.fit(train_data.get_Xdata(), train_y)
  36. metric_value_dict = {}
  37. # 评分卡
  38. card: Dict = sc.scorecard(bins, self.lr, train_data.x_columns, points0=600, odds0=train_data.get_odds0(),
  39. pdo=50)
  40. card_df = pd.DataFrame(columns=card['basepoints'].keys())
  41. for k, v in card.items():
  42. card_df = pd.concat((card_df, v))
  43. card_df_path = self._train_config.f_get_save_path(f"card_df.png")
  44. f_df_to_image(card_df, card_df_path)
  45. metric_value_dict["评分卡"] = MetricFucEntity(image_path=card_df_path)
  46. # 模型系数
  47. coef = dict(zip(train_data.x_columns, self.lr.coef_.reshape(-1)))
  48. coef_df = pd.DataFrame()
  49. coef_df['变量'] = coef.keys()
  50. coef_df['变量系数'] = coef.values()
  51. metric_value_dict["变量系数"] = MetricFucEntity(table=coef_df, table_font_size=10)
  52. # 模型ks auc
  53. train_prob = self.lr.predict_proba(train_data.get_Xdata())[:, 1]
  54. image_path_list = []
  55. train_perf = sc.perf_eva(train_y, train_prob, title="train", show_plot=True)
  56. path = self._train_config.f_get_save_path(f"train_perf.png")
  57. train_perf["pic"].savefig(path)
  58. image_path_list.append(path)
  59. train_auc = train_perf["AUC"]
  60. train_ks = train_perf["KS"]
  61. test_auc = "-"
  62. test_ks = "-"
  63. if test_data is not None:
  64. test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
  65. test_y = test_data.get_Ydata()
  66. test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
  67. path = self._train_config.f_get_save_path(f"test_perf.png")
  68. test_perf["pic"].savefig(path)
  69. image_path_list.append(path)
  70. test_auc = test_perf["AUC"]
  71. test_ks = test_perf["KS"]
  72. df_auc = pd.DataFrame()
  73. df_auc["样本集"] = ["训练集", "测试集"]
  74. df_auc["AUC"] = [train_auc, test_auc]
  75. df_auc["KS"] = [train_ks, test_ks]
  76. metric_value_dict["模型结果"] = MetricFucEntity(table=df_auc, image_path=image_path_list, image_size=5,
  77. table_font_size=10)
  78. # 评分卡分箱
  79. train_data_original, score_bins = f_get_model_score_bin(train_data_original, card)
  80. train_data_gain = f_calcu_model_ks(train_data_original, y_column, sort_ascending=True)
  81. train_data_gain_path = self._train_config.f_get_save_path(f"train_data_gain.png")
  82. f_df_to_image(train_data_gain, train_data_gain_path)
  83. metric_value_dict["训练集分数分箱"] = MetricFucEntity(image_path=train_data_gain_path)
  84. if test_data is not None:
  85. test_data_original, bins = f_get_model_score_bin(test_data_original, card, score_bins)
  86. test_data_gain = f_calcu_model_ks(test_data_original, y_column, sort_ascending=True)
  87. test_data_gain_path = self._train_config.f_get_save_path(f"test_data_gain.png")
  88. f_df_to_image(test_data_gain, test_data_gain_path)
  89. metric_value_dict["测试集分数分箱"] = MetricFucEntity(image_path=test_data_gain_path)
  90. # 模型分psi
  91. model_psi = f_calcu_model_psi(train_data_original, test_data_original)
  92. model_psi_path = self._train_config.f_get_save_path(f"model_psi.png")
  93. f_df_to_image(model_psi, model_psi_path)
  94. metric_value_dict["模型稳定性"] = MetricFucEntity(value=model_psi["psi"].sum().round(4), image_path=model_psi_path)
  95. return metric_value_dict
  96. def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
  97. return self.lr.predict_proba(x)[:, 1]
  98. def export_model_file(self):
  99. pass
  100. if __name__ == "__main__":
  101. pass