model_lr.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import os.path
  8. import pickle
  9. from os.path import dirname, realpath
  10. from typing import Dict
  11. import numpy as np
  12. import pandas as pd
  13. import scorecardpy as sc
  14. import statsmodels.api as sm
  15. from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
  16. f_image_crop_white_borders
  17. from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
  18. from enums import ContextEnum, ResultCodesEnum, ConstantEnum
  19. from init import context
  20. from .model_base import ModelBase
  21. from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
  22. class ModelLr(ModelBase):
  23. def __init__(self, *args, **kwargs):
  24. super().__init__(*args, **kwargs)
  25. # 报告模板
  26. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
  27. self.lr = None
  28. self.card = None
  29. def get_report_template_path(self):
  30. return self._template_path
  31. def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
  32. woebin = context.get(ContextEnum.WOEBIN)
  33. data_x = train_data.data_x.copy()
  34. # scorecardpy高版本区分了sklearn与statsmodels,为了后面生成评分卡需要,但是又不需要截距,所以常数项置0
  35. if sc.__version__ > '0.1.9.2':
  36. data_x["const"] = [0] * len(data_x)
  37. family = sm.families.Binomial()
  38. logit = sm.GLM(train_data.data_y, data_x, family=family)
  39. self.lr = logit.fit()
  40. # scorecardpy低版本
  41. if sc.__version__ <= '0.1.9.2':
  42. self.lr.coef_ = [list(self.lr.summary2().tables[1].loc[:, 'Coef.'])]
  43. self.lr.intercept_ = [0]
  44. if len(self.lr.coef_[0]) != len(data_x.columns):
  45. raise GeneralException(ResultCodesEnum.SYSTEM_ERROR, message=f"lr模型coef系数长度与x_columns长度不一致。")
  46. self.card = sc.scorecard(woebin, self.lr, data_x.columns, points0=600, pdo=50, odds0=train_data.get_odds0())
  47. def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  48. # scorecardpy高版本
  49. if sc.__version__ > '0.1.9.2':
  50. x = x.copy()
  51. x["const"] = [0] * len(x)
  52. return np.array(self.lr.predict(x))
  53. def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  54. return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
  55. def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  56. x[ConstantEnum.SCORE.value] = self.score(x)
  57. x = f_add_rules(x, self.ml_config.rules)
  58. return np.array(x[ConstantEnum.SCORE.value])
  59. def model_save(self):
  60. if self.lr is None:
  61. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  62. if self.card is None:
  63. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"card不存在")
  64. path = self.ml_config.f_get_save_path(f"model.pkl")
  65. self.lr.save(path)
  66. print(f"model save to【{path}】success. ")
  67. df_card = pd.concat(self.card.values())
  68. path = self.ml_config.f_get_save_path(f"card.csv")
  69. df_card.to_csv(path)
  70. print(f"model save to【{path}】success. ")
  71. def model_load(self, path: str, *args, **kwargs):
  72. if not os.path.isdir(path):
  73. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
  74. path_model = os.path.join(path, "model.pkl")
  75. if not os.path.isfile(path_model):
  76. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  77. path_card = os.path.join(path, "card.csv")
  78. if not os.path.isfile(path_card):
  79. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_card}】不存在")
  80. with open(path_model, 'rb') as f:
  81. self.lr = pickle.load(f)
  82. df_card = pd.read_csv(path_card)
  83. variables = df_card["variable"].unique().tolist()
  84. self.card = {}
  85. for variable in variables:
  86. self.card[variable] = df_card[df_card["variable"] == variable]
  87. print(f"model load from【{path}】success.")
  88. def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
  89. def _get_auc_ks(data_y, score, title):
  90. perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
  91. path = self.ml_config.f_get_save_path(f"perf_{title}.png")
  92. perf["pic"].savefig(path)
  93. auc = perf["AUC"]
  94. ks = perf["KS"]
  95. f_image_crop_white_borders(path, path)
  96. return auc, ks, path
  97. def _get_perf(perf_rule=False):
  98. # 模型ks auc
  99. img_path_auc_ks = []
  100. suffix = ""
  101. if perf_rule:
  102. suffix = "-规则"
  103. train_score = self.score_rule(train_data)
  104. test_score = self.score_rule(test_data)
  105. else:
  106. train_score = self.score(train_data)
  107. test_score = self.score(test_data)
  108. train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train{suffix}")
  109. img_path_auc_ks.append(path)
  110. test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test{suffix}")
  111. img_path_auc_ks.append(path)
  112. df_auc_ks = pd.DataFrame()
  113. df_auc_ks["样本集"] = ["训练集", "测试集"]
  114. df_auc_ks["AUC"] = [train_auc, test_auc]
  115. df_auc_ks["KS"] = [train_ks, test_ks]
  116. metric_value_dict[f"模型结果{suffix}"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
  117. image_size=5, table_font_size=10)
  118. # 评分卡分箱
  119. train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
  120. train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=True)
  121. img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain{suffix}.png")
  122. f_df_to_image(train_data_gain, img_path_train_gain)
  123. metric_value_dict[f"训练集分数分箱{suffix}"] = MetricFucResultEntity(table=train_data_gain,
  124. image_path=img_path_train_gain)
  125. test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
  126. test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=True)
  127. img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain{suffix}.png")
  128. f_df_to_image(test_data_gain, img_path_test_gain)
  129. metric_value_dict[f"测试集分数分箱{suffix}"] = MetricFucResultEntity(table=test_data_gain,
  130. image_path=img_path_test_gain)
  131. # 模型分psi
  132. model_psi = f_calcu_model_psi(train_score_bin, test_score_bin)
  133. img_path_psi = self.ml_config.f_get_save_path(f"model_psi{suffix}.png")
  134. f_df_to_image(model_psi, img_path_psi)
  135. metric_value_dict[f"模型稳定性{suffix}"] = MetricFucResultEntity(table=model_psi,
  136. value=model_psi["psi"].sum().round(3),
  137. image_path=img_path_psi)
  138. return train_score_bin, test_score_bin
  139. y_column = self._ml_config.y_column
  140. stress_test = self.ml_config.stress_test
  141. stress_sample_times = self.ml_config.stress_sample_times
  142. stress_bad_rate_list = self.ml_config.stress_bad_rate_list
  143. train_data = data.train_data
  144. test_data = data.test_data
  145. metric_value_dict = {}
  146. # 评分卡
  147. df_card = pd.concat(self.card.values())
  148. img_path_card = self.ml_config.f_get_save_path(f"card.png")
  149. f_df_to_image(df_card, img_path_card)
  150. metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
  151. # 模型系数
  152. coef_table = self.lr.summary().tables[1]
  153. var_name = coef_table.data[0]
  154. var_name[0] = "var"
  155. df_coef = pd.DataFrame(columns=var_name, data=coef_table.data[1:])
  156. img_path_coef = self.ml_config.f_get_save_path(f"coef.png")
  157. f_df_to_image(df_coef, img_path_coef)
  158. metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
  159. _, test_score_bin = _get_perf()
  160. if len(self.ml_config.rules) != 0:
  161. _, test_score_bin = _get_perf(perf_rule=True)
  162. # 压力测试
  163. if stress_test:
  164. df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
  165. bad_rate_list=stress_bad_rate_list,
  166. target_column=y_column, score_column=ConstantEnum.SCORE.value)
  167. img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
  168. f_df_to_image(df_stress, img_path_stress)
  169. metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  170. if self.ml_config.jupyter_print:
  171. self.jupyter_print(metric_value_dict)
  172. return metric_value_dict
  173. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
  174. from IPython import display
  175. suffix = "-规则"
  176. f_display_title(display, "模型结果")
  177. display.display(metric_value_dict["模型结果"].table)
  178. f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
  179. if len(self.ml_config.rules) != 0:
  180. print("加入规则后:")
  181. display.display(metric_value_dict[f"模型结果{suffix}"].table)
  182. f_display_images_by_side(display, metric_value_dict[f"模型结果{suffix}"].image_path)
  183. f_display_title(display, "模型变量系数")
  184. print(self.lr.summary().tables[0])
  185. display.display(metric_value_dict["变量系数"].table)
  186. # 模型psi
  187. f_display_title(display, "模型psi")
  188. display.display(metric_value_dict["模型稳定性"].table)
  189. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  190. if len(self.ml_config.rules) != 0:
  191. print("加入规则后:")
  192. display.display(metric_value_dict[f"模型稳定性{suffix}"].table)
  193. print(f"模型psi: {metric_value_dict[f'模型稳定性{suffix}'].value}")
  194. f_display_title(display, "分数分箱")
  195. print("训练集-分数分箱")
  196. display.display(metric_value_dict["训练集分数分箱"].table)
  197. if len(self.ml_config.rules) != 0:
  198. print("加入规则后:")
  199. print(f"训练集-分数分箱")
  200. display.display(metric_value_dict[f"训练集分数分箱{suffix}"].table)
  201. print("测试集-分数分箱")
  202. display.display(metric_value_dict["测试集分数分箱"].table)
  203. if len(self.ml_config.rules) != 0:
  204. print("加入规则后:")
  205. print(f"测试集-分数分箱")
  206. display.display(metric_value_dict[f"测试集分数分箱{suffix}"].table)
  207. # 评分卡
  208. f_display_title(display, "评分卡")
  209. if len(self.ml_config.rules) != 0:
  210. print(f"评分卡不包含规则")
  211. display.display(metric_value_dict["评分卡"].table)
  212. if "压力测试" in metric_value_dict.keys():
  213. f_display_title(display, "压力测试")
  214. display.display(metric_value_dict["压力测试"].table)
  215. if __name__ == "__main__":
  216. pass