model_lr.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import json
  8. import os.path
  9. import pickle
  10. from os.path import dirname, realpath
  11. from typing import Dict
  12. import numpy as np
  13. import pandas as pd
  14. import scorecardpy as sc
  15. import statsmodels.api as sm
  16. from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
  17. f_image_crop_white_borders
  18. from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
  19. from enums import ContextEnum, ResultCodesEnum, ConstantEnum, FileEnum
  20. from init import context
  21. from .model_base import ModelBase
  22. from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
  23. class ModelLr(ModelBase):
  24. def __init__(self, *args, **kwargs):
  25. super().__init__(*args, **kwargs)
  26. # 报告模板
  27. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
  28. self.lr = None
  29. self.card = None
  30. self.card_cfg = None
  31. self.coef = None
  32. def get_report_template_path(self):
  33. return self._template_path
  34. def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
  35. woebin = context.get(ContextEnum.WOEBIN)
  36. data_x = train_data.data_x.copy()
  37. # scorecardpy高版本区分了sklearn与statsmodels,为了后面生成评分卡需要,但是又不需要截距,所以常数项置0
  38. if sc.__version__ > '0.1.9.2':
  39. data_x["const"] = [0] * len(data_x)
  40. family = sm.families.Binomial()
  41. logit = sm.GLM(train_data.data_y, data_x, family=family)
  42. self.lr = logit.fit()
  43. # scorecardpy低版本
  44. if sc.__version__ <= '0.1.9.2':
  45. self.lr.coef_ = [list(self.lr.summary2().tables[1].loc[:, 'Coef.'])]
  46. self.lr.intercept_ = [0]
  47. if len(self.lr.coef_[0]) != len(data_x.columns):
  48. raise GeneralException(ResultCodesEnum.SYSTEM_ERROR, message=f"lr模型coef系数长度与x_columns长度不一致。")
  49. self.card = sc.scorecard(woebin, self.lr, data_x.columns, points0=600, pdo=50, odds0=train_data.get_odds0())
  50. self.card_cfg = {"points0": 600, "pdo": 50, "odds0": train_data.get_odds0()}
  51. coef_table = self.lr.summary2().tables[1]
  52. self.coef = dict(zip(coef_table.index, coef_table['Coef.']))
  53. def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  54. # scorecardpy高版本
  55. if sc.__version__ > '0.1.9.2':
  56. x = x.copy()
  57. x["const"] = [0] * len(x)
  58. return np.array(self.lr.predict(x))
  59. def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  60. return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
  61. def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  62. x[ConstantEnum.SCORE.value] = self.score(x)
  63. x = f_add_rules(x, self.ml_config.rules)
  64. return np.array(x[ConstantEnum.SCORE.value])
  65. def model_save(self):
  66. if self.lr is None:
  67. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  68. if self.card is None:
  69. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"card不存在")
  70. path = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
  71. self.lr.save(path)
  72. print(f"model save to【{path}】success. ")
  73. path = self.ml_config.f_get_save_path(FileEnum.COEF.value)
  74. with open(path, mode="w", encoding="utf-8") as f:
  75. j = json.dumps(self.coef, ensure_ascii=False)
  76. f.write(j)
  77. print(f"model save to【{path}】success. ")
  78. df_card = pd.concat(self.card.values())
  79. path = self.ml_config.f_get_save_path(FileEnum.CARD.value)
  80. df_card.to_csv(path)
  81. print(f"model save to【{path}】success. ")
  82. path = self.ml_config.f_get_save_path(FileEnum.CARD_CFG.value)
  83. with open(path, mode="w", encoding="utf-8") as f:
  84. j = json.dumps(self.card_cfg, ensure_ascii=False)
  85. f.write(j)
  86. print(f"model save to【{path}】success. ")
  87. def model_load(self, path: str, *args, **kwargs):
  88. if not os.path.isdir(path):
  89. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
  90. path_model = os.path.join(path, FileEnum.MODEL.value)
  91. if not os.path.isfile(path_model):
  92. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  93. path_card = os.path.join(path, FileEnum.CARD.value)
  94. if not os.path.isfile(path_card):
  95. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_card}】不存在")
  96. with open(path_model, 'rb') as f:
  97. self.lr = pickle.load(f)
  98. print(f"model load from【{path_model}】success.")
  99. df_card = pd.read_csv(path_card)
  100. variables = df_card["variable"].unique().tolist()
  101. self.card = {}
  102. for variable in variables:
  103. self.card[variable] = df_card[df_card["variable"] == variable]
  104. print(f"model load from【{path_card}】success.")
  105. def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
  106. def _get_auc_ks(data_y, score, title):
  107. perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
  108. path = self.ml_config.f_get_save_path(f"perf_{title}.png")
  109. perf["pic"].savefig(path)
  110. auc = perf["AUC"]
  111. ks = perf["KS"]
  112. f_image_crop_white_borders(path, path)
  113. return auc, ks, path
  114. def _get_perf(perf_rule=False):
  115. # 模型ks auc
  116. img_path_auc_ks = []
  117. suffix = ""
  118. if perf_rule:
  119. suffix = "-规则"
  120. train_score = self.score_rule(train_data)
  121. test_score = self.score_rule(test_data)
  122. else:
  123. train_score = self.score(train_data)
  124. test_score = self.score(test_data)
  125. train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train{suffix}")
  126. img_path_auc_ks.append(path)
  127. test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test{suffix}")
  128. img_path_auc_ks.append(path)
  129. df_auc_ks = pd.DataFrame()
  130. df_auc_ks["样本集"] = ["训练集", "测试集"]
  131. df_auc_ks["AUC"] = [train_auc, test_auc]
  132. df_auc_ks["KS"] = [train_ks, test_ks]
  133. metric_value_dict[f"模型结果{suffix}"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
  134. image_size=5, table_font_size=10)
  135. # 评分卡分箱
  136. train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
  137. train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=True)
  138. img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain{suffix}.png")
  139. f_df_to_image(train_data_gain, img_path_train_gain)
  140. metric_value_dict[f"训练集分数分箱{suffix}"] = MetricFucResultEntity(table=train_data_gain,
  141. image_path=img_path_train_gain)
  142. test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
  143. test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=True)
  144. img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain{suffix}.png")
  145. f_df_to_image(test_data_gain, img_path_test_gain)
  146. metric_value_dict[f"测试集分数分箱{suffix}"] = MetricFucResultEntity(table=test_data_gain,
  147. image_path=img_path_test_gain)
  148. # 模型分psi
  149. model_psi = f_calcu_model_psi(train_score_bin, test_score_bin)
  150. img_path_psi = self.ml_config.f_get_save_path(f"model_psi{suffix}.png")
  151. f_df_to_image(model_psi, img_path_psi)
  152. metric_value_dict[f"模型稳定性{suffix}"] = MetricFucResultEntity(table=model_psi,
  153. value=model_psi["psi"].sum().round(3),
  154. image_path=img_path_psi)
  155. return train_score_bin, test_score_bin
  156. y_column = self._ml_config.y_column
  157. stress_test = self.ml_config.stress_test
  158. stress_sample_times = self.ml_config.stress_sample_times
  159. stress_bad_rate_list = self.ml_config.stress_bad_rate_list
  160. train_data = data.train_data
  161. test_data = data.test_data
  162. metric_value_dict = {}
  163. # 评分卡
  164. df_card = pd.concat(self.card.values())
  165. img_path_card = self.ml_config.f_get_save_path(f"card.png")
  166. f_df_to_image(df_card, img_path_card)
  167. metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
  168. # 模型系数
  169. coef_table = self.lr.summary().tables[1]
  170. var_name = coef_table.data[0]
  171. var_name[0] = "var"
  172. df_coef = pd.DataFrame(columns=var_name, data=coef_table.data[1:])
  173. img_path_coef = self.ml_config.f_get_save_path(f"coef.png")
  174. f_df_to_image(df_coef, img_path_coef)
  175. metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
  176. _, test_score_bin = _get_perf()
  177. if len(self.ml_config.rules) != 0:
  178. _, test_score_bin = _get_perf(perf_rule=True)
  179. # 压力测试
  180. if stress_test:
  181. df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
  182. bad_rate_list=stress_bad_rate_list,
  183. target_column=y_column, score_column=ConstantEnum.SCORE.value)
  184. img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
  185. f_df_to_image(df_stress, img_path_stress)
  186. metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  187. if self.ml_config.jupyter_print:
  188. self.jupyter_print(metric_value_dict)
  189. return metric_value_dict
  190. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
  191. from IPython import display
  192. suffix = "-规则"
  193. f_display_title(display, "模型结果")
  194. display.display(metric_value_dict["模型结果"].table)
  195. f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
  196. if len(self.ml_config.rules) != 0:
  197. print("加入规则后:")
  198. display.display(metric_value_dict[f"模型结果{suffix}"].table)
  199. f_display_images_by_side(display, metric_value_dict[f"模型结果{suffix}"].image_path)
  200. f_display_title(display, "模型变量系数")
  201. print(self.lr.summary().tables[0])
  202. display.display(metric_value_dict["变量系数"].table)
  203. # 模型psi
  204. f_display_title(display, "模型psi")
  205. display.display(metric_value_dict["模型稳定性"].table)
  206. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  207. if len(self.ml_config.rules) != 0:
  208. print("加入规则后:")
  209. display.display(metric_value_dict[f"模型稳定性{suffix}"].table)
  210. print(f"模型psi: {metric_value_dict[f'模型稳定性{suffix}'].value}")
  211. f_display_title(display, "分数分箱")
  212. print("训练集-分数分箱")
  213. display.display(metric_value_dict["训练集分数分箱"].table)
  214. if len(self.ml_config.rules) != 0:
  215. print("加入规则后:")
  216. print(f"训练集-分数分箱")
  217. display.display(metric_value_dict[f"训练集分数分箱{suffix}"].table)
  218. print("测试集-分数分箱")
  219. display.display(metric_value_dict["测试集分数分箱"].table)
  220. if len(self.ml_config.rules) != 0:
  221. print("加入规则后:")
  222. print(f"测试集-分数分箱")
  223. display.display(metric_value_dict[f"测试集分数分箱{suffix}"].table)
  224. # 评分卡
  225. f_display_title(display, "评分卡")
  226. if len(self.ml_config.rules) != 0:
  227. print(f"评分卡不包含规则")
  228. display.display(metric_value_dict["评分卡"].table)
  229. if "压力测试" in metric_value_dict.keys():
  230. f_display_title(display, "压力测试")
  231. display.display(metric_value_dict["压力测试"].table)
  232. if __name__ == "__main__":
  233. pass