model_lr.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import json
  8. import os.path
  9. import pickle
  10. from os.path import dirname, realpath
  11. from typing import Dict
  12. import numpy as np
  13. import pandas as pd
  14. import scorecardpy as sc
  15. import statsmodels.api as sm
  16. from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
  17. f_image_crop_white_borders
  18. from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
  19. from enums import ContextEnum, ResultCodesEnum, ConstantEnum, FileEnum
  20. from feature import f_get_var_mapping
  21. from init import context
  22. from .model_base import ModelBase
  23. from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
  24. class ModelLr(ModelBase):
  25. def __init__(self, *args, **kwargs):
  26. super().__init__(*args, **kwargs)
  27. # 报告模板
  28. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
  29. self.lr = None
  30. self.card = None
  31. self.card_cfg = None
  32. self.coef = None
  33. self._test_case = None
  34. def get_report_template_path(self):
  35. return self._template_path
  36. def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
  37. woebin = context.get(ContextEnum.WOEBIN)
  38. data_x = train_data.data_x.copy()
  39. # scorecardpy高版本区分了sklearn与statsmodels,为了后面生成评分卡需要,但是又不需要截距,所以常数项置0
  40. if sc.__version__ > '0.1.9.2':
  41. data_x["const"] = [0] * len(data_x)
  42. family = sm.families.Binomial()
  43. logit = sm.GLM(train_data.data_y, data_x, family=family)
  44. self.lr = logit.fit()
  45. # scorecardpy低版本
  46. if sc.__version__ <= '0.1.9.2':
  47. self.lr.coef_ = [list(self.lr.summary2().tables[1].loc[:, 'Coef.'])]
  48. self.lr.intercept_ = [0]
  49. if len(self.lr.coef_[0]) != len(data_x.columns):
  50. raise GeneralException(ResultCodesEnum.SYSTEM_ERROR, message=f"lr模型coef系数长度与x_columns长度不一致。")
  51. self.card = sc.scorecard(woebin, self.lr, data_x.columns, points0=600, pdo=50, odds0=train_data.get_odds0())
  52. self.card_cfg = {"points0": 600, "pdo": 50, "odds0": train_data.get_odds0()}
  53. coef_table = self.lr.summary2().tables[1]
  54. self.coef = dict(zip(coef_table.index, coef_table['Coef.']))
  55. def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  56. # scorecardpy高版本
  57. if sc.__version__ > '0.1.9.2':
  58. x = x.copy()
  59. x["const"] = [0] * len(x)
  60. return np.array(self.lr.predict(x))
  61. def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  62. return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
  63. def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  64. x[ConstantEnum.SCORE.value] = self.score(x)
  65. x = f_add_rules(x, self.ml_config.rules)
  66. return np.array(x[ConstantEnum.SCORE.value])
  67. def model_save(self):
  68. if self.lr is None:
  69. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  70. if self.card is None:
  71. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"card不存在")
  72. path = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
  73. self.lr.save(path)
  74. print(f"model save to【{path}】success. ")
  75. path = self.ml_config.f_get_save_path(FileEnum.COEF.value)
  76. with open(path, mode="w", encoding="utf-8") as f:
  77. j = json.dumps(self.coef, ensure_ascii=False)
  78. f.write(j)
  79. print(f"model save to【{path}】success. ")
  80. df_card = pd.concat(self.card.values())
  81. path = self.ml_config.f_get_save_path(FileEnum.CARD.value)
  82. df_card.to_csv(path)
  83. print(f"model save to【{path}】success. ")
  84. path = self.ml_config.f_get_save_path(FileEnum.CARD_CFG.value)
  85. with open(path, mode="w", encoding="utf-8") as f:
  86. j = json.dumps(self.card_cfg, ensure_ascii=False)
  87. f.write(j)
  88. print(f"model save to【{path}】success. ")
  89. woebin = context.get(ContextEnum.WOEBIN)
  90. df_woebin = pd.concat(woebin.values())
  91. df_var_mapping = f_get_var_mapping(df_woebin, df_card, columns_anns=self.ml_config.columns_anns)
  92. path = self.ml_config.f_get_save_path(FileEnum.VAR_MAPPING.value)
  93. df_var_mapping.to_csv(path, encoding="utf-8")
  94. print(f"model save to【{path}】success. ")
  95. path = self.ml_config.f_get_save_path(FileEnum.TEST_CASE.value)
  96. self._test_case.to_csv(path, encoding="utf-8")
  97. print(f"test case save to【{path}】success. ")
  98. def model_load(self, path: str, *args, **kwargs):
  99. if not os.path.isdir(path):
  100. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
  101. path_model = os.path.join(path, FileEnum.MODEL.value)
  102. if not os.path.isfile(path_model):
  103. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  104. path_card = os.path.join(path, FileEnum.CARD.value)
  105. if not os.path.isfile(path_card):
  106. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_card}】不存在")
  107. with open(path_model, 'rb') as f:
  108. self.lr = pickle.load(f)
  109. print(f"model load from【{path_model}】success.")
  110. df_card = pd.read_csv(path_card)
  111. variables = df_card["variable"].unique().tolist()
  112. self.card = {}
  113. for variable in variables:
  114. self.card[variable] = df_card[df_card["variable"] == variable]
  115. print(f"model load from【{path_card}】success.")
  116. def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
  117. def _get_auc_ks(data_y, score, title):
  118. perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
  119. path = self.ml_config.f_get_save_path(f"perf_{title}.png")
  120. perf["pic"].savefig(path)
  121. auc = perf["AUC"]
  122. ks = perf["KS"]
  123. f_image_crop_white_borders(path, path)
  124. return auc, ks, path
  125. def _get_perf(perf_rule=False):
  126. # 模型ks auc
  127. img_path_auc_ks = []
  128. suffix = ""
  129. if perf_rule:
  130. suffix = "-规则"
  131. train_score = self.score_rule(train_data)
  132. test_score = self.score_rule(test_data)
  133. else:
  134. train_score = self.score(train_data)
  135. test_score = self.score(test_data)
  136. train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train{suffix}")
  137. img_path_auc_ks.append(path)
  138. test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test{suffix}")
  139. img_path_auc_ks.append(path)
  140. df_auc_ks = pd.DataFrame()
  141. df_auc_ks["样本集"] = ["训练集", "测试集"]
  142. df_auc_ks["AUC"] = [train_auc, test_auc]
  143. df_auc_ks["KS"] = [train_ks, test_ks]
  144. metric_value_dict[f"模型结果{suffix}"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
  145. image_size=5, table_font_size=10)
  146. # 评分卡分箱
  147. train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
  148. train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=True)
  149. img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain{suffix}.png")
  150. f_df_to_image(train_data_gain, img_path_train_gain)
  151. metric_value_dict[f"训练集分数分箱{suffix}"] = MetricFucResultEntity(table=train_data_gain,
  152. image_path=img_path_train_gain)
  153. test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
  154. test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=True)
  155. img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain{suffix}.png")
  156. f_df_to_image(test_data_gain, img_path_test_gain)
  157. metric_value_dict[f"测试集分数分箱{suffix}"] = MetricFucResultEntity(table=test_data_gain,
  158. image_path=img_path_test_gain)
  159. # 模型分psi
  160. model_psi = f_calcu_model_psi(train_score_bin, test_score_bin)
  161. img_path_psi = self.ml_config.f_get_save_path(f"model_psi{suffix}.png")
  162. f_df_to_image(model_psi, img_path_psi)
  163. metric_value_dict[f"模型稳定性{suffix}"] = MetricFucResultEntity(table=model_psi,
  164. value=model_psi["psi"].sum().round(3),
  165. image_path=img_path_psi)
  166. return train_score_bin, test_score_bin
  167. y_column = self._ml_config.y_column
  168. stress_test = self.ml_config.stress_test
  169. stress_sample_times = self.ml_config.stress_sample_times
  170. stress_bad_rate_list = self.ml_config.stress_bad_rate_list
  171. train_data = data.train_data
  172. test_data = data.test_data
  173. metric_value_dict = {}
  174. # 评分卡
  175. df_card = pd.concat(self.card.values())
  176. df_card.reset_index(drop=True, inplace=True)
  177. img_path_card = self.ml_config.f_get_save_path(f"card.png")
  178. f_df_to_image(df_card, img_path_card)
  179. metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
  180. # 模型系数
  181. coef_table = self.lr.summary().tables[1]
  182. var_name = coef_table.data[0]
  183. var_name[0] = "var"
  184. df_coef = pd.DataFrame(columns=var_name, data=coef_table.data[1:])
  185. img_path_coef = self.ml_config.f_get_save_path(f"coef.png")
  186. f_df_to_image(df_coef, img_path_coef)
  187. metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
  188. _, test_score_bin = _get_perf()
  189. if len(self.ml_config.rules) != 0:
  190. _, test_score_bin = _get_perf(perf_rule=True)
  191. # 压力测试
  192. if stress_test:
  193. df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
  194. bad_rate_list=stress_bad_rate_list,
  195. target_column=y_column, score_column=ConstantEnum.SCORE.value)
  196. img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
  197. f_df_to_image(df_stress, img_path_stress)
  198. metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  199. if self.ml_config.jupyter_print:
  200. self.jupyter_print(metric_value_dict)
  201. # 测试案例
  202. self._test_case = data.test_data.copy()
  203. if len(self.ml_config.rules) != 0:
  204. test_score = self.score_rule(self._test_case)
  205. else:
  206. test_score = self.score(self._test_case)
  207. self._test_case["score"] = test_score
  208. return metric_value_dict
  209. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
  210. from IPython import display
  211. suffix = "-规则"
  212. f_display_title(display, "模型结果")
  213. display.display(metric_value_dict["模型结果"].table)
  214. f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
  215. if len(self.ml_config.rules) != 0:
  216. print("加入规则后:")
  217. display.display(metric_value_dict[f"模型结果{suffix}"].table)
  218. f_display_images_by_side(display, metric_value_dict[f"模型结果{suffix}"].image_path)
  219. f_display_title(display, "模型变量系数")
  220. print(self.lr.summary().tables[0])
  221. display.display(metric_value_dict["变量系数"].table)
  222. # 模型psi
  223. f_display_title(display, "模型psi")
  224. display.display(metric_value_dict["模型稳定性"].table)
  225. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  226. if len(self.ml_config.rules) != 0:
  227. print("加入规则后:")
  228. display.display(metric_value_dict[f"模型稳定性{suffix}"].table)
  229. print(f"模型psi: {metric_value_dict[f'模型稳定性{suffix}'].value}")
  230. f_display_title(display, "分数分箱")
  231. print("训练集-分数分箱")
  232. display.display(metric_value_dict["训练集分数分箱"].table)
  233. if len(self.ml_config.rules) != 0:
  234. print("加入规则后:")
  235. print(f"训练集-分数分箱")
  236. display.display(metric_value_dict[f"训练集分数分箱{suffix}"].table)
  237. print("测试集-分数分箱")
  238. display.display(metric_value_dict["测试集分数分箱"].table)
  239. if len(self.ml_config.rules) != 0:
  240. print("加入规则后:")
  241. print(f"测试集-分数分箱")
  242. display.display(metric_value_dict[f"测试集分数分箱{suffix}"].table)
  243. # 评分卡
  244. f_display_title(display, "评分卡")
  245. if len(self.ml_config.rules) != 0:
  246. print(f"评分卡不包含规则")
  247. display.display(metric_value_dict["评分卡"].table)
  248. if "压力测试" in metric_value_dict.keys():
  249. f_display_title(display, "压力测试")
  250. display.display(metric_value_dict["压力测试"].table)
  251. if __name__ == "__main__":
  252. pass