trainer.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import json
  8. import math
  9. import os
  10. import re
  11. from os.path import dirname, realpath
  12. from typing import Dict, List
  13. import matplotlib.pyplot as plt
  14. import numpy as np
  15. import pandas as pd
  16. import scorecardpy as sc
  17. import torch
  18. import torch.nn as nn
  19. import torch.optim as optim
  20. from tqdm import tqdm
  21. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  22. f_display_images_by_side
  23. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  24. from enums import ResultCodesEnum, ConstantEnum, ContextEnum, FileEnum
  25. from feature import f_woebin_load
  26. from init import init, context
  27. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
  28. from monitor import ReportWord
  29. from .utils import LR
  30. init()
  31. class OnlineLearningTrainer:
  32. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  33. if ol_config is not None:
  34. self._ol_config = ol_config
  35. else:
  36. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  37. self._data = data
  38. self._columns = None
  39. self._model_original: LR
  40. self._model_optimized: LR
  41. self._df_param_optimized = None
  42. self.sc_woebin = None
  43. self.card_cfg = None
  44. self.card = None
  45. # 报告模板
  46. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  47. "./template/OnlineLearning报告模板_lr.docx")
  48. self._init(self._ol_config.path_resources)
  49. def _init(self, path: str):
  50. if not os.path.isdir(path):
  51. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  52. path_coef = os.path.join(path, FileEnum.COEF.value)
  53. if not os.path.isfile(path_coef):
  54. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型系数文件【{path_coef}】不存在")
  55. with open(path_coef, mode="r", encoding="utf-8") as f:
  56. coef = json.loads(f.read())
  57. print(f"coef load from【{path_coef}】success.")
  58. path_card_cfg = os.path.join(path, FileEnum.CARD_CFG.value)
  59. if os.path.isfile(path_card_cfg):
  60. with open(path_card_cfg, mode="r", encoding="utf-8") as f:
  61. self.card_cfg = json.loads(f.read())
  62. print(f"{FileEnum.CARD_CFG.value} load from【{path_card_cfg}】success.")
  63. self._columns = list(coef.keys())
  64. # 排个序,防止因为顺序原因导致的可能的bug
  65. self._columns.sort()
  66. weight = [coef[k] for k in self._columns]
  67. self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
  68. self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
  69. self._columns = [re.sub('_woe$', '', i) for i in self._columns]
  70. # 剔除常数项,因为woe编码里没有常数项
  71. self._columns_intercept_remove = self._columns.copy()
  72. if ConstantEnum.INTERCEPT.value in self._columns_intercept_remove:
  73. self._columns_intercept_remove.remove(ConstantEnum.INTERCEPT.value)
  74. # woe编码后带_woe后缀
  75. self._columns_woe = [f"{i}_woe" for i in self._columns]
  76. self.sc_woebin = f_woebin_load(path)
  77. for k in self._columns_intercept_remove:
  78. if k not in self.sc_woebin.keys():
  79. GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型变量【{k}】在woe特征里不存在")
  80. def _feature_generate(self, data: pd.DataFrame) -> pd.DataFrame:
  81. data_woe = sc.woebin_ply(data[self._columns_intercept_remove], self.sc_woebin, print_info=False)
  82. data_woe[f"{ConstantEnum.INTERCEPT.value}_woe"] = [1] * len(data_woe)
  83. return data_woe[self._columns_woe].to_numpy()
  84. def _f_get_best_model(self, df_param: pd.DataFrame, epoch: int = None) -> LR:
  85. if epoch is None:
  86. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  87. print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  88. weight = list(df_param_sort.iloc[0])
  89. else:
  90. print(f"选择epoch:【{epoch}】的参数:\n{df_param[df_param['epoch'] == epoch].iloc[0].to_dict()}")
  91. weight = list(df_param[df_param["epoch"] == epoch].iloc[0])
  92. weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
  93. return LR(weight)
  94. def _f_get_scorecard(self, ):
  95. class M:
  96. def __init__(self, ):
  97. pass
  98. m = M()
  99. m.coef_ = [self._model_optimized.linear.weight.tolist()]
  100. m.intercept_ = [0]
  101. self.card = sc.scorecard(self.sc_woebin, m, self._columns_woe, **self.card_cfg)
  102. def _f_get_metric_auc_ks(self, model_type: str):
  103. def _get_auc_ks(data, title):
  104. y = data[self._ol_config.y_column]
  105. y_prob = self.prob(data, model)
  106. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  107. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  108. perf["pic"].savefig(path)
  109. auc = perf["AUC"]
  110. ks = perf["KS"]
  111. f_image_crop_white_borders(path, path)
  112. return auc, ks, path
  113. train_data = self._data.train_data
  114. test_data = self._data.test_data
  115. data = self._data.data
  116. model = self._model_optimized
  117. if model_type != "新模型":
  118. model = self._model_original
  119. img_path_auc_ks = []
  120. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  121. img_path_auc_ks.append(path)
  122. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  123. img_path_auc_ks.append(path)
  124. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  125. img_path_auc_ks.append(path)
  126. df_auc_ks = pd.DataFrame()
  127. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  128. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  129. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  130. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  131. def _f_get_metric_trend(self, ):
  132. y_column = self._ol_config.y_column
  133. data = self._data.data
  134. # 建模样本变量趋势
  135. breaks_list = {}
  136. special_values = {}
  137. for column, bin in self.sc_woebin.items():
  138. breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
  139. sv = list(bin[bin["is_special_values"] == True]['breaks'])
  140. if len(sv) > 0:
  141. special_values[column] = sv
  142. woebin = sc.woebin(data[self._columns_intercept_remove + [y_column]], y=y_column, breaks_list=breaks_list,
  143. special_values=special_values, print_info=False)
  144. imgs_path = []
  145. for k, df_bin in woebin.items():
  146. sc.woebin_plot(df_bin)
  147. path = self._ol_config.f_get_save_path(f"trend_{k}.png")
  148. plt.savefig(path)
  149. imgs_path.append(path)
  150. return MetricFucResultEntity(image_path=imgs_path, image_size=4)
  151. def _f_get_metric_coef(self, ):
  152. columns_anns = self._ol_config.columns_anns
  153. df = pd.DataFrame()
  154. df["变量"] = self._columns
  155. df["原变量WOE拟合系数"] = [round(i, 4) for i in self._model_original.linear.weight.tolist()]
  156. df["新变量WOE拟合系数"] = [round(i, 4) for i in self._model_optimized.linear.weight.tolist()]
  157. anns = [columns_anns.get(column, "-") for column in self._columns]
  158. df["释义"] = anns
  159. img_path_coef = self._ol_config.f_get_save_path(f"coef.png")
  160. f_df_to_image(df, img_path_coef)
  161. return MetricFucResultEntity(table=df, image_path=img_path_coef)
  162. def _f_get_metric_gain(self, model_type: str):
  163. y_column = self._ol_config.y_column
  164. data = self._data.data
  165. model = self._model_optimized
  166. if model_type != "新模型":
  167. model = self._model_original
  168. score = self.prob(data, model)
  169. score_bin, _ = f_get_model_score_bin(data, score)
  170. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  171. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  172. f_df_to_image(gain, img_path_gain)
  173. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  174. def _f_get_stress_test(self, ):
  175. stress_sample_times = self._ol_config.stress_sample_times
  176. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  177. y_column = self._ol_config.y_column
  178. data = self._data.data
  179. score = self.prob(data, self._model_optimized)
  180. score_bin, _ = f_get_model_score_bin(data, score)
  181. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  182. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  183. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  184. f_df_to_image(df_stress, img_path_stress)
  185. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  186. def prob(self, x: pd.DataFrame, model=None):
  187. if model is None:
  188. model = self._model_optimized
  189. model.eval()
  190. with torch.no_grad():
  191. x = torch.tensor(self._feature_generate(x), dtype=torch.float64)
  192. y_prob = model(x)
  193. y_prob = y_prob.detach().numpy()
  194. return y_prob
  195. def score(self, x: pd.DataFrame) -> np.array:
  196. return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
  197. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
  198. y1 = self.prob(x1)
  199. y2 = self.prob(x2)
  200. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  201. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  202. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  203. print(f"模型psi: {model_psi['psi'].sum()}")
  204. return model_psi
  205. def train(self, ):
  206. def _get_param_optimized(model: LR, epoch):
  207. model.eval()
  208. with torch.no_grad():
  209. y_prob = model(test_x)
  210. loss = criterion(y_prob, torch.tensor(test_y.to_numpy(), dtype=torch.float64))
  211. loss_test = loss.detach().item()
  212. y_prob = y_prob.detach().numpy()
  213. perf = sc.perf_eva(test_y, y_prob, show_plot=False)
  214. auc = perf["AUC"]
  215. ks = perf["KS"]
  216. row = model.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
  217. return dict(zip(df_param_columns, row))
  218. epochs = self._ol_config.epochs
  219. batch_size = self._ol_config.batch_size
  220. train_data = self._data.train_data
  221. test_data = self._data.test_data
  222. train_x = self._feature_generate(train_data)
  223. train_y = train_data[self._ol_config.y_column].to_numpy()
  224. test_x = torch.tensor(self._feature_generate(test_data), dtype=torch.float64)
  225. test_y = test_data[self._ol_config.y_column]
  226. criterion = nn.BCELoss()
  227. optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
  228. df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
  229. self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
  230. # 优化前
  231. loss_train = 0
  232. self._df_param_optimized.loc[len(self._df_param_optimized)] = _get_param_optimized(self._model_original, -1)
  233. for epoch in tqdm(range(epochs)):
  234. data_len = len(train_x)
  235. for i in range(math.ceil(data_len / batch_size)):
  236. train_x_batch = torch.tensor(train_x[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  237. train_y_batch = torch.tensor(train_y[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  238. self._model_optimized.train()
  239. optimizer.zero_grad()
  240. y_prob = self._model_optimized(train_x_batch)
  241. loss = criterion(y_prob, train_y_batch)
  242. loss.backward()
  243. optimizer.step()
  244. loss_train = loss.detach().item()
  245. # 测试集评估
  246. self._df_param_optimized.loc[len(self._df_param_optimized)] = _get_param_optimized(self._model_optimized, epoch)
  247. def save(self):
  248. self._ol_config.config_save()
  249. if self.sc_woebin is None:
  250. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
  251. df_woebin = pd.concat(self.sc_woebin.values())
  252. path = self._ol_config.f_get_save_path(FileEnum.FEATURE.value)
  253. df_woebin.to_csv(path)
  254. print(f"feature save to【{path}】success. ")
  255. if self._model_optimized is None:
  256. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  257. path = self._ol_config.f_get_save_path(FileEnum.COEF.value)
  258. with open(path, mode="w", encoding="utf-8") as f:
  259. coef = dict(zip(self._columns, self._model_optimized.linear.weight.tolist()))
  260. j = json.dumps(coef, ensure_ascii=False)
  261. f.write(j)
  262. print(f"model save to【{path}】success. ")
  263. if self.card is not None:
  264. df_card = pd.concat(self.card.values())
  265. path = self._ol_config.f_get_save_path(FileEnum.CARD.value)
  266. df_card.to_csv(path)
  267. print(f"model save to【{path}】success. ")
  268. @staticmethod
  269. def load(path: str):
  270. ol_config = OnlineLearningConfigEntity.from_config(path)
  271. ol_config._path_resources = path
  272. return OnlineLearningTrainer(ol_config=ol_config)
  273. def report(self, epoch: int = None):
  274. self._model_optimized = self._f_get_best_model(self._df_param_optimized, epoch)
  275. if self._ol_config.jupyter_print:
  276. from IPython import display
  277. f_display_title(display, "模型系数优化过程")
  278. display.display(self._df_param_optimized)
  279. metric_value_dict = {}
  280. # 评分卡
  281. if not self.card_cfg is None:
  282. self._f_get_scorecard()
  283. df_card = pd.concat(self.card.values())
  284. img_path_card = self._ol_config.f_get_save_path(f"card.png")
  285. f_df_to_image(df_card, img_path_card)
  286. metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
  287. # 样本分布
  288. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  289. table_font_size=10, table_cell_width=3)
  290. # 模型结果对比
  291. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  292. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  293. # 变量趋势
  294. metric_value_dict["变量趋势-建模数据"] = self._f_get_metric_trend()
  295. # 模型系数对比
  296. metric_value_dict["模型系数"] = self._f_get_metric_coef()
  297. # 分数分箱
  298. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  299. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  300. # 压力测试
  301. if self._ol_config.stress_test:
  302. metric_value_dict["压力测试"] = self._f_get_stress_test()
  303. if self._ol_config.jupyter_print:
  304. self.jupyter_print(metric_value_dict)
  305. save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  306. ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  307. print(f"模型报告文件储存路径:{save_path}")
  308. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  309. from IPython import display
  310. f_display_title(display, "样本分布")
  311. display.display(metric_value_dict["样本分布"].table)
  312. f_display_title(display, "模型结果")
  313. print(f"原模型")
  314. display.display(metric_value_dict["模型结果-原模型"].table)
  315. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  316. print(f"新模型")
  317. display.display(metric_value_dict["模型结果-新模型"].table)
  318. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  319. f_display_title(display, "模型系数")
  320. display.display(metric_value_dict["模型系数"].table)
  321. f_display_title(display, "分数分箱")
  322. print(f"建模数据上分数分箱")
  323. print(f"原模型")
  324. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  325. print(f"新模型")
  326. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  327. f_display_title(display, "变量趋势")
  328. print(f"建模数据上变量趋势")
  329. f_display_images_by_side(display, metric_value_dict["变量趋势-建模数据"].image_path)
  330. if "压力测试" in metric_value_dict.keys():
  331. f_display_title(display, "压力测试")
  332. display.display(metric_value_dict["压力测试"].table)
  333. # 评分卡
  334. if "评分卡" in metric_value_dict.keys():
  335. f_display_title(display, "评分卡")
  336. display.display(metric_value_dict["评分卡"].table)
  337. if __name__ == "__main__":
  338. pass