trainer.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import json
  8. import math
  9. import os
  10. import re
  11. from os.path import dirname, realpath
  12. from typing import Dict, List
  13. import matplotlib.pyplot as plt
  14. import numpy as np
  15. import pandas as pd
  16. import scorecardpy as sc
  17. import torch
  18. import torch.nn as nn
  19. import torch.optim as optim
  20. from tqdm import tqdm
  21. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  22. f_display_images_by_side
  23. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  24. from enums import ResultCodesEnum, ConstantEnum, ContextEnum, FileEnum
  25. from feature import f_woebin_load
  26. from init import init, context
  27. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
  28. from monitor import ReportWord
  29. from .utils import LR
  30. init()
  31. class OnlineLearningTrainer:
  32. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  33. if ol_config is not None:
  34. self._ol_config = ol_config
  35. else:
  36. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  37. self._data = data
  38. self._columns = None
  39. self._model_original: LR
  40. self._model_optimized: LR
  41. self.sc_woebin = None
  42. self.card_cfg = None
  43. self.card = None
  44. # 报告模板
  45. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  46. "./template/OnlineLearning报告模板_lr.docx")
  47. self._init(self._ol_config.path_resources)
  48. def _init(self, path: str):
  49. if not os.path.isdir(path):
  50. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  51. path_coef = os.path.join(path, FileEnum.COEF.value)
  52. if not os.path.isfile(path_coef):
  53. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型系数文件【{path_coef}】不存在")
  54. with open(path_coef, mode="r", encoding="utf-8") as f:
  55. coef = json.loads(f.read())
  56. print(f"coef load from【{path_coef}】success.")
  57. path_card_cfg = os.path.join(path, FileEnum.CARD_CFG.value)
  58. if os.path.isfile(path_card_cfg):
  59. with open(path_card_cfg, mode="r", encoding="utf-8") as f:
  60. self.card_cfg = json.loads(f.read())
  61. print(f"{FileEnum.CARD_CFG.value} load from【{path_card_cfg}】success.")
  62. self._columns = list(coef.keys())
  63. # 排个序,防止因为顺序原因导致的可能的bug
  64. self._columns.sort()
  65. weight = [coef[k] for k in self._columns]
  66. self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
  67. self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
  68. self._columns = [re.sub('_woe$', '', i) for i in self._columns]
  69. # 剔除常数项,因为woe编码里没有常数项
  70. self._columns_intercept_remove = self._columns.copy()
  71. if ConstantEnum.INTERCEPT.value in self._columns_intercept_remove:
  72. self._columns_intercept_remove.remove(ConstantEnum.INTERCEPT.value)
  73. # woe编码后带_woe后缀
  74. self._columns_woe = [f"{i}_woe" for i in self._columns]
  75. self.sc_woebin = f_woebin_load(path)
  76. for k in self._columns_intercept_remove:
  77. if k not in self.sc_woebin.keys():
  78. GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型变量【{k}】在woe特征里不存在")
  79. def _feature_generate(self, data: pd.DataFrame) -> pd.DataFrame:
  80. data_woe = sc.woebin_ply(data[self._columns_intercept_remove], self.sc_woebin, print_info=False)
  81. data_woe[f"{ConstantEnum.INTERCEPT.value}_woe"] = [1] * len(data_woe)
  82. return data_woe[self._columns_woe].to_numpy()
  83. def _f_get_best_model(self, df_param: pd.DataFrame, epoch: int = None) -> LR:
  84. if epoch is None:
  85. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  86. print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  87. weight = list(df_param_sort.iloc[0])
  88. else:
  89. print(f"选择epoch:【{epoch}】的参数:\n{df_param[df_param['epoch'] == epoch].iloc[0].to_dict()}")
  90. weight = list(df_param[df_param["epoch"] == epoch].iloc[0])
  91. weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
  92. return LR(weight)
  93. def _f_get_scorecard(self, ):
  94. class M:
  95. def __init__(self, ):
  96. pass
  97. m = M()
  98. m.coef_ = [self._model_optimized.linear.weight.tolist()]
  99. m.intercept_ = [0]
  100. self.card = sc.scorecard(self.sc_woebin, m, self._columns_woe, **self.card_cfg)
  101. def _f_get_metric_auc_ks(self, model_type: str):
  102. def _get_auc_ks(data, title):
  103. y = data[self._ol_config.y_column]
  104. y_prob = self.prob(data, model)
  105. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  106. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  107. perf["pic"].savefig(path)
  108. auc = perf["AUC"]
  109. ks = perf["KS"]
  110. f_image_crop_white_borders(path, path)
  111. return auc, ks, path
  112. train_data = self._data.train_data
  113. test_data = self._data.test_data
  114. data = self._data.data
  115. model = self._model_optimized
  116. if model_type != "新模型":
  117. model = self._model_original
  118. img_path_auc_ks = []
  119. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  120. img_path_auc_ks.append(path)
  121. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  122. img_path_auc_ks.append(path)
  123. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  124. img_path_auc_ks.append(path)
  125. df_auc_ks = pd.DataFrame()
  126. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  127. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  128. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  129. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  130. def _f_get_metric_trend(self, ):
  131. y_column = self._ol_config.y_column
  132. data = self._data.data
  133. # 建模样本变量趋势
  134. breaks_list = {}
  135. special_values = {}
  136. for column, bin in self.sc_woebin.items():
  137. breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
  138. sv = list(bin[bin["is_special_values"] == True]['breaks'])
  139. if len(sv) > 0:
  140. special_values[column] = sv
  141. woebin = sc.woebin(data[self._columns_intercept_remove + [y_column]], y=y_column, breaks_list=breaks_list,
  142. special_values=special_values, print_info=False)
  143. imgs_path = []
  144. for k, df_bin in woebin.items():
  145. sc.woebin_plot(df_bin)
  146. path = self._ol_config.f_get_save_path(f"trend_{k}.png")
  147. plt.savefig(path)
  148. imgs_path.append(path)
  149. return MetricFucResultEntity(image_path=imgs_path, image_size=4)
  150. def _f_get_metric_coef(self, ):
  151. columns_anns = self._ol_config.columns_anns
  152. df = pd.DataFrame()
  153. df["变量"] = self._columns
  154. df["原变量WOE拟合系数"] = [round(i, 4) for i in self._model_original.linear.weight.tolist()]
  155. df["新变量WOE拟合系数"] = [round(i, 4) for i in self._model_optimized.linear.weight.tolist()]
  156. anns = [columns_anns.get(column, "-") for column in self._columns]
  157. df["释义"] = anns
  158. img_path_coef = self._ol_config.f_get_save_path(f"coef.png")
  159. f_df_to_image(df, img_path_coef)
  160. return MetricFucResultEntity(table=df, image_path=img_path_coef)
  161. def _f_get_metric_gain(self, model_type: str):
  162. y_column = self._ol_config.y_column
  163. data = self._data.data
  164. model = self._model_optimized
  165. if model_type != "新模型":
  166. model = self._model_original
  167. score = self.prob(data, model)
  168. score_bin, _ = f_get_model_score_bin(data, score)
  169. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  170. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  171. f_df_to_image(gain, img_path_gain)
  172. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  173. def _f_get_stress_test(self, ):
  174. stress_sample_times = self._ol_config.stress_sample_times
  175. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  176. y_column = self._ol_config.y_column
  177. data = self._data.data
  178. score = self.prob(data, self._model_optimized)
  179. score_bin, _ = f_get_model_score_bin(data, score)
  180. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  181. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  182. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  183. f_df_to_image(df_stress, img_path_stress)
  184. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  185. def prob(self, x: pd.DataFrame, model=None):
  186. if model is None:
  187. model = self._model_optimized
  188. model.eval()
  189. with torch.no_grad():
  190. x = torch.tensor(self._feature_generate(x), dtype=torch.float64)
  191. y_prob = model(x)
  192. y_prob = y_prob.detach().numpy()
  193. return y_prob
  194. def score(self, x: pd.DataFrame) -> np.array:
  195. return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
  196. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
  197. y1 = self.prob(x1)
  198. y2 = self.prob(x2)
  199. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  200. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  201. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  202. print(f"模型psi: {model_psi['psi'].sum()}")
  203. return model_psi
  204. def train(self, ):
  205. def _get_param_optimized(model: LR, epoch):
  206. model.eval()
  207. with torch.no_grad():
  208. y_prob = model(test_x)
  209. loss = criterion(y_prob, torch.tensor(test_y.to_numpy(), dtype=torch.float64))
  210. loss_test = loss.detach().item()
  211. y_prob = y_prob.detach().numpy()
  212. perf = sc.perf_eva(test_y, y_prob, show_plot=False)
  213. auc = perf["AUC"]
  214. ks = perf["KS"]
  215. row = model.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
  216. return dict(zip(df_param_columns, row))
  217. epochs = self._ol_config.epochs
  218. batch_size = self._ol_config.batch_size
  219. train_data = self._data.train_data
  220. test_data = self._data.test_data
  221. train_x = self._feature_generate(train_data)
  222. train_y = train_data[self._ol_config.y_column].to_numpy()
  223. test_x = torch.tensor(self._feature_generate(test_data), dtype=torch.float64)
  224. test_y = test_data[self._ol_config.y_column]
  225. criterion = nn.BCELoss()
  226. optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
  227. df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
  228. df_param = pd.DataFrame(columns=df_param_columns)
  229. # 优化前
  230. loss_train = 0
  231. df_param.loc[len(df_param)] = _get_param_optimized(self._model_original, -1)
  232. for epoch in tqdm(range(epochs)):
  233. data_len = len(train_x)
  234. for i in range(math.ceil(data_len / batch_size)):
  235. train_x_batch = torch.tensor(train_x[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  236. train_y_batch = torch.tensor(train_y[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  237. self._model_optimized.train()
  238. optimizer.zero_grad()
  239. y_prob = self._model_optimized(train_x_batch)
  240. loss = criterion(y_prob, train_y_batch)
  241. loss.backward()
  242. optimizer.step()
  243. loss_train = loss.detach().item()
  244. # 测试集评估
  245. df_param.loc[len(df_param)] = _get_param_optimized(self._model_optimized, epoch)
  246. context.set(ContextEnum.PARAM_OPTIMIZED, df_param)
  247. def save(self):
  248. self._ol_config.config_save()
  249. if self.sc_woebin is None:
  250. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
  251. df_woebin = pd.concat(self.sc_woebin.values())
  252. path = self._ol_config.f_get_save_path(FileEnum.FEATURE.value)
  253. df_woebin.to_csv(path)
  254. print(f"feature save to【{path}】success. ")
  255. if self._model_optimized is None:
  256. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  257. path = self._ol_config.f_get_save_path(FileEnum.COEF.value)
  258. with open(path, mode="w", encoding="utf-8") as f:
  259. coef = dict(zip(self._columns, self._model_optimized.linear.weight.tolist()))
  260. j = json.dumps(coef, ensure_ascii=False)
  261. f.write(j)
  262. print(f"model save to【{path}】success. ")
  263. if self.card is not None:
  264. df_card = pd.concat(self.card.values())
  265. path = self._ol_config.f_get_save_path(FileEnum.CARD.value)
  266. df_card.to_csv(path)
  267. print(f"model save to【{path}】success. ")
  268. @staticmethod
  269. def load(path: str):
  270. ol_config = OnlineLearningConfigEntity.from_config(path)
  271. ol_config._path_resources = path
  272. return OnlineLearningTrainer(ol_config=ol_config)
  273. def report(self, epoch: int = None):
  274. df_param = context.get(ContextEnum.PARAM_OPTIMIZED)
  275. self._model_optimized = self._f_get_best_model(df_param, epoch)
  276. if self._ol_config.jupyter_print:
  277. from IPython import display
  278. f_display_title(display, "模型系数优化过程")
  279. display.display(df_param)
  280. metric_value_dict = {}
  281. # 评分卡
  282. if not self.card_cfg is None:
  283. self._f_get_scorecard()
  284. df_card = pd.concat(self.card.values())
  285. img_path_card = self._ol_config.f_get_save_path(f"card.png")
  286. f_df_to_image(df_card, img_path_card)
  287. metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
  288. # 样本分布
  289. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  290. table_font_size=10, table_cell_width=3)
  291. # 模型结果对比
  292. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  293. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  294. # 变量趋势
  295. metric_value_dict["变量趋势-建模数据"] = self._f_get_metric_trend()
  296. # 模型系数对比
  297. metric_value_dict["模型系数"] = self._f_get_metric_coef()
  298. # 分数分箱
  299. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  300. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  301. # 压力测试
  302. if self._ol_config.stress_test:
  303. metric_value_dict["压力测试"] = self._f_get_stress_test()
  304. if self._ol_config.jupyter_print:
  305. self.jupyter_print(metric_value_dict)
  306. save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  307. ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  308. print(f"模型报告文件储存路径:{save_path}")
  309. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  310. from IPython import display
  311. f_display_title(display, "样本分布")
  312. display.display(metric_value_dict["样本分布"].table)
  313. f_display_title(display, "模型结果")
  314. print(f"原模型")
  315. display.display(metric_value_dict["模型结果-原模型"].table)
  316. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  317. print(f"新模型")
  318. display.display(metric_value_dict["模型结果-新模型"].table)
  319. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  320. f_display_title(display, "模型系数")
  321. display.display(metric_value_dict["模型系数"].table)
  322. f_display_title(display, "分数分箱")
  323. print(f"建模数据上分数分箱")
  324. print(f"原模型")
  325. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  326. print(f"新模型")
  327. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  328. f_display_title(display, "变量趋势")
  329. print(f"建模数据上变量趋势")
  330. f_display_images_by_side(display, metric_value_dict["变量趋势-建模数据"].image_path)
  331. if "压力测试" in metric_value_dict.keys():
  332. f_display_title(display, "压力测试")
  333. display.display(metric_value_dict["压力测试"].table)
  334. # 评分卡
  335. if "评分卡" in metric_value_dict.keys():
  336. f_display_title(display, "评分卡")
  337. display.display(metric_value_dict["评分卡"].table)
  338. if __name__ == "__main__":
  339. pass