trainer.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import json
  8. import math
  9. import os
  10. import re
  11. from os.path import dirname, realpath
  12. from typing import Dict, List
  13. import matplotlib.pyplot as plt
  14. import numpy as np
  15. import pandas as pd
  16. import scorecardpy as sc
  17. import torch
  18. import torch.nn as nn
  19. import torch.optim as optim
  20. from tqdm import tqdm
  21. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  22. f_display_images_by_side
  23. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  24. from enums import ResultCodesEnum, ConstantEnum, ContextEnum
  25. from feature import f_woebin_load
  26. from init import init, context
  27. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
  28. from monitor import ReportWord
  29. from .utils import LR
  30. init()
  31. class OnlineLearningTrainer:
  32. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  33. if ol_config is not None:
  34. self._ol_config = ol_config
  35. else:
  36. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  37. self._data = data
  38. self._columns = None
  39. self._model_original: LR
  40. self._model_optimized: LR
  41. self.sc_woebin = None
  42. # 报告模板
  43. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  44. "./template/OnlineLearning报告模板_lr.docx")
  45. self._init(self._ol_config.path_resources)
  46. def _init(self, path: str):
  47. if not os.path.isdir(path):
  48. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  49. path_coef = os.path.join(path, "coef.dict")
  50. if not os.path.isfile(path_coef):
  51. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型系数文件【{path_coef}】不存在")
  52. with open(path_coef, mode="r", encoding="utf-8") as f:
  53. coef = json.loads(f.read())
  54. print(f"coef load from【{path_coef}】success.")
  55. self._columns = list(coef.keys())
  56. # 排个序,防止因为顺序原因导致的可能的bug
  57. self._columns.sort()
  58. weight = [coef[k] for k in self._columns]
  59. self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
  60. self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
  61. self._columns = [re.sub('_woe$', '', i) for i in self._columns]
  62. # 剔除常数项,因为woe编码里没有常数项
  63. self._columns_intercept_remove = self._columns.copy()
  64. if ConstantEnum.INTERCEPT.value in self._columns_intercept_remove:
  65. self._columns_intercept_remove.remove(ConstantEnum.INTERCEPT.value)
  66. # woe编码后带_woe后缀
  67. self._columns_woe = [f"{i}_woe" for i in self._columns]
  68. self.sc_woebin = f_woebin_load(path)
  69. for k in self._columns_intercept_remove:
  70. if k not in self.sc_woebin.keys():
  71. GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型变量【{k}】在woe特征里不存在")
  72. def _feature_generate(self, data: pd.DataFrame) -> pd.DataFrame:
  73. data_woe = sc.woebin_ply(data[self._columns_intercept_remove], self.sc_woebin, print_info=False)
  74. data_woe[f"{ConstantEnum.INTERCEPT.value}_woe"] = [1] * len(data_woe)
  75. return data_woe[self._columns_woe].to_numpy()
  76. def _f_get_best_model(self, df_param: pd.DataFrame) -> LR:
  77. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  78. print(f"最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  79. weight = list(df_param_sort.iloc[0])
  80. weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
  81. return LR(weight)
  82. def _f_get_metric_auc_ks(self, model_type: str):
  83. def _get_auc_ks(data, title):
  84. y = data[self._ol_config.y_column]
  85. y_prob = self.prob(data, model)
  86. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  87. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  88. perf["pic"].savefig(path)
  89. auc = perf["AUC"]
  90. ks = perf["KS"]
  91. f_image_crop_white_borders(path, path)
  92. return auc, ks, path
  93. train_data = self._data.train_data
  94. test_data = self._data.test_data
  95. data = pd.concat((train_data, test_data))
  96. model = self._model_optimized
  97. if model_type != "新模型":
  98. model = self._model_original
  99. img_path_auc_ks = []
  100. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  101. img_path_auc_ks.append(path)
  102. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  103. img_path_auc_ks.append(path)
  104. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  105. img_path_auc_ks.append(path)
  106. df_auc_ks = pd.DataFrame()
  107. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  108. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  109. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  110. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  111. def _f_get_metric_trend(self, ):
  112. train_data = self._data.train_data
  113. test_data = self._data.test_data
  114. y_column = self._ol_config.y_column
  115. data = pd.concat((train_data, test_data))
  116. # 建模样本变量趋势
  117. breaks_list = {}
  118. special_values = {}
  119. for column, bin in self.sc_woebin.items():
  120. breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
  121. sv = list(bin[bin["is_special_values"] == True]['breaks'])
  122. if len(sv) > 0:
  123. special_values[column] = sv
  124. woebin = sc.woebin(data[self._columns_intercept_remove + [y_column]], y=y_column, breaks_list=breaks_list,
  125. special_values=special_values, print_info=False)
  126. imgs_path = []
  127. for k, df_bin in woebin.items():
  128. sc.woebin_plot(df_bin)
  129. path = self._ol_config.f_get_save_path(f"trend_{k}.png")
  130. plt.savefig(path)
  131. imgs_path.append(path)
  132. return MetricFucResultEntity(image_path=imgs_path, image_size=4)
  133. def _f_get_metric_coef(self, ):
  134. columns_anns = self._ol_config.columns_anns
  135. df = pd.DataFrame()
  136. df["变量"] = self._columns
  137. df["原变量WOE拟合系数"] = [round(i, 4) for i in self._model_original.linear.weight.tolist()]
  138. df["新变量WOE拟合系数"] = [round(i, 4) for i in self._model_optimized.linear.weight.tolist()]
  139. anns = [columns_anns.get(column, "-") for column in self._columns]
  140. df["释义"] = anns
  141. img_path_coef = self._ol_config.f_get_save_path(f"coef.png")
  142. f_df_to_image(df, img_path_coef)
  143. return MetricFucResultEntity(table=df, image_path=img_path_coef)
  144. def _f_get_metric_gain(self, model_type: str):
  145. train_data = self._data.train_data
  146. test_data = self._data.test_data
  147. y_column = self._ol_config.y_column
  148. data = pd.concat((train_data, test_data))
  149. model = self._model_optimized
  150. if model_type != "新模型":
  151. model = self._model_original
  152. score = self.prob(data, model)
  153. score_bin, _ = f_get_model_score_bin(data, score)
  154. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  155. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  156. f_df_to_image(gain, img_path_gain)
  157. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  158. def _f_get_stress_test(self, ):
  159. stress_sample_times = self._ol_config.stress_sample_times
  160. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  161. train_data = self._data.train_data
  162. test_data = self._data.test_data
  163. y_column = self._ol_config.y_column
  164. data = pd.concat((train_data, test_data))
  165. score = self.prob(data, self._model_optimized)
  166. score_bin, _ = f_get_model_score_bin(data, score)
  167. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  168. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  169. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  170. f_df_to_image(df_stress, img_path_stress)
  171. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  172. def prob(self, x: pd.DataFrame, model=None):
  173. if model is None:
  174. model = self._model_optimized
  175. model.eval()
  176. with torch.no_grad():
  177. x = torch.tensor(self._feature_generate(x), dtype=torch.float64)
  178. y_prob = model(x)
  179. y_prob = y_prob.detach().numpy()
  180. return y_prob
  181. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
  182. y1 = self.prob(x1)
  183. y2 = self.prob(x2)
  184. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  185. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  186. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  187. print(f"模型psi: {model_psi['psi'].sum()}")
  188. return model_psi
  189. def train(self, ):
  190. epochs = self._ol_config.epochs
  191. batch_size = self._ol_config.batch_size
  192. train_data = self._data.train_data
  193. test_data = self._data.test_data
  194. train_x = self._feature_generate(train_data)
  195. train_y = train_data[self._ol_config.y_column].to_numpy()
  196. test_x = torch.tensor(self._feature_generate(test_data), dtype=torch.float64)
  197. test_y = test_data[self._ol_config.y_column]
  198. criterion = nn.BCELoss()
  199. optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
  200. df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
  201. df_param = pd.DataFrame(columns=df_param_columns)
  202. for epoch in tqdm(range(epochs)):
  203. data_len = len(train_x)
  204. loss_train = 0
  205. for i in range(math.ceil(data_len / batch_size)):
  206. train_x_batch = torch.tensor(train_x[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  207. train_y_batch = torch.tensor(train_y[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
  208. self._model_optimized.train()
  209. optimizer.zero_grad()
  210. y_prob = self._model_optimized(train_x_batch)
  211. loss = criterion(y_prob, train_y_batch)
  212. loss.backward()
  213. optimizer.step()
  214. loss_train = loss.detach().item()
  215. # 测试集评估
  216. self._model_optimized.eval()
  217. with torch.no_grad():
  218. y_prob = self._model_optimized(test_x)
  219. loss = criterion(y_prob, torch.tensor(test_y.to_numpy(), dtype=torch.float64))
  220. loss_test = loss.detach().item()
  221. y_prob = y_prob.detach().numpy()
  222. perf = sc.perf_eva(test_y, y_prob, show_plot=False)
  223. auc = perf["AUC"]
  224. ks = perf["KS"]
  225. row = self._model_optimized.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
  226. df_param.loc[len(df_param)] = dict(zip(df_param_columns, row))
  227. # print(f"epoch:{epoch + 1} auc:{auc} ks:{ks}")
  228. self._model_optimized = self._f_get_best_model(df_param)
  229. context.set(ContextEnum.PARAM_OPTIMIZED, df_param)
  230. def save(self):
  231. self._ol_config.config_save()
  232. if self.sc_woebin is None:
  233. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
  234. df_woebin = pd.concat(self.sc_woebin.values())
  235. path = self._ol_config.f_get_save_path(f"feature.csv")
  236. df_woebin.to_csv(path)
  237. print(f"feature save to【{path}】success. ")
  238. if self._model_optimized is None:
  239. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  240. path = self._ol_config.f_get_save_path("coef.dict")
  241. with open(path, mode="w", encoding="utf-8") as f:
  242. coef = dict(zip(self._columns, self._model_optimized.linear.weight.tolist()))
  243. j = json.dumps(coef, ensure_ascii=False)
  244. f.write(j)
  245. print(f"model save to【{path}】success. ")
  246. @staticmethod
  247. def load(path: str):
  248. ol_config = OnlineLearningConfigEntity.from_config(path)
  249. ol_config._path_resources = path
  250. return OnlineLearningTrainer(ol_config=ol_config)
  251. def report(self, ):
  252. metric_value_dict = {}
  253. # 样本分布
  254. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  255. table_font_size=10, table_cell_width=3)
  256. # 模型结果对比
  257. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  258. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  259. # 变量趋势
  260. metric_value_dict["变量趋势-建模数据"] = self._f_get_metric_trend()
  261. # 模型系数对比
  262. metric_value_dict["模型系数"] = self._f_get_metric_coef()
  263. # 分数分箱
  264. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  265. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  266. # 压力测试
  267. if self._ol_config.stress_test:
  268. metric_value_dict["压力测试"] = self._f_get_stress_test()
  269. if self._ol_config.jupyter_print:
  270. self.jupyter_print(metric_value_dict)
  271. save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  272. ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  273. print(f"模型报告文件储存路径:{save_path}")
  274. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  275. from IPython import display
  276. df_param = context.get(ContextEnum.PARAM_OPTIMIZED)
  277. f_display_title(display, "样本分布")
  278. display.display(metric_value_dict["样本分布"].table)
  279. f_display_title(display, "模型结果")
  280. print(f"原模型")
  281. display.display(metric_value_dict["模型结果-原模型"].table)
  282. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  283. print(f"新模型")
  284. display.display(metric_value_dict["模型结果-新模型"].table)
  285. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  286. f_display_title(display, "模型系数")
  287. display.display(metric_value_dict["模型系数"].table)
  288. f_display_title(display, "分数分箱")
  289. print(f"建模数据上分数分箱")
  290. print(f"原模型")
  291. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  292. print(f"新模型")
  293. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  294. f_display_title(display, "变量趋势")
  295. print(f"建模数据上变量趋势")
  296. f_display_images_by_side(display, metric_value_dict["变量趋势-建模数据"].image_path)
  297. if "压力测试" in metric_value_dict.keys():
  298. f_display_title(display, "压力测试")
  299. display.display(metric_value_dict["压力测试"].table)
  300. f_display_title(display, "系数优化过程")
  301. display.display(df_param)
  302. if __name__ == "__main__":
  303. pass