trainer_xgb.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import os
  8. from os.path import dirname, realpath
  9. from typing import Dict, List
  10. import joblib
  11. import pandas as pd
  12. import scorecardpy as sc
  13. import xgboost as xgb
  14. from pypmml import Model
  15. from sklearn2pmml import PMMLPipeline, sklearn2pmml
  16. from tqdm import tqdm
  17. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  18. f_display_images_by_side, silent_print, df_print_nolimit
  19. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  20. from enums import ResultCodesEnum, ConstantEnum, FileEnum
  21. from init import init
  22. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi, Xtransformer_fit, \
  23. Xtransform, fit
  24. init()
  25. class OnlineLearningTrainerXgb:
  26. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  27. # 覆写方法
  28. PMMLPipeline.Xtransformer_fit = Xtransformer_fit
  29. PMMLPipeline.Xtransform = Xtransform
  30. PMMLPipeline.fit = fit
  31. if ol_config is not None:
  32. self._ol_config = ol_config
  33. else:
  34. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  35. self._data = data
  36. self._df_param_optimized = None
  37. self._model_optimized_list = []
  38. self._pipeline_original: PMMLPipeline
  39. self._pipeline_optimized: PMMLPipeline
  40. self.model_optimized: xgb.XGBClassifier
  41. # 报告模板
  42. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  43. "./template/OnlineLearning报告模板_xgb.docx")
  44. self._init(self._ol_config.path_resources)
  45. def _init(self, path: str):
  46. if not os.path.isdir(path):
  47. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  48. path_model = os.path.join(path, FileEnum.PIPELINE_XGB.value)
  49. if not os.path.isfile(path_model):
  50. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  51. self._pipeline_original = joblib.load(path_model)
  52. self._pipeline_optimized = joblib.load(path_model)
  53. print(f"model load from【{path_model}】success.")
  54. path_model = os.path.join(path, FileEnum.MODEL_XGB.value)
  55. if os.path.isfile(path_model):
  56. model = xgb.XGBClassifier()
  57. model.load_model(path_model)
  58. self._pipeline_optimized.steps[-1] = ("classifier", model)
  59. print(f"model load from【{path_model}】success.")
  60. def _f_rewrite_pmml(self, path_pmml: str):
  61. with open(path_pmml, mode="r", encoding="utf-8") as f:
  62. pmml = f.read()
  63. pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
  64. with open(path_pmml, mode="w", encoding="utf-8") as f:
  65. f.write(pmml)
  66. f.flush()
  67. def _f_get_best_model(self, df_param: pd.DataFrame, ntree: int = None):
  68. if ntree is None:
  69. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  70. print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  71. self._train(int(df_param_sort.iloc[0][3]))
  72. else:
  73. print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
  74. self._train(ntree)
  75. if self._ol_config.save_pmml:
  76. data = self._data.data
  77. path_pmml = self._ol_config.f_get_save_path(FileEnum.PMML_XGB.value)
  78. # pipeline = make_pmml_pipeline(self.model)
  79. sklearn2pmml(self._pipeline_optimized, path_pmml, with_repr=True, )
  80. self._f_rewrite_pmml(path_pmml)
  81. print(f"model save to【{path_pmml}】success. ")
  82. # pmml与原生模型结果一致性校验
  83. model_pmml = Model.fromFile(path_pmml)
  84. prob_pmml = model_pmml.predict(data)["probability(1)"]
  85. prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
  86. diff = pd.DataFrame()
  87. diff["prob_pmml"] = prob_pmml
  88. diff["prob_pipeline"] = prob_pipeline
  89. diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
  90. diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
  91. print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
  92. def _f_get_metric_auc_ks(self, model_type: str):
  93. def _get_auc_ks(data, title):
  94. y = data[self._ol_config.y_column]
  95. y_prob = self.prob(data, model)
  96. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  97. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  98. perf["pic"].savefig(path)
  99. auc = perf["AUC"]
  100. ks = perf["KS"]
  101. f_image_crop_white_borders(path, path)
  102. return auc, ks, path
  103. train_data = self._data.train_data
  104. test_data = self._data.test_data
  105. data = self._data.data
  106. model = self._pipeline_optimized
  107. if model_type != "新模型":
  108. model = self._pipeline_original
  109. img_path_auc_ks = []
  110. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  111. img_path_auc_ks.append(path)
  112. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  113. img_path_auc_ks.append(path)
  114. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  115. img_path_auc_ks.append(path)
  116. df_auc_ks = pd.DataFrame()
  117. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  118. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  119. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  120. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  121. def _f_get_metric_gain(self, model_type: str):
  122. y_column = self._ol_config.y_column
  123. data = self._data.data
  124. model = self._pipeline_optimized
  125. if model_type != "新模型":
  126. model = self._pipeline_original
  127. score = self.prob(data, model)
  128. score_bin, _ = f_get_model_score_bin(data, score)
  129. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  130. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  131. f_df_to_image(gain, img_path_gain)
  132. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  133. def _f_get_stress_test(self, ):
  134. stress_sample_times = self._ol_config.stress_sample_times
  135. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  136. y_column = self._ol_config.y_column
  137. data = self._data.data
  138. score = self.prob(data, self._pipeline_optimized)
  139. score_bin, _ = f_get_model_score_bin(data, score)
  140. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  141. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  142. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  143. f_df_to_image(df_stress, img_path_stress)
  144. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  145. def prob(self, x: pd.DataFrame, pipeline=None, ntree_limit=None):
  146. if pipeline is None:
  147. pipeline = self._pipeline_optimized
  148. y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
  149. return y_prob
  150. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True,
  151. ntree_limit=None) -> pd.DataFrame:
  152. y1 = self.prob(x1, ntree_limit=ntree_limit)
  153. y2 = self.prob(x2, ntree_limit=ntree_limit)
  154. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  155. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  156. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  157. if print_sum:
  158. print(f"模型psi: {model_psi['psi'].sum()}")
  159. return model_psi
  160. def _train(self, n_estimators: int = None):
  161. y_column = self._ol_config.y_column
  162. train_data = self._data.train_data
  163. params_xgb = self._ol_config.params_xgb
  164. model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
  165. ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
  166. if params_xgb.get("oltype") == "tree_refresh":
  167. self.model_optimized = xgb.XGBClassifier(
  168. n_estimators=n_estimators if n_estimators else ntree,
  169. reg_alpha=params_xgb.get("alpha"),
  170. reg_lambda=params_xgb.get("lambda"),
  171. importance_type='weight',
  172. updater="refresh",
  173. process_type="update",
  174. refresh_leaf=True,
  175. **params_xgb,
  176. )
  177. else:
  178. self.model_optimized = xgb.XGBClassifier(
  179. n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
  180. reg_alpha=params_xgb.get("alpha"),
  181. reg_lambda=params_xgb.get("lambda"),
  182. importance_type='weight',
  183. **params_xgb,
  184. )
  185. self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
  186. with silent_print():
  187. self._pipeline_optimized.fit(train_data, train_data[y_column],
  188. classifier__verbose=False,
  189. classifier__xgb_model=model_original.get_booster(),
  190. )
  191. return ntree
  192. def train(self, ):
  193. y_column = self._ol_config.y_column
  194. params_xgb = self._ol_config.params_xgb
  195. train_data = self._data.train_data
  196. test_data = self._data.test_data
  197. df_param_columns = ["auc_test", "ks_test", "psi", "ntree"]
  198. self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
  199. ntree = self._train()
  200. print(f"原模型一共有【{ntree}】棵树")
  201. # 迭代效果回溯
  202. if params_xgb.get("oltype") == "tree_refresh":
  203. print("更新原模型模式")
  204. iteration_n = ntree
  205. else:
  206. print("原模型基础上新增树模式")
  207. iteration_n = params_xgb.get("num_boost_round")
  208. for n in tqdm(range(iteration_n)):
  209. if params_xgb.get("oltype") == "tree_refresh":
  210. ntree_limit = n + 1
  211. else:
  212. ntree_limit = ntree + n + 1
  213. test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
  214. test_y = test_data[y_column]
  215. psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
  216. # auc_test = roc_auc_score(test_y, test_y_prob)
  217. # auc_test = round(auc_test, 4)
  218. # df = pd.DataFrame({'label': test_y, 'pred': test_y_prob})
  219. # dfkslift = eva_dfkslift(df)
  220. # ks_test = round(dfkslift["ks"].max(), 4)
  221. perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
  222. auc_test = perf["AUC"]
  223. ks_test = perf["KS"]
  224. row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
  225. self._df_param_optimized.loc[len(self._df_param_optimized)] = row
  226. def save(self):
  227. self._ol_config.config_save()
  228. if self._pipeline_optimized is None:
  229. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  230. path_model = self._ol_config.f_get_save_path(FileEnum.PIPELINE_XGB.value)
  231. joblib.dump(self._pipeline_optimized, path_model)
  232. print(f"model save to【{path_model}】success. ")
  233. # 在xgb的增量学习下直接保存pipeline会出错,所以这里需要单独保存xgb model,然后进行复原
  234. path_model = self._ol_config.f_get_save_path(FileEnum.MODEL_XGB.value)
  235. self.model_optimized.save_model(path_model)
  236. print(f"model save to【{path_model}】success. ")
  237. @staticmethod
  238. def load(path: str):
  239. ol_config = OnlineLearningConfigEntity.from_config(path)
  240. ol_config._path_resources = path
  241. return OnlineLearningTrainerXgb(ol_config=ol_config)
  242. def report(self, ntree: int = None):
  243. train_data = self._data.train_data
  244. test_data = self._data.test_data
  245. self._f_get_best_model(self._df_param_optimized, ntree)
  246. if self._ol_config.jupyter_print:
  247. from IPython import display
  248. f_display_title(display, "模型优化过程")
  249. with df_print_nolimit():
  250. display.display(self._df_param_optimized)
  251. metric_value_dict = {}
  252. # 样本分布
  253. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  254. table_font_size=10, table_cell_width=3)
  255. # 模型结果对比
  256. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  257. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  258. # 模型分psi
  259. model_psi = self.psi(train_data, test_data, print_sum=False)
  260. img_path_psi = self._ol_config.f_get_save_path(f"model_psi.png")
  261. f_df_to_image(model_psi, img_path_psi)
  262. metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
  263. value=model_psi["psi"].sum().round(3),
  264. image_path=img_path_psi)
  265. # 分数分箱
  266. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  267. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  268. # 压力测试
  269. if self._ol_config.stress_test:
  270. metric_value_dict["压力测试"] = self._f_get_stress_test()
  271. if self._ol_config.jupyter_print:
  272. self.jupyter_print(metric_value_dict)
  273. # save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  274. # ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  275. # print(f"模型报告文件储存路径:{save_path}")
  276. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  277. from IPython import display
  278. f_display_title(display, "样本分布")
  279. display.display(metric_value_dict["样本分布"].table)
  280. f_display_title(display, "模型结果")
  281. print(f"原模型")
  282. display.display(metric_value_dict["模型结果-原模型"].table)
  283. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  284. print(f"新模型")
  285. display.display(metric_value_dict["模型结果-新模型"].table)
  286. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  287. # 模型psi
  288. f_display_title(display, "模型psi")
  289. display.display(metric_value_dict["模型稳定性"].table)
  290. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  291. f_display_title(display, "分数分箱")
  292. print(f"建模数据上分数分箱")
  293. print(f"原模型")
  294. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  295. print(f"新模型")
  296. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  297. if "压力测试" in metric_value_dict.keys():
  298. f_display_title(display, "压力测试")
  299. display.display(metric_value_dict["压力测试"].table)
  300. if __name__ == "__main__":
  301. pass