trainer_xgb.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import os
  8. from os.path import dirname, realpath
  9. from typing import Dict, List
  10. import joblib
  11. import pandas as pd
  12. import scorecardpy as sc
  13. import xgboost as xgb
  14. from pypmml import Model
  15. from sklearn2pmml import PMMLPipeline, sklearn2pmml
  16. from tqdm import tqdm
  17. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  18. f_display_images_by_side, silent_print, df_print_nolimit
  19. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  20. from enums import ResultCodesEnum, ConstantEnum, FileEnum
  21. from init import init
  22. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi, Xtransformer_fit, \
  23. Xtransform, fit
  24. init()
  25. class OnlineLearningTrainerXgb:
  26. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  27. # 覆写方法
  28. PMMLPipeline.Xtransformer_fit = Xtransformer_fit
  29. PMMLPipeline.Xtransform = Xtransform
  30. PMMLPipeline.fit = fit
  31. if ol_config is not None:
  32. self._ol_config = ol_config
  33. else:
  34. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  35. self._data = data
  36. self._df_param_optimized = None
  37. self._model_optimized_list = []
  38. self._pipeline_original: PMMLPipeline
  39. self._pipeline_optimized: PMMLPipeline
  40. # 报告模板
  41. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  42. "./template/OnlineLearning报告模板_xgb.docx")
  43. self._init(self._ol_config.path_resources)
  44. def _init(self, path: str):
  45. if not os.path.isdir(path):
  46. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  47. path_model = os.path.join(path, FileEnum.MODEL.value)
  48. if not os.path.isfile(path_model):
  49. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  50. self._pipeline_original = joblib.load(path_model)
  51. self._pipeline_optimized = joblib.load(path_model)
  52. print(f"model load from【{path_model}】success.")
  53. def _f_rewrite_pmml(self, path_pmml: str):
  54. with open(path_pmml, mode="r", encoding="utf-8") as f:
  55. pmml = f.read()
  56. pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
  57. with open(path_pmml, mode="w", encoding="utf-8") as f:
  58. f.write(pmml)
  59. f.flush()
  60. def _f_get_best_model(self, df_param: pd.DataFrame, ntree: int = None):
  61. if ntree is None:
  62. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  63. print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  64. self._train(int(df_param_sort.iloc[0][2]))
  65. else:
  66. print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
  67. self._train(ntree)
  68. if self._ol_config.save_pmml:
  69. data = self._data.data
  70. path_pmml = self._ol_config.f_get_save_path(FileEnum.PMML.value)
  71. # pipeline = make_pmml_pipeline(self.model)
  72. sklearn2pmml(self._pipeline_optimized, path_pmml, with_repr=True, )
  73. self._f_rewrite_pmml(path_pmml)
  74. print(f"model save to【{path_pmml}】success. ")
  75. # pmml与原生模型结果一致性校验
  76. model_pmml = Model.fromFile(path_pmml)
  77. prob_pmml = model_pmml.predict(data)["probability(1)"]
  78. prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
  79. diff = pd.DataFrame()
  80. diff["prob_pmml"] = prob_pmml
  81. diff["prob_pipeline"] = prob_pipeline
  82. diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
  83. diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
  84. print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
  85. def _f_get_metric_auc_ks(self, model_type: str):
  86. def _get_auc_ks(data, title):
  87. y = data[self._ol_config.y_column]
  88. y_prob = self.prob(data, model)
  89. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  90. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  91. perf["pic"].savefig(path)
  92. auc = perf["AUC"]
  93. ks = perf["KS"]
  94. f_image_crop_white_borders(path, path)
  95. return auc, ks, path
  96. train_data = self._data.train_data
  97. test_data = self._data.test_data
  98. data = self._data.data
  99. model = self._pipeline_optimized
  100. if model_type != "新模型":
  101. model = self._pipeline_original
  102. img_path_auc_ks = []
  103. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  104. img_path_auc_ks.append(path)
  105. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  106. img_path_auc_ks.append(path)
  107. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  108. img_path_auc_ks.append(path)
  109. df_auc_ks = pd.DataFrame()
  110. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  111. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  112. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  113. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  114. def _f_get_metric_gain(self, model_type: str):
  115. y_column = self._ol_config.y_column
  116. data = self._data.data
  117. model = self._pipeline_optimized
  118. if model_type != "新模型":
  119. model = self._pipeline_original
  120. score = self.prob(data, model)
  121. score_bin, _ = f_get_model_score_bin(data, score)
  122. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  123. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  124. f_df_to_image(gain, img_path_gain)
  125. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  126. def _f_get_stress_test(self, ):
  127. stress_sample_times = self._ol_config.stress_sample_times
  128. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  129. y_column = self._ol_config.y_column
  130. data = self._data.data
  131. score = self.prob(data, self._pipeline_optimized)
  132. score_bin, _ = f_get_model_score_bin(data, score)
  133. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  134. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  135. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  136. f_df_to_image(df_stress, img_path_stress)
  137. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  138. def prob(self, x: pd.DataFrame, pipeline=None, ntree_limit=None):
  139. if pipeline is None:
  140. pipeline = self._pipeline_optimized
  141. y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
  142. return y_prob
  143. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True,
  144. ntree_limit=None) -> pd.DataFrame:
  145. y1 = self.prob(x1, ntree_limit=ntree_limit)
  146. y2 = self.prob(x2, ntree_limit=ntree_limit)
  147. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  148. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  149. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  150. if print_sum:
  151. print(f"模型psi: {model_psi['psi'].sum()}")
  152. return model_psi
  153. def _train(self, n_estimators: int = None):
  154. y_column = self._ol_config.y_column
  155. train_data = self._data.train_data
  156. model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
  157. ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
  158. model_optimized = xgb.XGBClassifier(
  159. n_estimators=n_estimators if n_estimators else ntree,
  160. updater="refresh",
  161. process_type="update",
  162. refresh_leaf=True,
  163. learning_rate=self._ol_config.lr,
  164. random_state=self._ol_config.random_state,
  165. )
  166. self._pipeline_optimized.steps[-1] = ("classifier", model_optimized)
  167. with silent_print():
  168. self._pipeline_optimized.fit(train_data, train_data[y_column],
  169. classifier__verbose=False,
  170. classifier__xgb_model=model_original.get_booster(),
  171. )
  172. return ntree
  173. def train(self, ):
  174. y_column = self._ol_config.y_column
  175. train_data = self._data.train_data
  176. test_data = self._data.test_data
  177. df_param_columns = ["auc_test", "ks_test", "psi", "ntree"]
  178. self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
  179. ntree = self._train()
  180. print(f"原模型一共有【{ntree}】棵树")
  181. for n in tqdm(range(ntree)):
  182. n = n + 1
  183. test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
  184. test_y = test_data[y_column]
  185. psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=n)['psi'].sum(), 3)
  186. # auc_test = roc_auc_score(test_y, test_y_prob)
  187. # auc_test = round(auc_test, 4)
  188. # df = pd.DataFrame({'label': test_y, 'pred': test_y_prob})
  189. # dfkslift = eva_dfkslift(df)
  190. # ks_test = round(dfkslift["ks"].max(), 4)
  191. perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
  192. auc_test = perf["AUC"]
  193. ks_test = perf["KS"]
  194. row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n]))
  195. self._df_param_optimized.loc[len(self._df_param_optimized)] = row
  196. def save(self):
  197. self._ol_config.config_save()
  198. if self._pipeline_optimized is None:
  199. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  200. path_model = self._ol_config.f_get_save_path(FileEnum.MODEL.value)
  201. joblib.dump(self._pipeline_optimized, path_model)
  202. print(f"model save to【{path_model}】success. ")
  203. @staticmethod
  204. def load(path: str):
  205. ol_config = OnlineLearningConfigEntity.from_config(path)
  206. ol_config._path_resources = path
  207. return OnlineLearningTrainerXgb(ol_config=ol_config)
  208. def report(self, ntree: int = None):
  209. train_data = self._data.train_data
  210. test_data = self._data.test_data
  211. self._f_get_best_model(self._df_param_optimized, ntree)
  212. if self._ol_config.jupyter_print:
  213. from IPython import display
  214. f_display_title(display, "模型优化过程")
  215. with df_print_nolimit():
  216. display.display(self._df_param_optimized)
  217. metric_value_dict = {}
  218. # 样本分布
  219. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  220. table_font_size=10, table_cell_width=3)
  221. # 模型结果对比
  222. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  223. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  224. # 模型分psi
  225. model_psi = self.psi(train_data, test_data, print_sum=False)
  226. img_path_psi = self._ol_config.f_get_save_path(f"model_psi.png")
  227. f_df_to_image(model_psi, img_path_psi)
  228. metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
  229. value=model_psi["psi"].sum().round(3),
  230. image_path=img_path_psi)
  231. # 分数分箱
  232. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  233. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  234. # 压力测试
  235. if self._ol_config.stress_test:
  236. metric_value_dict["压力测试"] = self._f_get_stress_test()
  237. if self._ol_config.jupyter_print:
  238. self.jupyter_print(metric_value_dict)
  239. # save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  240. # ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  241. # print(f"模型报告文件储存路径:{save_path}")
  242. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  243. from IPython import display
  244. f_display_title(display, "样本分布")
  245. display.display(metric_value_dict["样本分布"].table)
  246. f_display_title(display, "模型结果")
  247. print(f"原模型")
  248. display.display(metric_value_dict["模型结果-原模型"].table)
  249. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  250. print(f"新模型")
  251. display.display(metric_value_dict["模型结果-新模型"].table)
  252. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  253. # 模型psi
  254. f_display_title(display, "模型psi")
  255. display.display(metric_value_dict["模型稳定性"].table)
  256. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  257. f_display_title(display, "分数分箱")
  258. print(f"建模数据上分数分箱")
  259. print(f"原模型")
  260. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  261. print(f"新模型")
  262. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  263. if "压力测试" in metric_value_dict.keys():
  264. f_display_title(display, "压力测试")
  265. display.display(metric_value_dict["压力测试"].table)
  266. if __name__ == "__main__":
  267. pass