trainer_xgb.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2025/2/27
  5. @desc:
  6. """
  7. import os
  8. from os.path import dirname, realpath
  9. from typing import Dict, List
  10. import joblib
  11. import pandas as pd
  12. import scorecardpy as sc
  13. import xgboost as xgb
  14. from sklearn2pmml import PMMLPipeline
  15. from tqdm import tqdm
  16. from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
  17. f_display_images_by_side
  18. from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
  19. from enums import ResultCodesEnum, ConstantEnum, FileEnum
  20. from init import init
  21. from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi, Xtransformer_fit, \
  22. Xtransform, fit
  23. init()
  24. class OnlineLearningTrainerXgb:
  25. def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
  26. # 覆写方法
  27. PMMLPipeline.Xtransformer_fit = Xtransformer_fit
  28. PMMLPipeline.Xtransform = Xtransform
  29. PMMLPipeline.fit = fit
  30. if ol_config is not None:
  31. self._ol_config = ol_config
  32. else:
  33. self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
  34. self._data = data
  35. self._df_param_optimized = None
  36. self._model_optimized_list = []
  37. self._pipeline_original: PMMLPipeline
  38. self._pipeline_optimized: PMMLPipeline
  39. # 报告模板
  40. self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
  41. "./template/OnlineLearning报告模板_xgb.docx")
  42. self._init(self._ol_config.path_resources)
  43. def _init(self, path: str):
  44. if not os.path.isdir(path):
  45. raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
  46. path_model = os.path.join(path, FileEnum.MODEL.value)
  47. if not os.path.isfile(path_model):
  48. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  49. self._pipeline_original = joblib.load(path_model)
  50. self._pipeline_optimized = joblib.load(path_model)
  51. print(f"model load from【{path_model}】success.")
  52. def _f_get_best_model(self, df_param: pd.DataFrame, ntree: int = None):
  53. if ntree is None:
  54. df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
  55. print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
  56. self._train(df_param_sort.iloc[0][2])
  57. else:
  58. print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
  59. self._train(ntree)
  60. def _f_get_metric_auc_ks(self, model_type: str):
  61. def _get_auc_ks(data, title):
  62. y = data[self._ol_config.y_column]
  63. y_prob = self.prob(data, model)
  64. perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
  65. path = self._ol_config.f_get_save_path(f"perf_{title}.png")
  66. perf["pic"].savefig(path)
  67. auc = perf["AUC"]
  68. ks = perf["KS"]
  69. f_image_crop_white_borders(path, path)
  70. return auc, ks, path
  71. train_data = self._data.train_data
  72. test_data = self._data.test_data
  73. data = self._data.data
  74. model = self._pipeline_optimized
  75. if model_type != "新模型":
  76. model = self._pipeline_original
  77. img_path_auc_ks = []
  78. auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
  79. img_path_auc_ks.append(path)
  80. train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
  81. img_path_auc_ks.append(path)
  82. test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
  83. img_path_auc_ks.append(path)
  84. df_auc_ks = pd.DataFrame()
  85. df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
  86. df_auc_ks["AUC"] = [auc, train_auc, test_auc]
  87. df_auc_ks["KS"] = [ks, train_ks, test_ks]
  88. return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
  89. def _f_get_metric_gain(self, model_type: str):
  90. y_column = self._ol_config.y_column
  91. data = self._data.data
  92. model = self._pipeline_optimized
  93. if model_type != "新模型":
  94. model = self._pipeline_original
  95. score = self.prob(data, model)
  96. score_bin, _ = f_get_model_score_bin(data, score)
  97. gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
  98. img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
  99. f_df_to_image(gain, img_path_gain)
  100. return MetricFucResultEntity(table=gain, image_path=img_path_gain)
  101. def _f_get_stress_test(self, ):
  102. stress_sample_times = self._ol_config.stress_sample_times
  103. stress_bad_rate_list = self._ol_config.stress_bad_rate_list
  104. y_column = self._ol_config.y_column
  105. data = self._data.data
  106. score = self.prob(data, self._pipeline_optimized)
  107. score_bin, _ = f_get_model_score_bin(data, score)
  108. df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
  109. target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
  110. img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
  111. f_df_to_image(df_stress, img_path_stress)
  112. return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  113. def prob(self, x: pd.DataFrame, pipeline=None):
  114. if pipeline is None:
  115. pipeline = self._pipeline_optimized
  116. y_prob = pipeline.predict_proba(x)[:, 1]
  117. return y_prob
  118. def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
  119. y1 = self.prob(x1)
  120. y2 = self.prob(x2)
  121. x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
  122. x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
  123. model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
  124. print(f"模型psi: {model_psi['psi'].sum()}")
  125. return model_psi
  126. def _train(self, n_estimators: int = None):
  127. y_column = self._ol_config.y_column
  128. train_data = self._data.train_data
  129. model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
  130. ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
  131. model_optimized = xgb.XGBClassifier(
  132. n_estimators=n_estimators if n_estimators else ntree,
  133. updater="refresh",
  134. process_type="update",
  135. refresh_leaf=True,
  136. learning_rate=self._ol_config.lr,
  137. random_state=self._ol_config.random_state,
  138. )
  139. self._pipeline_optimized.steps[-1] = ("classifier", model_optimized)
  140. self._pipeline_optimized.fit(train_data, train_data[y_column],
  141. classifier__verbose=False,
  142. classifier__xgb_model=model_original.get_booster(),
  143. )
  144. return ntree
  145. def train(self, ):
  146. y_column = self._ol_config.y_column
  147. test_data = self._data.test_data
  148. df_param_columns = ["auc_test", "ks_test", "ntree"]
  149. self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
  150. ntree = self._train()
  151. print(f"原模型一共有【{ntree}】棵树")
  152. for n in tqdm(range(ntree)):
  153. n = n + 1
  154. test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
  155. test_y = test_data[y_column]
  156. perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
  157. auc_test = perf["AUC"]
  158. ks_test = perf["KS"]
  159. row = dict(zip(df_param_columns, [auc_test, ks_test, n]))
  160. self._df_param_optimized.loc[len(self._df_param_optimized)] = row
  161. def save(self):
  162. self._ol_config.config_save()
  163. if self._pipeline_optimized is None:
  164. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  165. path_model = self._ol_config.f_get_save_path(FileEnum.MODEL.value)
  166. joblib.dump(self._pipeline_optimized, path_model)
  167. print(f"model save to【{path_model}】success. ")
  168. @staticmethod
  169. def load(path: str):
  170. ol_config = OnlineLearningConfigEntity.from_config(path)
  171. ol_config._path_resources = path
  172. return OnlineLearningTrainerXgb(ol_config=ol_config)
  173. def report(self, ntree: int = None):
  174. self._f_get_best_model(self._df_param_optimized, ntree)
  175. if self._ol_config.jupyter_print:
  176. from IPython import display
  177. f_display_title(display, "模型优化过程")
  178. display.display(self._df_param_optimized)
  179. metric_value_dict = {}
  180. # 样本分布
  181. metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
  182. table_font_size=10, table_cell_width=3)
  183. # 模型结果对比
  184. metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
  185. metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
  186. # 分数分箱
  187. metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
  188. metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
  189. # 压力测试
  190. if self._ol_config.stress_test:
  191. metric_value_dict["压力测试"] = self._f_get_stress_test()
  192. if self._ol_config.jupyter_print:
  193. self.jupyter_print(metric_value_dict)
  194. # save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
  195. # ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
  196. # print(f"模型报告文件储存路径:{save_path}")
  197. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
  198. from IPython import display
  199. f_display_title(display, "样本分布")
  200. display.display(metric_value_dict["样本分布"].table)
  201. f_display_title(display, "模型结果")
  202. print(f"原模型")
  203. display.display(metric_value_dict["模型结果-原模型"].table)
  204. f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
  205. print(f"新模型")
  206. display.display(metric_value_dict["模型结果-新模型"].table)
  207. f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
  208. f_display_title(display, "分数分箱")
  209. print(f"建模数据上分数分箱")
  210. print(f"原模型")
  211. display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
  212. print(f"新模型")
  213. display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
  214. if "压力测试" in metric_value_dict.keys():
  215. f_display_title(display, "压力测试")
  216. display.display(metric_value_dict["压力测试"].table)
  217. if __name__ == "__main__":
  218. pass