model_xgb.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import json
  8. import os.path
  9. from os.path import dirname, realpath
  10. from typing import Dict
  11. import joblib
  12. import numpy as np
  13. import pandas as pd
  14. import scorecardpy as sc
  15. import xgboost as xgb
  16. from pypmml import Model
  17. from sklearn.preprocessing import OneHotEncoder
  18. from sklearn2pmml import sklearn2pmml, PMMLPipeline
  19. from sklearn2pmml.preprocessing import CutTransformer
  20. from sklearn_pandas import DataFrameMapper
  21. from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
  22. f_image_crop_white_borders
  23. from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
  24. from enums import ResultCodesEnum, ConstantEnum, FileEnum, ContextEnum
  25. from init import context
  26. from .model_base import ModelBase
  27. from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
  28. from .pipeline_xgb_util import fit, Xtransform, Xtransformer_fit
  29. class ModelXgb(ModelBase):
  30. def __init__(self, *args, **kwargs):
  31. super().__init__(*args, **kwargs)
  32. # 覆写方法
  33. PMMLPipeline.Xtransformer_fit = Xtransformer_fit
  34. PMMLPipeline.Xtransform = Xtransform
  35. PMMLPipeline.fit = fit
  36. # 报告模板
  37. self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_xgb.docx")
  38. self.pipeline: PMMLPipeline
  39. self.model = xgb.XGBClassifier
  40. self._test_case = None
  41. def _f_rewrite_pmml(self, path_pmml: str):
  42. with open(path_pmml, mode="r", encoding="utf-8") as f:
  43. pmml = f.read()
  44. pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
  45. with open(path_pmml, mode="w", encoding="utf-8") as f:
  46. f.write(pmml)
  47. f.flush()
  48. def get_report_template_path(self):
  49. return self._template_path
  50. def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
  51. print(f"{'-' * 50}开始训练{'-' * 50}")
  52. params_xgb = self.ml_config.params_xgb
  53. y_column = self._ml_config.y_column
  54. # 选定的变量
  55. x_columns_selected = context.get(ContextEnum.XGB_COLUMNS_SELECTED)
  56. # 包含了未选定的变量
  57. num_columns = context.get(ContextEnum.XGB_COLUMNS_NUM)
  58. points_dict: dict = context.get(ContextEnum.XGB_POINTS)
  59. data: DataSplitEntity = kwargs["data"]
  60. train_data_raw = data.train_data
  61. test_data_raw = data.test_data
  62. # xgb原生接口训练
  63. # dtrain = xgb.DMatrix(data=train_data.data_x, label=train_data.data_y)
  64. # dtest = xgb.DMatrix(data=test_data.data_x, label=test_data.data_y)
  65. # self.model = xgb.train(
  66. # params_xgb,
  67. # dtrain=dtrain,
  68. # evals=[(dtrain, 'train'), (dtest, 'test')],
  69. # num_boost_round=params_xgb.get("num_boost_round"),
  70. # early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
  71. # verbose_eval=params_xgb.get("verbose_eval")
  72. # )
  73. # xgb二次封装为sklearn接口
  74. self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
  75. n_estimators=params_xgb.get("num_boost_round"),
  76. max_depth=params_xgb.get("max_depth"),
  77. learning_rate=params_xgb.get("learning_rate"),
  78. random_state=params_xgb.get("random_state"),
  79. reg_alpha=params_xgb.get("alpha"),
  80. subsample=params_xgb.get("subsample"),
  81. colsample_bytree=params_xgb.get("colsample_bytree"),
  82. importance_type='weight'
  83. )
  84. # self.model.fit(X=train_data.data_x, y=train_data.data_y,
  85. # eval_set=[(train_data.data_x, train_data.data_y), (test_data.data_x, test_data.data_y)],
  86. # eval_metric=params_xgb.get("eval_metric"),
  87. # early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
  88. # verbose=params_xgb.get("verbose_eval"),
  89. # )
  90. str_columns_selected = [i for i in x_columns_selected if i not in num_columns]
  91. mapper = [(str_columns_selected, OneHotEncoder())]
  92. for column in x_columns_selected:
  93. if column in str_columns_selected:
  94. continue
  95. # 粗分箱
  96. if column in points_dict.keys():
  97. points = [-np.inf] + points_dict[column] + [np.inf]
  98. labels = [ConstantEnum.XGB_BIN_LOWEST.value] + points_dict[column]
  99. mapper.append((column, CutTransformer(points, right=False, labels=labels)))
  100. else:
  101. mapper.append((column, None))
  102. mapper = DataFrameMapper(mapper)
  103. self.pipeline = PMMLPipeline([("mapper", mapper), ("classifier", self.model)])
  104. self.pipeline.Xtransformer_fit(data.data, data.data[y_column])
  105. self.pipeline.fit(train_data_raw, train_data_raw[y_column],
  106. classifier__eval_set=[
  107. (self.pipeline.Xtransform(train_data_raw), train_data_raw[y_column]),
  108. (self.pipeline.Xtransform(test_data_raw), test_data_raw[y_column])
  109. ],
  110. classifier__eval_metric=params_xgb.get("eval_metric"),
  111. classifier__early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
  112. classifier__verbose=params_xgb.get("verbose_eval"),
  113. )
  114. if params_xgb.get("save_pmml"):
  115. path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
  116. # pipeline = make_pmml_pipeline(self.model)
  117. sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
  118. self._f_rewrite_pmml(path_pmml)
  119. print(f"model save to【{path_pmml}】success. ")
  120. # pmml与原生模型结果一致性校验
  121. model_pmml = Model.fromFile(path_pmml)
  122. prob_pmml = model_pmml.predict(data.data)["probability(1)"]
  123. prob_pipeline = self.pipeline.predict_proba(data.data)[:, 1]
  124. diff = pd.DataFrame()
  125. diff["prob_pmml"] = prob_pmml
  126. diff["prob_pipeline"] = prob_pipeline
  127. diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
  128. diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
  129. print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
  130. if params_xgb.get("trees_print"):
  131. trees = self.model.get_booster().get_dump()
  132. for i, tree in enumerate(trees):
  133. if i < self.model.best_ntree_limit:
  134. print(f"Tree {i}:")
  135. print(tree)
  136. def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  137. # prob = self.model.predict_proba(x)[:, 1]
  138. prob = self.pipeline.predict_proba(x)[:, 1]
  139. return prob
  140. def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  141. pass
  142. def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
  143. pass
  144. def model_save(self):
  145. if self.pipeline is None:
  146. GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
  147. path_model = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
  148. # self.model.save_model(path_model)
  149. joblib.dump(self.pipeline, path_model)
  150. print(f"model save to【{path_model}】success. ")
  151. path = self.ml_config.f_get_save_path(FileEnum.TEST_CASE.value)
  152. self._test_case.to_csv(path, encoding="utf-8")
  153. print(f"test case save to【{path}】success. ")
  154. def model_load(self, path: str, *args, **kwargs):
  155. if not os.path.isdir(path):
  156. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
  157. path_model = os.path.join(path, FileEnum.MODEL.value)
  158. if not os.path.isfile(path_model):
  159. raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
  160. # self.model = xgb.XGBClassifier()
  161. # self.model.load_model(path_model)
  162. self.pipeline = joblib.load(path_model)
  163. print(f"model load from【{path_model}】success.")
  164. def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
  165. def _get_auc_ks(data_y, score, title):
  166. perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
  167. path = self.ml_config.f_get_save_path(f"perf_{title}.png")
  168. perf["pic"].savefig(path)
  169. auc = perf["AUC"]
  170. ks = perf["KS"]
  171. f_image_crop_white_borders(path, path)
  172. return auc, ks, path
  173. def _get_perf():
  174. # 模型ks auc
  175. img_path_auc_ks = []
  176. train_score = self.prob(train_data)
  177. test_score = self.prob(test_data)
  178. train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train")
  179. img_path_auc_ks.append(path)
  180. test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test")
  181. img_path_auc_ks.append(path)
  182. df_auc_ks = pd.DataFrame()
  183. df_auc_ks["样本集"] = ["训练集", "测试集"]
  184. df_auc_ks["AUC"] = [train_auc, test_auc]
  185. df_auc_ks["KS"] = [train_ks, test_ks]
  186. metric_value_dict[f"模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
  187. image_size=5, table_font_size=10)
  188. # 评分卡分箱
  189. train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
  190. train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=False)
  191. img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
  192. f_df_to_image(train_data_gain, img_path_train_gain)
  193. metric_value_dict[f"训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain,
  194. image_path=img_path_train_gain)
  195. test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
  196. test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=False)
  197. img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain.png")
  198. f_df_to_image(test_data_gain, img_path_test_gain)
  199. metric_value_dict[f"测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain,
  200. image_path=img_path_test_gain)
  201. # 模型分psi
  202. model_psi = f_calcu_model_psi(train_score_bin, test_score_bin, sort_ascending=False)
  203. img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
  204. f_df_to_image(model_psi, img_path_psi)
  205. metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
  206. value=model_psi["psi"].sum().round(3),
  207. image_path=img_path_psi)
  208. return train_score_bin, test_score_bin
  209. y_column = self._ml_config.y_column
  210. stress_test = self.ml_config.stress_test
  211. stress_sample_times = self.ml_config.stress_sample_times
  212. stress_bad_rate_list = self.ml_config.stress_bad_rate_list
  213. train_data = data.train_data
  214. test_data = data.test_data
  215. metric_value_dict = {}
  216. metric_value_dict["模型超参数"] = MetricFucResultEntity(
  217. value=json.dumps(self.model.get_xgb_params(), ensure_ascii=False, indent=2))
  218. _, test_score_bin = _get_perf()
  219. # 压力测试
  220. if stress_test:
  221. df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
  222. bad_rate_list=stress_bad_rate_list,
  223. target_column=y_column, score_column=ConstantEnum.SCORE.value,
  224. sort_ascending=False)
  225. img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
  226. f_df_to_image(df_stress, img_path_stress)
  227. metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
  228. if self.ml_config.jupyter_print:
  229. self.jupyter_print(metric_value_dict)
  230. # 测试案例
  231. self._test_case = data.test_data.copy()
  232. test_score = self.prob(test_data)
  233. self._test_case["score"] = test_score
  234. return metric_value_dict
  235. def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
  236. from IPython import display
  237. f_display_title(display, "模型结果")
  238. display.display(metric_value_dict["模型结果"].table)
  239. f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
  240. f_display_title(display, "模型超参数")
  241. print(metric_value_dict["模型超参数"].value)
  242. # 模型psi
  243. f_display_title(display, "模型psi")
  244. display.display(metric_value_dict["模型稳定性"].table)
  245. print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
  246. f_display_title(display, "分数分箱")
  247. print("训练集-分数分箱")
  248. display.display(metric_value_dict["训练集分数分箱"].table)
  249. print("测试集-分数分箱")
  250. display.display(metric_value_dict["测试集分数分箱"].table)
  251. if "压力测试" in metric_value_dict.keys():
  252. f_display_title(display, "压力测试")
  253. display.display(metric_value_dict["压力测试"].table)
  254. if __name__ == "__main__":
  255. pass