|
@@ -0,0 +1,218 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+@author: yq
|
|
|
+@time: 2024/11/1
|
|
|
+@desc:
|
|
|
+"""
|
|
|
+import os.path
|
|
|
+from os.path import dirname, realpath
|
|
|
+from typing import Dict
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import scorecardpy as sc
|
|
|
+import xgboost as xgb
|
|
|
+from sklearn2pmml import sklearn2pmml, make_pmml_pipeline
|
|
|
+
|
|
|
+from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
|
|
|
+ f_image_crop_white_borders
|
|
|
+from config import BaseConfig
|
|
|
+from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
|
|
|
+from enums import ResultCodesEnum, ConstantEnum, FileEnum
|
|
|
+from .model_base import ModelBase
|
|
|
+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
|
|
|
+
|
|
|
+
|
|
|
+class ModelXgb(ModelBase):
|
|
|
+ def __init__(self, *args, **kwargs):
|
|
|
+ super().__init__(*args, **kwargs)
|
|
|
+ # 报告模板
|
|
|
+ self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
|
|
|
+ self.model = None
|
|
|
+
|
|
|
+ def get_report_template_path(self):
|
|
|
+ return self._template_path
|
|
|
+
|
|
|
+ def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
|
|
|
+ print(f"{'-' * 50}开始训练{'-' * 50}")
|
|
|
+ params_xgb = self.ml_config.params_xgb
|
|
|
+
|
|
|
+ # dtrain = xgb.DMatrix(data=train_data.data_x, label=train_data.data_y)
|
|
|
+ # dtest = xgb.DMatrix(data=test_data.data_x, label=test_data.data_y)
|
|
|
+ # self.model = xgb.train(
|
|
|
+ # params_xgb,
|
|
|
+ # dtrain=dtrain,
|
|
|
+ # evals=[(dtrain, 'train'), (dtest, 'test')],
|
|
|
+ # num_boost_round=params_xgb.get("num_boost_round"),
|
|
|
+ # early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
|
|
|
+ # verbose_eval=params_xgb.get("verbose_eval")
|
|
|
+ # )
|
|
|
+
|
|
|
+ self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
|
|
|
+ n_estimators=params_xgb.get("num_boost_round"),
|
|
|
+ max_depth=params_xgb.get("max_depth"),
|
|
|
+ learning_rate=params_xgb.get("learning_rate"),
|
|
|
+ random_state=params_xgb.get("random_state"),
|
|
|
+ reg_alpha=params_xgb.get("alpha"),
|
|
|
+ subsample=params_xgb.get("subsample"),
|
|
|
+ colsample_bytree=params_xgb.get("colsample_bytree"),
|
|
|
+ importance_type='weight'
|
|
|
+ )
|
|
|
+
|
|
|
+ self.model.fit(X=train_data.data_x, y=train_data.data_y,
|
|
|
+ eval_set=[(train_data.data_x, train_data.data_y), (test_data.data_x, test_data.data_y)],
|
|
|
+ eval_metric=params_xgb.get("eval_metric"),
|
|
|
+ early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
|
|
|
+ verbose=params_xgb.get("verbose_eval"),
|
|
|
+ )
|
|
|
+
|
|
|
+ if params_xgb.get("trees_print"):
|
|
|
+ trees = self.model.get_booster().get_dump()
|
|
|
+ for i, tree in enumerate(trees):
|
|
|
+ if i < self.model.best_ntree_limit:
|
|
|
+ print(f"Tree {i}:")
|
|
|
+ print(tree)
|
|
|
+
|
|
|
+ self._train_score = self.prob(train_data.data_x)
|
|
|
+ self._test_score = self.prob(test_data.data_x)
|
|
|
+
|
|
|
+ def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
|
|
|
+ prob = self.model.predict_proba(x)[:, 1]
|
|
|
+ return prob
|
|
|
+
|
|
|
+ def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def model_save(self):
|
|
|
+ if self.model is None:
|
|
|
+ GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
|
|
|
+
|
|
|
+ path_model = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
|
|
|
+ self.model.save_model(path_model)
|
|
|
+ print(f"model save to【{path_model}】success. ")
|
|
|
+
|
|
|
+ path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
|
|
|
+ pipeline = make_pmml_pipeline(self.model)
|
|
|
+ sklearn2pmml(pipeline, path_pmml, with_repr=True, java_home=BaseConfig.java_home)
|
|
|
+
|
|
|
+ def model_load(self, path: str, *args, **kwargs):
|
|
|
+ if not os.path.isdir(path):
|
|
|
+ raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
|
|
|
+ path_model = os.path.join(path, FileEnum.MODEL.value)
|
|
|
+ if not os.path.isfile(path_model):
|
|
|
+ raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
|
|
|
+
|
|
|
+ self.model = xgb.XGBClassifier()
|
|
|
+ self.model.load_model(path_model)
|
|
|
+
|
|
|
+ print(f"model load from【{path_model}】success.")
|
|
|
+
|
|
|
+ def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
|
|
|
+
|
|
|
+ def _get_auc_ks(data_y, score, title):
|
|
|
+ perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
|
|
|
+ path = self.ml_config.f_get_save_path(f"perf_{title}.png")
|
|
|
+ perf["pic"].savefig(path)
|
|
|
+ auc = perf["AUC"]
|
|
|
+ ks = perf["KS"]
|
|
|
+ f_image_crop_white_borders(path, path)
|
|
|
+ return auc, ks, path
|
|
|
+
|
|
|
+ def _get_perf():
|
|
|
+ # 模型ks auc
|
|
|
+ img_path_auc_ks = []
|
|
|
+
|
|
|
+ train_score = self._train_score
|
|
|
+ test_score = self._test_score
|
|
|
+
|
|
|
+ train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train")
|
|
|
+ img_path_auc_ks.append(path)
|
|
|
+ test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test")
|
|
|
+ img_path_auc_ks.append(path)
|
|
|
+
|
|
|
+ df_auc_ks = pd.DataFrame()
|
|
|
+ df_auc_ks["样本集"] = ["训练集", "测试集"]
|
|
|
+ df_auc_ks["AUC"] = [train_auc, test_auc]
|
|
|
+ df_auc_ks["KS"] = [train_ks, test_ks]
|
|
|
+ metric_value_dict[f"模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
|
|
|
+ image_size=5, table_font_size=10)
|
|
|
+
|
|
|
+ # 评分卡分箱
|
|
|
+ train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
|
|
|
+ train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=False)
|
|
|
+ img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
|
|
|
+ f_df_to_image(train_data_gain, img_path_train_gain)
|
|
|
+ metric_value_dict[f"训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain,
|
|
|
+ image_path=img_path_train_gain)
|
|
|
+
|
|
|
+ test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
|
|
|
+ test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=False)
|
|
|
+ img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain.png")
|
|
|
+ f_df_to_image(test_data_gain, img_path_test_gain)
|
|
|
+ metric_value_dict[f"测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain,
|
|
|
+ image_path=img_path_test_gain)
|
|
|
+
|
|
|
+ # 模型分psi
|
|
|
+ model_psi = f_calcu_model_psi(train_score_bin, test_score_bin, sort_ascending=False)
|
|
|
+ img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
|
|
|
+ f_df_to_image(model_psi, img_path_psi)
|
|
|
+ metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
|
|
|
+ value=model_psi["psi"].sum().round(3),
|
|
|
+ image_path=img_path_psi)
|
|
|
+ return train_score_bin, test_score_bin
|
|
|
+
|
|
|
+ y_column = self._ml_config.y_column
|
|
|
+ stress_test = self.ml_config.stress_test
|
|
|
+ stress_sample_times = self.ml_config.stress_sample_times
|
|
|
+ stress_bad_rate_list = self.ml_config.stress_bad_rate_list
|
|
|
+
|
|
|
+ train_data = data.train_data
|
|
|
+ test_data = data.test_data
|
|
|
+
|
|
|
+ metric_value_dict = {}
|
|
|
+
|
|
|
+ _, test_score_bin = _get_perf()
|
|
|
+
|
|
|
+ # 压力测试
|
|
|
+ if stress_test:
|
|
|
+ df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
|
|
|
+ bad_rate_list=stress_bad_rate_list,
|
|
|
+ target_column=y_column, score_column=ConstantEnum.SCORE.value,
|
|
|
+ sort_ascending=False)
|
|
|
+
|
|
|
+ img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
|
|
|
+ f_df_to_image(df_stress, img_path_stress)
|
|
|
+ metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
|
|
|
+
|
|
|
+ if self.ml_config.jupyter_print:
|
|
|
+ self.jupyter_print(metric_value_dict)
|
|
|
+
|
|
|
+ return metric_value_dict
|
|
|
+
|
|
|
+ def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
|
|
|
+ from IPython import display
|
|
|
+ f_display_title(display, "模型结果")
|
|
|
+ display.display(metric_value_dict["模型结果"].table)
|
|
|
+ f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
|
|
|
+
|
|
|
+ # 模型psi
|
|
|
+ f_display_title(display, "模型psi")
|
|
|
+ display.display(metric_value_dict["模型稳定性"].table)
|
|
|
+ print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
|
|
|
+
|
|
|
+ f_display_title(display, "分数分箱")
|
|
|
+ print("训练集-分数分箱")
|
|
|
+ display.display(metric_value_dict["训练集分数分箱"].table)
|
|
|
+ print("测试集-分数分箱")
|
|
|
+ display.display(metric_value_dict["测试集分数分箱"].table)
|
|
|
+
|
|
|
+ if "压力测试" in metric_value_dict.keys():
|
|
|
+ f_display_title(display, "压力测试")
|
|
|
+ display.display(metric_value_dict["压力测试"].table)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ pass
|