2 mēneši atpakaļ · 1a7ef38780
--- a/__init__.py
+++ b/__init__.py
@@ -10,7 +10,7 @@ from os.path import dirname, realpath
 
				 
			
 
				 sys.path.append(dirname(realpath(__file__)))
			
 
				 
			
 
				-from online_learning import OnlineLearningTrainerLr
			
 
				+from online_learning import OnlineLearningTrainerLr, OnlineLearningTrainerXgb
			
 
				 from pipeline import Pipeline
			
 
				 from data import DataLoaderMysql
			
 
				 from entitys import DbConfigEntity, DataSplitEntity
			
@@ -18,4 +18,4 @@ from monitor import MonitorMetric
 
				 from metrics import MetricBase
			
 
				 
			
 
				 __all__ = ['MonitorMetric', 'MetricBase', 'DataLoaderMysql', 'DbConfigEntity',
			
 
				-           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr']
			
 
				+           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb']
			
--- a/entitys/ol_config_entity.py
+++ b/entitys/ol_config_entity.py
@@ -20,6 +20,7 @@ class OnlineLearningConfigEntity():
 
				                  y_column: str,
			
 
				                  project_name: str = None,
			
 
				                  lr: float = 0.01,
			
 
				+                 random_state: int = 2025,
			
 
				                  batch_size: int = 64,
			
 
				                  epochs: int = 50,
			
 
				                  columns_anns: dict = {},
			
@@ -36,6 +37,8 @@ class OnlineLearningConfigEntity():
 
				         self._project_name = project_name
			
 
				         # 学习率
			
 
				         self._lr = lr
			
 
				+        # 学习率
			
 
				+        self._random_state = random_state
			
 
				         # 模型单次更新使用数据量
			
 
				         self._batch_size = batch_size
			
 
				         # 最大训练轮数
			
@@ -78,6 +81,10 @@ class OnlineLearningConfigEntity():
 
				     def lr(self):
			
 
				         return self._lr
			
 
				 
			
 
				+    @property
			
 
				+    def random_state(self):
			
 
				+        return self._random_state
			
 
				+
			
 
				     @property
			
 
				     def batch_size(self):
			
 
				         return self._batch_size
			
--- a/model/__init__.py
+++ b/model/__init__.py
@@ -7,6 +7,7 @@
 
				 from .model_base import ModelBase
			
 
				 from .model_factory import ModelFactory
			
 
				 from .model_utils import f_add_rules, f_get_model_score_bin, f_calcu_model_ks, f_calcu_model_psi, f_stress_test
			
 
				+from .pipeline_xgb_util import Xtransformer_fit, Xtransform, fit
			
 
				 
			
 
				 __all__ = ['ModelBase', 'ModelFactory', 'f_add_rules', 'f_get_model_score_bin', 'f_calcu_model_ks', 'f_calcu_model_psi',
			
 
				-           'f_stress_test']
			
 
				+           'f_stress_test', 'Xtransformer_fit', 'Xtransform', 'fit']
			
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -162,8 +162,6 @@ class ModelXgb(ModelBase):
 
				         pass
			
 
				 
			
 
				     def model_save(self):
			
 
				-        params_xgb = self.ml_config.params_xgb
			
 
				-
			
 
				         if self.pipeline is None:
			
 
				             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
			
 
				 
			
--- a/ol_test_lr.py
+++ b/ol_test_lr.py
--- a/ol_test_xgb.py
+++ b/ol_test_xgb.py
@@ -0,0 +1,51 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2024/11/27
			
 
				+@desc:
			
 
				+"""
			
 
				+import time
			
 
				+
			
 
				+from entitys import DataSplitEntity
			
 
				+from online_learning import OnlineLearningTrainerXgb
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    time_now = time.time()
			
 
				+    import scorecardpy as sc
			
 
				+
			
 
				+    # 加载数据
			
 
				+    dat = sc.germancredit()
			
 
				+    dat_columns = dat.columns.tolist()
			
 
				+    dat_columns = [c.replace(".", "_") for c in dat_columns]
			
 
				+    dat.columns = dat_columns
			
 
				+
			
 
				+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
			
 
				+
			
 
				+    data = DataSplitEntity(train_data=dat[:609], test_data=dat[609:])
			
 
				+
			
 
				+    # 特征处理
			
 
				+    cfg = {
			
 
				+        # 模型，请参考ol_resources_demo目录下文件
			
 
				+        # 模型文件 model.pkl
			
 
				+        "path_resources": "/root/notebook/ol_resources_demo",
			
 
				+        # 项目名称，影响数据存储位置
			
 
				+        "project_name": "OnlineLearningDemo",
			
 
				+        "y_column": "creditability",
			
 
				+        # 学习率
			
 
				+        "lr": 0.01,
			
 
				+        "jupyter_print": True,
			
 
				+        # 压力测试
			
 
				+        "stress_test": False,
			
 
				+        # 压力测试抽样次数
			
 
				+        "stress_sample_times": 10,
			
 
				+        "columns_anns": {
			
 
				+            "age_in_years": "年龄"
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    # 训练并生成报告
			
 
				+    trainer = OnlineLearningTrainerXgb(data=data, **cfg)
			
 
				+    trainer.train()
			
 
				+    trainer.report()
			
 
				+
			
 
				+    print(time.time() - time_now)
			
--- a/online_learning/__init__.py
+++ b/online_learning/__init__.py
@@ -6,5 +6,6 @@
 
				 """
			
 
				 
			
 
				 from .trainer_lr import OnlineLearningTrainerLr
			
 
				+from .trainer_xgb import OnlineLearningTrainerXgb
			
 
				 
			
 
				-__all__ = ['OnlineLearningTrainerLr']
			
 
				+__all__ = ['OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb']
			
--- a/online_learning/trainer_xgb.py
+++ b/online_learning/trainer_xgb.py
@@ -0,0 +1,267 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2025/2/27
			
 
				+@desc: 
			
 
				+"""
			
 
				+import os
			
 
				+from os.path import dirname, realpath
			
 
				+from typing import Dict, List
			
 
				+
			
 
				+import joblib
			
 
				+import pandas as pd
			
 
				+import scorecardpy as sc
			
 
				+import xgboost as xgb
			
 
				+from sklearn2pmml import PMMLPipeline
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
			
 
				+    f_display_images_by_side
			
 
				+from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
			
 
				+from enums import ResultCodesEnum, ConstantEnum, FileEnum
			
 
				+from init import init
			
 
				+from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi, Xtransformer_fit, \
			
 
				+    Xtransform, fit
			
 
				+
			
 
				+init()
			
 
				+
			
 
				+
			
 
				+class OnlineLearningTrainerXgb:
			
 
				+    def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
			
 
				+        # 覆写方法
			
 
				+        PMMLPipeline.Xtransformer_fit = Xtransformer_fit
			
 
				+        PMMLPipeline.Xtransform = Xtransform
			
 
				+        PMMLPipeline.fit = fit
			
 
				+
			
 
				+        if ol_config is not None:
			
 
				+            self._ol_config = ol_config
			
 
				+        else:
			
 
				+            self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
			
 
				+
			
 
				+        self._data = data
			
 
				+        self._df_param_optimized = None
			
 
				+        self._model_optimized_list = []
			
 
				+        self._pipeline_original: PMMLPipeline
			
 
				+        self._pipeline_optimized: PMMLPipeline
			
 
				+
			
 
				+        # 报告模板
			
 
				+        self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
			
 
				+                                           "./template/OnlineLearning报告模板_xgb.docx")
			
 
				+
			
 
				+        self._init(self._ol_config.path_resources)
			
 
				+
			
 
				+    def _init(self, path: str):
			
 
				+        if not os.path.isdir(path):
			
 
				+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
			
 
				+        path_model = os.path.join(path, FileEnum.MODEL.value)
			
 
				+        if not os.path.isfile(path_model):
			
 
				+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
			
 
				+
			
 
				+        self._pipeline_original = joblib.load(path_model)
			
 
				+        self._pipeline_optimized = joblib.load(path_model)
			
 
				+        print(f"model load from【{path_model}】success.")
			
 
				+
			
 
				+    def _f_get_best_model(self, df_param: pd.DataFrame, ntree: int = None):
			
 
				+        if ntree is None:
			
 
				+            df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
			
 
				+            print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
			
 
				+            self._train(df_param_sort.iloc[0][2])
			
 
				+        else:
			
 
				+            print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
			
 
				+            self._train(ntree)
			
 
				+
			
 
				+    def _f_get_metric_auc_ks(self, model_type: str):
			
 
				+        def _get_auc_ks(data, title):
			
 
				+            y = data[self._ol_config.y_column]
			
 
				+            y_prob = self.prob(data, model)
			
 
				+            perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
			
 
				+            path = self._ol_config.f_get_save_path(f"perf_{title}.png")
			
 
				+            perf["pic"].savefig(path)
			
 
				+            auc = perf["AUC"]
			
 
				+            ks = perf["KS"]
			
 
				+            f_image_crop_white_borders(path, path)
			
 
				+            return auc, ks, path
			
 
				+
			
 
				+        train_data = self._data.train_data
			
 
				+        test_data = self._data.test_data
			
 
				+        data = self._data.data
			
 
				+        model = self._pipeline_optimized
			
 
				+        if model_type != "新模型":
			
 
				+            model = self._pipeline_original
			
 
				+
			
 
				+        img_path_auc_ks = []
			
 
				+        auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
			
 
				+        img_path_auc_ks.append(path)
			
 
				+        train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
			
 
				+        img_path_auc_ks.append(path)
			
 
				+        test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
			
 
				+        img_path_auc_ks.append(path)
			
 
				+
			
 
				+        df_auc_ks = pd.DataFrame()
			
 
				+        df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
			
 
				+        df_auc_ks["AUC"] = [auc, train_auc, test_auc]
			
 
				+        df_auc_ks["KS"] = [ks, train_ks, test_ks]
			
 
				+
			
 
				+        return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
			
 
				+
			
 
				+    def _f_get_metric_gain(self, model_type: str):
			
 
				+        y_column = self._ol_config.y_column
			
 
				+        data = self._data.data
			
 
				+
			
 
				+        model = self._pipeline_optimized
			
 
				+        if model_type != "新模型":
			
 
				+            model = self._pipeline_original
			
 
				+
			
 
				+        score = self.prob(data, model)
			
 
				+        score_bin, _ = f_get_model_score_bin(data, score)
			
 
				+        gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
			
 
				+        img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
			
 
				+        f_df_to_image(gain, img_path_gain)
			
 
				+
			
 
				+        return MetricFucResultEntity(table=gain, image_path=img_path_gain)
			
 
				+
			
 
				+    def _f_get_stress_test(self, ):
			
 
				+        stress_sample_times = self._ol_config.stress_sample_times
			
 
				+        stress_bad_rate_list = self._ol_config.stress_bad_rate_list
			
 
				+        y_column = self._ol_config.y_column
			
 
				+        data = self._data.data
			
 
				+        score = self.prob(data, self._pipeline_optimized)
			
 
				+        score_bin, _ = f_get_model_score_bin(data, score)
			
 
				+        df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
			
 
				+                                  target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
			
 
				+
			
 
				+        img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
			
 
				+        f_df_to_image(df_stress, img_path_stress)
			
 
				+        return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
			
 
				+
			
 
				+    def prob(self, x: pd.DataFrame, pipeline=None):
			
 
				+        if pipeline is None:
			
 
				+            pipeline = self._pipeline_optimized
			
 
				+        y_prob = pipeline.predict_proba(x)[:, 1]
			
 
				+        return y_prob
			
 
				+
			
 
				+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
			
 
				+        y1 = self.prob(x1)
			
 
				+        y2 = self.prob(x2)
			
 
				+        x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
			
 
				+        x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
			
 
				+        model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
			
 
				+        print(f"模型psi: {model_psi['psi'].sum()}")
			
 
				+        return model_psi
			
 
				+
			
 
				+    def _train(self, n_estimators: int = None):
			
 
				+        y_column = self._ol_config.y_column
			
 
				+        train_data = self._data.train_data
			
 
				+
			
 
				+        model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
			
 
				+        ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
			
 
				+        model_optimized = xgb.XGBClassifier(
			
 
				+            n_estimators=n_estimators if n_estimators else ntree,
			
 
				+            updater="refresh",
			
 
				+            process_type="update",
			
 
				+            refresh_leaf=True,
			
 
				+            learning_rate=self._ol_config.lr,
			
 
				+            random_state=self._ol_config.random_state,
			
 
				+        )
			
 
				+        self._pipeline_optimized.steps[-1] = ("classifier", model_optimized)
			
 
				+        self._pipeline_optimized.fit(train_data, train_data[y_column],
			
 
				+                                     classifier__verbose=False,
			
 
				+                                     classifier__xgb_model=model_original.get_booster(),
			
 
				+                                     )
			
 
				+        return ntree
			
 
				+
			
 
				+    def train(self, ):
			
 
				+        y_column = self._ol_config.y_column
			
 
				+        test_data = self._data.test_data
			
 
				+
			
 
				+        df_param_columns = ["auc_test", "ks_test", "ntree"]
			
 
				+        self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
			
 
				+        ntree = self._train()
			
 
				+        print(f"原模型一共有【{ntree}】棵树")
			
 
				+        for n in tqdm(range(ntree)):
			
 
				+            n = n + 1
			
 
				+            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
			
 
				+            test_y = test_data[y_column]
			
 
				+            perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
			
 
				+            auc_test = perf["AUC"]
			
 
				+            ks_test = perf["KS"]
			
 
				+            row = dict(zip(df_param_columns, [auc_test, ks_test, n]))
			
 
				+            self._df_param_optimized.loc[len(self._df_param_optimized)] = row
			
 
				+
			
 
				+    def save(self):
			
 
				+        self._ol_config.config_save()
			
 
				+
			
 
				+        if self._pipeline_optimized is None:
			
 
				+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
			
 
				+
			
 
				+        path_model = self._ol_config.f_get_save_path(FileEnum.MODEL.value)
			
 
				+        joblib.dump(self._pipeline_optimized, path_model)
			
 
				+        print(f"model save to【{path_model}】success. ")
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def load(path: str):
			
 
				+        ol_config = OnlineLearningConfigEntity.from_config(path)
			
 
				+        ol_config._path_resources = path
			
 
				+        return OnlineLearningTrainerXgb(ol_config=ol_config)
			
 
				+
			
 
				+    def report(self, ntree: int = None):
			
 
				+        self._f_get_best_model(self._df_param_optimized, ntree)
			
 
				+
			
 
				+        if self._ol_config.jupyter_print:
			
 
				+            from IPython import display
			
 
				+            f_display_title(display, "模型优化过程")
			
 
				+            display.display(self._df_param_optimized)
			
 
				+
			
 
				+        metric_value_dict = {}
			
 
				+
			
 
				+        # 样本分布
			
 
				+        metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
			
 
				+                                                          table_font_size=10, table_cell_width=3)
			
 
				+
			
 
				+        # 模型结果对比
			
 
				+        metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
			
 
				+        metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
			
 
				+
			
 
				+        # 分数分箱
			
 
				+        metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
			
 
				+        metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
			
 
				+
			
 
				+        # 压力测试
			
 
				+        if self._ol_config.stress_test:
			
 
				+            metric_value_dict["压力测试"] = self._f_get_stress_test()
			
 
				+
			
 
				+        if self._ol_config.jupyter_print:
			
 
				+            self.jupyter_print(metric_value_dict)
			
 
				+
			
 
				+        # save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
			
 
				+        # ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
			
 
				+        # print(f"模型报告文件储存路径:{save_path}")
			
 
				+
			
 
				+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
			
 
				+        from IPython import display
			
 
				+
			
 
				+        f_display_title(display, "样本分布")
			
 
				+        display.display(metric_value_dict["样本分布"].table)
			
 
				+
			
 
				+        f_display_title(display, "模型结果")
			
 
				+        print(f"原模型")
			
 
				+        display.display(metric_value_dict["模型结果-原模型"].table)
			
 
				+        f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
			
 
				+        print(f"新模型")
			
 
				+        display.display(metric_value_dict["模型结果-新模型"].table)
			
 
				+        f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
			
 
				+
			
 
				+        f_display_title(display, "分数分箱")
			
 
				+        print(f"建模数据上分数分箱")
			
 
				+        print(f"原模型")
			
 
				+        display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
			
 
				+        print(f"新模型")
			
 
				+        display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
			
 
				+
			
 
				+        if "压力测试" in metric_value_dict.keys():
			
 
				+            f_display_title(display, "压力测试")
			
 
				+            display.display(metric_value_dict["压力测试"].table)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass