Browse Source

add: xgb增量学习原模型基础上新增树模式

yq 1 day ago
parent
commit
3606fc6f99
6 changed files with 142 additions and 31 deletions
  1. 1 0
      entitys/ol_config_entity.py
  2. 1 1
      model/model_xgb.py
  3. 19 0
      ol_test_xgb.py
  4. 59 11
      online_learning/trainer_xgb.py
  5. 55 12
      online_learning_demo.ipynb
  6. 7 7
      train_test_xgb.py

+ 1 - 0
entitys/ol_config_entity.py

@@ -139,6 +139,7 @@ class OnlineLearningConfigEntity():
             'trees_print': False,
             # tree_add tree_refresh
             'oltype': "refresh",
+            'add_columns': []
         }
         params.update(self._params_xgb)
 

+ 1 - 1
model/model_xgb.py

@@ -141,7 +141,7 @@ class ModelXgb(ModelBase):
             diff["prob_pipeline"] = prob_pipeline
             diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
             diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
-            print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
+            print(f"pmml模型结果一致率(误差小于0.001):{(diff['diff_format'].sum() / len(diff)).round(3) * 100}%")
 
         if params_xgb.get("trees_print"):
             trees = self.model.get_booster().get_dump()

+ 19 - 0
ol_test_xgb.py

@@ -40,6 +40,25 @@ if __name__ == "__main__":
         "stress_sample_times": 10,
         "columns_anns": {
             "age_in_years": "年龄"
+        },
+        "params_xgb": {
+            'objective': 'binary:logistic',
+            'eval_metric': 'auc',
+            'learning_rate': 0.1,
+            'max_depth': 3,
+            'subsample': None,
+            'colsample_bytree': None,
+            'alpha': 0,
+            'lambda': 1,
+            'num_boost_round': 7,
+            'early_stopping_rounds': 20,
+            'verbose_eval': 10,
+            'random_state': 2025,
+            'save_pmml': True,
+            'trees_print': False,
+            # tree_refresh tree_add
+            'oltype': "tree_add",
+            'add_columns': ['age_in_years'],
         }
     }
 

+ 59 - 11
online_learning/trainer_xgb.py

@@ -12,8 +12,11 @@ import joblib
 import pandas as pd
 import scorecardpy as sc
 import xgboost as xgb
+from pandas.core.dtypes.common import is_numeric_dtype
 from pypmml import Model
+from sklearn.preprocessing import OneHotEncoder
 from sklearn2pmml import PMMLPipeline, sklearn2pmml
+from sklearn_pandas import DataFrameMapper
 from tqdm import tqdm
 
 from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
@@ -81,7 +84,7 @@ class OnlineLearningTrainerXgb:
         if ntree is None:
             df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
             print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
-            self._train(int(df_param_sort.iloc[0][3]))
+            self._train(int(df_param_sort.iloc[0][5]))
         else:
             print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
             self._train(ntree)
@@ -96,13 +99,14 @@ class OnlineLearningTrainerXgb:
             # pmml与原生模型结果一致性校验
             model_pmml = Model.fromFile(path_pmml)
             prob_pmml = model_pmml.predict(data)["probability(1)"]
-            prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
+            with silent_print():
+                prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
             diff = pd.DataFrame()
             diff["prob_pmml"] = prob_pmml
             diff["prob_pipeline"] = prob_pipeline
             diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
             diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
-            print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
+            print(f"pmml模型结果一致率(误差小于0.001):{(diff['diff_format'].sum() / len(diff)).round(3) * 100}%")
 
     def _f_get_metric_auc_ks(self, model_type: str):
         def _get_auc_ks(data, title):
@@ -171,7 +175,8 @@ class OnlineLearningTrainerXgb:
     def prob(self, x: pd.DataFrame, pipeline=None, ntree_limit=None):
         if pipeline is None:
             pipeline = self._pipeline_optimized
-        y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
+        with silent_print():
+            y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
         return y_prob
 
     def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True,
@@ -204,6 +209,34 @@ class OnlineLearningTrainerXgb:
                 **params_xgb,
             )
         else:
+            # 处理新增的变量
+            add_columns = params_xgb.get("add_columns")
+            num_columns = []
+            for x_column in add_columns:
+                if is_numeric_dtype(train_data[x_column]):
+                    num_columns.append(x_column)
+            str_columns = [i for i in add_columns if i not in num_columns]
+            mapper_new = []
+            if len(str_columns) > 0:
+                mapper_new.append((str_columns, OneHotEncoder()))
+            for x_column in num_columns:
+                mapper_new.append((x_column, None))
+            mapper_new = DataFrameMapper(mapper_new)
+            mapper_new.fit(self._data.data)
+            features_new = mapper_new.features
+            built_features_new = mapper_new.built_features
+
+            # 合并特征处理器
+            mapper_old: list = self._pipeline_optimized.steps[0][1]
+            features_old = mapper_old.features
+            features_old.extend(features_new)
+            built_features_old = mapper_old.built_features
+            built_features_old.extend(built_features_new)
+            mapper_old.features = features_old
+            mapper_old.built_features = built_features_old
+            self._pipeline_optimized.steps[0] = ("mapper", mapper_old)
+
+            # 模型初始化
             self.model_optimized = xgb.XGBClassifier(
                 n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
                 reg_alpha=params_xgb.get("alpha"),
@@ -212,11 +245,19 @@ class OnlineLearningTrainerXgb:
                 **params_xgb,
             )
         self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
+
+        feature_names_old = model_original.get_booster().feature_names
+        data_transform = self._pipeline_optimized.Xtransform(self._data.data)
+        feature_names_new = [f"f{i}" for i in range(data_transform.shape[1])]
+        model_original.get_booster().feature_names = feature_names_new
+
         with silent_print():
             self._pipeline_optimized.fit(train_data, train_data[y_column],
                                          classifier__verbose=False,
                                          classifier__xgb_model=model_original.get_booster(),
                                          )
+        model_original.get_booster().feature_names = feature_names_old
+
         return ntree
 
     def train(self, ):
@@ -225,7 +266,7 @@ class OnlineLearningTrainerXgb:
         train_data = self._data.train_data
         test_data = self._data.test_data
 
-        df_param_columns = ["auc_test", "ks_test", "psi", "ntree"]
+        df_param_columns = ["auc_train", "ks_train", "auc_test", "ks_test", "psi", "ntree"]
         self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
         ntree = self._train()
         print(f"原模型一共有【{ntree}】棵树")
@@ -241,9 +282,11 @@ class OnlineLearningTrainerXgb:
                 ntree_limit = n + 1
             else:
                 ntree_limit = ntree + n + 1
-            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
+            with silent_print():
+                train_y_prob = self._pipeline_optimized.predict_proba(train_data, ntree_limit=ntree_limit)[:, 1]
+                test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
+            train_y = train_data[y_column]
             test_y = test_data[y_column]
-
             psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
 
             # auc_test = roc_auc_score(test_y, test_y_prob)
@@ -252,10 +295,15 @@ class OnlineLearningTrainerXgb:
             # dfkslift = eva_dfkslift(df)
             # ks_test = round(dfkslift["ks"].max(), 4)
 
+            perf = sc.perf_eva(train_y, train_y_prob, show_plot=False)
+            auc_train = perf["AUC"]
+            ks_train = perf["KS"]
+
             perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
             auc_test = perf["AUC"]
             ks_test = perf["KS"]
-            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
+
+            row = dict(zip(df_param_columns, [auc_train, ks_train, auc_test, ks_test, psi, n + 1]))
             self._df_param_optimized.loc[len(self._df_param_optimized)] = row
 
     def save(self):
@@ -268,9 +316,9 @@ class OnlineLearningTrainerXgb:
         joblib.dump(self._pipeline_optimized, path_model)
         print(f"model save to【{path_model}】success. ")
         # 在xgb的增量学习下直接保存pipeline会出错,所以这里需要单独保存xgb model,然后进行复原
-        path_model = self._ol_config.f_get_save_path(FileEnum.MODEL_XGB.value)
-        self.model_optimized.save_model(path_model)
-        print(f"model save to【{path_model}】success. ")
+        # path_model = self._ol_config.f_get_save_path(FileEnum.MODEL_XGB.value)
+        # self.model_optimized.save_model(path_model)
+        # print(f"model save to【{path_model}】success. ")
 
     @staticmethod
     def load(path: str):

+ 55 - 12
online_learning_demo.ipynb

@@ -3,7 +3,11 @@
   {
    "cell_type": "markdown",
    "id": "38ecba89",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "# lr"
    ]
@@ -12,7 +16,11 @@
    "cell_type": "code",
    "execution_count": 2,
    "id": "4807cd30",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -50,7 +58,10 @@
    "execution_count": 3,
    "id": "20ee7fc7",
    "metadata": {
-    "code_folding": []
+    "code_folding": [],
+    "pycharm": {
+     "name": "#%%\n"
+    }
    },
    "outputs": [
     {
@@ -2242,7 +2253,11 @@
   {
    "cell_type": "markdown",
    "id": "1c57b8b9",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## 加载模型"
    ]
@@ -2251,7 +2266,11 @@
    "cell_type": "code",
    "execution_count": 6,
    "id": "34773917",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -2286,7 +2305,11 @@
   {
    "cell_type": "markdown",
    "id": "a6e8bb2c",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## 计算psi"
    ]
@@ -2295,7 +2318,11 @@
    "cell_type": "code",
    "execution_count": 4,
    "id": "97e6c7fd",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -2459,7 +2486,11 @@
   {
    "cell_type": "markdown",
    "id": "0cc6d099",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "# xgb"
    ]
@@ -2468,7 +2499,11 @@
    "cell_type": "code",
    "execution_count": 4,
    "id": "95af6493",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -4209,7 +4244,11 @@
    "cell_type": "code",
    "execution_count": 5,
    "id": "28a40327",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -4244,7 +4283,11 @@
    "cell_type": "code",
    "execution_count": 6,
    "id": "8f8addf4",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -4448,4 +4491,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}

+ 7 - 7
train_test_xgb.py

@@ -35,7 +35,7 @@ if __name__ == "__main__":
         # jupyter下输出内容
         "jupyter_print": True,
         # 是否开启粗分箱
-        "format_bin": True,
+        "format_bin": False,
         "max_feature_num": 5,
         # 压力测试
         "stress_test": False,
@@ -44,17 +44,17 @@ if __name__ == "__main__":
         # y
         "y_column": "creditability",
         # 参与建模的候选变量
-        # "x_columns": [
-        # "duration_in_month",
-        # "credit_amount",
-        # "age_in_years",
+        "x_columns": [
+        "duration_in_month",
+        "credit_amount",
+        "age_in_years",
         # "purpose",
         # "credit_history",
         # "random",
-
+        #
         # "credit_amount_corr1",
         # "credit_amount_corr2",
-        #   ],
+          ],
         # 变量释义
         "columns_anns": {
             "age_in_years": "年龄",