Преглед на файлове

add: xgb增量学习原模型基础上新增树模式

yq преди 2 дни
родител
ревизия
c834a6bceb
променени са 4 файла, в които са добавени 72 реда и са изтрити 28 реда
  1. 3 2
      entitys/ml_config_entity.py
  2. 27 3
      entitys/ol_config_entity.py
  3. 4 8
      model/model_xgb.py
  4. 38 15
      online_learning/trainer_xgb.py

+ 3 - 2
entitys/ml_config_entity.py

@@ -153,8 +153,9 @@ class MlConfigEntity():
             'max_depth': 3,
             'subsample': None,
             'colsample_bytree': None,
-            'alpha': None,
-            'num_boost_round': 500,
+            'alpha': 0,
+            'lambda': 1,
+            'num_boost_round': 100,
             'early_stopping_rounds': 20,
             'verbose_eval': 10,
             'random_state': 2025,

+ 27 - 3
entitys/ol_config_entity.py

@@ -29,6 +29,7 @@ class OnlineLearningConfigEntity():
                  stress_test=False,
                  stress_sample_times=100,
                  stress_bad_rate_list: List[float] = [],
+                 params_xgb={},
                  *args, **kwargs):
 
         self._path_resources = path_resources
@@ -53,15 +54,14 @@ class OnlineLearningConfigEntity():
 
         self._save_pmml = save_pmml
 
-        # 是否开启下输出内容
         self._stress_test = stress_test
 
-        # jupyter下输出内容
         self._stress_sample_times = stress_sample_times
 
-        # jupyter下输出内容
         self._stress_bad_rate_list = stress_bad_rate_list
 
+        self._params_xgb = params_xgb
+
         if self._project_name is None or len(self._project_name) == 0:
             self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
         else:
@@ -120,6 +120,30 @@ class OnlineLearningConfigEntity():
     def stress_bad_rate_list(self):
         return self._stress_bad_rate_list
 
+    @property
+    def params_xgb(self):
+        params = {
+            'objective': 'binary:logistic',
+            'eval_metric': 'auc',
+            'learning_rate': 0.1,
+            'max_depth': 3,
+            'subsample': None,
+            'colsample_bytree': None,
+            'alpha': 0,
+            'lambda': 1,
+            'num_boost_round': 100,
+            'early_stopping_rounds': 20,
+            'verbose_eval': 10,
+            'random_state': 2025,
+            'save_pmml': True,
+            'trees_print': False,
+            # tree_add tree_refresh
+            'oltype': "refresh",
+        }
+        params.update(self._params_xgb)
+
+        return params
+
     @staticmethod
     def from_config(config_path: str):
         """

+ 4 - 8
model/model_xgb.py

@@ -82,15 +82,11 @@ class ModelXgb(ModelBase):
         # )
 
         # xgb二次封装为sklearn接口
-        self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
-                                       n_estimators=params_xgb.get("num_boost_round"),
-                                       max_depth=params_xgb.get("max_depth"),
-                                       learning_rate=params_xgb.get("learning_rate"),
-                                       random_state=params_xgb.get("random_state"),
+        self.model = xgb.XGBClassifier(n_estimators=params_xgb.get("num_boost_round"),
                                        reg_alpha=params_xgb.get("alpha"),
-                                       subsample=params_xgb.get("subsample"),
-                                       colsample_bytree=params_xgb.get("colsample_bytree"),
-                                       importance_type='weight'
+                                       reg_lambda=params_xgb.get("lambda"),
+                                       importance_type='weight',
+                                       **params_xgb
                                        )
 
         # self.model.fit(X=train_data.data_x, y=train_data.data_y,

+ 38 - 15
online_learning/trainer_xgb.py

@@ -67,7 +67,7 @@ class OnlineLearningTrainerXgb:
             model = xgb.XGBClassifier()
             model.load_model(path_model)
             self._pipeline_optimized.steps[-1] = ("classifier", model)
-        print(f"model load from【{path_model}】success.")
+            print(f"model load from【{path_model}】success.")
 
     def _f_rewrite_pmml(self, path_pmml: str):
         with open(path_pmml, mode="r", encoding="utf-8") as f:
@@ -81,7 +81,7 @@ class OnlineLearningTrainerXgb:
         if ntree is None:
             df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
             print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
-            self._train(int(df_param_sort.iloc[0][2]))
+            self._train(int(df_param_sort.iloc[0][3]))
         else:
             print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
             self._train(ntree)
@@ -188,17 +188,29 @@ class OnlineLearningTrainerXgb:
     def _train(self, n_estimators: int = None):
         y_column = self._ol_config.y_column
         train_data = self._data.train_data
+        params_xgb = self._ol_config.params_xgb
 
         model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
         ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
-        self.model_optimized = xgb.XGBClassifier(
-            n_estimators=n_estimators if n_estimators else ntree,
-            updater="refresh",
-            process_type="update",
-            refresh_leaf=True,
-            learning_rate=self._ol_config.lr,
-            random_state=self._ol_config.random_state,
-        )
+        if params_xgb.get("oltype") == "tree_refresh":
+            self.model_optimized = xgb.XGBClassifier(
+                n_estimators=n_estimators if n_estimators else ntree,
+                reg_alpha=params_xgb.get("alpha"),
+                reg_lambda=params_xgb.get("lambda"),
+                importance_type='weight',
+                updater="refresh",
+                process_type="update",
+                refresh_leaf=True,
+                **params_xgb,
+            )
+        else:
+            self.model_optimized = xgb.XGBClassifier(
+                n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
+                reg_alpha=params_xgb.get("alpha"),
+                reg_lambda=params_xgb.get("lambda"),
+                importance_type='weight',
+                **params_xgb,
+            )
         self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
         with silent_print():
             self._pipeline_optimized.fit(train_data, train_data[y_column],
@@ -209,6 +221,7 @@ class OnlineLearningTrainerXgb:
 
     def train(self, ):
         y_column = self._ol_config.y_column
+        params_xgb = self._ol_config.params_xgb
         train_data = self._data.train_data
         test_data = self._data.test_data
 
@@ -216,12 +229,22 @@ class OnlineLearningTrainerXgb:
         self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
         ntree = self._train()
         print(f"原模型一共有【{ntree}】棵树")
-        for n in tqdm(range(ntree)):
-            n = n + 1
-            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
+        # 迭代效果回溯
+        if params_xgb.get("oltype") == "tree_refresh":
+            print("更新原模型模式")
+            iteration_n = ntree
+        else:
+            print("原模型基础上新增树模式")
+            iteration_n = params_xgb.get("num_boost_round")
+        for n in tqdm(range(iteration_n)):
+            if params_xgb.get("oltype") == "tree_refresh":
+                ntree_limit = n + 1
+            else:
+                ntree_limit = ntree + n + 1
+            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
             test_y = test_data[y_column]
 
-            psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=n)['psi'].sum(), 3)
+            psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
 
             # auc_test = roc_auc_score(test_y, test_y_prob)
             # auc_test = round(auc_test, 4)
@@ -232,7 +255,7 @@ class OnlineLearningTrainerXgb:
             perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
             auc_test = perf["AUC"]
             ks_test = perf["KS"]
-            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n]))
+            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
             self._df_param_optimized.loc[len(self._df_param_optimized)] = row
 
     def save(self):