há 2 dias atrás · c834a6bceb
--- a/entitys/ml_config_entity.py
+++ b/entitys/ml_config_entity.py
@@ -153,8 +153,9 @@ class MlConfigEntity():
 
				             'max_depth': 3,
			
 
				             'subsample': None,
			
 
				             'colsample_bytree': None,
			
 
				-            'alpha': None,
			
 
				-            'num_boost_round': 500,
			
 
				+            'alpha': 0,
			
 
				+            'lambda': 1,
			
 
				+            'num_boost_round': 100,
			
 
				             'early_stopping_rounds': 20,
			
 
				             'verbose_eval': 10,
			
 
				             'random_state': 2025,
			
--- a/entitys/ol_config_entity.py
+++ b/entitys/ol_config_entity.py
@@ -29,6 +29,7 @@ class OnlineLearningConfigEntity():
 
				                  stress_test=False,
			
 
				                  stress_sample_times=100,
			
 
				                  stress_bad_rate_list: List[float] = [],
			
 
				+                 params_xgb={},
			
 
				                  *args, **kwargs):
			
 
				 
			
 
				         self._path_resources = path_resources
			
@@ -53,15 +54,14 @@ class OnlineLearningConfigEntity():
 
				 
			
 
				         self._save_pmml = save_pmml
			
 
				 
			
 
				-        # 是否开启下输出内容
			
 
				         self._stress_test = stress_test
			
 
				 
			
 
				-        # jupyter下输出内容
			
 
				         self._stress_sample_times = stress_sample_times
			
 
				 
			
 
				-        # jupyter下输出内容
			
 
				         self._stress_bad_rate_list = stress_bad_rate_list
			
 
				 
			
 
				+        self._params_xgb = params_xgb
			
 
				+
			
 
				         if self._project_name is None or len(self._project_name) == 0:
			
 
				             self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
			
 
				         else:
			
@@ -120,6 +120,30 @@ class OnlineLearningConfigEntity():
 
				     def stress_bad_rate_list(self):
			
 
				         return self._stress_bad_rate_list
			
 
				 
			
 
				+    @property
			
 
				+    def params_xgb(self):
			
 
				+        params = {
			
 
				+            'objective': 'binary:logistic',
			
 
				+            'eval_metric': 'auc',
			
 
				+            'learning_rate': 0.1,
			
 
				+            'max_depth': 3,
			
 
				+            'subsample': None,
			
 
				+            'colsample_bytree': None,
			
 
				+            'alpha': 0,
			
 
				+            'lambda': 1,
			
 
				+            'num_boost_round': 100,
			
 
				+            'early_stopping_rounds': 20,
			
 
				+            'verbose_eval': 10,
			
 
				+            'random_state': 2025,
			
 
				+            'save_pmml': True,
			
 
				+            'trees_print': False,
			
 
				+            # tree_add tree_refresh
			
 
				+            'oltype': "refresh",
			
 
				+        }
			
 
				+        params.update(self._params_xgb)
			
 
				+
			
 
				+        return params
			
 
				+
			
 
				     @staticmethod
			
 
				     def from_config(config_path: str):
			
 
				         """
			
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -82,15 +82,11 @@ class ModelXgb(ModelBase):
 
				         # )
			
 
				 
			
 
				         # xgb二次封装为sklearn接口
			
 
				-        self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
			
 
				-                                       n_estimators=params_xgb.get("num_boost_round"),
			
 
				-                                       max_depth=params_xgb.get("max_depth"),
			
 
				-                                       learning_rate=params_xgb.get("learning_rate"),
			
 
				-                                       random_state=params_xgb.get("random_state"),
			
 
				+        self.model = xgb.XGBClassifier(n_estimators=params_xgb.get("num_boost_round"),
			
 
				                                        reg_alpha=params_xgb.get("alpha"),
			
 
				-                                       subsample=params_xgb.get("subsample"),
			
 
				-                                       colsample_bytree=params_xgb.get("colsample_bytree"),
			
 
				-                                       importance_type='weight'
			
 
				+                                       reg_lambda=params_xgb.get("lambda"),
			
 
				+                                       importance_type='weight',
			
 
				+                                       **params_xgb
			
 
				                                        )
			
 
				 
			
 
				         # self.model.fit(X=train_data.data_x, y=train_data.data_y,
			
--- a/online_learning/trainer_xgb.py
+++ b/online_learning/trainer_xgb.py
@@ -67,7 +67,7 @@ class OnlineLearningTrainerXgb:
 
				             model = xgb.XGBClassifier()
			
 
				             model.load_model(path_model)
			
 
				             self._pipeline_optimized.steps[-1] = ("classifier", model)
			
 
				-        print(f"model load from【{path_model}】success.")
			
 
				+            print(f"model load from【{path_model}】success.")
			
 
				 
			
 
				     def _f_rewrite_pmml(self, path_pmml: str):
			
 
				         with open(path_pmml, mode="r", encoding="utf-8") as f:
			
@@ -81,7 +81,7 @@ class OnlineLearningTrainerXgb:
 
				         if ntree is None:
			
 
				             df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
			
 
				             print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
			
 
				-            self._train(int(df_param_sort.iloc[0][2]))
			
 
				+            self._train(int(df_param_sort.iloc[0][3]))
			
 
				         else:
			
 
				             print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
			
 
				             self._train(ntree)
			
@@ -188,17 +188,29 @@ class OnlineLearningTrainerXgb:
 
				     def _train(self, n_estimators: int = None):
			
 
				         y_column = self._ol_config.y_column
			
 
				         train_data = self._data.train_data
			
 
				+        params_xgb = self._ol_config.params_xgb
			
 
				 
			
 
				         model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
			
 
				         ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
			
 
				-        self.model_optimized = xgb.XGBClassifier(
			
 
				-            n_estimators=n_estimators if n_estimators else ntree,
			
 
				-            updater="refresh",
			
 
				-            process_type="update",
			
 
				-            refresh_leaf=True,
			
 
				-            learning_rate=self._ol_config.lr,
			
 
				-            random_state=self._ol_config.random_state,
			
 
				-        )
			
 
				+        if params_xgb.get("oltype") == "tree_refresh":
			
 
				+            self.model_optimized = xgb.XGBClassifier(
			
 
				+                n_estimators=n_estimators if n_estimators else ntree,
			
 
				+                reg_alpha=params_xgb.get("alpha"),
			
 
				+                reg_lambda=params_xgb.get("lambda"),
			
 
				+                importance_type='weight',
			
 
				+                updater="refresh",
			
 
				+                process_type="update",
			
 
				+                refresh_leaf=True,
			
 
				+                **params_xgb,
			
 
				+            )
			
 
				+        else:
			
 
				+            self.model_optimized = xgb.XGBClassifier(
			
 
				+                n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
			
 
				+                reg_alpha=params_xgb.get("alpha"),
			
 
				+                reg_lambda=params_xgb.get("lambda"),
			
 
				+                importance_type='weight',
			
 
				+                **params_xgb,
			
 
				+            )
			
 
				         self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
			
 
				         with silent_print():
			
 
				             self._pipeline_optimized.fit(train_data, train_data[y_column],
			
@@ -209,6 +221,7 @@ class OnlineLearningTrainerXgb:
 
				 
			
 
				     def train(self, ):
			
 
				         y_column = self._ol_config.y_column
			
 
				+        params_xgb = self._ol_config.params_xgb
			
 
				         train_data = self._data.train_data
			
 
				         test_data = self._data.test_data
			
 
				 
			
@@ -216,12 +229,22 @@ class OnlineLearningTrainerXgb:
 
				         self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
			
 
				         ntree = self._train()
			
 
				         print(f"原模型一共有【{ntree}】棵树")
			
 
				-        for n in tqdm(range(ntree)):
			
 
				-            n = n + 1
			
 
				-            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
			
 
				+        # 迭代效果回溯
			
 
				+        if params_xgb.get("oltype") == "tree_refresh":
			
 
				+            print("更新原模型模式")
			
 
				+            iteration_n = ntree
			
 
				+        else:
			
 
				+            print("原模型基础上新增树模式")
			
 
				+            iteration_n = params_xgb.get("num_boost_round")
			
 
				+        for n in tqdm(range(iteration_n)):
			
 
				+            if params_xgb.get("oltype") == "tree_refresh":
			
 
				+                ntree_limit = n + 1
			
 
				+            else:
			
 
				+                ntree_limit = ntree + n + 1
			
 
				+            test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
			
 
				             test_y = test_data[y_column]
			
 
				 
			
 
				-            psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=n)['psi'].sum(), 3)
			
 
				+            psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
			
 
				 
			
 
				             # auc_test = roc_auc_score(test_y, test_y_prob)
			
 
				             # auc_test = round(auc_test, 4)
			
@@ -232,7 +255,7 @@ class OnlineLearningTrainerXgb:
 
				             perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
			
 
				             auc_test = perf["AUC"]
			
 
				             ks_test = perf["KS"]
			
 
				-            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n]))
			
 
				+            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
			
 
				             self._df_param_optimized.loc[len(self._df_param_optimized)] = row
			
 
				 
			
 
				     def save(self):