|
@@ -67,7 +67,7 @@ class OnlineLearningTrainerXgb:
|
|
|
model = xgb.XGBClassifier()
|
|
|
model.load_model(path_model)
|
|
|
self._pipeline_optimized.steps[-1] = ("classifier", model)
|
|
|
- print(f"model load from【{path_model}】success.")
|
|
|
+ print(f"model load from【{path_model}】success.")
|
|
|
|
|
|
def _f_rewrite_pmml(self, path_pmml: str):
|
|
|
with open(path_pmml, mode="r", encoding="utf-8") as f:
|
|
@@ -81,7 +81,7 @@ class OnlineLearningTrainerXgb:
|
|
|
if ntree is None:
|
|
|
df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
|
|
|
print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
|
|
|
- self._train(int(df_param_sort.iloc[0][2]))
|
|
|
+ self._train(int(df_param_sort.iloc[0][3]))
|
|
|
else:
|
|
|
print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
|
|
|
self._train(ntree)
|
|
@@ -188,17 +188,29 @@ class OnlineLearningTrainerXgb:
|
|
|
def _train(self, n_estimators: int = None):
|
|
|
y_column = self._ol_config.y_column
|
|
|
train_data = self._data.train_data
|
|
|
+ params_xgb = self._ol_config.params_xgb
|
|
|
|
|
|
model_original: xgb.XGBClassifier = self._pipeline_original.steps[-1][1]
|
|
|
ntree = model_original.n_estimators if model_original.best_ntree_limit is None else model_original.best_ntree_limit
|
|
|
- self.model_optimized = xgb.XGBClassifier(
|
|
|
- n_estimators=n_estimators if n_estimators else ntree,
|
|
|
- updater="refresh",
|
|
|
- process_type="update",
|
|
|
- refresh_leaf=True,
|
|
|
- learning_rate=self._ol_config.lr,
|
|
|
- random_state=self._ol_config.random_state,
|
|
|
- )
|
|
|
+ if params_xgb.get("oltype") == "tree_refresh":
|
|
|
+ self.model_optimized = xgb.XGBClassifier(
|
|
|
+ n_estimators=n_estimators if n_estimators else ntree,
|
|
|
+ reg_alpha=params_xgb.get("alpha"),
|
|
|
+ reg_lambda=params_xgb.get("lambda"),
|
|
|
+ importance_type='weight',
|
|
|
+ updater="refresh",
|
|
|
+ process_type="update",
|
|
|
+ refresh_leaf=True,
|
|
|
+ **params_xgb,
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ self.model_optimized = xgb.XGBClassifier(
|
|
|
+ n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
|
|
|
+ reg_alpha=params_xgb.get("alpha"),
|
|
|
+ reg_lambda=params_xgb.get("lambda"),
|
|
|
+ importance_type='weight',
|
|
|
+ **params_xgb,
|
|
|
+ )
|
|
|
self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
|
|
|
with silent_print():
|
|
|
self._pipeline_optimized.fit(train_data, train_data[y_column],
|
|
@@ -209,6 +221,7 @@ class OnlineLearningTrainerXgb:
|
|
|
|
|
|
def train(self, ):
|
|
|
y_column = self._ol_config.y_column
|
|
|
+ params_xgb = self._ol_config.params_xgb
|
|
|
train_data = self._data.train_data
|
|
|
test_data = self._data.test_data
|
|
|
|
|
@@ -216,12 +229,22 @@ class OnlineLearningTrainerXgb:
|
|
|
self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
|
|
|
ntree = self._train()
|
|
|
print(f"原模型一共有【{ntree}】棵树")
|
|
|
- for n in tqdm(range(ntree)):
|
|
|
- n = n + 1
|
|
|
- test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
|
|
|
+ # 迭代效果回溯
|
|
|
+ if params_xgb.get("oltype") == "tree_refresh":
|
|
|
+ print("更新原模型模式")
|
|
|
+ iteration_n = ntree
|
|
|
+ else:
|
|
|
+ print("原模型基础上新增树模式")
|
|
|
+ iteration_n = params_xgb.get("num_boost_round")
|
|
|
+ for n in tqdm(range(iteration_n)):
|
|
|
+ if params_xgb.get("oltype") == "tree_refresh":
|
|
|
+ ntree_limit = n + 1
|
|
|
+ else:
|
|
|
+ ntree_limit = ntree + n + 1
|
|
|
+ test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
|
|
|
test_y = test_data[y_column]
|
|
|
|
|
|
- psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=n)['psi'].sum(), 3)
|
|
|
+ psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
|
|
|
|
|
|
# auc_test = roc_auc_score(test_y, test_y_prob)
|
|
|
# auc_test = round(auc_test, 4)
|
|
@@ -232,7 +255,7 @@ class OnlineLearningTrainerXgb:
|
|
|
perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
|
|
|
auc_test = perf["AUC"]
|
|
|
ks_test = perf["KS"]
|
|
|
- row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n]))
|
|
|
+ row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
|
|
|
self._df_param_optimized.loc[len(self._df_param_optimized)] = row
|
|
|
|
|
|
def save(self):
|