|
@@ -12,8 +12,11 @@ import joblib
|
|
|
import pandas as pd
|
|
|
import scorecardpy as sc
|
|
|
import xgboost as xgb
|
|
|
+from pandas.core.dtypes.common import is_numeric_dtype
|
|
|
from pypmml import Model
|
|
|
+from sklearn.preprocessing import OneHotEncoder
|
|
|
from sklearn2pmml import PMMLPipeline, sklearn2pmml
|
|
|
+from sklearn_pandas import DataFrameMapper
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
|
|
@@ -81,7 +84,7 @@ class OnlineLearningTrainerXgb:
|
|
|
if ntree is None:
|
|
|
df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
|
|
|
print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
|
|
|
- self._train(int(df_param_sort.iloc[0][3]))
|
|
|
+ self._train(int(df_param_sort.iloc[0][5]))
|
|
|
else:
|
|
|
print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
|
|
|
self._train(ntree)
|
|
@@ -96,13 +99,14 @@ class OnlineLearningTrainerXgb:
|
|
|
# pmml与原生模型结果一致性校验
|
|
|
model_pmml = Model.fromFile(path_pmml)
|
|
|
prob_pmml = model_pmml.predict(data)["probability(1)"]
|
|
|
- prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
|
|
|
+ with silent_print():
|
|
|
+ prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
|
|
|
diff = pd.DataFrame()
|
|
|
diff["prob_pmml"] = prob_pmml
|
|
|
diff["prob_pipeline"] = prob_pipeline
|
|
|
diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
|
|
|
diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
|
|
|
- print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
|
|
|
+ print(f"pmml模型结果一致率(误差小于0.001):{(diff['diff_format'].sum() / len(diff)).round(3) * 100}%")
|
|
|
|
|
|
def _f_get_metric_auc_ks(self, model_type: str):
|
|
|
def _get_auc_ks(data, title):
|
|
@@ -171,7 +175,8 @@ class OnlineLearningTrainerXgb:
|
|
|
def prob(self, x: pd.DataFrame, pipeline=None, ntree_limit=None):
|
|
|
if pipeline is None:
|
|
|
pipeline = self._pipeline_optimized
|
|
|
- y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
|
|
|
+ with silent_print():
|
|
|
+ y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
|
|
|
return y_prob
|
|
|
|
|
|
def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True,
|
|
@@ -204,6 +209,34 @@ class OnlineLearningTrainerXgb:
|
|
|
**params_xgb,
|
|
|
)
|
|
|
else:
|
|
|
+ # 处理新增的变量
|
|
|
+ add_columns = params_xgb.get("add_columns")
|
|
|
+ num_columns = []
|
|
|
+ for x_column in add_columns:
|
|
|
+ if is_numeric_dtype(train_data[x_column]):
|
|
|
+ num_columns.append(x_column)
|
|
|
+ str_columns = [i for i in add_columns if i not in num_columns]
|
|
|
+ mapper_new = []
|
|
|
+ if len(str_columns) > 0:
|
|
|
+ mapper_new.append((str_columns, OneHotEncoder()))
|
|
|
+ for x_column in num_columns:
|
|
|
+ mapper_new.append((x_column, None))
|
|
|
+ mapper_new = DataFrameMapper(mapper_new)
|
|
|
+ mapper_new.fit(self._data.data)
|
|
|
+ features_new = mapper_new.features
|
|
|
+ built_features_new = mapper_new.built_features
|
|
|
+
|
|
|
+ # 合并特征处理器
|
|
|
+ mapper_old: list = self._pipeline_optimized.steps[0][1]
|
|
|
+ features_old = mapper_old.features
|
|
|
+ features_old.extend(features_new)
|
|
|
+ built_features_old = mapper_old.built_features
|
|
|
+ built_features_old.extend(built_features_new)
|
|
|
+ mapper_old.features = features_old
|
|
|
+ mapper_old.built_features = built_features_old
|
|
|
+ self._pipeline_optimized.steps[0] = ("mapper", mapper_old)
|
|
|
+
|
|
|
+ # 模型初始化
|
|
|
self.model_optimized = xgb.XGBClassifier(
|
|
|
n_estimators=n_estimators if n_estimators else params_xgb.get("num_boost_round"),
|
|
|
reg_alpha=params_xgb.get("alpha"),
|
|
@@ -212,11 +245,19 @@ class OnlineLearningTrainerXgb:
|
|
|
**params_xgb,
|
|
|
)
|
|
|
self._pipeline_optimized.steps[-1] = ("classifier", self.model_optimized)
|
|
|
+
|
|
|
+ feature_names_old = model_original.get_booster().feature_names
|
|
|
+ data_transform = self._pipeline_optimized.Xtransform(self._data.data)
|
|
|
+ feature_names_new = [f"f{i}" for i in range(data_transform.shape[1])]
|
|
|
+ model_original.get_booster().feature_names = feature_names_new
|
|
|
+
|
|
|
with silent_print():
|
|
|
self._pipeline_optimized.fit(train_data, train_data[y_column],
|
|
|
classifier__verbose=False,
|
|
|
classifier__xgb_model=model_original.get_booster(),
|
|
|
)
|
|
|
+ model_original.get_booster().feature_names = feature_names_old
|
|
|
+
|
|
|
return ntree
|
|
|
|
|
|
def train(self, ):
|
|
@@ -225,7 +266,7 @@ class OnlineLearningTrainerXgb:
|
|
|
train_data = self._data.train_data
|
|
|
test_data = self._data.test_data
|
|
|
|
|
|
- df_param_columns = ["auc_test", "ks_test", "psi", "ntree"]
|
|
|
+ df_param_columns = ["auc_train", "ks_train", "auc_test", "ks_test", "psi", "ntree"]
|
|
|
self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
|
|
|
ntree = self._train()
|
|
|
print(f"原模型一共有【{ntree}】棵树")
|
|
@@ -241,9 +282,11 @@ class OnlineLearningTrainerXgb:
|
|
|
ntree_limit = n + 1
|
|
|
else:
|
|
|
ntree_limit = ntree + n + 1
|
|
|
- test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
|
|
|
+ with silent_print():
|
|
|
+ train_y_prob = self._pipeline_optimized.predict_proba(train_data, ntree_limit=ntree_limit)[:, 1]
|
|
|
+ test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=ntree_limit)[:, 1]
|
|
|
+ train_y = train_data[y_column]
|
|
|
test_y = test_data[y_column]
|
|
|
-
|
|
|
psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=ntree_limit)['psi'].sum(), 3)
|
|
|
|
|
|
# auc_test = roc_auc_score(test_y, test_y_prob)
|
|
@@ -252,10 +295,15 @@ class OnlineLearningTrainerXgb:
|
|
|
# dfkslift = eva_dfkslift(df)
|
|
|
# ks_test = round(dfkslift["ks"].max(), 4)
|
|
|
|
|
|
+ perf = sc.perf_eva(train_y, train_y_prob, show_plot=False)
|
|
|
+ auc_train = perf["AUC"]
|
|
|
+ ks_train = perf["KS"]
|
|
|
+
|
|
|
perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
|
|
|
auc_test = perf["AUC"]
|
|
|
ks_test = perf["KS"]
|
|
|
- row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n + 1]))
|
|
|
+
|
|
|
+ row = dict(zip(df_param_columns, [auc_train, ks_train, auc_test, ks_test, psi, n + 1]))
|
|
|
self._df_param_optimized.loc[len(self._df_param_optimized)] = row
|
|
|
|
|
|
def save(self):
|
|
@@ -268,9 +316,9 @@ class OnlineLearningTrainerXgb:
|
|
|
joblib.dump(self._pipeline_optimized, path_model)
|
|
|
print(f"model save to【{path_model}】success. ")
|
|
|
# 在xgb的增量学习下直接保存pipeline会出错,所以这里需要单独保存xgb model,然后进行复原
|
|
|
- path_model = self._ol_config.f_get_save_path(FileEnum.MODEL_XGB.value)
|
|
|
- self.model_optimized.save_model(path_model)
|
|
|
- print(f"model save to【{path_model}】success. ")
|
|
|
+ # path_model = self._ol_config.f_get_save_path(FileEnum.MODEL_XGB.value)
|
|
|
+ # self.model_optimized.save_model(path_model)
|
|
|
+ # print(f"model save to【{path_model}】success. ")
|
|
|
|
|
|
@staticmethod
|
|
|
def load(path: str):
|