Przeglądaj źródła

add: 在线学习 psi输出

yq 14 godzin temu
rodzic
commit
7f6b84e87a
2 zmienionych plików z 59 dodań i 17 usunięć
  1. 29 9
      online_learning/trainer_lr.py
  2. 30 8
      online_learning/trainer_xgb.py

+ 29 - 9
online_learning/trainer_lr.py

@@ -23,9 +23,9 @@ from tqdm import tqdm
 from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
     f_display_images_by_side
 from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
-from enums import ResultCodesEnum, ConstantEnum, ContextEnum, FileEnum
+from enums import ResultCodesEnum, ConstantEnum, FileEnum
 from feature import f_woebin_load
-from init import init, context
+from init import init
 from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
 from monitor import ReportWord
 from .utils import LR
@@ -102,7 +102,7 @@ class OnlineLearningTrainerLr:
         else:
             print(f"选择epoch:【{epoch}】的参数:\n{df_param[df_param['epoch'] == epoch].iloc[0].to_dict()}")
             weight = list(df_param[df_param["epoch"] == epoch].iloc[0])
-        weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
+        weight = nn.Parameter(torch.tensor(np.array(weight[0:-6])))
         return LR(weight)
 
     def _f_get_scorecard(self, ):
@@ -227,13 +227,14 @@ class OnlineLearningTrainerLr:
     def score(self, x: pd.DataFrame) -> np.array:
         return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
 
-    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True) -> pd.DataFrame:
         y1 = self.prob(x1)
         y2 = self.prob(x2)
         x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
         x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
         model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
-        print(f"模型psi: {model_psi['psi'].sum()}")
+        if print_sum:
+            print(f"模型psi: {model_psi['psi'].sum()}")
         return model_psi
 
     def train(self, ):
@@ -247,7 +248,8 @@ class OnlineLearningTrainerLr:
                 perf = sc.perf_eva(test_y, y_prob, show_plot=False)
                 auc = perf["AUC"]
                 ks = perf["KS"]
-                row = model.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
+                psi = round(self.psi(train_data, test_data, print_sum=False)['psi'].sum(), 3)
+                row = model.linear.weight.tolist() + [auc, ks, psi, epoch + 1, loss_train, loss_test]
                 return dict(zip(df_param_columns, row))
 
         epochs = self._ol_config.epochs
@@ -262,9 +264,9 @@ class OnlineLearningTrainerLr:
         criterion = nn.BCELoss()
         optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
 
-        df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
+        df_param_columns = self._columns + ["auc_test", "ks_test", "psi", "epoch", "loss_train", "loss_test"]
         self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
-        
+
         # 优化前
         loss_train = 0
         self._df_param_optimized.loc[len(self._df_param_optimized)] = _get_param_optimized(self._model_original, -1)
@@ -281,7 +283,8 @@ class OnlineLearningTrainerLr:
                 optimizer.step()
                 loss_train = loss.detach().item()
             # 测试集评估
-            self._df_param_optimized.loc[len(self._df_param_optimized)] = _get_param_optimized(self._model_optimized, epoch)
+            self._df_param_optimized.loc[len(self._df_param_optimized)] = _get_param_optimized(self._model_optimized,
+                                                                                               epoch)
 
     def save(self):
 
@@ -316,6 +319,10 @@ class OnlineLearningTrainerLr:
         return OnlineLearningTrainerLr(ol_config=ol_config)
 
     def report(self, epoch: int = None):
+
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+
         self._model_optimized = self._f_get_best_model(self._df_param_optimized, epoch)
 
         if self._ol_config.jupyter_print:
@@ -347,6 +354,14 @@ class OnlineLearningTrainerLr:
         # 模型系数对比
         metric_value_dict["模型系数"] = self._f_get_metric_coef()
 
+        # 模型分psi
+        model_psi = self.psi(train_data, test_data, print_sum=False)
+        img_path_psi = self._ol_config.f_get_save_path(f"model_psi.png")
+        f_df_to_image(model_psi, img_path_psi)
+        metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
+                                                            value=model_psi["psi"].sum().round(3),
+                                                            image_path=img_path_psi)
+
         # 分数分箱
         metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
         metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
@@ -379,6 +394,11 @@ class OnlineLearningTrainerLr:
         f_display_title(display, "模型系数")
         display.display(metric_value_dict["模型系数"].table)
 
+        # 模型psi
+        f_display_title(display, "模型psi")
+        display.display(metric_value_dict["模型稳定性"].table)
+        print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
+
         f_display_title(display, "分数分箱")
         print(f"建模数据上分数分箱")
         print(f"原模型")

+ 30 - 8
online_learning/trainer_xgb.py

@@ -134,19 +134,21 @@ class OnlineLearningTrainerXgb:
         f_df_to_image(df_stress, img_path_stress)
         return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
 
-    def prob(self, x: pd.DataFrame, pipeline=None):
+    def prob(self, x: pd.DataFrame, pipeline=None, ntree_limit=None):
         if pipeline is None:
             pipeline = self._pipeline_optimized
-        y_prob = pipeline.predict_proba(x)[:, 1]
+        y_prob = pipeline.predict_proba(x, ntree_limit=ntree_limit)[:, 1]
         return y_prob
 
-    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
-        y1 = self.prob(x1)
-        y2 = self.prob(x2)
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None, print_sum=True,
+            ntree_limit=None) -> pd.DataFrame:
+        y1 = self.prob(x1, ntree_limit=ntree_limit)
+        y2 = self.prob(x2, ntree_limit=ntree_limit)
         x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
         x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
         model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
-        print(f"模型psi: {model_psi['psi'].sum()}")
+        if print_sum:
+            print(f"模型psi: {model_psi['psi'].sum()}")
         return model_psi
 
     def _train(self, n_estimators: int = None):
@@ -173,9 +175,10 @@ class OnlineLearningTrainerXgb:
 
     def train(self, ):
         y_column = self._ol_config.y_column
+        train_data = self._data.train_data
         test_data = self._data.test_data
 
-        df_param_columns = ["auc_test", "ks_test", "ntree"]
+        df_param_columns = ["auc_test", "ks_test", "psi", "ntree"]
         self._df_param_optimized = pd.DataFrame(columns=df_param_columns)
         ntree = self._train()
         print(f"原模型一共有【{ntree}】棵树")
@@ -184,6 +187,8 @@ class OnlineLearningTrainerXgb:
             test_y_prob = self._pipeline_optimized.predict_proba(test_data, ntree_limit=n)[:, 1]
             test_y = test_data[y_column]
 
+            psi = round(self.psi(train_data, test_data, print_sum=False, ntree_limit=n)['psi'].sum(), 3)
+
             # auc_test = roc_auc_score(test_y, test_y_prob)
             # auc_test = round(auc_test, 4)
             # df = pd.DataFrame({'label': test_y, 'pred': test_y_prob})
@@ -193,7 +198,7 @@ class OnlineLearningTrainerXgb:
             perf = sc.perf_eva(test_y, test_y_prob, show_plot=False)
             auc_test = perf["AUC"]
             ks_test = perf["KS"]
-            row = dict(zip(df_param_columns, [auc_test, ks_test, n]))
+            row = dict(zip(df_param_columns, [auc_test, ks_test, psi, n]))
             self._df_param_optimized.loc[len(self._df_param_optimized)] = row
 
     def save(self):
@@ -213,6 +218,10 @@ class OnlineLearningTrainerXgb:
         return OnlineLearningTrainerXgb(ol_config=ol_config)
 
     def report(self, ntree: int = None):
+
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+
         self._f_get_best_model(self._df_param_optimized, ntree)
 
         if self._ol_config.jupyter_print:
@@ -231,6 +240,14 @@ class OnlineLearningTrainerXgb:
         metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
         metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
 
+        # 模型分psi
+        model_psi = self.psi(train_data, test_data, print_sum=False)
+        img_path_psi = self._ol_config.f_get_save_path(f"model_psi.png")
+        f_df_to_image(model_psi, img_path_psi)
+        metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
+                                                            value=model_psi["psi"].sum().round(3),
+                                                            image_path=img_path_psi)
+
         # 分数分箱
         metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
         metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
@@ -260,6 +277,11 @@ class OnlineLearningTrainerXgb:
         display.display(metric_value_dict["模型结果-新模型"].table)
         f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
 
+        # 模型psi
+        f_display_title(display, "模型psi")
+        display.display(metric_value_dict["模型稳定性"].table)
+        print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
+
         f_display_title(display, "分数分箱")
         print(f"建模数据上分数分箱")
         print(f"原模型")