Explorar el Código

bugfix: bug修复

yq hace 1 mes
padre
commit
4dea621fc7
Se han modificado 3 ficheros con 33 adiciones y 47 borrados
  1. 0 2
      feature/woe/entity.py
  2. 29 41
      feature/woe/strategy_woe.py
  3. 4 4
      feature/woe/utils.py

+ 0 - 2
feature/woe/entity.py

@@ -28,7 +28,6 @@ class BinInfo():
                  is_qualified_trend_nsv: int = None,
                  psi: float = None,
                  is_qualified_psi: int = None,
-                 vif: float = None,
                  ):
         self.x_column = x_column
         self.bin_num = bin_num
@@ -44,7 +43,6 @@ class BinInfo():
         self.is_qualified_trend_nsv = is_qualified_trend_nsv
         self.psi = psi
         self.is_qualified_psi = is_qualified_psi
-        self.vif = vif
 
     def to_dict(self):
         return self.__dict__

+ 29 - 41
feature/woe/strategy_woe.py

@@ -287,7 +287,7 @@ class StrategyWoe(FeatureStrategyBase):
                                special_values=special_values, breaks_list=breaks_list, print_info=False)
 
         for column, bin in bins_train.items():
-            breaks_list[column] = list(bin[bin["is_special_values"]==False]['breaks'])
+            breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
 
         bins_test = sc.woebin(test_data[x_columns + [y_column]], y=y_column,
                               special_values=special_values, breaks_list=breaks_list, print_info=False)
@@ -371,18 +371,15 @@ class StrategyWoe(FeatureStrategyBase):
         x_columns = list(bin_info_dict.keys())
         sc_woebin = self._f_get_sc_woebin(train_data, bin_info_dict)
         train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin, print_info=False)
-        vif_df = f_get_vif(train_woe)
-        if vif_df is None:
+        df_vif = f_get_vif(train_woe)
+        if df_vif is None:
             return bin_info_dict
 
         filter_vif_overview = ""
         filter_vif_detail = []
-        for _, row in vif_df.iterrows():
+        for _, row in df_vif.iterrows():
             column = row["变量"]
             vif = row["vif"]
-            bin_info = bin_info_dict[column]
-            bin_info.vif = vif
-            bin_info_dict[column] = bin_info
             if vif < vif_threshold or self.ml_config.is_include(column):
                 continue
             filter_vif_overview = f"{filter_vif_overview}{column} 因为vif【{vif}】大于阈值被剔除\n"
@@ -477,28 +474,32 @@ class StrategyWoe(FeatureStrategyBase):
         # 样本分布
         metric_value_dict["样本分布"] = MetricFucResultEntity(table=data.get_distribution(y_column), table_font_size=10,
                                                           table_cell_width=3)
+
+        # 变量相关性
+        sc_woebin_train = self._f_get_sc_woebin(train_data, bin_info_filtered)
+        train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin_train, print_info=False)
+        img_path_corr = self._f_get_img_corr(train_woe)
+        metric_value_dict["变量相关性"] = MetricFucResultEntity(image_path=img_path_corr)
+
         # 变量iv、psi、vif
         df_iv_psi_vif = pd.DataFrame()
         train_iv = [bin_info_filtered[column].train_iv for column in x_columns]
         psi = [bin_info_filtered[column].psi for column in x_columns]
-        vif = [bin_info_filtered[column].vif for column in x_columns]
         anns = [columns_anns.get(column, "-") for column in x_columns]
         df_iv_psi_vif["变量"] = x_columns
         df_iv_psi_vif["iv"] = train_iv
         df_iv_psi_vif["psi"] = psi
-        df_iv_psi_vif["vif"] = vif
+
+        df_vif = f_get_vif(train_woe)
+        if df_vif is not None:
+            df_iv_psi_vif = pd.merge(df_iv_psi_vif, df_vif, on="变量", how="left")
+
         df_iv_psi_vif["释义"] = anns
         df_iv_psi_vif.sort_values(by=["iv"], ascending=[False], inplace=True)
         img_path_iv = self.ml_config.f_get_save_path(f"iv.png")
         f_df_to_image(df_iv_psi_vif, img_path_iv)
         metric_value_dict["变量iv"] = MetricFucResultEntity(table=df_iv_psi_vif, image_path=img_path_iv)
 
-        # 变量相关性
-        sc_woebin_train = self._f_get_sc_woebin(train_data, bin_info_filtered)
-        train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin_train, print_info=False)
-        img_path_corr = self._f_get_img_corr(train_woe)
-        metric_value_dict["变量相关性"] = MetricFucResultEntity(image_path=img_path_corr)
-
         # 变量趋势-训练集
         imgs_path_trend_train = self._f_get_img_trend(sc_woebin_train, x_columns, "train")
         metric_value_dict["变量趋势-训练集"] = MetricFucResultEntity(image_path=imgs_path_trend_train, image_size=4)
@@ -550,6 +551,14 @@ class StrategyWoe(FeatureStrategyBase):
                     detail_print(column)
                     detail_print(challenger_columns)
 
+        def filter_print(filter, title, notes=""):
+            f_display_title(display, title)
+            print(notes)
+            print(filter.get("overview"))
+            detail = filter.get("detail")
+            if detail is not None:
+                detail_print(detail)
+
         train_data = data.train_data
         test_data = data.test_data
 
@@ -585,29 +594,8 @@ class StrategyWoe(FeatureStrategyBase):
         detail_print(list(bin_info_filtered.keys()))
 
         # 打印fast_filter筛选情况
-        f_display_title(display, "快速筛选过程")
-        print(filter_fast.get("overview"))
-
-        # 打印filter_numeric筛选情况
-        f_display_title(display, "数值变量筛选过程")
-        print(filter_numeric.get("overview"))
-        detail = filter_numeric.get("detail")
-        detail_print(detail)
-
-        # 打印filter_corr筛选情况
-        f_display_title(display, "相关性筛选过程")
-        print(filter_corr.get("overview"))
-        detail = filter_corr.get("detail")
-        detail_print(detail)
-
-        # 打印filter_vif筛选情况
-        f_display_title(display, "vif筛选过程")
-        print(filter_vif.get("overview"))
-        detail = filter_vif.get("detail")
-        detail_print(detail)
-
-        # 打印ivtop筛选情况
-        f_display_title(display, "ivtop筛选过程")
-        print(filter_ivtop.get("overview"))
-        detail = filter_ivtop.get("detail")
-        detail_print(detail)
+        filter_print(filter_fast, "快速筛选过程", "剔除train_iv小于阈值")
+        filter_print(filter_numeric, "数值变量筛选过程")
+        filter_print(filter_corr, "相关性筛选过程")
+        filter_print(filter_vif, "vif筛选过程")
+        filter_print(filter_ivtop, "ivtop筛选过程", "iv = train_iv + test_iv")

+ 4 - 4
feature/woe/utils.py

@@ -129,7 +129,7 @@ def f_get_vif(data: pd.DataFrame) -> Union[pd.DataFrame, None]:
     if len(data.columns.to_list()) <= 1:
         return None
     vif_v = [round(vif(data.values, data.columns.get_loc(i)), 3) for i in data.columns]
-    vif_df = pd.DataFrame()
-    vif_df["变量"] = [column.replace("_woe", "") for column in data.columns]
-    vif_df['vif'] = vif_v
-    return vif_df
+    df_vif = pd.DataFrame()
+    df_vif["变量"] = [column.replace("_woe", "") for column in data.columns]
+    df_vif['vif'] = vif_v
+    return df_vif