|
@@ -287,7 +287,7 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
special_values=special_values, breaks_list=breaks_list, print_info=False)
|
|
|
|
|
|
for column, bin in bins_train.items():
|
|
|
- breaks_list[column] = list(bin[bin["is_special_values"]==False]['breaks'])
|
|
|
+ breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
|
|
|
|
|
|
bins_test = sc.woebin(test_data[x_columns + [y_column]], y=y_column,
|
|
|
special_values=special_values, breaks_list=breaks_list, print_info=False)
|
|
@@ -371,18 +371,15 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
x_columns = list(bin_info_dict.keys())
|
|
|
sc_woebin = self._f_get_sc_woebin(train_data, bin_info_dict)
|
|
|
train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin, print_info=False)
|
|
|
- vif_df = f_get_vif(train_woe)
|
|
|
- if vif_df is None:
|
|
|
+ df_vif = f_get_vif(train_woe)
|
|
|
+ if df_vif is None:
|
|
|
return bin_info_dict
|
|
|
|
|
|
filter_vif_overview = ""
|
|
|
filter_vif_detail = []
|
|
|
- for _, row in vif_df.iterrows():
|
|
|
+ for _, row in df_vif.iterrows():
|
|
|
column = row["变量"]
|
|
|
vif = row["vif"]
|
|
|
- bin_info = bin_info_dict[column]
|
|
|
- bin_info.vif = vif
|
|
|
- bin_info_dict[column] = bin_info
|
|
|
if vif < vif_threshold or self.ml_config.is_include(column):
|
|
|
continue
|
|
|
filter_vif_overview = f"{filter_vif_overview}{column} 因为vif【{vif}】大于阈值被剔除\n"
|
|
@@ -477,28 +474,32 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
# 样本分布
|
|
|
metric_value_dict["样本分布"] = MetricFucResultEntity(table=data.get_distribution(y_column), table_font_size=10,
|
|
|
table_cell_width=3)
|
|
|
+
|
|
|
+ # 变量相关性
|
|
|
+ sc_woebin_train = self._f_get_sc_woebin(train_data, bin_info_filtered)
|
|
|
+ train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin_train, print_info=False)
|
|
|
+ img_path_corr = self._f_get_img_corr(train_woe)
|
|
|
+ metric_value_dict["变量相关性"] = MetricFucResultEntity(image_path=img_path_corr)
|
|
|
+
|
|
|
# 变量iv、psi、vif
|
|
|
df_iv_psi_vif = pd.DataFrame()
|
|
|
train_iv = [bin_info_filtered[column].train_iv for column in x_columns]
|
|
|
psi = [bin_info_filtered[column].psi for column in x_columns]
|
|
|
- vif = [bin_info_filtered[column].vif for column in x_columns]
|
|
|
anns = [columns_anns.get(column, "-") for column in x_columns]
|
|
|
df_iv_psi_vif["变量"] = x_columns
|
|
|
df_iv_psi_vif["iv"] = train_iv
|
|
|
df_iv_psi_vif["psi"] = psi
|
|
|
- df_iv_psi_vif["vif"] = vif
|
|
|
+
|
|
|
+ df_vif = f_get_vif(train_woe)
|
|
|
+ if df_vif is not None:
|
|
|
+ df_iv_psi_vif = pd.merge(df_iv_psi_vif, df_vif, on="变量", how="left")
|
|
|
+
|
|
|
df_iv_psi_vif["释义"] = anns
|
|
|
df_iv_psi_vif.sort_values(by=["iv"], ascending=[False], inplace=True)
|
|
|
img_path_iv = self.ml_config.f_get_save_path(f"iv.png")
|
|
|
f_df_to_image(df_iv_psi_vif, img_path_iv)
|
|
|
metric_value_dict["变量iv"] = MetricFucResultEntity(table=df_iv_psi_vif, image_path=img_path_iv)
|
|
|
|
|
|
- # 变量相关性
|
|
|
- sc_woebin_train = self._f_get_sc_woebin(train_data, bin_info_filtered)
|
|
|
- train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin_train, print_info=False)
|
|
|
- img_path_corr = self._f_get_img_corr(train_woe)
|
|
|
- metric_value_dict["变量相关性"] = MetricFucResultEntity(image_path=img_path_corr)
|
|
|
-
|
|
|
# 变量趋势-训练集
|
|
|
imgs_path_trend_train = self._f_get_img_trend(sc_woebin_train, x_columns, "train")
|
|
|
metric_value_dict["变量趋势-训练集"] = MetricFucResultEntity(image_path=imgs_path_trend_train, image_size=4)
|
|
@@ -550,6 +551,14 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
detail_print(column)
|
|
|
detail_print(challenger_columns)
|
|
|
|
|
|
+ def filter_print(filter, title, notes=""):
|
|
|
+ f_display_title(display, title)
|
|
|
+ print(notes)
|
|
|
+ print(filter.get("overview"))
|
|
|
+ detail = filter.get("detail")
|
|
|
+ if detail is not None:
|
|
|
+ detail_print(detail)
|
|
|
+
|
|
|
train_data = data.train_data
|
|
|
test_data = data.test_data
|
|
|
|
|
@@ -585,29 +594,8 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
detail_print(list(bin_info_filtered.keys()))
|
|
|
|
|
|
# 打印fast_filter筛选情况
|
|
|
- f_display_title(display, "快速筛选过程")
|
|
|
- print(filter_fast.get("overview"))
|
|
|
-
|
|
|
- # 打印filter_numeric筛选情况
|
|
|
- f_display_title(display, "数值变量筛选过程")
|
|
|
- print(filter_numeric.get("overview"))
|
|
|
- detail = filter_numeric.get("detail")
|
|
|
- detail_print(detail)
|
|
|
-
|
|
|
- # 打印filter_corr筛选情况
|
|
|
- f_display_title(display, "相关性筛选过程")
|
|
|
- print(filter_corr.get("overview"))
|
|
|
- detail = filter_corr.get("detail")
|
|
|
- detail_print(detail)
|
|
|
-
|
|
|
- # 打印filter_vif筛选情况
|
|
|
- f_display_title(display, "vif筛选过程")
|
|
|
- print(filter_vif.get("overview"))
|
|
|
- detail = filter_vif.get("detail")
|
|
|
- detail_print(detail)
|
|
|
-
|
|
|
- # 打印ivtop筛选情况
|
|
|
- f_display_title(display, "ivtop筛选过程")
|
|
|
- print(filter_ivtop.get("overview"))
|
|
|
- detail = filter_ivtop.get("detail")
|
|
|
- detail_print(detail)
|
|
|
+ filter_print(filter_fast, "快速筛选过程", "剔除train_iv小于阈值")
|
|
|
+ filter_print(filter_numeric, "数值变量筛选过程")
|
|
|
+ filter_print(filter_corr, "相关性筛选过程")
|
|
|
+ filter_print(filter_vif, "vif筛选过程")
|
|
|
+ filter_print(filter_ivtop, "ivtop筛选过程", "iv = train_iv + test_iv")
|