|
@@ -4,7 +4,6 @@
|
|
|
@time: 2024/1/2
|
|
|
@desc: iv值及单调性筛选类
|
|
|
"""
|
|
|
-import time
|
|
|
from itertools import combinations_with_replacement
|
|
|
from typing import List, Dict
|
|
|
|
|
@@ -68,7 +67,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
for column, candidate in candidate_dict.items():
|
|
|
breaks_list[column] = candidate.breaks_list
|
|
|
bins = sc.woebin(data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
|
|
|
- special_values=special_values)
|
|
|
+ special_values=special_values, print_info=False)
|
|
|
return bins
|
|
|
|
|
|
def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
|
|
@@ -78,7 +77,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
x_columns_candidate = list(candidate_dict.keys())
|
|
|
|
|
|
bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
|
|
|
- train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
|
|
|
+ train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins, print_info=False)
|
|
|
corr_df = f_get_corr(train_woe)
|
|
|
corr_dict = corr_df.to_dict()
|
|
|
for column, corr in corr_dict.items():
|
|
@@ -114,14 +113,14 @@ class StrategyIv(FilterStrategyBase):
|
|
|
x_columns_candidate.remove(y_column)
|
|
|
|
|
|
bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, bin_num_limit=5,
|
|
|
- special_values=special_values, breaks_list=breaks_list)
|
|
|
+ special_values=special_values, breaks_list=breaks_list, print_info=False)
|
|
|
|
|
|
for column, bin in bins_train.items():
|
|
|
breaks_list[column] = list(bin['breaks'])
|
|
|
bins_test = None
|
|
|
if test_data is not None and len(test_data) != 0:
|
|
|
bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column,
|
|
|
- special_values=special_values, breaks_list=breaks_list)
|
|
|
+ special_values=special_values, breaks_list=breaks_list, print_info=False)
|
|
|
bins_iv_dict = {}
|
|
|
for column, bin_train in bins_train.items():
|
|
|
train_iv = bin_train['total_iv'][0]
|
|
@@ -334,19 +333,19 @@ class StrategyIv(FilterStrategyBase):
|
|
|
x_columns_candidate = list(candidate_dict.keys())
|
|
|
bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
|
|
|
|
|
|
- train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
|
|
|
+ train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins, print_info=False)
|
|
|
train_data_feature = DataFeatureEntity(pd.concat((train_woe, train_data[y_column]), axis=1),
|
|
|
train_woe.columns.tolist(), y_column)
|
|
|
|
|
|
val_data_feature = None
|
|
|
if val_data is not None and len(val_data) != 0:
|
|
|
- val_woe = sc.woebin_ply(val_data[x_columns_candidate], bins)
|
|
|
+ val_woe = sc.woebin_ply(val_data[x_columns_candidate], bins, print_info=False)
|
|
|
val_data_feature = DataFeatureEntity(pd.concat((val_woe, val_data[y_column]), axis=1),
|
|
|
train_woe.columns.tolist(), y_column)
|
|
|
|
|
|
test_data_feature = None
|
|
|
if test_data is not None and len(test_data) != 0:
|
|
|
- test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
|
|
|
+ test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins, print_info=False)
|
|
|
test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
|
|
|
train_woe.columns.tolist(), y_column)
|
|
|
return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
|
|
@@ -386,21 +385,30 @@ class StrategyIv(FilterStrategyBase):
|
|
|
image_path_list = self._f_save_var_trend(train_bins, x_columns_candidate, "train")
|
|
|
metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
|
|
|
# 变量有效性
|
|
|
- train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
|
|
|
+ train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins, print_info=False)
|
|
|
var_corr_image_path = self._f_get_var_corr_image(train_woe)
|
|
|
# vif
|
|
|
vif_df = f_get_ivf(train_woe)
|
|
|
metric_value_dict["变量有效性"] = MetricFucEntity(image_path=var_corr_image_path, table=vif_df)
|
|
|
|
|
|
- time.sleep(3)
|
|
|
if jupyter:
|
|
|
from IPython import display
|
|
|
+
|
|
|
display.display(metric_value_dict["样本分布"].table)
|
|
|
+ # 打印变量iv
|
|
|
display.display(metric_value_dict["变量iv"].table)
|
|
|
+ # 打印变量相关性
|
|
|
f_display_images_by_side(metric_value_dict["变量有效性"].image_path, display, width=800)
|
|
|
+ # 打印变量趋势
|
|
|
f_display_images_by_side(metric_value_dict["变量趋势-训练集"].image_path, display, title="变量趋势训练集")
|
|
|
metric_test = metric_value_dict.get("变量趋势-测试集")
|
|
|
if metric_test is not None:
|
|
|
f_display_images_by_side(metric_test.image_path, display, title="变量趋势测试集")
|
|
|
+ # 打印breaks_list
|
|
|
+ breaks_list = {}
|
|
|
+ for x_column, feature in candidate_dict.items():
|
|
|
+ breaks_list[x_column] = feature.breaks_list
|
|
|
+ print("变量切分点:")
|
|
|
+ print(breaks_list)
|
|
|
|
|
|
return metric_value_dict
|