|
@@ -61,6 +61,25 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
imgs_path.append(path)
|
|
|
return imgs_path
|
|
|
|
|
|
+ def _f_best_bins_print(self, display, data: DataSplitEntity, column: str, homo_bin_info: HomologousBinInfo):
|
|
|
+ print(f"-----【{column}】不同分箱数下变量的推荐切分点-----")
|
|
|
+ imgs_path_trend_train = []
|
|
|
+ imgs_path_trend_test = []
|
|
|
+ bins_info = homo_bin_info.get_best_bins()
|
|
|
+ for bin_info in bins_info:
|
|
|
+ print(json.dumps(bin_info.points, ensure_ascii=False, cls=NumpyEncoder))
|
|
|
+ breaks_list = [str(i) for i in bin_info.points]
|
|
|
+ sc_woebin_train = self._f_get_sc_woebin(data.train_data, {column: bin_info})
|
|
|
+ image_path = self._f_get_img_trend(sc_woebin_train, [column],
|
|
|
+ f"train_{column}_{'_'.join(breaks_list)}")
|
|
|
+ imgs_path_trend_train.append(image_path[0])
|
|
|
+ sc_woebin_test = self._f_get_sc_woebin(data.test_data, {column: bin_info})
|
|
|
+ image_path = self._f_get_img_trend(sc_woebin_test, [column],
|
|
|
+ f"test_{column}_{'_'.join(breaks_list)}")
|
|
|
+ imgs_path_trend_test.append(image_path[0])
|
|
|
+ f_display_images_by_side(display, imgs_path_trend_train, title=f"训练集",
|
|
|
+ image_path_list2=imgs_path_trend_test, title2="测试集")
|
|
|
+
|
|
|
def _f_get_sc_woebin(self, data: pd.DataFrame, bin_info_dict: Dict[str, BinInfo]) -> Dict[str, pd.DataFrame]:
|
|
|
y_column = self.ml_config.y_column
|
|
|
special_values = self.ml_config.special_values
|
|
@@ -444,36 +463,15 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
from IPython import display
|
|
|
|
|
|
if is_numeric_dtype(data.train_data[column]):
|
|
|
- train_data = data.train_data
|
|
|
- test_data = data.test_data
|
|
|
format_bin_mlcfg = self.ml_config.format_bin
|
|
|
if format_bin is not None:
|
|
|
self.ml_config._format_bin = format_bin
|
|
|
homo_bin_info_numeric: HomologousBinInfo = self._handle_numeric(data, column)
|
|
|
-
|
|
|
- bins_info = homo_bin_info_numeric.get_best_bins()
|
|
|
- print(f"-----【{column}】不同分箱数下变量的推荐切分点-----")
|
|
|
- imgs_path_trend_train = []
|
|
|
- imgs_path_trend_test = []
|
|
|
- for bin_info in bins_info:
|
|
|
- print(json.dumps(bin_info.points, ensure_ascii=False, cls=NumpyEncoder))
|
|
|
- breaks_list = [str(i) for i in bin_info.points]
|
|
|
- sc_woebin_train = self._f_get_sc_woebin(train_data, {column: bin_info})
|
|
|
- image_path = self._f_get_img_trend(sc_woebin_train, [column],
|
|
|
- f"train_{column}_{'_'.join(breaks_list)}")
|
|
|
- imgs_path_trend_train.append(image_path[0])
|
|
|
- sc_woebin_test = self._f_get_sc_woebin(test_data, {column: bin_info})
|
|
|
- image_path = self._f_get_img_trend(sc_woebin_test, [column],
|
|
|
- f"test_{column}_{'_'.join(breaks_list)}")
|
|
|
- imgs_path_trend_test.append(image_path[0])
|
|
|
- f_display_images_by_side(display, imgs_path_trend_train, title=f"训练集",
|
|
|
- image_path_list2=imgs_path_trend_test, title2="测试集")
|
|
|
+ self._f_best_bins_print(display, data, column, homo_bin_info_numeric)
|
|
|
self.ml_config._format_bin = format_bin_mlcfg
|
|
|
-
|
|
|
else:
|
|
|
print("只能针对数值型变量进行分析。")
|
|
|
|
|
|
-
|
|
|
def feature_save(self, *args, **kwargs):
|
|
|
if self.sc_woebin is None:
|
|
|
GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
|
|
@@ -564,29 +562,12 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
def detail_print(detail):
|
|
|
if isinstance(detail, str):
|
|
|
detail = [detail]
|
|
|
-
|
|
|
if isinstance(detail, list):
|
|
|
for column in detail:
|
|
|
homo_bin_info_numeric = homo_bin_info_numeric_set.get(column)
|
|
|
if homo_bin_info_numeric is None:
|
|
|
continue
|
|
|
- bins_info = homo_bin_info_numeric.get_best_bins()
|
|
|
- print(f"-----【{column}】不同分箱数下变量的推荐切分点-----")
|
|
|
- imgs_path_trend_train = []
|
|
|
- imgs_path_trend_test = []
|
|
|
- for bin_info in bins_info:
|
|
|
- print(json.dumps(bin_info.points, ensure_ascii=False, cls=NumpyEncoder))
|
|
|
- breaks_list = [str(i) for i in bin_info.points]
|
|
|
- sc_woebin_train = self._f_get_sc_woebin(train_data, {column: bin_info})
|
|
|
- image_path = self._f_get_img_trend(sc_woebin_train, [column],
|
|
|
- f"train_{column}_{'_'.join(breaks_list)}")
|
|
|
- imgs_path_trend_train.append(image_path[0])
|
|
|
- sc_woebin_test = self._f_get_sc_woebin(test_data, {column: bin_info})
|
|
|
- image_path = self._f_get_img_trend(sc_woebin_test, [column],
|
|
|
- f"test_{column}_{'_'.join(breaks_list)}")
|
|
|
- imgs_path_trend_test.append(image_path[0])
|
|
|
- f_display_images_by_side(display, imgs_path_trend_train, title=f"训练集",
|
|
|
- image_path_list2=imgs_path_trend_test, title2="测试集")
|
|
|
+ self._f_best_bins_print(display, data, column, homo_bin_info_numeric)
|
|
|
if isinstance(detail, dict):
|
|
|
for column, challenger_columns in detail.items():
|
|
|
print(f"-----相关性筛选保留的【{column}】-----")
|
|
@@ -601,9 +582,6 @@ class StrategyWoe(FeatureStrategyBase):
|
|
|
if detail is not None and self.ml_config.bin_detail_print:
|
|
|
detail_print(detail)
|
|
|
|
|
|
- train_data = data.train_data
|
|
|
- test_data = data.test_data
|
|
|
-
|
|
|
bin_info_filtered: Dict[str, BinInfo] = context.get(ContextEnum.BIN_INFO_FILTERED)
|
|
|
homo_bin_info_numeric_set: Dict[str, HomologousBinInfo] = context.get(ContextEnum.HOMO_BIN_INFO_NUMERIC_SET)
|
|
|
filter_fast = context.get(ContextEnum.FILTER_FAST)
|