|
@@ -6,7 +6,7 @@
|
|
|
"""
|
|
|
import json
|
|
|
from itertools import combinations_with_replacement
|
|
|
-from typing import List, Dict
|
|
|
+from typing import List, Dict, Tuple
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
@@ -16,9 +16,9 @@ import seaborn as sns
|
|
|
from pandas.core.dtypes.common import is_numeric_dtype
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
-from commom import f_display_images_by_side
|
|
|
+from commom import f_display_images_by_side, NumpyEncoder
|
|
|
from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
|
|
|
-from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin
|
|
|
+from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin, f_monto_contrast
|
|
|
from .filter_strategy_base import FilterStrategyBase
|
|
|
|
|
|
|
|
@@ -74,6 +74,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
|
|
|
# 相关性剔除变量
|
|
|
corr_threshold = self.data_process_config.corr_threshold
|
|
|
+ breaks_list = self.data_process_config.breaks_list
|
|
|
train_data = data.train_data
|
|
|
x_columns_candidate = list(candidate_dict.keys())
|
|
|
|
|
@@ -93,9 +94,11 @@ class StrategyIv(FilterStrategyBase):
|
|
|
iv_max = candidate_dict[column].iv_max
|
|
|
challenger_iv_max = candidate_dict[challenger_column].iv_max
|
|
|
if iv_max > challenger_iv_max:
|
|
|
- x_columns_candidate.remove(challenger_column)
|
|
|
+ if challenger_column not in breaks_list.keys():
|
|
|
+ x_columns_candidate.remove(challenger_column)
|
|
|
else:
|
|
|
- x_columns_candidate.remove(column)
|
|
|
+ if column not in breaks_list.keys():
|
|
|
+ x_columns_candidate.remove(column)
|
|
|
break
|
|
|
return x_columns_candidate
|
|
|
|
|
@@ -145,6 +148,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
sample_rate = self.data_process_config.sample_rate
|
|
|
format_bin = self.data_process_config.format_bin
|
|
|
pos_neg_cnt = self.data_process_config.pos_neg_cnt
|
|
|
+ monto_contrast_change_cnt = self.data_process_config.monto_contrast_change_cnt
|
|
|
|
|
|
def _n0(x):
|
|
|
return sum(x == 0)
|
|
@@ -194,11 +198,15 @@ class StrategyIv(FilterStrategyBase):
|
|
|
bins['is_special_values'] = [False] * len(bins)
|
|
|
return bins
|
|
|
|
|
|
- def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
|
|
|
+ def _get_badprob(bins):
|
|
|
bins['count'] = bins['good'] + bins['bad']
|
|
|
bins['badprob'] = bins['bad'] / bins['count']
|
|
|
- # 单调性判断
|
|
|
bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
|
|
|
+ return bad_prob
|
|
|
+
|
|
|
+ def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
|
|
|
+ # 单调性判断
|
|
|
+ bad_prob = _get_badprob(bins)
|
|
|
if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
|
|
|
return -1
|
|
|
# 计算iv
|
|
@@ -260,6 +268,8 @@ class StrategyIv(FilterStrategyBase):
|
|
|
if point_list_cache not in points_list:
|
|
|
points_list.append(point_list_cache)
|
|
|
# IV与单调性过滤
|
|
|
+ # 获取2 - 5 箱的情况下最佳分箱
|
|
|
+ bins_enum = {}
|
|
|
iv_max = 0
|
|
|
breaks_list_target = None
|
|
|
judge_monto = True
|
|
@@ -271,6 +281,10 @@ class StrategyIv(FilterStrategyBase):
|
|
|
if test_data_filter is not None:
|
|
|
test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
|
|
|
for point_list in points_list:
|
|
|
+ is_discard = 0
|
|
|
+ discard_reason = ""
|
|
|
+ is_monto = 1
|
|
|
+ is_monto_contrast = 1
|
|
|
train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
|
|
|
# 与special_values合并计算iv
|
|
|
for sv_bin in train_sv_bin_list:
|
|
@@ -279,7 +293,9 @@ class StrategyIv(FilterStrategyBase):
|
|
|
train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
|
|
|
# 只限制训练集的单调性与iv值大小
|
|
|
if train_iv < iv_threshold:
|
|
|
- continue
|
|
|
+ discard_reason = f"训练集iv小于阈值{iv_threshold}"
|
|
|
+ is_discard = 1
|
|
|
+ is_monto = 0
|
|
|
|
|
|
test_iv = 0
|
|
|
if test_data_filter is not None:
|
|
@@ -287,22 +303,59 @@ class StrategyIv(FilterStrategyBase):
|
|
|
for sv_bin in test_sv_bin_list:
|
|
|
test_bins = pd.concat((test_bins, sv_bin))
|
|
|
test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
|
|
|
+ # 趋势一致性判断
|
|
|
+ train_bad_prob = _get_badprob(train_bins)
|
|
|
+ test_bad_prob = _get_badprob(test_bins)
|
|
|
+ if not f_monto_contrast(train_bad_prob, test_bad_prob, monto_contrast_change_cnt) \
|
|
|
+ and len(breaks_list) == 0:
|
|
|
+ discard_reason = f"变量趋势一致性不够"
|
|
|
+ is_discard = 1
|
|
|
+ is_monto_contrast = 0
|
|
|
+
|
|
|
iv = train_iv + test_iv
|
|
|
- if iv > iv_max:
|
|
|
+
|
|
|
+ if len(breaks_list) == 0:
|
|
|
+ bin_num = len(point_list) + 1
|
|
|
+ if bin_num not in bins_enum.keys():
|
|
|
+ bins_enum[bin_num] = []
|
|
|
+ bins_enum[bin_num].append({
|
|
|
+ "is_discard": is_discard,
|
|
|
+ "is_monto": is_monto,
|
|
|
+ "is_monto_contrast": is_monto_contrast,
|
|
|
+ "discard_reason": discard_reason,
|
|
|
+ "point_list": point_list,
|
|
|
+ "iv": iv,
|
|
|
+ })
|
|
|
+
|
|
|
+ if iv > iv_max and not is_discard:
|
|
|
iv_max = iv
|
|
|
breaks_list_target = point_list
|
|
|
|
|
|
- return iv_max, breaks_list_target
|
|
|
+ # 各个分箱数下的最佳分箱点
|
|
|
+ bins_enum_best_point = []
|
|
|
+ for k, v in bins_enum.items():
|
|
|
+ df_bin_enum = pd.DataFrame(data=v)
|
|
|
+ df_bin_enum.sort_values(by=["is_discard", "is_monto", "is_monto_contrast", "iv"],
|
|
|
+ ascending=[True, False, False, False], inplace=True)
|
|
|
+ bins_enum_best_point.append(df_bin_enum.iloc[0]["point_list"])
|
|
|
+
|
|
|
+ return iv_max, breaks_list_target, bins_enum_best_point
|
|
|
|
|
|
- def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
|
|
|
+ def filter(self, data: DataSplitEntity, *args, **kwargs) -> Tuple[
|
|
|
+ Dict[str, CandidateFeatureEntity], Dict[str, List[CandidateFeatureEntity]]]:
|
|
|
# 粗筛
|
|
|
bins_iv_dict = self._f_wide_filter(data)
|
|
|
x_columns_candidate = list(bins_iv_dict.keys())
|
|
|
candidate_num = self.data_process_config.candidate_num
|
|
|
candidate_dict: Dict[str, CandidateFeatureEntity] = {}
|
|
|
+ numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]] = {}
|
|
|
for x_column in tqdm(x_columns_candidate):
|
|
|
if is_numeric_dtype(data.train_data[x_column]):
|
|
|
- iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
|
|
|
+ iv_max, breaks_list, bins_enum_best_point = self._f_get_best_bins_numeric(data, x_column)
|
|
|
+ if len(bins_enum_best_point) != 0 :
|
|
|
+ numeric_candidate_dict_all[x_column] = []
|
|
|
+ for point in bins_enum_best_point:
|
|
|
+ numeric_candidate_dict_all[x_column].append(CandidateFeatureEntity(x_column, point, 0))
|
|
|
if breaks_list is None:
|
|
|
continue
|
|
|
candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
|
|
@@ -323,7 +376,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
candidate_dict = {}
|
|
|
for candidate in candidate_list:
|
|
|
candidate_dict[candidate.x_column] = candidate
|
|
|
- return candidate_dict
|
|
|
+ return candidate_dict, numeric_candidate_dict_all
|
|
|
|
|
|
def feature_generate(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
|
|
|
**kwargs) -> DataPreparedEntity:
|
|
@@ -353,6 +406,7 @@ class StrategyIv(FilterStrategyBase):
|
|
|
data_split_original=data)
|
|
|
|
|
|
def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity],
|
|
|
+ numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]],
|
|
|
*args, **kwargs) -> Dict[str, MetricFucEntity]:
|
|
|
y_column = self.data_process_config.y_column
|
|
|
jupyter = self.data_process_config.jupyter
|
|
@@ -417,6 +471,28 @@ class StrategyIv(FilterStrategyBase):
|
|
|
for x_column, feature in candidate_dict.items():
|
|
|
breaks_list[x_column] = feature.breaks_list
|
|
|
print("变量切分点:")
|
|
|
- print(json.dumps(breaks_list, ensure_ascii=False, indent=2))
|
|
|
+ print(json.dumps(breaks_list, ensure_ascii=False, indent=2, cls=NumpyEncoder))
|
|
|
+
|
|
|
+ # 打印所有变量的推荐切分点
|
|
|
+ print("-----不同分箱数下变量的推荐切分点-----")
|
|
|
+ for x_column, features in numeric_candidate_dict_all.items():
|
|
|
+ print(f"-----【{x_column}】-----")
|
|
|
+ var_trend_images_train = []
|
|
|
+ var_trend_images_test = []
|
|
|
+ for feature in features:
|
|
|
+ var_breaks_list = [str(i) for i in feature.breaks_list]
|
|
|
+ var_trend_bins_train = self._f_get_bins_by_breaks(train_data, {x_column: feature})
|
|
|
+ image_path = self._f_save_var_trend(var_trend_bins_train, [x_column],
|
|
|
+ f"train_{x_column}_{'_'.join(var_breaks_list)}")
|
|
|
+ var_trend_images_train.append(image_path[0])
|
|
|
+ if metric_test is not None:
|
|
|
+ var_trend_bins_test = self._f_get_bins_by_breaks(test_data, {x_column: feature})
|
|
|
+ image_path = self._f_save_var_trend(var_trend_bins_test, [x_column],
|
|
|
+ f"test_{x_column}_{'_'.join(var_breaks_list)}")
|
|
|
+ var_trend_images_test.append(image_path[0])
|
|
|
+
|
|
|
+ f_display_images_by_side(var_trend_images_train, display, title=f"训练集",
|
|
|
+ image_path_list2=var_trend_images_test,
|
|
|
+ title2="测试集")
|
|
|
|
|
|
return metric_value_dict
|