|
@@ -102,24 +102,24 @@ class StrategyIv(FilterStrategyBase):
|
|
|
train_data = data.train_data
|
|
|
test_data = data.test_data
|
|
|
special_values = self.data_process_config.special_values
|
|
|
+ breaks_list = self.data_process_config.breaks_list.copy()
|
|
|
y_column = self.data_process_config.y_column
|
|
|
iv_threshold_wide = self.data_process_config.iv_threshold_wide
|
|
|
x_columns_candidate = self.data_process_config.x_columns_candidate
|
|
|
if x_columns_candidate is None or len(x_columns_candidate) == 0:
|
|
|
x_columns_candidate = train_data.columns.tolist()
|
|
|
+ if y_column in x_columns_candidate:
|
|
|
x_columns_candidate.remove(y_column)
|
|
|
|
|
|
- bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, special_values=special_values,
|
|
|
- bin_num_limit=5)
|
|
|
+ bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, bin_num_limit=5,
|
|
|
+ special_values=special_values, breaks_list=breaks_list)
|
|
|
|
|
|
- breaks_list = {}
|
|
|
for column, bin in bins_train.items():
|
|
|
breaks_list[column] = list(bin['breaks'])
|
|
|
bins_test = None
|
|
|
if test_data is not None and len(test_data) != 0:
|
|
|
- bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
|
|
|
- special_values=special_values
|
|
|
- )
|
|
|
+ bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column,
|
|
|
+ special_values=special_values, breaks_list=breaks_list)
|
|
|
bins_iv_dict = {}
|
|
|
for column, bin_train in bins_train.items():
|
|
|
train_iv = bin_train['total_iv'][0]
|
|
@@ -138,9 +138,11 @@ class StrategyIv(FilterStrategyBase):
|
|
|
interval = self.data_process_config.bin_search_interval
|
|
|
iv_threshold = self.data_process_config.iv_threshold
|
|
|
special_values = self.data_process_config.get_special_values(x_column)
|
|
|
+ breaks_list = self.data_process_config.get_breaks_list(x_column)
|
|
|
y_column = self.data_process_config.y_column
|
|
|
sample_rate = self.data_process_config.sample_rate
|
|
|
format_bin = self.data_process_config.format_bin
|
|
|
+ pos_neg_cnt = self.data_process_config.pos_neg_cnt
|
|
|
|
|
|
def _n0(x):
|
|
|
return sum(x == 0)
|
|
@@ -190,12 +192,12 @@ class StrategyIv(FilterStrategyBase):
|
|
|
bins['is_special_values'] = [False] * len(bins)
|
|
|
return bins
|
|
|
|
|
|
- def _calculation_iv(bins):
|
|
|
+ def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
|
|
|
bins['count'] = bins['good'] + bins['bad']
|
|
|
bins['badprob'] = bins['bad'] / bins['count']
|
|
|
# 单调性判断
|
|
|
bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
|
|
|
- if not f_judge_monto(bad_prob):
|
|
|
+ if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
|
|
|
return -1
|
|
|
# 计算iv
|
|
|
infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
|
|
@@ -257,7 +259,11 @@ class StrategyIv(FilterStrategyBase):
|
|
|
points_list.append(point_list_cache)
|
|
|
# IV与单调性过滤
|
|
|
iv_max = 0
|
|
|
- breaks_list = []
|
|
|
+ breaks_list_target = None
|
|
|
+ judge_monto = True
|
|
|
+ if len(breaks_list) != 0:
|
|
|
+ points_list = [breaks_list]
|
|
|
+ judge_monto = False
|
|
|
train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
|
|
|
test_sv_bin_list = None
|
|
|
if test_data_filter is not None:
|
|
@@ -267,7 +273,8 @@ class StrategyIv(FilterStrategyBase):
|
|
|
# 与special_values合并计算iv
|
|
|
for sv_bin in train_sv_bin_list:
|
|
|
train_bins = pd.concat((train_bins, sv_bin))
|
|
|
- train_iv = _calculation_iv(train_bins)
|
|
|
+ # _calculation_iv包含了单调性判断,并排除了特殊值
|
|
|
+ train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
|
|
|
# 只限制训练集的单调性与iv值大小
|
|
|
if train_iv < iv_threshold:
|
|
|
continue
|
|
@@ -277,13 +284,13 @@ class StrategyIv(FilterStrategyBase):
|
|
|
test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
|
|
|
for sv_bin in test_sv_bin_list:
|
|
|
test_bins = pd.concat((test_bins, sv_bin))
|
|
|
- test_iv = _calculation_iv(test_bins)
|
|
|
+ test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
|
|
|
iv = train_iv + test_iv
|
|
|
if iv > iv_max:
|
|
|
iv_max = iv
|
|
|
- breaks_list = point_list
|
|
|
+ breaks_list_target = point_list
|
|
|
|
|
|
- return iv_max, breaks_list
|
|
|
+ return iv_max, breaks_list_target
|
|
|
|
|
|
def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
|
|
|
# 粗筛
|
|
@@ -294,6 +301,8 @@ class StrategyIv(FilterStrategyBase):
|
|
|
for x_column in tqdm(x_columns_candidate):
|
|
|
if is_numeric_dtype(data.train_data[x_column]):
|
|
|
iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
|
|
|
+ if breaks_list is None:
|
|
|
+ continue
|
|
|
candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
|
|
|
else:
|
|
|
# 字符型暂时用scorecardpy来处理
|