|
@@ -5,13 +5,14 @@
|
|
|
@desc: iv值及单调性筛选类
|
|
|
"""
|
|
|
from itertools import combinations_with_replacement
|
|
|
-from typing import List
|
|
|
+from typing import List, Dict
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
+import scorecardpy as sc
|
|
|
|
|
|
-from entitys import DataSplitEntity, CandidateFeatureEntity, DataProcessConfigEntity
|
|
|
-from .feature_utils import f_judge_monto
|
|
|
+from entitys import DataSplitEntity, CandidateFeatureEntity
|
|
|
+from .feature_utils import f_judge_monto, f_get_corr
|
|
|
from .filter_strategy_base import FilterStrategyBase
|
|
|
|
|
|
|
|
@@ -20,6 +21,50 @@ class StrategyIv(FilterStrategyBase):
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
+ def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
|
|
|
+ # 相关性剔除变量
|
|
|
+ corr_threshold = self.data_process_config.corr_threshold
|
|
|
+ train_data = data.train_data
|
|
|
+ x_columns_candidate = list(candidate_dict.keys())
|
|
|
+ corr_df = f_get_corr(train_data[x_columns_candidate])
|
|
|
+ corr_dict = corr_df.to_dict()
|
|
|
+ for column, corr in corr_dict.items():
|
|
|
+ if column not in x_columns_candidate:
|
|
|
+ continue
|
|
|
+ for challenger_column, challenger_corr in corr.items():
|
|
|
+ if challenger_corr < corr_threshold or column == challenger_column \
|
|
|
+ or challenger_column not in x_columns_candidate:
|
|
|
+ continue
|
|
|
+ iv_max = candidate_dict[column].iv_max
|
|
|
+ challenger_iv_max = candidate_dict[challenger_column].iv_max
|
|
|
+ if iv_max > challenger_iv_max:
|
|
|
+ x_columns_candidate.remove(challenger_column)
|
|
|
+ else:
|
|
|
+ x_columns_candidate.remove(column)
|
|
|
+ break
|
|
|
+ return x_columns_candidate
|
|
|
+
|
|
|
+ def _f_wide_filter(self, data: DataSplitEntity) -> List[str]:
|
|
|
+ # 粗筛变量
|
|
|
+ train_data = data.train_data
|
|
|
+ y_column = self.data_process_config.y_column
|
|
|
+ iv_threshold_wide = self.data_process_config.iv_threshold_wide
|
|
|
+ x_columns_candidate = self.data_process_config.x_columns_candidate
|
|
|
+ if x_columns_candidate is None or len(x_columns_candidate) == 0:
|
|
|
+ x_columns_candidate = train_data.columns.tolist().remove(y_column)
|
|
|
+
|
|
|
+ bins = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column)
|
|
|
+ bins_iv_list = []
|
|
|
+ columns = []
|
|
|
+ for column, bin in bins.items():
|
|
|
+ total_iv = bin['total_iv'][0]
|
|
|
+ if total_iv < iv_threshold_wide:
|
|
|
+ continue
|
|
|
+ bins_iv_list.append({column: total_iv})
|
|
|
+ columns.append(column)
|
|
|
+ bins_iv_list = bins_iv_list.sort(key=lambda x: list(x.values())[0], reverse=True)
|
|
|
+ return columns
|
|
|
+
|
|
|
def _f_get_best_bins(self, data: DataSplitEntity, x_column: str):
|
|
|
# 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
|
|
|
interval = self.data_process_config.bin_search_interval
|
|
@@ -154,13 +199,23 @@ class StrategyIv(FilterStrategyBase):
|
|
|
|
|
|
return iv_max, breaks_list
|
|
|
|
|
|
- def filter(self, data: DataSplitEntity, *args, **kwargs):
|
|
|
- x_columns_candidate = self.data_process_config.x_columns_candidate
|
|
|
+ def filter(self, data: DataSplitEntity, *args, **kwargs) -> List[CandidateFeatureEntity]:
|
|
|
+ # 粗筛
|
|
|
+ x_columns_candidate = self._f_wide_filter(data)
|
|
|
candidate_num = self.data_process_config.candidate_num
|
|
|
- candidate_list: List[CandidateFeatureEntity] = []
|
|
|
+
|
|
|
+ candidate_dict: Dict[str, CandidateFeatureEntity] = {}
|
|
|
for x_column in x_columns_candidate:
|
|
|
iv_max, breaks_list = self._f_get_best_bins(data, x_column)
|
|
|
- candidate_list.append(CandidateFeatureEntity(x_column, breaks_list, iv_max))
|
|
|
+ candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
|
|
|
+
|
|
|
+ # 相关性进一步剔除变量
|
|
|
+ x_columns_candidate = self._f_corr_filter(data, candidate_dict)
|
|
|
+ candidate_list: List[CandidateFeatureEntity] = []
|
|
|
+ for x_column, v in candidate_dict.items():
|
|
|
+ if x_column in x_columns_candidate:
|
|
|
+ candidate_list.append(v)
|
|
|
+
|
|
|
candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
|
|
|
|
|
|
return candidate_list[0:candidate_num]
|