Explorar o código

add: 数据处理新增breaks_list

yq hai 4 meses
pai
achega
c941848ffe

+ 6 - 0
config/data_process_config_template.json

@@ -4,6 +4,12 @@
   "feature_search_strategy": "iv",
   "x_candidate_num": 10,
   "special_values": null,
+  "breaks_list": {
+    "duration_in_month": [13, 17,  47],
+    "credit_amount": [2001, 3000, 4000, 5000,  10000],
+    "age_in_years": [35, 50],
+    "purpose": ["car (used)%,%others","radio/television%,%retraining","furniture/equipment%,%business","repairs%,%domestic appliances%,%education%,%car (new)"]
+  },
   "format_bin": false,
   "y_column": "creditability",
   "x_columns_candidate": [

+ 25 - 1
entitys/data_process_config_entity.py

@@ -18,7 +18,11 @@ class DataProcessConfigEntity():
                  split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
-                 project_name: str = None, format_bin: str = False, *args, **kwargs):
+                 project_name: str = None, format_bin: str = False, breaks_list: dict = None, pos_neg_cnt=1,
+                 *args, **kwargs):
+
+        # 单调性允许变化次数
+        self._pos_neg_cnt = pos_neg_cnt
 
         # 是否启用粗分箱
         self._format_bin = format_bin
@@ -58,6 +62,8 @@ class DataProcessConfigEntity():
 
         self._special_values = special_values
 
+        self._breaks_list = breaks_list
+
         # 变量相关性阈值
         self._corr_threshold = corr_threshold
 
@@ -75,6 +81,9 @@ class DataProcessConfigEntity():
     def base_dir(self):
         return self._base_dir
 
+    @property
+    def pos_neg_cnt(self):
+        return self._pos_neg_cnt
 
     @property
     def format_bin(self):
@@ -153,6 +162,21 @@ class DataProcessConfigEntity():
             return self._special_values.get(column, [])
         return []
 
+    @property
+    def breaks_list(self):
+        if self._breaks_list is None:
+            return {}
+        if isinstance(self._breaks_list, dict):
+            return self._breaks_list
+        return {}
+
+    def get_breaks_list(self, column: str = None):
+        if self._breaks_list is None or len(self._breaks_list) == 0:
+            return []
+        if isinstance(self._breaks_list, dict) and column is not None:
+            return self._breaks_list.get(column, [])
+        return []
+
     def f_get_save_path(self, file_name: str) -> str:
         path = os.path.join(self._base_dir, file_name)
         return path

+ 22 - 13
feature/strategy_iv.py

@@ -102,24 +102,24 @@ class StrategyIv(FilterStrategyBase):
         train_data = data.train_data
         test_data = data.test_data
         special_values = self.data_process_config.special_values
+        breaks_list = self.data_process_config.breaks_list.copy()
         y_column = self.data_process_config.y_column
         iv_threshold_wide = self.data_process_config.iv_threshold_wide
         x_columns_candidate = self.data_process_config.x_columns_candidate
         if x_columns_candidate is None or len(x_columns_candidate) == 0:
             x_columns_candidate = train_data.columns.tolist()
+        if y_column in x_columns_candidate:
             x_columns_candidate.remove(y_column)
 
-        bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, special_values=special_values,
-                               bin_num_limit=5)
+        bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, bin_num_limit=5,
+                               special_values=special_values, breaks_list=breaks_list)
 
-        breaks_list = {}
         for column, bin in bins_train.items():
             breaks_list[column] = list(bin['breaks'])
         bins_test = None
         if test_data is not None and len(test_data) != 0:
-            bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
-                                  special_values=special_values
-                                  )
+            bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column,
+                                  special_values=special_values, breaks_list=breaks_list)
         bins_iv_dict = {}
         for column, bin_train in bins_train.items():
             train_iv = bin_train['total_iv'][0]
@@ -138,9 +138,11 @@ class StrategyIv(FilterStrategyBase):
         interval = self.data_process_config.bin_search_interval
         iv_threshold = self.data_process_config.iv_threshold
         special_values = self.data_process_config.get_special_values(x_column)
+        breaks_list = self.data_process_config.get_breaks_list(x_column)
         y_column = self.data_process_config.y_column
         sample_rate = self.data_process_config.sample_rate
         format_bin = self.data_process_config.format_bin
+        pos_neg_cnt = self.data_process_config.pos_neg_cnt
 
         def _n0(x):
             return sum(x == 0)
@@ -190,12 +192,12 @@ class StrategyIv(FilterStrategyBase):
             bins['is_special_values'] = [False] * len(bins)
             return bins
 
-        def _calculation_iv(bins):
+        def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
             bins['count'] = bins['good'] + bins['bad']
             bins['badprob'] = bins['bad'] / bins['count']
             # 单调性判断
             bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
-            if not f_judge_monto(bad_prob):
+            if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
                 return -1
             # 计算iv
             infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
@@ -257,7 +259,11 @@ class StrategyIv(FilterStrategyBase):
                 points_list.append(point_list_cache)
         # IV与单调性过滤
         iv_max = 0
-        breaks_list = []
+        breaks_list_target = None
+        judge_monto = True
+        if len(breaks_list) != 0:
+            points_list = [breaks_list]
+            judge_monto = False
         train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
         test_sv_bin_list = None
         if test_data_filter is not None:
@@ -267,7 +273,8 @@ class StrategyIv(FilterStrategyBase):
             # 与special_values合并计算iv
             for sv_bin in train_sv_bin_list:
                 train_bins = pd.concat((train_bins, sv_bin))
-            train_iv = _calculation_iv(train_bins)
+            # _calculation_iv包含了单调性判断,并排除了特殊值
+            train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
             # 只限制训练集的单调性与iv值大小
             if train_iv < iv_threshold:
                 continue
@@ -277,13 +284,13 @@ class StrategyIv(FilterStrategyBase):
                 test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
                 for sv_bin in test_sv_bin_list:
                     test_bins = pd.concat((test_bins, sv_bin))
-                test_iv = _calculation_iv(test_bins)
+                test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
             iv = train_iv + test_iv
             if iv > iv_max:
                 iv_max = iv
-                breaks_list = point_list
+                breaks_list_target = point_list
 
-        return iv_max, breaks_list
+        return iv_max, breaks_list_target
 
     def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
         # 粗筛
@@ -294,6 +301,8 @@ class StrategyIv(FilterStrategyBase):
         for x_column in tqdm(x_columns_candidate):
             if is_numeric_dtype(data.train_data[x_column]):
                 iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
+                if breaks_list is None:
+                    continue
                 candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
             else:
                 # 字符型暂时用scorecardpy来处理

+ 0 - 2
train_test.py

@@ -23,8 +23,6 @@ if __name__ == "__main__":
     # 特征处理
     ## 获取特征筛选策略
     filter_strategy_clazz = FilterStrategyFactory.get_strategy("iv")
-    ## 可传入参数
-    # filter_strategy = filter_strategy_clazz(y_column="creditability")
     ## 也可从配置文件加载
     filter_strategy = filter_strategy_clazz(DataProcessConfigEntity.from_config('./config/data_process_config_template.json'))