ソースを参照

add: 粗分箱

yq 4 ヶ月 前
コミット
057cb1a149

+ 1 - 0
config/data_process_config_template.json

@@ -4,6 +4,7 @@
   "feature_search_strategy": "iv",
   "x_candidate_num": 10,
   "special_values": null,
+  "format_bin": false,
   "y_column": "creditability",
   "x_columns_candidate": [
     "duration_in_month",

+ 9 - 1
entitys/data_process_config_entity.py

@@ -18,7 +18,10 @@ class DataProcessConfigEntity():
                  split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
-                 project_name: str = None, *args, **kwargs):
+                 project_name: str = None, format_bin: str = False, *args, **kwargs):
+
+        # 是否启用粗分箱
+        self._format_bin = format_bin
 
         # 项目名称,和缓存路径有关
         self._project_name = project_name
@@ -72,6 +75,11 @@ class DataProcessConfigEntity():
     def base_dir(self):
         return self._base_dir
 
+
+    @property
+    def format_bin(self):
+        return self._format_bin
+
     @property
     def project_name(self):
         return self._project_name

+ 65 - 0
feature/feature_utils.py

@@ -9,6 +9,71 @@ import pandas as pd
 import scorecardpy as sc
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
+FORMAT_DICT = {
+    # 比例类 -1 - 1
+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
+
+    # 次数类1 0 -10
+    "bin_cnt1": np.arange(0, 11, 1),
+    # 次数类2 0 - 20
+    "bin_cnt2": [0, 1, 2, 3, 4, 5, 8, 10, 15, 20],
+    # 次数类3 0 - 50
+    "bin_cnt3": [0, 2, 4, 6, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50],
+    # 次数类4 0 - 100
+    "bin_cnt4": [0, 3, 6, 10, 15, 20, 30, 40, 50, 100],
+
+    # 金额类1 0 - 1w
+    "bin_amt1": np.arange(0, 1.1e4, 1e3),
+    # 金额类2 0 - 5w
+    "bin_amt2": np.arange(0, 5.5e4, 5e3),
+    # 金额类3 0 - 10w
+    "bin_amt3": np.arange(0, 11e4, 1e4),
+    # 金额类4 0 - 20w
+    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
+    # 金额类5 0 - 100w
+    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
+
+    # 年龄类
+    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
+}
+
+
+# 粗分箱
+def f_format_bin(data_describe: pd.Series, raw_v):
+    percent10 = data_describe["10%"]
+    percent90 = data_describe["90%"]
+    format_v = raw_v
+
+    # 筛选最合适的标准化分箱节点
+    bin = None
+    for k, v_list in FORMAT_DICT.items():
+        bin_min = min(v_list)
+        bin_max = max(v_list)
+        if percent10 >= bin_min and percent90 <= bin_max:
+            if bin is None:
+                bin = (k, bin_max)
+            elif bin[1] > bin_max:
+                bin = (k, bin_max)
+
+    if bin is None:
+        return format_v
+
+    # 选择分箱内适合的切分点
+    v_list = FORMAT_DICT[bin[0]]
+    for idx in range(1, len(v_list)):
+        v_left = v_list[idx - 1]
+        v_right = v_list[idx]
+        # 就近原则
+        if v_left <= raw_v <= v_right:
+            format_v = v_right if (raw_v - v_left) - (v_right - raw_v) > 0 else v_left
+    if format_v not in v_list:
+        if format_v > v_list[-1]:
+            format_v = v_list[-1]
+        if format_v < v_list[0]:
+            format_v = v_list[0]
+
+    return format_v
+
 
 # 此函数判断list的单调性,允许至多N次符号变化
 def f_judge_monto(bd_list: list, pos_neg_cnt: int = 1) -> int:

+ 5 - 1
feature/strategy_iv.py

@@ -16,7 +16,7 @@ from pandas.core.dtypes.common import is_numeric_dtype
 from tqdm import tqdm
 
 from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
-from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf
+from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin
 from .filter_strategy_base import FilterStrategyBase
 
 
@@ -140,6 +140,7 @@ class StrategyIv(FilterStrategyBase):
         special_values = self.data_process_config.get_special_values(x_column)
         y_column = self.data_process_config.y_column
         sample_rate = self.data_process_config.sample_rate
+        format_bin = self.data_process_config.format_bin
 
         def _n0(x):
             return sum(x == 0)
@@ -219,6 +220,7 @@ class StrategyIv(FilterStrategyBase):
         train_data_filter = train_data[~train_data[x_column].isin(special_values)]
         train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
         train_data_x = train_data_filter[x_column]
+        train_data_x_describe = train_data_x.describe(percentiles=[0.1, 0.9])
 
         test_data = data.test_data
         test_data_filter = None
@@ -247,6 +249,8 @@ class StrategyIv(FilterStrategyBase):
             point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
             for point_percentile in point_percentile_list:
                 point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
+                if format_bin:
+                    point = f_format_bin(train_data_x_describe, point)
                 if point not in point_list_cache:
                     point_list_cache.append(point)
             if point_list_cache not in points_list: