7 ヶ月前 · 057cb1a149
--- a/config/data_process_config_template.json
+++ b/config/data_process_config_template.json
@@ -4,6 +4,7 @@
 
				   "feature_search_strategy": "iv",
			
 
				   "x_candidate_num": 10,
			
 
				   "special_values": null,
			
 
				+  "format_bin": false,
			
 
				   "y_column": "creditability",
			
 
				   "x_columns_candidate": [
			
 
				     "duration_in_month",
			
--- a/entitys/data_process_config_entity.py
+++ b/entitys/data_process_config_entity.py
@@ -18,7 +18,10 @@ class DataProcessConfigEntity():
 
				                  split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
			
 
				                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
			
 
				                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
			
 
				-                 project_name: str = None, *args, **kwargs):
			
 
				+                 project_name: str = None, format_bin: str = False, *args, **kwargs):
			
 
				+
			
 
				+        # 是否启用粗分箱
			
 
				+        self._format_bin = format_bin
			
 
				 
			
 
				         # 项目名称，和缓存路径有关
			
 
				         self._project_name = project_name
			
@@ -72,6 +75,11 @@ class DataProcessConfigEntity():
 
				     def base_dir(self):
			
 
				         return self._base_dir
			
 
				 
			
 
				+
			
 
				+    @property
			
 
				+    def format_bin(self):
			
 
				+        return self._format_bin
			
 
				+
			
 
				     @property
			
 
				     def project_name(self):
			
 
				         return self._project_name
			
--- a/feature/feature_utils.py
+++ b/feature/feature_utils.py
@@ -9,6 +9,71 @@ import pandas as pd
 
				 import scorecardpy as sc
			
 
				 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
			
 
				 
			
 
				+FORMAT_DICT = {
			
 
				+    # 比例类 -1 - 1
			
 
				+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
			
 
				+
			
 
				+    # 次数类1 0 -10
			
 
				+    "bin_cnt1": np.arange(0, 11, 1),
			
 
				+    # 次数类2 0 - 20
			
 
				+    "bin_cnt2": [0, 1, 2, 3, 4, 5, 8, 10, 15, 20],
			
 
				+    # 次数类3 0 - 50
			
 
				+    "bin_cnt3": [0, 2, 4, 6, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50],
			
 
				+    # 次数类4 0 - 100
			
 
				+    "bin_cnt4": [0, 3, 6, 10, 15, 20, 30, 40, 50, 100],
			
 
				+
			
 
				+    # 金额类1 0 - 1w
			
 
				+    "bin_amt1": np.arange(0, 1.1e4, 1e3),
			
 
				+    # 金额类2 0 - 5w
			
 
				+    "bin_amt2": np.arange(0, 5.5e4, 5e3),
			
 
				+    # 金额类3 0 - 10w
			
 
				+    "bin_amt3": np.arange(0, 11e4, 1e4),
			
 
				+    # 金额类4 0 - 20w
			
 
				+    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				+    # 金额类5 0 - 100w
			
 
				+    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				+
			
 
				+    # 年龄类
			
 
				+    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# 粗分箱
			
 
				+def f_format_bin(data_describe: pd.Series, raw_v):
			
 
				+    percent10 = data_describe["10%"]
			
 
				+    percent90 = data_describe["90%"]
			
 
				+    format_v = raw_v
			
 
				+
			
 
				+    # 筛选最合适的标准化分箱节点
			
 
				+    bin = None
			
 
				+    for k, v_list in FORMAT_DICT.items():
			
 
				+        bin_min = min(v_list)
			
 
				+        bin_max = max(v_list)
			
 
				+        if percent10 >= bin_min and percent90 <= bin_max:
			
 
				+            if bin is None:
			
 
				+                bin = (k, bin_max)
			
 
				+            elif bin[1] > bin_max:
			
 
				+                bin = (k, bin_max)
			
 
				+
			
 
				+    if bin is None:
			
 
				+        return format_v
			
 
				+
			
 
				+    # 选择分箱内适合的切分点
			
 
				+    v_list = FORMAT_DICT[bin[0]]
			
 
				+    for idx in range(1, len(v_list)):
			
 
				+        v_left = v_list[idx - 1]
			
 
				+        v_right = v_list[idx]
			
 
				+        # 就近原则
			
 
				+        if v_left <= raw_v <= v_right:
			
 
				+            format_v = v_right if (raw_v - v_left) - (v_right - raw_v) > 0 else v_left
			
 
				+    if format_v not in v_list:
			
 
				+        if format_v > v_list[-1]:
			
 
				+            format_v = v_list[-1]
			
 
				+        if format_v < v_list[0]:
			
 
				+            format_v = v_list[0]
			
 
				+
			
 
				+    return format_v
			
 
				+
			
 
				 
			
 
				 # 此函数判断list的单调性，允许至多N次符号变化
			
 
				 def f_judge_monto(bd_list: list, pos_neg_cnt: int = 1) -> int:
			
--- a/feature/strategy_iv.py
+++ b/feature/strategy_iv.py
@@ -16,7 +16,7 @@ from pandas.core.dtypes.common import is_numeric_dtype
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
			
 
				-from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf
			
 
				+from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin
			
 
				 from .filter_strategy_base import FilterStrategyBase
			
 
				 
			
 
				 
			
@@ -140,6 +140,7 @@ class StrategyIv(FilterStrategyBase):
 
				         special_values = self.data_process_config.get_special_values(x_column)
			
 
				         y_column = self.data_process_config.y_column
			
 
				         sample_rate = self.data_process_config.sample_rate
			
 
				+        format_bin = self.data_process_config.format_bin
			
 
				 
			
 
				         def _n0(x):
			
 
				             return sum(x == 0)
			
@@ -219,6 +220,7 @@ class StrategyIv(FilterStrategyBase):
 
				         train_data_filter = train_data[~train_data[x_column].isin(special_values)]
			
 
				         train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
			
 
				         train_data_x = train_data_filter[x_column]
			
 
				+        train_data_x_describe = train_data_x.describe(percentiles=[0.1, 0.9])
			
 
				 
			
 
				         test_data = data.test_data
			
 
				         test_data_filter = None
			
@@ -247,6 +249,8 @@ class StrategyIv(FilterStrategyBase):
 
				             point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
			
 
				             for point_percentile in point_percentile_list:
			
 
				                 point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
			
 
				+                if format_bin:
			
 
				+                    point = f_format_bin(train_data_x_describe, point)
			
 
				                 if point not in point_list_cache:
			
 
				                     point_list_cache.append(point)
			
 
				             if point_list_cache not in points_list: