Procházet zdrojové kódy

add: 变量趋势一致性判断及输出更多的变量切分点

yq před 2 měsíci
rodič
revize
6f222b4d1a

+ 2 - 2
commom/__init__.py

@@ -8,8 +8,8 @@ from .logger import get_logger
 from .placeholder_func import f_fill_placeholder
 from .user_exceptions import GeneralException
 from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime, f_save_train_df, f_format_float, \
-    f_df_to_image, f_display_images_by_side
+    f_df_to_image, f_display_images_by_side, NumpyEncoder
 
 __all__ = ['f_get_clazz_in_module', 'f_clazz_to_json', 'GeneralException', 'get_logger', 'f_fill_placeholder',
            'f_get_date', 'f_get_datetime', 'f_save_train_df', 'f_format_float', 'f_df_to_image',
-           'f_display_images_by_side']
+           'f_display_images_by_side', 'NumpyEncoder']

+ 12 - 0
commom/utils.py

@@ -12,6 +12,7 @@ import os
 from json import JSONEncoder
 from typing import Union
 
+import numpy as np
 import pandas as pd
 import pytz
 
@@ -116,3 +117,14 @@ def f_display_images_by_side(image_path_list, display, title: str = "", width: i
 class f_clazz_to_json(JSONEncoder):
     def default(self, o):
         return o.__dict__
+
+
+class NumpyEncoder(JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NumpyEncoder, self).default(obj)

+ 1 - 1
entitys/data_feaure_entity.py

@@ -25,7 +25,7 @@ class CandidateFeatureEntity():
         return self._x_column
 
     @property
-    def breaks_list(self):
+    def breaks_list(self) -> list:
         return self._breaks_list
 
     @property

+ 9 - 1
entitys/data_process_config_entity.py

@@ -19,7 +19,8 @@ class DataProcessConfigEntity():
                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
                  project_name: str = None, format_bin: str = False, breaks_list: dict = None, pos_neg_cnt=1,
-                 jupyter=False, strees=False, strees_sample_times=100, strees_bad_rate_list=[], *args, **kwargs):
+                 monto_contrast_change_cnt=0, jupyter=False, strees=False, strees_sample_times=100,
+                 strees_bad_rate_list: List[float] = [], *args, **kwargs):
 
         # 是否开启下输出内容
         self._strees = strees
@@ -36,6 +37,9 @@ class DataProcessConfigEntity():
         # 单调性允许变化次数
         self._pos_neg_cnt = pos_neg_cnt
 
+        # 变量趋势一致性允许变化次数
+        self._monto_contrast_change_cnt = monto_contrast_change_cnt
+
         # 是否启用粗分箱
         self._format_bin = format_bin
 
@@ -114,6 +118,10 @@ class DataProcessConfigEntity():
     def pos_neg_cnt(self):
         return self._pos_neg_cnt
 
+    @property
+    def monto_contrast_change_cnt(self):
+        return self._monto_contrast_change_cnt
+
     @property
     def format_bin(self):
         return self._format_bin

+ 18 - 0
feature/feature_utils.py

@@ -101,6 +101,24 @@ def f_judge_monto(bd_list: list, pos_neg_cnt: int = 1) -> bool:
     return False
 
 
+# 变量趋势一致性判断
+def f_monto_contrast(train_bd_list: list, test_bd_list: list, monto_contrast_change_cnt: int = 0):
+    if len(train_bd_list) != len(test_bd_list) or len(train_bd_list) < 2 or len(test_bd_list) < 2:
+        return False
+
+    train_monto = np.array(train_bd_list[1:]) - np.array(train_bd_list[0:-1])
+    train_monto = np.where(train_monto >= 0, 1, -1)
+
+    test_monto = np.array(test_bd_list[1:]) - np.array(test_bd_list[0:-1])
+    test_monto = np.where(test_monto >= 0, 1, -1)
+
+    contrast = train_monto - test_monto
+    if len(contrast[contrast != 0]) > monto_contrast_change_cnt:
+        return False
+
+    return True
+
+
 def f_get_corr(data: pd.DataFrame, meth: str = 'spearman') -> pd.DataFrame:
     return data.corr(method=meth)
 

+ 2 - 2
feature/filter_strategy_base.py

@@ -5,7 +5,7 @@
 @desc: 特征筛选基类
 """
 import abc
-from typing import Dict
+from typing import Dict, Tuple, List
 
 from entitys import DataProcessConfigEntity, DataPreparedEntity, CandidateFeatureEntity, MetricFucEntity
 from init import warning_ignore
@@ -28,7 +28,7 @@ class FilterStrategyBase(metaclass=abc.ABCMeta):
         return self._data_process_config
 
     @abc.abstractmethod
-    def filter(self, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
+    def filter(self, *args, **kwargs) -> Tuple[Dict[str, CandidateFeatureEntity], Dict[str, List[CandidateFeatureEntity]]]:
         """
         特征筛选
         """

+ 90 - 14
feature/strategy_iv.py

@@ -6,7 +6,7 @@
 """
 import json
 from itertools import combinations_with_replacement
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -16,9 +16,9 @@ import seaborn as sns
 from pandas.core.dtypes.common import is_numeric_dtype
 from tqdm import tqdm
 
-from commom import f_display_images_by_side
+from commom import f_display_images_by_side, NumpyEncoder
 from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
-from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin
+from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin, f_monto_contrast
 from .filter_strategy_base import FilterStrategyBase
 
 
@@ -74,6 +74,7 @@ class StrategyIv(FilterStrategyBase):
     def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
         # 相关性剔除变量
         corr_threshold = self.data_process_config.corr_threshold
+        breaks_list = self.data_process_config.breaks_list
         train_data = data.train_data
         x_columns_candidate = list(candidate_dict.keys())
 
@@ -93,9 +94,11 @@ class StrategyIv(FilterStrategyBase):
                 iv_max = candidate_dict[column].iv_max
                 challenger_iv_max = candidate_dict[challenger_column].iv_max
                 if iv_max > challenger_iv_max:
-                    x_columns_candidate.remove(challenger_column)
+                    if challenger_column not in breaks_list.keys():
+                        x_columns_candidate.remove(challenger_column)
                 else:
-                    x_columns_candidate.remove(column)
+                    if column not in breaks_list.keys():
+                        x_columns_candidate.remove(column)
                     break
         return x_columns_candidate
 
@@ -145,6 +148,7 @@ class StrategyIv(FilterStrategyBase):
         sample_rate = self.data_process_config.sample_rate
         format_bin = self.data_process_config.format_bin
         pos_neg_cnt = self.data_process_config.pos_neg_cnt
+        monto_contrast_change_cnt = self.data_process_config.monto_contrast_change_cnt
 
         def _n0(x):
             return sum(x == 0)
@@ -194,11 +198,15 @@ class StrategyIv(FilterStrategyBase):
             bins['is_special_values'] = [False] * len(bins)
             return bins
 
-        def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
+        def _get_badprob(bins):
             bins['count'] = bins['good'] + bins['bad']
             bins['badprob'] = bins['bad'] / bins['count']
-            # 单调性判断
             bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
+            return bad_prob
+
+        def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
+            # 单调性判断
+            bad_prob = _get_badprob(bins)
             if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
                 return -1
             # 计算iv
@@ -260,6 +268,8 @@ class StrategyIv(FilterStrategyBase):
             if point_list_cache not in points_list:
                 points_list.append(point_list_cache)
         # IV与单调性过滤
+        # 获取2 - 5 箱的情况下最佳分箱
+        bins_enum = {}
         iv_max = 0
         breaks_list_target = None
         judge_monto = True
@@ -271,6 +281,10 @@ class StrategyIv(FilterStrategyBase):
         if test_data_filter is not None:
             test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
         for point_list in points_list:
+            is_discard = 0
+            discard_reason = ""
+            is_monto = 1
+            is_monto_contrast = 1
             train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
             # 与special_values合并计算iv
             for sv_bin in train_sv_bin_list:
@@ -279,7 +293,9 @@ class StrategyIv(FilterStrategyBase):
             train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
             # 只限制训练集的单调性与iv值大小
             if train_iv < iv_threshold:
-                continue
+                discard_reason = f"训练集iv小于阈值{iv_threshold}"
+                is_discard = 1
+                is_monto = 0
 
             test_iv = 0
             if test_data_filter is not None:
@@ -287,22 +303,59 @@ class StrategyIv(FilterStrategyBase):
                 for sv_bin in test_sv_bin_list:
                     test_bins = pd.concat((test_bins, sv_bin))
                 test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
+                # 趋势一致性判断
+                train_bad_prob = _get_badprob(train_bins)
+                test_bad_prob = _get_badprob(test_bins)
+                if not f_monto_contrast(train_bad_prob, test_bad_prob, monto_contrast_change_cnt) \
+                        and len(breaks_list) == 0:
+                    discard_reason = f"变量趋势一致性不够"
+                    is_discard = 1
+                    is_monto_contrast = 0
+
             iv = train_iv + test_iv
-            if iv > iv_max:
+
+            if len(breaks_list) == 0:
+                bin_num = len(point_list) + 1
+                if bin_num not in bins_enum.keys():
+                    bins_enum[bin_num] = []
+                bins_enum[bin_num].append({
+                    "is_discard": is_discard,
+                    "is_monto": is_monto,
+                    "is_monto_contrast": is_monto_contrast,
+                    "discard_reason": discard_reason,
+                    "point_list": point_list,
+                    "iv": iv,
+                })
+
+            if iv > iv_max and not is_discard:
                 iv_max = iv
                 breaks_list_target = point_list
 
-        return iv_max, breaks_list_target
+        # 各个分箱数下的最佳分箱点
+        bins_enum_best_point = []
+        for k, v in bins_enum.items():
+            df_bin_enum = pd.DataFrame(data=v)
+            df_bin_enum.sort_values(by=["is_discard", "is_monto", "is_monto_contrast", "iv"],
+                                    ascending=[True, False, False, False], inplace=True)
+            bins_enum_best_point.append(df_bin_enum.iloc[0]["point_list"])
+
+        return iv_max, breaks_list_target, bins_enum_best_point
 
-    def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
+    def filter(self, data: DataSplitEntity, *args, **kwargs) -> Tuple[
+        Dict[str, CandidateFeatureEntity], Dict[str, List[CandidateFeatureEntity]]]:
         # 粗筛
         bins_iv_dict = self._f_wide_filter(data)
         x_columns_candidate = list(bins_iv_dict.keys())
         candidate_num = self.data_process_config.candidate_num
         candidate_dict: Dict[str, CandidateFeatureEntity] = {}
+        numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]] = {}
         for x_column in tqdm(x_columns_candidate):
             if is_numeric_dtype(data.train_data[x_column]):
-                iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
+                iv_max, breaks_list, bins_enum_best_point = self._f_get_best_bins_numeric(data, x_column)
+                if len(bins_enum_best_point) != 0 :
+                    numeric_candidate_dict_all[x_column] = []
+                    for point in bins_enum_best_point:
+                        numeric_candidate_dict_all[x_column].append(CandidateFeatureEntity(x_column, point, 0))
                 if breaks_list is None:
                     continue
                 candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
@@ -323,7 +376,7 @@ class StrategyIv(FilterStrategyBase):
         candidate_dict = {}
         for candidate in candidate_list:
             candidate_dict[candidate.x_column] = candidate
-        return candidate_dict
+        return candidate_dict, numeric_candidate_dict_all
 
     def feature_generate(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
                          **kwargs) -> DataPreparedEntity:
@@ -353,6 +406,7 @@ class StrategyIv(FilterStrategyBase):
                                   data_split_original=data)
 
     def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity],
+                       numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]],
                        *args, **kwargs) -> Dict[str, MetricFucEntity]:
         y_column = self.data_process_config.y_column
         jupyter = self.data_process_config.jupyter
@@ -417,6 +471,28 @@ class StrategyIv(FilterStrategyBase):
             for x_column, feature in candidate_dict.items():
                 breaks_list[x_column] = feature.breaks_list
             print("变量切分点:")
-            print(json.dumps(breaks_list, ensure_ascii=False, indent=2))
+            print(json.dumps(breaks_list, ensure_ascii=False, indent=2, cls=NumpyEncoder))
+
+            # 打印所有变量的推荐切分点
+            print("-----不同分箱数下变量的推荐切分点-----")
+            for x_column, features in numeric_candidate_dict_all.items():
+                print(f"-----【{x_column}】-----")
+                var_trend_images_train = []
+                var_trend_images_test = []
+                for feature in features:
+                    var_breaks_list = [str(i) for i in feature.breaks_list]
+                    var_trend_bins_train = self._f_get_bins_by_breaks(train_data, {x_column: feature})
+                    image_path = self._f_save_var_trend(var_trend_bins_train, [x_column],
+                                                        f"train_{x_column}_{'_'.join(var_breaks_list)}")
+                    var_trend_images_train.append(image_path[0])
+                    if metric_test is not None:
+                        var_trend_bins_test = self._f_get_bins_by_breaks(test_data, {x_column: feature})
+                        image_path = self._f_save_var_trend(var_trend_bins_test, [x_column],
+                                                            f"test_{x_column}_{'_'.join(var_breaks_list)}")
+                        var_trend_images_test.append(image_path[0])
+
+                f_display_images_by_side(var_trend_images_train, display, title=f"训练集",
+                                         image_path_list2=var_trend_images_test,
+                                         title2="测试集")
 
         return metric_value_dict

+ 2 - 2
trainer/train.py

@@ -25,11 +25,11 @@ class TrainPipeline():
 
     def train(self, ) -> Dict[str, MetricFucEntity]:
         # 处理数据,获取候选特征
-        candidate_feature = self._filter_strategy.filter(self._data)
+        candidate_feature, numeric_candidate_dict_all = self._filter_strategy.filter(self._data)
         # 生成训练数据
         data_prepared = self._filter_strategy.feature_generate(self._data, candidate_feature)
         # 特征信息
-        metric_value_dict_feature = self._filter_strategy.feature_report(self._data, candidate_feature)
+        metric_value_dict_feature = self._filter_strategy.feature_report(self._data, candidate_feature, numeric_candidate_dict_all)
 
         metric_value_dict_train = self._model.train(data_prepared, *data_prepared.args, **data_prepared.kwargs)
         self.metric_value_dict = {**metric_value_dict_feature, **metric_value_dict_train}