فهرست منبع

add: StrategyIv

yq 4 ماه پیش
والد
کامیت
395977cafc

+ 8 - 0
config/data_process_config_template.json

@@ -0,0 +1,8 @@
+{
+  "y_column": "creditability",
+  "x_columns_candidate": [
+    "duration_in_month",
+    "credit_amount"
+  ],
+  "bin_search_interval": 0.05
+}

+ 3 - 2
entitys/__init__.py

@@ -6,14 +6,15 @@
 """
 from .train_config_entity import TrainConfigEntity
 from .data_process_config_entity import DataProcessConfigEntity
-from .data_feaure_entity import DataFeatureEntity, DataSplitEntity, DataPreparedEntity
+from .data_feaure_entity import DataFeatureEntity, DataSplitEntity, DataPreparedEntity, CandidateFeatureEntity
 from .db_config_entity import DbConfigEntity
 from .metric_config_entity import MetricConfigEntity
 from .metric_entity import MetricTrainEntity, MetricFucEntity
 from .monitor_metric_config_entity import MonitorMetricConfigEntity
 
 __all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MetricTrainEntity', 'MonitorMetricConfigEntity', 'MetricConfigEntity',
-           'MetricFucEntity', 'DataSplitEntity', 'DataProcessConfigEntity', 'TrainConfigEntity', 'DataPreparedEntity']
+           'MetricFucEntity', 'DataSplitEntity', 'DataProcessConfigEntity', 'TrainConfigEntity', 'DataPreparedEntity',
+           'CandidateFeatureEntity']
 
 if __name__ == "__main__":
     pass

+ 25 - 6
entitys/data_feaure_entity.py

@@ -4,9 +4,33 @@
 @time: 2024/11/1
 @desc: 
 """
+
 import pandas as pd
 
 
+class CandidateFeatureEntity():
+    """
+    经过特征筛选后的特征信息
+    """
+
+    def __init__(self, x_column: str, breaks_list: list = None, iv_max: float = None):
+        self._x_column = x_column
+        self._breaks_list = breaks_list
+        self._iv_max = iv_max
+
+    @property
+    def x_column(self):
+        return self._x_column
+
+    @property
+    def breaks_list(self):
+        return self._breaks_list
+
+    @property
+    def iv_max(self):
+        return self._iv_max
+
+
 class DataFeatureEntity():
     def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
         self._data = data
@@ -52,11 +76,10 @@ class DataPreparedEntity():
 
 
 class DataSplitEntity():
-    def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame, y_column: str):
+    def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame):
         self._train_data = train_data
         self._val_data = val_data
         self._test_data = test_data
-        self._y_column = y_column
 
     @property
     def train_data(self):
@@ -70,10 +93,6 @@ class DataSplitEntity():
     def test_data(self):
         return self._test_data
 
-    @property
-    def y_column(self):
-        return self._y_column
-
 
 if __name__ == "__main__":
     pass

+ 28 - 7
entitys/data_process_config_entity.py

@@ -6,16 +6,16 @@
 """
 import json
 import os
-from typing import List
+from typing import List, Union
 
 from commom import GeneralException
 from enums import ResultCodesEnum
 
 
 class DataProcessConfigEntity():
-    def __init__(self, y_column: str, x_columns_candidate: List[str], fill_method: str, split_method: str,
-                 feature_search_method: str, bin_search_interval: float = 0.05, iv_threshold: float = 0.03,
-                 x_candidate_num: int = 10):
+    def __init__(self, y_column: str, x_columns_candidate: List[str], fill_method: str = None, split_method: str = None,
+                 feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05, iv_threshold: float = 0.03,
+                 x_candidate_num: int = 10, special_values: Union[dict, list] = None):
         # 定义y变量
         self._y_column = y_column
 
@@ -29,7 +29,7 @@ class DataProcessConfigEntity():
         self._split_method = split_method
 
         # 最优特征搜索方法
-        self._feature_search_method = feature_search_method
+        self._feature_search_strategy = feature_search_strategy
 
         # 使用iv筛变量时的阈值
         self._iv_threshold = iv_threshold
@@ -40,10 +40,20 @@ class DataProcessConfigEntity():
         # 最终保留多少x变量
         self._x_candidate_num = x_candidate_num
 
+        self._special_values = special_values
+
+    @property
+    def candidate_num(self):
+        return self._x_candidate_num
+
     @property
     def y_column(self):
         return self._y_column
 
+    @property
+    def x_columns_candidate(self):
+        return self._x_columns_candidate
+
     @property
     def fill_method(self):
         return self._fill_method
@@ -53,8 +63,8 @@ class DataProcessConfigEntity():
         return self._split_method
 
     @property
-    def feature_search_method(self):
-        return self._feature_search_method
+    def feature_search_strategy(self):
+        return self._feature_search_strategy
 
     @property
     def iv_threshold(self):
@@ -64,6 +74,17 @@ class DataProcessConfigEntity():
     def bin_search_interval(self):
         return self._bin_search_interval
 
+    @property
+    def special_values(self):
+        return self._special_values
+
+    def get_special_values(self, column: str = None):
+        if column is None or isinstance(self._special_values, list):
+            return self._special_values
+        if isinstance(self._special_values, dict) and column is not None:
+            return self._special_values.get(column, [])
+        return []
+
     @staticmethod
     def from_config(config_path: str):
         """

+ 2 - 1
enums/__init__.py

@@ -5,7 +5,8 @@
 @desc: 枚举值
 """
 from .bins_strategy_enum import BinsStrategyEnum
+from .filter_strategy_enum import FilterStrategyEnum
 from .placeholder_prefix_enum import PlaceholderPrefixEnum
 from .result_codes_enum import ResultCodesEnum
 
-__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'BinsStrategyEnum']
+__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'BinsStrategyEnum', 'FilterStrategyEnum']

+ 11 - 0
enums/filter_strategy_enum.py

@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/14
+@desc: 特征筛选策略枚举值
+"""
+from enum import Enum
+
+
+class FilterStrategyEnum(Enum):
+    IV = "iv"

+ 2 - 0
feature/__init__.py

@@ -5,5 +5,7 @@
 @desc: 特征挖掘
 """
 
+
+
 if __name__ == "__main__":
     pass

+ 0 - 146
feature/feature_utils.py

@@ -4,14 +4,10 @@
 @time: 2023/12/28
 @desc:  特征工具类
 """
-from itertools import combinations_with_replacement
 
-import numpy as np
 import pandas as pd
-import scorecardpy as sc
 import toad as td
 from sklearn.preprocessing import KBinsDiscretizer
-from tqdm import tqdm
 
 from entitys import DataSplitEntity
 from enums import BinsStrategyEnum
@@ -126,145 +122,3 @@ def f_get_corr(data: DataSplitEntity, meth: str = 'spearman') -> pd.DataFrame:
 
 def f_get_ivf(data: DataSplitEntity) -> pd.DataFrame:
     pass
-
-
-def _f_distribute_balls(balls, boxes):
-    # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
-    total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
-    distribute_list = []
-    # 遍历所有可能的隔板位置
-    for combo in total_ways:
-        # 根据隔板位置分配球
-        distribution = [0] * boxes
-        start = 0
-        for i, divider in enumerate(combo):
-            distribution[i] = divider - start + 1
-            start = divider + 1
-        distribution[-1] = balls - start  # 最后一个箱子的球数
-        # 确保每个箱子至少有一个球
-        if all(x > 0 for x in distribution):
-            distribute_list.append(distribution)
-    return distribute_list
-
-
-def f_get_best_bins(data: DataSplitEntity, x_column: str, special_values: list = []):
-    # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
-    interval = 0.05
-
-    def _n0(x):
-        return sum(x == 0)
-
-    def _n1(x):
-        return sum(x == 1)
-
-    def _get_sv_bins(df, x_column, y_column, special_values):
-        # special_values_bins
-        sv_bin_list = []
-        for special in special_values:
-            dtm = df[df[x_column] == special]
-            if len(dtm) != 0:
-                dtm['bin'] = [str(special)] * len(dtm)
-                binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
-                    [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
-                binning['is_special_values'] = [True] * len(binning)
-                sv_bin_list.append(binning)
-        return sv_bin_list
-
-    def _get_bins(df, x_column, y_column, breaks_list):
-        dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
-        bstbrks = [-np.inf] + breaks_list + [np.inf]
-        labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
-        dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
-        dtm['bin'] = dtm['bin'].astype(str)
-        bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
-            .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
-        bins['is_special_values'] = [False] * len(bins)
-        return bins
-
-    def _calculation_iv(bins):
-        bins['count'] = bins['good'] + bins['bad']
-        bins['badprob'] = bins['bad'] / bins['count']
-        # 单调性判断
-        bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
-        if not f_judge_monto(bad_prob):
-            return -1
-        # 计算iv
-        infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
-            .replace(0, 0.9) \
-            .assign(
-            DistrBad=lambda x: x.bad / sum(x.bad),
-            DistrGood=lambda x: x.good / sum(x.good)
-        ) \
-            .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
-            .iv
-        bins['bin_iv'] = infovalue
-        bins['total_iv'] = bins['bin_iv'].sum()
-        iv = bins['total_iv'].values[0]
-        return iv
-
-    train_data = data.train_data
-    train_data_filter = train_data[~train_data[x_column].isin(special_values)]
-    train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
-    train_data_x = train_data_filter[x_column]
-
-    test_data = data.test_data
-    test_data_filter = None
-    if test_data is not None and len(test_data) != 0:
-        test_data_filter = test_data[~test_data[x_column].isin(special_values)]
-        test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
-
-    # 构造数据切分点
-    # 计算 2 - 5 箱的情况
-    distribute_list = []
-    points_list = []
-    for bin_num in list(range(2, 6)):
-        distribute_list.extend(_f_distribute_balls(int(1 / interval), bin_num))
-    for distribute in distribute_list:
-        point_list_cache = []
-        point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
-        for point_percentile in point_percentile_list:
-            point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
-            if point not in point_list_cache:
-                point_list_cache.append(point)
-        if point_list_cache not in points_list:
-            points_list.append(point_list_cache)
-    # IV与单调性过滤
-    iv_max = 0
-    breaks_list = []
-    train_sv_bin_list = _get_sv_bins(train_data, x_column, data.y_column, special_values)
-    test_sv_bin_list = None
-    if test_data_filter is not None:
-        test_sv_bin_list = _get_sv_bins(test_data, x_column, data.y_column, special_values)
-    for point_list in tqdm(points_list):
-        train_bins = _get_bins(train_data_filter, x_column, data.y_column, point_list)
-        # 与special_values合并计算iv
-        for sv_bin in train_sv_bin_list:
-            train_bins = pd.concat((train_bins, sv_bin))
-        train_iv = _calculation_iv(train_bins)
-        # 只限制训练集的单调性与iv值大小
-        if train_iv < 0.03:
-            continue
-
-        test_iv = 0
-        if test_data_filter is not None:
-            test_bins = _get_bins(test_data_filter, x_column, data.y_column, point_list)
-            for sv_bin in test_sv_bin_list:
-                test_bins = pd.concat((test_bins, sv_bin))
-            test_iv = _calculation_iv(test_bins)
-
-        iv = train_iv + test_iv
-        if iv > iv_max:
-            iv_max = iv
-            breaks_list = point_list
-
-    return iv_max, breaks_list
-
-
-if __name__ == "__main__":
-    dat = sc.germancredit()
-    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
-    data = DataSplitEntity(dat[:700], None, dat[700:], "creditability")
-    iv_max, breaks_list = f_get_best_bins(data, "duration_in_month", special_values=[24, 12])
-    print(iv_max, breaks_list)
-
-    pass

+ 23 - 0
feature/filter_strategy_base.py

@@ -0,0 +1,23 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2024/1/2
+@desc: 特征筛选基类
+"""
+import abc
+
+from entitys import DataProcessConfigEntity
+
+
+class FilterStrategyBase(metaclass=abc.ABCMeta):
+
+    def __init__(self, data_process_config: DataProcessConfigEntity, *args, **kwargs):
+        self._data_process_config = data_process_config
+
+    @property
+    def data_process_config(self):
+        return self._data_process_config
+
+    @abc.abstractmethod
+    def filter(self, *args, **kwargs):
+        pass

+ 9 - 0
feature/filter_strategy_factory.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/25
+@desc: 特征筛选策略工厂
+"""
+
+if __name__ == "__main__":
+    pass

+ 166 - 0
feature/strategy_iv.py

@@ -0,0 +1,166 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2024/1/2
+@desc: iv值及单调性筛选类
+"""
+from itertools import combinations_with_replacement
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from entitys import DataSplitEntity, CandidateFeatureEntity, DataProcessConfigEntity
+from .feature_utils import f_judge_monto
+from .filter_strategy_base import FilterStrategyBase
+
+
+class StrategyIv(FilterStrategyBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _f_get_best_bins(self, data: DataSplitEntity, x_column: str):
+        # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
+        interval = self.data_process_config.bin_search_interval
+        iv_threshold = self.data_process_config.iv_threshold
+        special_values = self.data_process_config.get_special_values(x_column)
+        y_column = self.data_process_config.y_column
+
+        def _n0(x):
+            return sum(x == 0)
+
+        def _n1(x):
+            return sum(x == 1)
+
+        def _f_distribute_balls(balls, boxes):
+            # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
+            total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
+            distribute_list = []
+            # 遍历所有可能的隔板位置
+            for combo in total_ways:
+                # 根据隔板位置分配球
+                distribution = [0] * boxes
+                start = 0
+                for i, divider in enumerate(combo):
+                    distribution[i] = divider - start + 1
+                    start = divider + 1
+                distribution[-1] = balls - start  # 最后一个箱子的球数
+                # 确保每个箱子至少有一个球
+                if all(x > 0 for x in distribution):
+                    distribute_list.append(distribution)
+            return distribute_list
+
+        def _get_sv_bins(df, x_column, y_column, special_values):
+            # special_values_bins
+            sv_bin_list = []
+            for special in special_values:
+                dtm = df[df[x_column] == special]
+                if len(dtm) != 0:
+                    dtm['bin'] = [str(special)] * len(dtm)
+                    binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
+                        [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
+                    binning['is_special_values'] = [True] * len(binning)
+                    sv_bin_list.append(binning)
+            return sv_bin_list
+
+        def _get_bins(df, x_column, y_column, breaks_list):
+            dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
+            bstbrks = [-np.inf] + breaks_list + [np.inf]
+            labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
+            dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
+            dtm['bin'] = dtm['bin'].astype(str)
+            bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
+                .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
+            bins['is_special_values'] = [False] * len(bins)
+            return bins
+
+        def _calculation_iv(bins):
+            bins['count'] = bins['good'] + bins['bad']
+            bins['badprob'] = bins['bad'] / bins['count']
+            # 单调性判断
+            bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
+            if not f_judge_monto(bad_prob):
+                return -1
+            # 计算iv
+            infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
+                .replace(0, 0.9) \
+                .assign(
+                DistrBad=lambda x: x.bad / sum(x.bad),
+                DistrGood=lambda x: x.good / sum(x.good)
+            ) \
+                .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
+                .iv
+            bins['bin_iv'] = infovalue
+            bins['total_iv'] = bins['bin_iv'].sum()
+            iv = bins['total_iv'].values[0]
+            return iv
+
+        train_data = data.train_data
+        train_data_filter = train_data[~train_data[x_column].isin(special_values)]
+        train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
+        train_data_x = train_data_filter[x_column]
+
+        test_data = data.test_data
+        test_data_filter = None
+        if test_data is not None and len(test_data) != 0:
+            test_data_filter = test_data[~test_data[x_column].isin(special_values)]
+            test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
+
+        # 构造数据切分点
+        # 计算 2 - 5 箱的情况
+        distribute_list = []
+        points_list = []
+        for bin_num in list(range(2, 6)):
+            distribute_list.extend(_f_distribute_balls(int(1 / interval), bin_num))
+        for distribute in distribute_list:
+            point_list_cache = []
+            point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
+            for point_percentile in point_percentile_list:
+                point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
+                if point not in point_list_cache:
+                    point_list_cache.append(point)
+            if point_list_cache not in points_list:
+                points_list.append(point_list_cache)
+        # IV与单调性过滤
+        iv_max = 0
+        breaks_list = []
+        train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
+        test_sv_bin_list = None
+        if test_data_filter is not None:
+            test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
+        from tqdm import tqdm
+        for point_list in tqdm(points_list):
+            train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
+            # 与special_values合并计算iv
+            for sv_bin in train_sv_bin_list:
+                train_bins = pd.concat((train_bins, sv_bin))
+            train_iv = _calculation_iv(train_bins)
+            # 只限制训练集的单调性与iv值大小
+            if train_iv < iv_threshold:
+                continue
+
+            test_iv = 0
+            if test_data_filter is not None:
+                test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
+                for sv_bin in test_sv_bin_list:
+                    test_bins = pd.concat((test_bins, sv_bin))
+                test_iv = _calculation_iv(test_bins)
+
+            iv = train_iv + test_iv
+            if iv > iv_max:
+                iv_max = iv
+                breaks_list = point_list
+
+        return iv_max, breaks_list
+
+    def filter(self, data: DataSplitEntity, *args, **kwargs):
+        x_columns_candidate = self.data_process_config.x_columns_candidate
+        candidate_num = self.data_process_config.candidate_num
+        candidate_list: List[CandidateFeatureEntity] = []
+        for x_column in x_columns_candidate:
+            iv_max, breaks_list = self._f_get_best_bins(data, x_column)
+            candidate_list.append(CandidateFeatureEntity(x_column, breaks_list, iv_max))
+        candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
+
+        return candidate_list[0:candidate_num]

+ 16 - 0
strategy_test1.py

@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 
+"""
+from entitys import DataSplitEntity, DataProcessConfigEntity
+from feature.strategy_iv import StrategyIv
+
+if __name__ == "__main__":
+    import scorecardpy as sc
+    dat = sc.germancredit()
+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
+    data = DataSplitEntity(dat[:700], None, dat[700:])
+    strategy = StrategyIv(DataProcessConfigEntity.from_config('./config/data_process_config_template.json'))
+    strategy.filter(data)