소스 검색

modify: 代码重构及优化

yq 1 개월 전
부모
커밋
aa1a2ec910
46개의 변경된 파일1502개의 추가작업 그리고 1107개의 파일을 삭제
  1. 4 4
      __init__.py
  2. 8 8
      app.py
  3. 2 2
      commom/__init__.py
  4. 1 0
      commom/traceId_util.py
  5. 8 3
      commom/utils.py
  6. 3 4
      config/ml_config_template.json
  7. 0 0
      config/monitor_config_template.json
  8. 0 3
      config/train_config_template.json
  9. 3 3
      data/process/data_process.py
  10. 8 14
      easy_ml_demo.ipynb
  11. 6 9
      entitys/__init__.py
  12. 16 80
      entitys/data_feaure_entity.py
  13. 0 36
      entitys/metric_config_entity.py
  14. 27 1
      entitys/metric_entity.py
  15. 113 59
      entitys/ml_config_entity.py
  16. 7 7
      entitys/monitor_entity.py
  17. 3 3
      enums/__init__.py
  18. 0 12
      enums/bins_strategy_enum.py
  19. 18 0
      enums/context_enum.py
  20. 2 2
      enums/feature_strategy_enum.py
  21. 2 2
      feature/__init__.py
  22. 67 0
      feature/feature_strategy_base.py
  23. 6 6
      feature/feature_strategy_factory.py
  24. 0 49
      feature/filter_strategy_base.py
  25. 0 513
      feature/strategy_iv.py
  26. 9 0
      feature/woe/__init__.py
  27. 166 0
      feature/woe/entity.py
  28. 611 0
      feature/woe/strategy_woe.py
  29. 40 38
      feature/woe/utils.py
  30. 42 1
      init/__init__.py
  31. 6 7
      metric_test.py
  32. 8 8
      metric_test2.py
  33. 2 2
      metrics/metric_base.py
  34. 3 3
      metrics/metric_by_sql_general.py
  35. 33 11
      model/model_base.py
  36. 1 1
      model/model_factory.py
  37. 166 116
      model/model_lr.py
  38. 16 18
      model/model_utils.py
  39. 6 6
      monitor/monitor_metric.py
  40. 5 5
      monitor/report_generate.py
  41. 2 2
      pipeline/__init__.py
  42. 70 0
      pipeline/pipeline.py
  43. BIN
      template/模型开发报告模板_lr.docx
  44. 5 17
      train_test.py
  45. 0 45
      trainer/train.py
  46. 7 7
      webui/utils.py

+ 4 - 4
__init__.py

@@ -9,14 +9,14 @@ from os.path import dirname, realpath
 
 sys.path.append(dirname(realpath(__file__)))
 
-from feature import FilterStrategyFactory
+from feature import FeatureStrategyFactory
 from model import ModelFactory
-from trainer import TrainPipeline
+from pipeline import Pipeline
 
 from data import DataLoaderMysql
 from entitys import DbConfigEntity, DataSplitEntity
 from monitor import MonitorMetric
 from metrics import MetricBase
 
-__all__ = ['MonitorMetric', 'DataLoaderMysql', 'DbConfigEntity', 'MetricBase', 'FilterStrategyFactory', 'ModelFactory',
-           'TrainPipeline', 'DataSplitEntity']
+__all__ = ['MonitorMetric', 'DataLoaderMysql', 'DbConfigEntity', 'MetricBase', 'FeatureStrategyFactory', 'ModelFactory',
+           'Pipeline', 'DataSplitEntity']

+ 8 - 8
app.py

@@ -48,12 +48,12 @@ with gr.Blocks() as demo:
                         search_strategy = gr.Dropdown(["iv"], value="iv", label="特征搜索策略")
                     with gr.Row():
                         y_column = gr.Dropdown(label="Y标签列", interactive=True, info="其值应该是0或者1")
-                        x_columns_candidate = gr.Dropdown(label="X特征列", multiselect=True, interactive=True,
+                        x_columns = gr.Dropdown(label="X特征列", multiselect=True, interactive=True,
                                                           info="不应包含Y特征列,不选择则使用全部特征")
                     with gr.Row():
-                        x_candidate_num = gr.Number(value=10, label="建模最多保留特征数", info="保留最重要的N个特征",
+                        max_feature_num = gr.Number(value=10, label="建模最多保留特征数", info="保留最重要的N个特征",
                                                     interactive=True)
-                        sample_rate = gr.Slider(0.05, 1, value=0.1, label="分箱组合采样率", info="对2-5箱所有分箱组合进行采样",
+                        bin_sample_rate = gr.Slider(0.05, 1, value=0.1, label="分箱组合采样率", info="对2-5箱所有分箱组合进行采样",
                                                 step=0.01, interactive=True)
                         special_values = gr.Textbox(label="特殊值", placeholder="可以是dict list str格式",
                                                     info="分箱时特殊值会单独一个分箱")
@@ -63,16 +63,16 @@ with gr.Blocks() as demo:
                     train_button = gr.Button("开始训练", variant="primary", elem_id="train_button")
 
                     input_elems.update(
-                        {model_type, search_strategy, y_column, x_columns_candidate, x_candidate_num, sample_rate,
+                        {model_type, search_strategy, y_column, x_columns, max_feature_num, bin_sample_rate,
                          special_values, test_split_strategy, test_split_rate, train_button
                          })
                     elem_dict.update(dict(
                         model_type=model_type,
                         feature_search_strategy=search_strategy,
                         y_column=y_column,
-                        x_columns_candidate=x_columns_candidate,
-                        x_candidate_num=x_candidate_num,
-                        sample_rate=sample_rate,
+                        x_columns=x_columns,
+                        max_feature_num=max_feature_num,
+                        bin_sample_rate=bin_sample_rate,
                         special_values=special_values,
                         test_split_strategy=test_split_strategy,
                         test_split_rate=test_split_rate,
@@ -104,7 +104,7 @@ with gr.Blocks() as demo:
 
                 project_name.change(fn=f_project_is_exist, inputs=input_elems)
                 file_data.upload(fn=f_data_upload, inputs=input_elems, outputs=[data_upload, data_insight, y_column,
-                                                                                x_columns_candidate])
+                                                                                x_columns])
                 train_button.click(fn=f_train, inputs=input_elems,
                                    outputs=[train_progress, auc_df, gallery_auc, download_report])
                 download_report.click(fn=f_download_report, inputs=input_elems, outputs=download_report)

+ 2 - 2
commom/__init__.py

@@ -8,8 +8,8 @@ from .logger import get_logger
 from .placeholder_func import f_fill_placeholder
 from .user_exceptions import GeneralException
 from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime, f_save_train_df, f_format_float, \
-    f_df_to_image, f_display_images_by_side, NumpyEncoder
+    f_df_to_image, f_display_images_by_side, NumpyEncoder, f_display_title
 
 __all__ = ['f_get_clazz_in_module', 'f_clazz_to_json', 'GeneralException', 'get_logger', 'f_fill_placeholder',
            'f_get_date', 'f_get_datetime', 'f_save_train_df', 'f_format_float', 'f_df_to_image',
-           'f_display_images_by_side', 'NumpyEncoder']
+           'f_display_images_by_side', 'f_display_title', 'NumpyEncoder']

+ 1 - 0
commom/traceId_util.py

@@ -6,6 +6,7 @@
 """
 
 import logging
+
 from contextvars import ContextVar
 
 request_id_context = ContextVar('request_id')

+ 8 - 3
commom/utils.py

@@ -93,27 +93,32 @@ def _f_image_to_base64(image_path):
         return img_str.decode("utf-8")
 
 
-def f_display_images_by_side(image_path_list, display, title: str = "", width: int = 500,
+def f_display_images_by_side(display, image_path_list, title: str = "", width: int = 500,
                              image_path_list2: Union[list, None] = None, title2: str = "", ):
     if isinstance(image_path_list, str):
         image_path_list = [image_path_list]
     # justify-content:space-around; 会导致某些情况下图片越界
     html_str = '<div style="display:flex;">'
     if title != "":
-        html_str += '<h3>{}</h3>'.format(title)
+        html_str += '<div>{}</div>'.format(title)
     for image_path in image_path_list:
         html_str += f'<img src="data:image/png;base64,{_f_image_to_base64(image_path)}" style="width:{width}px;"/>'
     html_str += '</div>'
     if not (image_path_list2 is None or len(image_path_list2) == 0):
         html_str += '<div style="display:flex;">'
         if title2 != "":
-            html_str += '<h3>{}</h3>'.format(title2)
+            html_str += '<div>{}</div>'.format(title2)
         for image_path in image_path_list2:
             html_str += f'<img src="data:image/png;base64,{_f_image_to_base64(image_path)}" style="width:{width}px;"/>'
         html_str += '</div>'
     display.display(display.HTML(html_str))
 
 
+def f_display_title(display, title):
+    html_str = f"<h2>{title}</h2>"
+    display.display(display.HTML(html_str))
+
+
 class f_clazz_to_json(JSONEncoder):
     def default(self, o):
         return o.__dict__

+ 3 - 4
config/data_process_config_template.json → config/ml_config_template.json

@@ -1,18 +1,17 @@
 {
-  "sample_rate": 0.01,
+  "bin_sample_rate": 0.01,
   "bin_search_interval": 0.05,
   "feature_search_strategy": "iv",
-  "x_candidate_num": 10,
+  "max_feature_num": 10,
   "special_values": {"age_in_years": [36]},
   "breaks_list": {
     "duration_in_month": [13, 17,  47],
     "credit_amount": [2001, 3000, 4000, 5000,  10000],
-    "age_in_years": [35, 50],
     "purpose": ["car (used)%,%others","radio/television%,%retraining","furniture/equipment%,%business","repairs%,%domestic appliances%,%education%,%car (new)"]
   },
   "format_bin": false,
   "y_column": "creditability",
-  "x_columns_candidate": [
+  "x_columns": [
     "duration_in_month",
     "credit_amount",
     "age_in_years",

+ 0 - 0
config/model_monitor_config_template.json → config/monitor_config_template.json


+ 0 - 3
config/train_config_template.json

@@ -1,3 +0,0 @@
-{
-  "model_type": "lr"
-}

+ 3 - 3
data/process/data_process.py

@@ -8,13 +8,13 @@
 import pandas as pd
 
 from commom import f_save_train_df
-from entitys import DataProcessConfigEntity
+from entitys import MlConfigEntity
 
 
 class DataProcess():
 
-    def __init__(self, data_process_config: DataProcessConfigEntity):
-        self._data_process_config = data_process_config
+    def __init__(self, ml_config: MlConfigEntity):
+        self._ml_config = ml_config
 
     def data_fill(self, df: pd.DataFrame) -> pd.DataFrame:
         """

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 8 - 14
easy_ml_demo.ipynb


+ 6 - 9
entitys/__init__.py

@@ -4,17 +4,14 @@
 @time: 2024/10/30
 @desc: 数据实体类
 """
-from .data_feaure_entity import DataFeatureEntity, DataSplitEntity, DataPreparedEntity, CandidateFeatureEntity
-from .data_process_config_entity import DataProcessConfigEntity
+from .data_feaure_entity import DataFeatureEntity, DataSplitEntity
 from .db_config_entity import DbConfigEntity
-from .metric_config_entity import MetricConfigEntity
-from .metric_entity import MetricFucEntity
-from .monitor_metric_config_entity import MonitorMetricConfigEntity
-from .train_config_entity import TrainConfigEntity
+from .metric_entity import MetricFucResultEntity, MetricConfigEntity
+from .ml_config_entity import MlConfigEntity
+from .monitor_entity import MonitorConfigEntity
 
-__all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MonitorMetricConfigEntity', 'MetricConfigEntity',
-           'MetricFucEntity', 'DataSplitEntity', 'DataProcessConfigEntity', 'TrainConfigEntity', 'DataPreparedEntity',
-           'CandidateFeatureEntity']
+__all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MonitorConfigEntity', 'MetricConfigEntity', 'MetricFucResultEntity',
+           'DataSplitEntity', 'MlConfigEntity']
 
 if __name__ == "__main__":
     pass

+ 16 - 80
entitys/data_feaure_entity.py

@@ -10,125 +10,61 @@ import pandas as pd
 from commom import f_format_float
 
 
-class CandidateFeatureEntity():
-    """
-    经过特征筛选后的特征信息
-    """
-
-    def __init__(self, x_column: str, breaks_list: list = None, iv_max: float = None):
-        self._x_column = x_column
-        self._breaks_list = breaks_list
-        self._iv_max = iv_max
-
-    @property
-    def x_column(self):
-        return self._x_column
-
-    @property
-    def breaks_list(self) -> list:
-        return self._breaks_list
-
-    @property
-    def iv_max(self):
-        return self._iv_max
-
-
 class DataFeatureEntity():
     """
     数据特征准备完毕
     """
 
-    def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
-        self._data = data
-        self._x_columns = x_columns
-        self._y_column = y_column
-
-    @property
-    def data(self):
-        return self._data
+    def __init__(self, data_x: pd.DataFrame, data_y: pd.Series):
+        self._data_x = data_x
+        self._data_y = data_y
 
     @property
     def x_columns(self):
-        return self._x_columns
+        return self._data_x.columns.tolist()
 
     @property
-    def y_column(self):
-        return self._y_column
-
-    def get_Xdata(self):
-        return self._data[self._x_columns]
+    def data_x(self):
+        return self._data_x
 
-    def get_Ydata(self):
-        return self._data[self._y_column]
+    @property
+    def data_y(self):
+        return self._data_y
 
     def get_odds0(self):
-        train_good_len = len(self._data[self._data[self._y_column] == 0])
-        train_bad_len = len(self._data[self._data[self._y_column] == 1])
+        train_good_len = len(self._data_y[self._data_y == 0])
+        train_bad_len = len(self._data_y[self._data_y == 1])
         odds0 = train_bad_len / train_good_len
         return odds0
 
 
-class DataPreparedEntity():
-    """
-    训练集测试集特征准备完毕
-    """
-
-    def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity,
-                 *args, **kwargs):
-        self._train_data = train_data
-        self._val_data = val_data
-        self._test_data = test_data
-        self.args = args
-        self.kwargs = kwargs
-
-    @property
-    def train_data(self):
-        return self._train_data
-
-    @property
-    def val_data(self):
-        return self._val_data
-
-    @property
-    def test_data(self):
-        return self._test_data
-
-
 class DataSplitEntity():
     """
     初始数据训练集测试集划分
     """
 
-    def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame = None, test_data: pd.DataFrame = None):
+    def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
         self._train_data = train_data
-        self._val_data = val_data
         self._test_data = test_data
 
     @property
     def train_data(self):
         return self._train_data
 
-    @property
-    def val_data(self):
-        return self._val_data
-
     @property
     def test_data(self):
         return self._test_data
 
     def get_distribution(self, y_column) -> pd.DataFrame:
         df = pd.DataFrame()
+
         train_data_len = len(self._train_data)
         train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
         train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%"
 
-        test_data_len = 0
-        test_bad_len = 0
-        test_bad_rate = "-"
-        if self._test_data is not None:
-            test_data_len = len(self._test_data)
-            test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
-            test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
+        test_data_len = len(self._test_data)
+        test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
+        test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
 
         total = train_data_len + test_data_len
         bad_total = train_bad_len + test_bad_len

+ 0 - 36
entitys/metric_config_entity.py

@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: yq
-@time: 2024/11/1
-@desc: 指标配置
-"""
-
-
-class MetricConfigEntity():
-    def __init__(self, metric_code: str, metric_func: str, *args, **kwargs):
-        self._args = args
-        self._kwargs = kwargs
-        # metric_code 用于填充模板时查找
-        self._metric_code = metric_code
-        # metric_func 用于查找对应的指标计算函数
-        self._metric_func = metric_func
-
-    @property
-    def args(self):
-        return self._args
-
-    @property
-    def kwargs(self):
-        return self._kwargs
-
-    @property
-    def metric_code(self):
-        return self._metric_code
-
-    @property
-    def metric_func(self):
-        return self._metric_func
-
-
-if __name__ == "__main__":
-    pass

+ 27 - 1
entitys/metric_entity.py

@@ -9,7 +9,7 @@ from typing import Union
 import pandas as pd
 
 
-class MetricFucEntity():
+class MetricFucResultEntity():
     """
     指标计算函数结果类
     """
@@ -54,5 +54,31 @@ class MetricFucEntity():
         return self._image_size
 
 
+class MetricConfigEntity():
+    def __init__(self, metric_code: str, metric_func: str, *args, **kwargs):
+        self._args = args
+        self._kwargs = kwargs
+        # metric_code 用于填充模板时查找
+        self._metric_code = metric_code
+        # metric_func 用于查找对应的指标计算函数
+        self._metric_func = metric_func
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def kwargs(self):
+        return self._kwargs
+
+    @property
+    def metric_code(self):
+        return self._metric_code
+
+    @property
+    def metric_func(self):
+        return self._metric_func
+
+
 if __name__ == "__main__":
     pass

+ 113 - 59
entitys/data_process_config_entity.py → entitys/ml_config_entity.py

@@ -11,34 +11,73 @@ from typing import List, Union
 from commom import GeneralException, f_get_datetime
 from config import BaseConfig
 from enums import ResultCodesEnum
-
-
-class DataProcessConfigEntity():
-    def __init__(self, y_column: str, x_columns_candidate: List[str] = None, fill_method: str = None, fill_value=None,
-                 split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
-                 iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
-                 sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
-                 project_name: str = None, format_bin: str = False, breaks_list: dict = None, pos_neg_cnt=1,
-                 monto_contrast_change_cnt=0, jupyter=False, strees=False, strees_sample_times=100,
-                 strees_bad_rate_list: List[float] = [], *args, **kwargs):
+from init import warning_ignore
+
+
+class MlConfigEntity():
+    def __init__(self,
+                 y_column: str,
+                 project_name: str = None,
+                 x_columns: List[str] = [],
+                 columns_exclude: List[str] = [],
+                 columns_include: List[str] = [],
+                 columns_anns: dict = {},
+                 bin_search_interval: float = 0.05,
+                 bin_sample_rate: float = 0.1,
+                 iv_threshold: float = 0.01,
+                 corr_threshold: float = 0.4,
+                 psi_threshold: float = 0.2,
+                 vif_threshold: float = 10,
+                 monto_shift_threshold=1,
+                 trend_shift_threshold=0,
+                 max_feature_num: int = 10,
+                 special_values: Union[dict, list, str] = None,
+                 breaks_list: dict = None,
+                 format_bin: str = False,
+                 jupyter_print=False,
+                 stress_test=False,
+                 stress_sample_times=100,
+                 stress_bad_rate_list: List[float] = [],
+                 model_type = "lr",
+                 feature_strategy = "woe",
+                 fill_method: str = None,
+                 fill_value=None,
+                 *args, **kwargs):
+
+        self._model_type= model_type
+
+        self._feature_strategy = feature_strategy
+
+        self._psi_threshold = psi_threshold
+
+        self._vif_threshold = vif_threshold
+
+        # 排除的x列
+        self._columns_exclude = columns_exclude
+
+        # 强制保留的x列
+        self._columns_include = columns_include
+
+        # 变量注释
+        self._columns_anns = columns_anns
 
         # 是否开启下输出内容
-        self._strees = strees
+        self._stress_test = stress_test
 
         # jupyter下输出内容
-        self._strees_sample_times = strees_sample_times
+        self._stress_sample_times = stress_sample_times
 
         # jupyter下输出内容
-        self._strees_bad_rate_list = strees_bad_rate_list
+        self._stress_bad_rate_list = stress_bad_rate_list
 
         # jupyter下输出内容
-        self._jupyter = jupyter
+        self._jupyter_print = jupyter_print
 
         # 单调性允许变化次数
-        self._pos_neg_cnt = pos_neg_cnt
+        self._monto_shift_threshold = monto_shift_threshold
 
         # 变量趋势一致性允许变化次数
-        self._monto_contrast_change_cnt = monto_contrast_change_cnt
+        self._trend_shift_threshold = trend_shift_threshold
 
         # 是否启用粗分箱
         self._format_bin = format_bin
@@ -50,7 +89,7 @@ class DataProcessConfigEntity():
         self._y_column = y_column
 
         # 候选x变量
-        self._x_columns_candidate = x_columns_candidate
+        self._x_columns = x_columns
 
         # 缺失值填充方法
         self._fill_method = fill_method
@@ -58,23 +97,14 @@ class DataProcessConfigEntity():
         # 缺失值填充值
         self._fill_value = fill_value
 
-        # 数据划分方法
-        self._split_method = split_method
-
-        # 最优特征搜索方法
-        self._feature_search_strategy = feature_search_strategy
-
         # 使用iv筛变量时的阈值
         self._iv_threshold = iv_threshold
 
-        # 使用iv粗筛变量时的阈值
-        self._iv_threshold_wide = iv_threshold_wide
-
         # 贪婪搜索分箱时数据粒度大小,应该在0.01-0.1之间
         self._bin_search_interval = bin_search_interval
 
         # 最终保留多少x变量
-        self._x_candidate_num = x_candidate_num
+        self._max_feature_num = max_feature_num
 
         self._special_values = special_values
 
@@ -84,43 +114,64 @@ class DataProcessConfigEntity():
         self._corr_threshold = corr_threshold
 
         # 贪婪搜索采样比例,只针对4箱5箱时有效
-        self._sample_rate = sample_rate
+        self._bin_sample_rate = bin_sample_rate
 
         if self._project_name is None or len(self._project_name) == 0:
             self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
         else:
             self._base_dir = os.path.join(BaseConfig.train_path, self._project_name)
 
+        self._include = columns_include + list(self.breaks_list.keys())
+
         os.makedirs(self._base_dir, exist_ok=True)
 
+        if self._jupyter_print:
+            warning_ignore()
+
+    @property
+    def model_type(self):
+        return self._model_type
+
+    @property
+    def feature_strategy(self):
+        return self._feature_strategy
+
+    @property
+    def psi_threshold(self):
+        return self._psi_threshold
+
+    @property
+    def vif_threshold(self):
+        return self._vif_threshold
+
     @property
-    def strees(self):
-        return self._strees
+    def stress_test(self):
+        return self._stress_test
 
     @property
-    def strees_sample_times(self):
+    def stress_sample_times(self):
 
-        return self._strees_sample_times
+        return self._stress_sample_times
 
     @property
-    def strees_bad_rate_list(self):
-        return self._strees_bad_rate_list
+    def stress_bad_rate_list(self):
+        return self._stress_bad_rate_list
 
     @property
-    def jupyter(self):
-        return self._jupyter
+    def jupyter_print(self):
+        return self._jupyter_print
 
     @property
     def base_dir(self):
         return self._base_dir
 
     @property
-    def pos_neg_cnt(self):
-        return self._pos_neg_cnt
+    def monto_shift_threshold(self):
+        return self._monto_shift_threshold
 
     @property
-    def monto_contrast_change_cnt(self):
-        return self._monto_contrast_change_cnt
+    def trend_shift_threshold(self):
+        return self._trend_shift_threshold
 
     @property
     def format_bin(self):
@@ -131,44 +182,44 @@ class DataProcessConfigEntity():
         return self._project_name
 
     @property
-    def sample_rate(self):
-        return self._sample_rate
+    def bin_sample_rate(self):
+        return self._bin_sample_rate
 
     @property
     def corr_threshold(self):
         return self._corr_threshold
 
     @property
-    def iv_threshold_wide(self):
-        return self._iv_threshold_wide
-
-    @property
-    def candidate_num(self):
-        return self._x_candidate_num
+    def max_feature_num(self):
+        return self._max_feature_num
 
     @property
     def y_column(self):
         return self._y_column
 
     @property
-    def x_columns_candidate(self):
-        return self._x_columns_candidate
+    def x_columns(self):
+        return self._x_columns
 
     @property
-    def fill_value(self):
-        return self._fill_value
+    def columns_exclude(self):
+        return self._columns_exclude
 
     @property
-    def fill_method(self):
-        return self._fill_method
+    def columns_include(self):
+        return self._columns_include
 
     @property
-    def split_method(self):
-        return self._split_method
+    def columns_anns(self):
+        return self._columns_anns
 
     @property
-    def feature_search_strategy(self):
-        return self._feature_search_strategy
+    def fill_value(self):
+        return self._fill_value
+
+    @property
+    def fill_method(self):
+        return self._fill_method
 
     @property
     def iv_threshold(self):
@@ -214,6 +265,9 @@ class DataProcessConfigEntity():
             return self._breaks_list.get(column, [])
         return []
 
+    def is_include(self, column: str) -> bool:
+        return column in self._include
+
     def f_get_save_path(self, file_name: str) -> str:
         path = os.path.join(self._base_dir, file_name)
         return path
@@ -229,7 +283,7 @@ class DataProcessConfigEntity():
         else:
             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
 
-        return DataProcessConfigEntity(**j)
+        return MlConfigEntity(**j)
 
 
 if __name__ == "__main__":

+ 7 - 7
entitys/monitor_metric_config_entity.py → entitys/monitor_entity.py

@@ -2,19 +2,19 @@
 """
 @author: yq
 @time: 2024/11/1
-@desc: 指标监控配置
+@desc:  常用指标实体集合
 """
 import json
 import os
 from typing import List, Dict
 
 from commom import GeneralException
-from entitys import MetricConfigEntity
 from enums import ResultCodesEnum
 from metrics import f_get_metric_clazz_dict, MetricBase
+from .metric_entity import MetricConfigEntity
 
 
-class MonitorMetricConfigEntity():
+class MonitorConfigEntity():
 
     def __init__(self, metric_config_list: List[MetricConfigEntity], template_path: str):
         self._template_path = template_path
@@ -36,10 +36,10 @@ class MonitorMetricConfigEntity():
             metric_code = metric_config.metric_code
             # 指标函数不存在
             if metric_func_name not in self._metric_clazz_dict.keys():
-                raise GeneralException(ResultCodesEnum.NOT_FOUND, message = f"指标函数【{metric_func_name}】不存在")
+                raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指标函数【{metric_func_name}】不存在")
             # 指标code不唯一
             if metric_code in metric_dict.keys():
-                raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message = f"指标code【{metric_code}】不唯一")
+                raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"指标code【{metric_code}】不唯一")
             metric_clazz = self._metric_clazz_dict[metric_func_name]
             metric_dict[metric_code] = metric_clazz(*metric_config.args, **metric_config.kwargs)
         return metric_dict
@@ -53,11 +53,11 @@ class MonitorMetricConfigEntity():
             with open(config_path, mode="r", encoding="utf-8") as f:
                 j = json.loads(f.read())
         else:
-            raise GeneralException(ResultCodesEnum.NOT_FOUND, message = f"指标监控配置文件【{config_path}】不存在")
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指标监控配置文件【{config_path}】不存在")
         metric_config_list = j.get("metric_config_list", [])
         metric_config_list = [MetricConfigEntity(**i) for i in metric_config_list]
         j["metric_config_list"] = metric_config_list
-        return MonitorMetricConfigEntity(**j)
+        return MonitorConfigEntity(**j)
 
 
 if __name__ == "__main__":

+ 3 - 3
enums/__init__.py

@@ -4,10 +4,10 @@
 @time: 2024/10/30
 @desc: 枚举值
 """
-from .bins_strategy_enum import BinsStrategyEnum
-from .filter_strategy_enum import FilterStrategyEnum
+from .context_enum import ContextEnum
+from .feature_strategy_enum import FeatureStrategyEnum
 from .model_enum import ModelEnum
 from .placeholder_prefix_enum import PlaceholderPrefixEnum
 from .result_codes_enum import ResultCodesEnum
 
-__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'BinsStrategyEnum', 'FilterStrategyEnum', 'ModelEnum']
+__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'FeatureStrategyEnum', 'ModelEnum', 'ContextEnum']

+ 0 - 12
enums/bins_strategy_enum.py

@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: yq
-@time: 2024/11/14
-@desc: 分箱策略枚举值
-"""
-from enum import Enum
-
-
-class BinsStrategyEnum(Enum):
-    QUANTILE = "quantile"
-    WIDTH = "width"

+ 18 - 0
enums/context_enum.py

@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/14
+@desc: 特征筛选策略枚举值
+"""
+from enum import Enum
+
+
+class ContextEnum(Enum):
+    BIN_INFO_FILTERED = "bin_info_filtered"
+    HOMO_BIN_INFO_NUMERIC_SET = "homo_bin_info_numeric_set"
+    WOEBIN = "woebin"
+    FILTER_FAST = "filter_fast"
+    FILTER_NUMERIC = "filter_numeric"
+    FILTER_CORR = "filter_corr"
+    FILTER_VIF = "filter_vif"
+    FILTER_IVTOP = "filter_ivtop"

+ 2 - 2
enums/filter_strategy_enum.py → enums/feature_strategy_enum.py

@@ -7,5 +7,5 @@
 from enum import Enum
 
 
-class FilterStrategyEnum(Enum):
-    IV = "iv"
+class FeatureStrategyEnum(Enum):
+    WOE = "woe"

+ 2 - 2
feature/__init__.py

@@ -4,6 +4,6 @@
 @time: 2024/11/1
 @desc: 特征挖掘
 """
-from .filter_strategy_factory import FilterStrategyFactory
+from .feature_strategy_factory import FeatureStrategyFactory
 
-__all__ = ['FilterStrategyFactory']
+__all__ = ['FeatureStrategyFactory']

+ 67 - 0
feature/feature_strategy_base.py

@@ -0,0 +1,67 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2024/1/2
+@desc: 特征筛选基类
+"""
+import abc
+from typing import Dict
+
+import pandas as pd
+
+from entitys import MlConfigEntity, MetricFucResultEntity
+
+
+class FeatureStrategyBase(metaclass=abc.ABCMeta):
+
+    def __init__(self, ml_config: MlConfigEntity = None, *args, **kwargs):
+        if ml_config is not None:
+            self._ml_config = ml_config
+        else:
+            self._ml_config = MlConfigEntity(*args, **kwargs)
+
+    @property
+    def ml_config(self):
+        return self._ml_config
+
+    @abc.abstractmethod
+    def feature_search(self, *args, **kwargs):
+        """
+        特征筛选
+        """
+        pass
+
+    @abc.abstractmethod
+    def feature_generate(self, *args, **kwargs) -> pd.DataFrame:
+        """
+        特征转换
+        """
+        pass
+
+    @abc.abstractmethod
+    def feature_save(self, *args, **kwargs):
+        """
+        特征保存
+        """
+        pass
+
+    @abc.abstractmethod
+    def feature_load(self, path: str, *args, **kwargs):
+        """
+        特征加载
+        """
+        pass
+
+    @abc.abstractmethod
+    def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+        """
+        特征报告
+        """
+        pass
+
+    @abc.abstractmethod
+    def jupyter_print(self, *args, **kwargs):
+        """
+        notebook输出
+        """
+        pass

+ 6 - 6
feature/filter_strategy_factory.py → feature/feature_strategy_factory.py

@@ -7,19 +7,19 @@
 from typing import Type
 
 from commom import GeneralException
-from enums import FilterStrategyEnum, ResultCodesEnum
-from .filter_strategy_base import FilterStrategyBase
-from .strategy_iv import StrategyIv
+from enums import FeatureStrategyEnum, ResultCodesEnum
+from .feature_strategy_base import FeatureStrategyBase
+from .woe.strategy_woe import StrategyWoe
 
 strategy_map = {
-    FilterStrategyEnum.IV.value: StrategyIv
+    FeatureStrategyEnum.WOE.value: StrategyWoe
 }
 
 
-class FilterStrategyFactory():
+class FeatureStrategyFactory():
 
     @staticmethod
-    def get_strategy(strategy: str) -> Type[FilterStrategyBase]:
+    def get_strategy(strategy: str) -> Type[FeatureStrategyBase]:
         if strategy not in strategy_map.keys():
             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征搜索策略【{strategy}】不存在")
         strategy = strategy_map.get(strategy)

+ 0 - 49
feature/filter_strategy_base.py

@@ -1,49 +0,0 @@
-# -*- coding:utf-8 -*-
-"""
-@author: yq
-@time: 2024/1/2
-@desc: 特征筛选基类
-"""
-import abc
-from typing import Dict, Tuple, List
-
-from entitys import DataProcessConfigEntity, DataPreparedEntity, CandidateFeatureEntity, MetricFucEntity
-from init import warning_ignore
-
-
-class FilterStrategyBase(metaclass=abc.ABCMeta):
-
-    def __init__(self, data_process_config: DataProcessConfigEntity = None, *args, **kwargs):
-        if data_process_config is not None:
-            self._data_process_config = data_process_config
-        else:
-            self._data_process_config = DataProcessConfigEntity(*args, **kwargs)
-
-        jupyter = self._data_process_config.jupyter
-        if jupyter:
-            warning_ignore()
-
-    @property
-    def data_process_config(self):
-        return self._data_process_config
-
-    @abc.abstractmethod
-    def filter(self, *args, **kwargs) -> Tuple[Dict[str, CandidateFeatureEntity], Dict[str, List[CandidateFeatureEntity]]]:
-        """
-        特征筛选
-        """
-        pass
-
-    @abc.abstractmethod
-    def feature_generate(self, *args, **kwargs) -> DataPreparedEntity:
-        """
-        特征转换
-        """
-        pass
-
-    @abc.abstractmethod
-    def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucEntity]:
-        """
-        特征报告
-        """
-        pass

+ 0 - 513
feature/strategy_iv.py

@@ -1,513 +0,0 @@
-# -*- coding:utf-8 -*-
-"""
-@author: yq
-@time: 2024/1/2
-@desc: iv值及单调性筛选类
-"""
-import json
-from itertools import combinations_with_replacement
-from typing import List, Dict, Tuple
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import scorecardpy as sc
-import seaborn as sns
-from pandas.core.dtypes.common import is_numeric_dtype
-from tqdm import tqdm
-
-from commom import f_display_images_by_side, NumpyEncoder
-from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
-from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin, f_monto_contrast
-from .filter_strategy_base import FilterStrategyBase
-
-
-class StrategyIv(FilterStrategyBase):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def _f_get_iv_by_bins(self, bins) -> pd.DataFrame:
-        iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in bins.items()}
-        iv = pd.DataFrame.from_dict(iv, orient='index', columns=['IV']).reset_index()
-        iv = iv.sort_values('IV', ascending=False).reset_index(drop=True)
-        iv.columns = ['变量', 'IV']
-        return iv
-
-    def _f_get_var_corr_image(self, train_woe):
-        if len(train_woe.columns.to_list()) <= 1:
-            return None
-        train_corr = f_get_corr(train_woe)
-        plt.figure(figsize=(12, 12))
-        sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
-        plt.title('Variables Correlation', fontsize=15)
-        plt.yticks(rotation=0)
-        plt.xticks(rotation=90)
-        path = self.data_process_config.f_get_save_path(f"var_corr.png")
-        plt.savefig(path)
-        return path
-
-    def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
-        image_path_list = []
-        for k in x_columns_candidate:
-            bin_df = bins[k]
-            # bin_df["bin"] = bin_df["bin"].apply(lambda x: re.sub(r"(\d+\.\d+)",
-            #                                                      lambda m: "{:.2f}".format(float(m.group(0))), x))
-            sc.woebin_plot(bin_df)
-            path = self.data_process_config.f_get_save_path(f"{prefix}_{k}.png")
-            plt.savefig(path)
-            image_path_list.append(path)
-        return image_path_list
-
-    def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity],
-                              y_column=None):
-        y_column = self.data_process_config.y_column if y_column is None else y_column
-        special_values = self.data_process_config.special_values
-        x_columns_candidate = list(candidate_dict.keys())
-        breaks_list = {}
-        for column, candidate in candidate_dict.items():
-            breaks_list[column] = candidate.breaks_list
-        bins = sc.woebin(data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
-                         special_values=special_values, print_info=False)
-        return bins
-
-    def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
-        # 相关性剔除变量
-        corr_threshold = self.data_process_config.corr_threshold
-        breaks_list = self.data_process_config.breaks_list
-        train_data = data.train_data
-        x_columns_candidate = list(candidate_dict.keys())
-
-        bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
-        train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins, print_info=False)
-        corr_df = f_get_corr(train_woe)
-        corr_dict = corr_df.to_dict()
-        for column, corr in corr_dict.items():
-            column = column.replace("_woe", "")
-            if column not in x_columns_candidate:
-                continue
-            for challenger_column, challenger_corr in corr.items():
-                challenger_column = challenger_column.replace("_woe", "")
-                if challenger_corr < corr_threshold or column == challenger_column \
-                        or challenger_column not in x_columns_candidate:
-                    continue
-                iv_max = candidate_dict[column].iv_max
-                challenger_iv_max = candidate_dict[challenger_column].iv_max
-                if iv_max > challenger_iv_max:
-                    if challenger_column not in breaks_list.keys():
-                        x_columns_candidate.remove(challenger_column)
-                else:
-                    if column not in breaks_list.keys():
-                        x_columns_candidate.remove(column)
-                    break
-        return x_columns_candidate
-
-    def _f_wide_filter(self, data: DataSplitEntity) -> Dict:
-        # 粗筛变量
-        train_data = data.train_data
-        test_data = data.test_data
-        special_values = self.data_process_config.special_values
-        breaks_list = self.data_process_config.breaks_list.copy()
-        y_column = self.data_process_config.y_column
-        iv_threshold_wide = self.data_process_config.iv_threshold_wide
-        x_columns_candidate = self.data_process_config.x_columns_candidate
-        if x_columns_candidate is None or len(x_columns_candidate) == 0:
-            x_columns_candidate = train_data.columns.tolist()
-        if y_column in x_columns_candidate:
-            x_columns_candidate.remove(y_column)
-
-        bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, bin_num_limit=5,
-                               special_values=special_values, breaks_list=breaks_list, print_info=False)
-
-        for column, bin in bins_train.items():
-            breaks_list[column] = list(bin['breaks'])
-        bins_test = None
-        if test_data is not None and len(test_data) != 0:
-            bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column,
-                                  special_values=special_values, breaks_list=breaks_list, print_info=False)
-        bins_iv_dict = {}
-        for column, bin_train in bins_train.items():
-            train_iv = bin_train['total_iv'][0]
-            test_iv = 0
-            if bins_test is not None:
-                bin_test = bins_test[column]
-                test_iv = bin_test['total_iv'][0]
-            iv_max = train_iv + test_iv
-            if train_iv < iv_threshold_wide:
-                continue
-            bins_iv_dict[column] = {"iv_max": iv_max, "breaks_list": breaks_list[column]}
-        return bins_iv_dict
-
-    def _f_get_best_bins_numeric(self, data: DataSplitEntity, x_column: str):
-        # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
-        interval = self.data_process_config.bin_search_interval
-        iv_threshold = self.data_process_config.iv_threshold
-        special_values = self.data_process_config.get_special_values(x_column)
-        breaks_list = self.data_process_config.get_breaks_list(x_column)
-        y_column = self.data_process_config.y_column
-        sample_rate = self.data_process_config.sample_rate
-        format_bin = self.data_process_config.format_bin
-        pos_neg_cnt = self.data_process_config.pos_neg_cnt
-        monto_contrast_change_cnt = self.data_process_config.monto_contrast_change_cnt
-
-        def _n0(x):
-            return sum(x == 0)
-
-        def _n1(x):
-            return sum(x == 1)
-
-        def _f_distribute_balls(balls, boxes):
-            # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
-            total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
-            distribute_list = []
-            # 遍历所有可能的隔板位置
-            for combo in total_ways:
-                # 根据隔板位置分配球
-                distribution = [0] * boxes
-                start = 0
-                for i, divider in enumerate(combo):
-                    distribution[i] = divider - start + 1
-                    start = divider + 1
-                distribution[-1] = balls - start  # 最后一个箱子的球数
-                # 确保每个箱子至少有一个球
-                if all(x > 0 for x in distribution):
-                    distribute_list.append(distribution)
-            return distribute_list
-
-        def _get_sv_bins(df, x_column, y_column, special_values):
-            # special_values_bins
-            sv_bin_list = []
-            for special in special_values:
-                dtm = df[df[x_column] == special]
-                if len(dtm) != 0:
-                    dtm['bin'] = [str(special)] * len(dtm)
-                    binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
-                        [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
-                    binning['is_special_values'] = [True] * len(binning)
-                    sv_bin_list.append(binning)
-            return sv_bin_list
-
-        def _get_bin_left_value(bin: str):
-            if "," not in bin:
-                return float(bin)
-            left = bin.split(",")[0]
-            return float(left[1:])
-
-        def _get_bins(df, x_column, y_column, breaks_list):
-            dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
-            bstbrks = [-np.inf] + breaks_list + [np.inf]
-            labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
-            dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
-            dtm['bin'] = dtm['bin'].astype(str)
-            bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
-                .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
-            bins['is_special_values'] = [False] * len(bins)
-            bins["ordered"] = bins['bin'].apply(_get_bin_left_value)
-            bins = bins.sort_values(by=["ordered"], ascending=[True])
-            return bins
-
-        def _get_badprob(bins):
-            bins['count'] = bins['good'] + bins['bad']
-            bins['badprob'] = bins['bad'] / bins['count']
-            bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
-            return bad_prob
-
-        def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
-            # 单调性判断
-            bad_prob = _get_badprob(bins)
-            if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
-                return -1
-            # 计算iv
-            infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
-                .replace(0, 0.9) \
-                .assign(
-                DistrBad=lambda x: x.bad / sum(x.bad),
-                DistrGood=lambda x: x.good / sum(x.good)
-            ) \
-                .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
-                .iv
-            bins['bin_iv'] = infovalue
-            bins['total_iv'] = bins['bin_iv'].sum()
-            iv = bins['total_iv'].values[0]
-            return iv
-
-        def _f_sampling(distribute_list: list, sample_rate: float):
-            # 采样,完全贪婪搜索耗时太长
-            sampled_list = distribute_list[::int(1 / sample_rate)]
-            return sampled_list
-
-        train_data = data.train_data
-        train_data_filter = train_data[~train_data[x_column].isin(special_values)]
-        train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
-        train_data_x = train_data_filter[x_column]
-        train_data_x_describe = train_data_x.describe(percentiles=[0.1, 0.9])
-        train_data_x_max = train_data_x.max()
-
-        test_data = data.test_data
-        test_data_filter = None
-        if test_data is not None and len(test_data) != 0:
-            test_data_filter = test_data[~test_data[x_column].isin(special_values)]
-            test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
-
-        # 构造数据切分点
-        # 计算 2 - 5 箱的情况
-        distribute_list = []
-        points_list = []
-        for bin_num in list(range(2, 6)):
-            distribute_list_cache = _f_distribute_balls(int(1 / interval), bin_num)
-            # 4箱及以上得采样,不然耗时太久
-            sample_num = 1000 * sample_rate
-            if sample_rate <= 0.15:
-                sample_num *= 2
-            if bin_num == 4 and len(distribute_list_cache) >= sample_num:
-                distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
-            sample_num = 4000 * sample_rate
-            if bin_num == 5 and len(distribute_list_cache) >= sample_num:
-                distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
-            distribute_list.extend(distribute_list_cache)
-        for distribute in distribute_list:
-            point_list_cache = []
-            point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
-            for point_percentile in point_percentile_list:
-                point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
-                point = float(point)
-                if format_bin:
-                    point = f_format_bin(train_data_x_describe, point)
-                point = round(point, 2)
-                if point == 0:
-                    continue
-                if point not in point_list_cache and point < train_data_x_max:
-                    point_list_cache.append(point)
-            if point_list_cache not in points_list and len(point_list_cache) != 0:
-                points_list.append(point_list_cache)
-        # IV与单调性过滤
-        # 获取2 - 5 箱的情况下最佳分箱
-        bins_enum = {}
-        iv_max = 0
-        breaks_list_target = None
-        judge_monto = True
-        if len(breaks_list) != 0:
-            points_list = [breaks_list]
-            judge_monto = False
-        train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
-        test_sv_bin_list = None
-        if test_data_filter is not None:
-            test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
-        for point_list in points_list:
-            is_discard = 0
-            discard_reason = ""
-            is_monto = 1
-            is_monto_contrast = 1
-            train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
-            # 与special_values合并计算iv
-            for sv_bin in train_sv_bin_list:
-                train_bins = pd.concat((train_bins, sv_bin))
-            # _calculation_iv包含了单调性判断,并排除了特殊值
-            train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
-            # 只限制训练集的单调性与iv值大小
-            if train_iv < iv_threshold:
-                discard_reason = f"训练集iv小于阈值{iv_threshold}"
-                is_discard = 1
-                is_monto = 0
-
-            test_iv = 0
-            if test_data_filter is not None:
-                test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
-                for sv_bin in test_sv_bin_list:
-                    test_bins = pd.concat((test_bins, sv_bin))
-                test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
-                # 趋势一致性判断
-                train_bad_prob = _get_badprob(train_bins)
-                test_bad_prob = _get_badprob(test_bins)
-                if not f_monto_contrast(train_bad_prob, test_bad_prob, monto_contrast_change_cnt) \
-                        and len(breaks_list) == 0:
-                    discard_reason = f"变量趋势一致性不够"
-                    is_discard = 1
-                    is_monto_contrast = 0
-
-            iv = train_iv + test_iv
-
-            if len(breaks_list) == 0:
-                bin_num = len(point_list) + 1
-                if bin_num not in bins_enum.keys():
-                    bins_enum[bin_num] = []
-                bins_enum[bin_num].append({
-                    "is_discard": is_discard,
-                    "is_monto": is_monto,
-                    "is_monto_contrast": is_monto_contrast,
-                    "discard_reason": discard_reason,
-                    "point_list": point_list,
-                    "iv": iv,
-                })
-
-            if iv > iv_max and not is_discard:
-                iv_max = iv
-                breaks_list_target = point_list
-
-        # 各个分箱数下的最佳分箱点
-        bins_enum_best_point = []
-        for k, v in bins_enum.items():
-            df_bin_enum = pd.DataFrame(data=v)
-            df_bin_enum.sort_values(by=["is_discard", "is_monto", "is_monto_contrast", "iv"],
-                                    ascending=[True, False, False, False], inplace=True)
-            bins_enum_best_point.append(df_bin_enum.iloc[0]["point_list"])
-
-        return iv_max, breaks_list_target, bins_enum_best_point
-
-    def filter(self, data: DataSplitEntity, *args, **kwargs) -> Tuple[
-        Dict[str, CandidateFeatureEntity], Dict[str, List[CandidateFeatureEntity]]]:
-        # 粗筛
-        bins_iv_dict = self._f_wide_filter(data)
-        x_columns_candidate = list(bins_iv_dict.keys())
-        candidate_num = self.data_process_config.candidate_num
-        candidate_dict: Dict[str, CandidateFeatureEntity] = {}
-        numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]] = {}
-        for x_column in tqdm(x_columns_candidate):
-            if is_numeric_dtype(data.train_data[x_column]):
-                iv_max, breaks_list, bins_enum_best_point = self._f_get_best_bins_numeric(data, x_column)
-                if len(bins_enum_best_point) != 0:
-                    numeric_candidate_dict_all[x_column] = []
-                    for point in bins_enum_best_point:
-                        numeric_candidate_dict_all[x_column].append(CandidateFeatureEntity(x_column, point, 0))
-                if breaks_list is None:
-                    continue
-                candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
-            else:
-                # 字符型暂时用scorecardpy来处理
-                candidate_dict[x_column] = CandidateFeatureEntity(x_column, bins_iv_dict[x_column]["breaks_list"],
-                                                                  bins_iv_dict[x_column]["iv_max"])
-
-        # 相关性进一步剔除变量
-        x_columns_candidate = self._f_corr_filter(data, candidate_dict)
-        candidate_list: List[CandidateFeatureEntity] = []
-        for x_column, v in candidate_dict.items():
-            if x_column in x_columns_candidate:
-                candidate_list.append(v)
-
-        candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
-        candidate_list = candidate_list[0:candidate_num]
-        candidate_dict = {}
-        for candidate in candidate_list:
-            candidate_dict[candidate.x_column] = candidate
-        return candidate_dict, numeric_candidate_dict_all
-
-    def feature_generate(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
-                         **kwargs) -> DataPreparedEntity:
-        train_data = data.train_data
-        val_data = data.val_data
-        test_data = data.test_data
-        y_column = self.data_process_config.y_column
-        x_columns_candidate = list(candidate_dict.keys())
-        bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
-
-        train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins, print_info=False)
-        train_data_feature = DataFeatureEntity(pd.concat((train_woe, train_data[y_column]), axis=1),
-                                               train_woe.columns.tolist(), y_column)
-
-        val_data_feature = None
-        if val_data is not None and len(val_data) != 0:
-            val_woe = sc.woebin_ply(val_data[x_columns_candidate], bins, print_info=False)
-            val_data_feature = DataFeatureEntity(pd.concat((val_woe, val_data[y_column]), axis=1),
-                                                 train_woe.columns.tolist(), y_column)
-
-        test_data_feature = None
-        if test_data is not None and len(test_data) != 0:
-            test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins, print_info=False)
-            test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
-                                                  train_woe.columns.tolist(), y_column)
-        return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
-                                  data_split_original=data)
-
-    def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity],
-                       numeric_candidate_dict_all: Dict[str, List[CandidateFeatureEntity]],
-                       *args, **kwargs) -> Dict[str, MetricFucEntity]:
-        y_column = self.data_process_config.y_column
-        jupyter = self.data_process_config.jupyter
-        x_columns_candidate = list(candidate_dict.keys())
-        train_data = data.train_data
-        test_data = data.test_data
-
-        metric_value_dict = {}
-        # 样本分布
-        metric_value_dict["样本分布"] = MetricFucEntity(table=data.get_distribution(y_column), table_font_size=10,
-                                                    table_cell_width=3)
-        # 变量iv及psi
-        train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
-        train_iv = self._f_get_iv_by_bins(train_bins)
-
-        if test_data is not None and len(test_data) != 0:
-            # 计算psi仅需把y改成识别各自训练集测试集即可
-            psi_df = pd.concat((train_data, test_data))
-            psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
-            psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
-            psi = self._f_get_iv_by_bins(psi)
-            psi.columns = ['变量', 'psi']
-            train_iv = pd.merge(train_iv, psi, on="变量", how="left")
-
-            # 变量趋势-测试集
-            test_bins = self._f_get_bins_by_breaks(test_data, candidate_dict)
-            image_path_list = self._f_save_var_trend(test_bins, x_columns_candidate, "test")
-            metric_value_dict["变量趋势-测试集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
-
-        metric_value_dict["变量iv"] = MetricFucEntity(table=train_iv, table_font_size=10, table_cell_width=3)
-        # 变量趋势-训练集
-        image_path_list = self._f_save_var_trend(train_bins, x_columns_candidate, "train")
-        metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
-        # 变量有效性
-        train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins, print_info=False)
-        var_corr_image_path = self._f_get_var_corr_image(train_woe)
-        # vif
-        vif_df = f_get_ivf(train_woe)
-        metric_value_dict["变量有效性"] = MetricFucEntity(image_path=var_corr_image_path, table=vif_df)
-
-        if jupyter:
-            from IPython import display
-
-            display.display(metric_value_dict["样本分布"].table)
-            # 打印变量iv
-            display.display(metric_value_dict["变量iv"].table)
-            # 打印vif
-            display.display(metric_value_dict["变量有效性"].table)
-            # 打印变量相关性
-            f_display_images_by_side(metric_value_dict["变量有效性"].image_path, display, width=800)
-
-            # 打印变量趋势
-            var_trend_train = metric_value_dict["变量趋势-训练集"].image_path
-            var_trend_test = None
-            metric_test = metric_value_dict.get("变量趋势-测试集")
-            if metric_test is not None:
-                var_trend_test = metric_test.image_path
-            f_display_images_by_side(var_trend_train, display, title="变量趋势训练集", image_path_list2=var_trend_test,
-                                     title2="变量趋势测试集")
-            # 打印breaks_list
-            breaks_list = {}
-            for x_column, feature in candidate_dict.items():
-                breaks_list[x_column] = feature.breaks_list
-            print("变量切分点:")
-            print(json.dumps(breaks_list, ensure_ascii=False, indent=2, cls=NumpyEncoder))
-
-            # 打印所有变量的推荐切分点
-            print("-----不同分箱数下变量的推荐切分点-----")
-            for x_column, features in numeric_candidate_dict_all.items():
-                print(f"-----【{x_column}】-----")
-                print(f"切分点:")
-                var_trend_images_train = []
-                var_trend_images_test = []
-                for feature in features:
-                    print(json.dumps(feature.breaks_list, ensure_ascii=False, cls=NumpyEncoder))
-                    var_breaks_list = [str(i) for i in feature.breaks_list]
-                    var_trend_bins_train = self._f_get_bins_by_breaks(train_data, {x_column: feature})
-                    image_path = self._f_save_var_trend(var_trend_bins_train, [x_column],
-                                                        f"train_{x_column}_{'_'.join(var_breaks_list)}")
-                    var_trend_images_train.append(image_path[0])
-                    if metric_test is not None:
-                        var_trend_bins_test = self._f_get_bins_by_breaks(test_data, {x_column: feature})
-                        image_path = self._f_save_var_trend(var_trend_bins_test, [x_column],
-                                                            f"test_{x_column}_{'_'.join(var_breaks_list)}")
-                        var_trend_images_test.append(image_path[0])
-
-                f_display_images_by_side(var_trend_images_train, display, title=f"训练集",
-                                         image_path_list2=var_trend_images_test,
-                                         title2="测试集")
-
-        return metric_value_dict

+ 9 - 0
feature/woe/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/14
+@desc: 
+"""
+
+if __name__ == "__main__":
+    pass

+ 166 - 0
feature/woe/entity.py

@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/14
+@desc: 
+"""
+from typing import Union, List
+
+import pandas as pd
+
+from enums import ContextEnum
+from init import context
+
+
+class BinInfo():
+    def __init__(self,
+                 x_column: str = None,
+                 bin_num: int = None,
+                 points: list = None,
+                 is_auto_bins: int = None,
+                 train_iv: float = None,
+                 test_iv: float = None,
+                 iv: float = None,
+                 is_qualified_iv_train: int = None,
+                 monto_shift_nsv: int = None,
+                 is_qualified_monto_train_nsv: int = None,
+                 trend_shift_nsv: int = None,
+                 is_qualified_trend_nsv: int = None,
+                 psi: float = None,
+                 is_qualified_psi: int = None,
+                 vif: float = None,
+                 ):
+        self.x_column = x_column
+        self.bin_num = bin_num
+        self.points = points
+        self.is_auto_bins = is_auto_bins
+        self.train_iv = train_iv
+        self.test_iv = test_iv
+        self.iv = iv
+        self.is_qualified_iv_train = is_qualified_iv_train
+        self.monto_shift_nsv = monto_shift_nsv
+        self.is_qualified_monto_train_nsv = is_qualified_monto_train_nsv
+        self.trend_shift_nsv = trend_shift_nsv
+        self.is_qualified_trend_nsv = is_qualified_trend_nsv
+        self.psi = psi
+        self.is_qualified_psi = is_qualified_psi
+        self.vif = vif
+
+    def to_dict(self):
+        return self.__dict__
+
+    @staticmethod
+    def ivTopN(data: dict, top_n: int):
+        candidate = list(data.values())
+        candidate.sort(key=lambda x: x.iv, reverse=True)
+        filter_ivtop_overview = ""
+        filter_ivtop_detail = []
+        if top_n < len(candidate):
+            for bin_info in candidate[top_n:]:
+                filter_ivtop_overview = f"{filter_ivtop_overview}{bin_info.x_column} 因为ivtop【{bin_info.iv}】被剔除\n"
+                filter_ivtop_detail.append(bin_info.x_column)
+        candidate = candidate[0:top_n]
+        context.set_filter_info(ContextEnum.FILTER_IVTOP, filter_ivtop_overview, filter_ivtop_detail)
+        return {bin_info.x_column: bin_info for bin_info in candidate}
+
+    @staticmethod
+    def ofConvertByDict(data: dict):
+        bin_info = BinInfo()
+        for k, v in data.items():
+            bin_info.__setattr__(k, v)
+        return bin_info
+
+
+class HomologousBinInfo():
+    """
+     同一变量不同分箱下的特征信息
+     """
+
+    def __init__(self, x_column: str, is_auto_bins: int = None):
+        self.x_column = x_column
+        self.is_auto_bins = is_auto_bins
+        self.bins_info: List[BinInfo] = []
+
+    def add(self, bin_info: BinInfo):
+        self.bins_info.append(bin_info)
+
+    def convert_to_df(self) -> pd.DataFrame:
+        data = []
+        for bin_info in self.bins_info:
+            data.append(bin_info.to_dict())
+        df_bins_info = pd.DataFrame(data=data)
+        return df_bins_info
+
+    def drop_reason(self, ) -> str:
+        df_bins_info = self.convert_to_df()
+
+        df_bins_info_filter1 = df_bins_info[df_bins_info["is_qualified_iv_train"] == 1]
+        if len(df_bins_info_filter1) == 0:
+            return f"因为train_iv最大值【{df_bins_info['train_iv'].max()}】小于阈值被剔除"
+
+        df_bins_info_filter2 = df_bins_info[
+            (df_bins_info["is_qualified_iv_train"] == 1)
+            & (df_bins_info["is_qualified_monto_train_nsv"] == 1)
+            ]
+        if len(df_bins_info_filter2) == 0:
+            return f"因为monto单调变化最小次数【{df_bins_info_filter1['monto_shift_nsv'].min()}】大于阈值被剔除"
+
+        df_bins_info_filter3 = df_bins_info[
+            (df_bins_info["is_qualified_iv_train"] == 1)
+            & (df_bins_info["is_qualified_monto_train_nsv"] == 1)
+            & (df_bins_info["is_qualified_trend_nsv"] == 1)
+            ]
+        if len(df_bins_info_filter3) == 0:
+            return f"因为trend变量趋势一致性变化最小次数【{df_bins_info_filter2['trend_shift_nsv'].min()}】大于阈值被剔除"
+
+        df_bins_info_filter4 = df_bins_info[
+            (df_bins_info["is_qualified_iv_train"] == 1)
+            & (df_bins_info["is_qualified_monto_train_nsv"] == 1)
+            & (df_bins_info["is_qualified_trend_nsv"] == 1)
+            & (df_bins_info["is_qualified_psi"] == 1)
+            ]
+        if len(df_bins_info_filter4) == 0:
+            return f"因为psi【{df_bins_info_filter3['psi'].min()}】大于阈值被剔除"
+
+        print(df_bins_info_filter4)
+        return f"因为【未知原因】被剔除"
+
+    def filter(self) -> Union[BinInfo, None]:
+        # iv psi 变量单调性 变量趋势一致性 筛选
+        df_bins_info = self.convert_to_df()
+        # 人工指定切分点的直接返回
+        if not self.is_auto_bins:
+            return BinInfo.ofConvertByDict(df_bins_info.iloc[0].to_dict())
+        df_bins_info_filter = df_bins_info[
+            (df_bins_info["is_qualified_iv_train"] == 1)
+            & (df_bins_info["is_qualified_monto_train_nsv"] == 1)
+            & (df_bins_info["is_qualified_trend_nsv"] == 1)
+            & (df_bins_info["is_qualified_psi"] == 1)
+            ]
+        # 选取单调性变化最少,iv最大,psi 最小的分箱
+        df_bins_info_filter.sort_values(by=["monto_shift_nsv", "trend_shift_nsv", "iv", "psi"],
+                                        ascending=[True, True, False, True], inplace=True)
+        if len(df_bins_info_filter) != 0:
+            return BinInfo.ofConvertByDict(df_bins_info_filter.iloc[0].to_dict())
+        return None
+
+    def get_best_bins(self) -> List[BinInfo]:
+        df_bins_info = self.convert_to_df()
+        bin_num_list = df_bins_info["bin_num"].unique().tolist()
+        bin_num_list.sort()
+        bins_info = []
+        for bin_num in bin_num_list:
+            df_bins_info_filter = df_bins_info[df_bins_info["bin_num"] == bin_num]
+            df_bins_info_filter.sort_values(by=["monto_shift_nsv", "trend_shift_nsv", "iv", "psi"],
+                                            ascending=[True, True, False, True], inplace=True)
+            bin_info_dict1 = df_bins_info_filter.iloc[0].to_dict()
+            bins_info.append(BinInfo.ofConvertByDict(bin_info_dict1))
+
+            # 获取没单调性排序的,考虑到age这种变量允许有转折的
+            df_bins_info_filter.sort_values(by=["trend_shift_nsv", "iv", "psi"],
+                                            ascending=[True, False, True], inplace=True)
+            bin_info_dict2 = df_bins_info_filter.iloc[0].to_dict()
+            if bin_info_dict1["monto_shift_nsv"] != bin_info_dict2["monto_shift_nsv"]:
+                bins_info.append(BinInfo.ofConvertByDict(bin_info_dict2))
+
+        return bins_info

+ 611 - 0
feature/woe/strategy_woe.py

@@ -0,0 +1,611 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2024/1/2
+@desc: iv值及单调性筛选类
+"""
+import json
+import os.path
+from itertools import combinations_with_replacement
+from typing import Dict, Optional, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scorecardpy as sc
+import seaborn as sns
+from pandas.core.dtypes.common import is_numeric_dtype
+from tqdm import tqdm
+
+from commom import f_display_images_by_side, NumpyEncoder, GeneralException, f_df_to_image, f_display_title
+from entitys import DataSplitEntity, MetricFucResultEntity
+from enums import ContextEnum, ResultCodesEnum
+from feature.feature_strategy_base import FeatureStrategyBase
+from init import context
+from .entity import BinInfo, HomologousBinInfo
+from .utils import f_monto_shift, f_get_corr, f_get_vif, f_format_bin, f_trend_shift, f_get_psi
+
+
+class StrategyWoe(FeatureStrategyBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # woe编码需要的分箱信息,复用scorecardpy的格式
+        self.sc_woebin = None
+
+    def _f_get_img_corr(self, train_woe) -> Union[str, None]:
+        if len(train_woe.columns.to_list()) <= 1:
+            return None
+        train_corr = f_get_corr(train_woe)
+        plt.figure(figsize=(12, 12))
+        sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
+        plt.title('Variables Correlation', fontsize=15)
+        plt.yticks(rotation=0)
+        plt.xticks(rotation=90)
+        img_path = self.ml_config.f_get_save_path(f"corr.png")
+        plt.savefig(img_path)
+        return img_path
+
+    def _f_get_img_trend(self, sc_woebin, x_columns, prefix):
+        imgs_path = []
+        for k in x_columns:
+            df_bin = sc_woebin[k]
+            # df_bin["bin"] = df_bin["bin"].apply(lambda x: re.sub(r"(\d+\.\d+)",
+            #                                                      lambda m: "{:.2f}".format(float(m.group(0))), x))
+            sc.woebin_plot(df_bin)
+            path = self.ml_config.f_get_save_path(f"{prefix}_{k}.png")
+            plt.savefig(path)
+            imgs_path.append(path)
+        return imgs_path
+
+    def _f_get_sc_woebin(self, data: pd.DataFrame, bin_info_dict: Dict[str, BinInfo]) -> Dict[str, pd.DataFrame]:
+        y_column = self.ml_config.y_column
+        special_values = self.ml_config.special_values
+        x_columns = list(bin_info_dict.keys())
+        breaks_list = {column: bin_info.points for column, bin_info in bin_info_dict.items()}
+        sc_woebin = sc.woebin(data[x_columns + [y_column]], y=y_column, breaks_list=breaks_list,
+                              special_values=special_values, print_info=False)
+        return sc_woebin
+
+    def _handle_numeric(self, data: DataSplitEntity, x_column: str) -> HomologousBinInfo:
+        # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
+        def _n0(x):
+            return sum(x == 0)
+
+        def _n1(x):
+            return sum(x == 1)
+
+        def _get_bins_sv(df, x_column):
+            y_column = self.ml_config.y_column
+            special_values = self.ml_config.get_special_values(x_column)
+            # special_values_bins
+            bins_sv = pd.DataFrame()
+            for special in special_values:
+                dtm = df[df[x_column] == special]
+                if len(dtm) != 0:
+                    dtm['bin'] = [str(special)] * len(dtm)
+                    bin = dtm.groupby(['bin'], group_keys=False)[y_column].agg([_n0, _n1]) \
+                        .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
+                    bin['is_special_values'] = [True] * len(bin)
+                    bins_sv = pd.concat((bins_sv, bin))
+            return bins_sv
+
+        def _get_bins_nsv(df, x_column, breaks_list):
+            # no_special_values_bins
+            def _left_value(bin: str):
+                if "," not in bin:
+                    return float(bin)
+                left = bin.split(",")[0]
+                return float(left[1:])
+
+            y_column = self.ml_config.y_column
+
+            dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
+            bstbrks = [-np.inf] + breaks_list + [np.inf]
+            labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
+            dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
+            dtm['bin'] = dtm['bin'].astype(str)
+            bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
+                .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
+            bins['is_special_values'] = [False] * len(bins)
+            bins["ordered"] = bins['bin'].apply(_left_value)
+            # 排序防止计算变量分箱后的单调性错位
+            bins = bins.sort_values(by=["ordered"], ascending=[True])
+            return bins
+
+        def _get_badprobs(bins):
+            bins['count'] = bins['good'] + bins['bad']
+            bins['badprob'] = bins['bad'] / bins['count']
+            return bins['badprob'].values.tolist()
+
+        def _get_iv(bins):
+            infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
+                .replace(0, 0.9) \
+                .assign(DistrBad=lambda x: x.bad / sum(x.bad), DistrGood=lambda x: x.good / sum(x.good)) \
+                .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
+                .iv
+            bins['bin_iv'] = infovalue
+            bins['total_iv'] = bins['bin_iv'].sum()
+            iv = bins['total_iv'].values[0]
+            return iv.round(3)
+
+        def _get_points(data_ascending, column):
+            def _sampling(raw_list: list, num: int):
+                # 按步长采样
+                return raw_list[::int(len(raw_list) / num)]
+
+            def _distribute(interval, bin_num):
+                parts = int(1 / interval)
+                # 穷举分布,隔板法
+                total_ways = combinations_with_replacement(range(parts + bin_num - 1), bin_num - 1)
+                distributions = []
+                # 遍历所有可能的隔板位置
+                for combo in total_ways:
+                    # 根据隔板位置分配球
+                    distribution = [0] * bin_num
+                    start = 0
+                    for i, divider in enumerate(combo):
+                        distribution[i] = divider - start + 1
+                        start = divider + 1
+                    distribution[-1] = parts - start  # 最后一个箱子的球数
+                    # 确保每个箱子至少有一个球
+                    if all(x > 0 for x in distribution):
+                        distributions.append(distribution)
+                return distributions
+
+            interval = self.ml_config.bin_search_interval
+            bin_sample_rate = self.ml_config.bin_sample_rate
+            format_bin = self.ml_config.format_bin
+
+            data_x = data_ascending[column]
+            data_x_describe = data_x.describe(percentiles=[0.1, 0.9])
+            data_x_max = data_x.max()
+
+            # 计算 2 - 5 箱的情况
+            distributions_list = []
+            for bin_num in list(range(2, 6)):
+                distributions = _distribute(interval, bin_num)
+                # 4箱及以上得采样,不然耗时太久
+                sample_num = 1000 * bin_sample_rate
+                if bin_sample_rate <= 0.15:
+                    sample_num *= 2
+                if bin_num == 5:
+                    sample_num = 4000 * bin_sample_rate
+                if bin_num in (4, 5) and len(distributions) >= sample_num:
+                    distributions = _sampling(distributions, sample_num)
+                distributions_list.extend(distributions)
+
+            points_list = []
+            for distributions in distributions_list:
+                points = []
+                point_percentile = [sum(distributions[0:idx + 1]) * interval for idx, _ in
+                                    enumerate(distributions[0:-1])]
+                for percentile in point_percentile:
+                    point = data_x.iloc[int(len(data_x) * percentile)]
+                    point = float(point)
+                    if format_bin:
+                        point = f_format_bin(data_x_describe, point)
+                    point = round(point, 2)
+                    if point == 0:
+                        continue
+                    # 排除粗分箱后越界的情况
+                    if point not in points and point < data_x_max:
+                        points.append(point)
+                if points not in points_list and len(points) != 0:
+                    points_list.append(points)
+            return points_list
+
+        special_values = self.ml_config.get_special_values(x_column)
+        breaks_list = self.ml_config.get_breaks_list(x_column)
+        iv_threshold = self.ml_config.iv_threshold
+        psi_threshold = self.ml_config.psi_threshold
+        monto_shift_threshold = self.ml_config.monto_shift_threshold
+        trend_shift_threshold = self.ml_config.trend_shift_threshold
+
+        train_data = data.train_data
+        test_data = data.test_data
+
+        train_data_ascending_nsv = train_data[~train_data[x_column].isin(special_values)] \
+            .sort_values(by=x_column, ascending=True)
+        test_data_ascending_nsv = test_data[~test_data[x_column].isin(special_values)] \
+            .sort_values(by=x_column, ascending=True)
+
+        train_bins_sv = _get_bins_sv(train_data, x_column)
+        test_bins_sv = _get_bins_sv(test_data, x_column)
+
+        # 获取每种分箱的信息
+        # 构造数据切分点
+        is_auto_bins = 1
+        if len(breaks_list) != 0:
+            points_list_nsv = [breaks_list]
+            is_auto_bins = 0
+        else:
+            points_list_nsv = _get_points(train_data_ascending_nsv, x_column)
+        homo_bin_info = HomologousBinInfo(x_column, is_auto_bins)
+        # 计算iv psi monto_shift等
+        for points in points_list_nsv:
+            bin_info = BinInfo()
+            bin_info.x_column = x_column
+            bin_info.bin_num = len(points) + 1
+            bin_info.points = points
+            bin_info.is_auto_bins = is_auto_bins
+
+            # 变量iv,与special_values合并计算iv
+            train_bins_nsv = _get_bins_nsv(train_data_ascending_nsv, x_column, points)
+            train_bins = pd.concat((train_bins_nsv, train_bins_sv))
+            train_iv = _get_iv(train_bins)
+
+            test_bins_nsv = _get_bins_nsv(test_data_ascending_nsv, x_column, points)
+            test_bins = pd.concat((test_bins_nsv, test_bins_sv))
+            test_iv = _get_iv(test_bins)
+
+            bin_info.train_iv = train_iv
+            bin_info.test_iv = test_iv
+            bin_info.iv = train_iv + test_iv
+            bin_info.is_qualified_iv_train = 1 if train_iv > iv_threshold else 0
+
+            # 变量单调性变化次数
+            train_badprobs_nsv = _get_badprobs(train_bins_nsv)
+            monto_shift_train_nsv = f_monto_shift(train_badprobs_nsv)
+            bin_info.monto_shift_nsv = monto_shift_train_nsv
+            bin_info.is_qualified_monto_train_nsv = 0 if monto_shift_train_nsv > monto_shift_threshold else 1
+
+            # 变量趋势一致性
+            test_badprobs_nsv = _get_badprobs(test_bins_nsv)
+            trend_shift_nsv = f_trend_shift(train_badprobs_nsv, test_badprobs_nsv)
+            bin_info.trend_shift_nsv = trend_shift_nsv
+            bin_info.is_qualified_trend_nsv = 0 if trend_shift_nsv > trend_shift_threshold else 1
+
+            # 变量psi
+            psi = f_get_psi(train_bins, test_bins)
+            bin_info.psi = psi
+            bin_info.is_qualified_psi = 1 if psi < psi_threshold else 0
+            homo_bin_info.add(bin_info)
+        return homo_bin_info
+
+    def _f_fast_filter(self, data: DataSplitEntity) -> Dict[str, BinInfo]:
+        # 通过iv值粗筛变量
+        train_data = data.train_data
+        test_data = data.test_data
+        y_column = self.ml_config.y_column
+        x_columns = self.ml_config.x_columns
+        columns_exclude = self.ml_config.columns_exclude
+        special_values = self.ml_config.special_values
+        breaks_list = self.ml_config.breaks_list.copy()
+        iv_threshold = self.ml_config.iv_threshold
+        psi_threshold = self.ml_config.psi_threshold
+
+        if len(x_columns) == 0:
+            x_columns = train_data.columns.tolist()
+        if y_column in x_columns:
+            x_columns.remove(y_column)
+        for column in columns_exclude:
+            if column in x_columns:
+                x_columns.remove(column)
+
+        bins_train = sc.woebin(train_data[x_columns + [y_column]], y=y_column, bin_num_limit=5,
+                               special_values=special_values, breaks_list=breaks_list, print_info=False)
+
+        for column, bin in bins_train.items():
+            breaks_list[column] = list(bin['breaks'])
+
+        bins_test = sc.woebin(test_data[x_columns + [y_column]], y=y_column,
+                              special_values=special_values, breaks_list=breaks_list, print_info=False)
+        bin_info_fast: Dict[str, BinInfo] = {}
+        filter_fast_overview = ""
+        for column, bin_train in bins_train.items():
+            train_iv = bin_train['total_iv'][0].round(3)
+            if train_iv <= iv_threshold and not self.ml_config.is_include(column):
+                filter_fast_overview = f"{filter_fast_overview}{column} 因为train_iv【{train_iv}】小于阈值被剔除\n"
+                continue
+            bin_test = bins_test[column]
+            test_iv = bin_test['total_iv'][0].round(3)
+            iv = train_iv + test_iv
+            psi = f_get_psi(bin_train, bin_test)
+            if psi >= psi_threshold and not self.ml_config.is_include(column):
+                filter_fast_overview = f"{filter_fast_overview}{column} 因为psi【{psi}】大于阈值被剔除\n"
+                continue
+            bin_info_fast[column] = BinInfo.ofConvertByDict(
+                {"x_column": column, "iv": iv, "psi": psi, "points": breaks_list[column]}
+            )
+
+        context.set_filter_info(ContextEnum.FILTER_FAST,
+                                f"筛选前变量数量:{len(x_columns)}\n{x_columns}\n"
+                                f"快速筛选剔除变量数量:{len(x_columns) - len(bin_info_fast)}\n{filter_fast_overview}")
+        return bin_info_fast
+
+    def _f_corr_filter(self, data: DataSplitEntity, bin_info_dict: Dict[str, BinInfo]) -> Dict[str, BinInfo]:
+        # 相关性剔除变量
+        corr_threshold = self.ml_config.corr_threshold
+        train_data = data.train_data
+
+        x_columns = list(bin_info_dict.keys())
+        sc_woebin = self._f_get_sc_woebin(train_data, bin_info_dict)
+        train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin, print_info=False)
+        corr_df = f_get_corr(train_woe)
+        corr_dict = corr_df.to_dict()
+        filter_corr_overview = "corr_filter\n"
+        filter_corr_detail = {}
+        # 依次判断每个变量对于其它变量的相关性
+        for column, corr in corr_dict.items():
+            column = column.replace("_woe", "")
+            column_remove = []
+            overview = f"{column}: "
+            if column not in x_columns:
+                continue
+            for challenger_column, challenger_corr in corr.items():
+                challenger_corr = round(challenger_corr, 3)
+                challenger_column = challenger_column.replace("_woe", "")
+                if challenger_corr < corr_threshold or column == challenger_column \
+                        or challenger_column not in x_columns:
+                    continue
+                # 相关性大于阈值的情况,选择iv值大的
+                iv = bin_info_dict[column].iv
+                challenger_iv = bin_info_dict[challenger_column].iv
+                if iv > challenger_iv:
+                    if not self.ml_config.is_include(challenger_column):
+                        column_remove.append(challenger_column)
+                        overview = f"{overview}【{challenger_column}_iv{challenger_iv}_corr{challenger_corr}】 "
+                else:
+                    # 自己被剔除的情况下不再记录
+                    column_remove = []
+                    overview = ""
+                    break
+            # 剔除与自己相关的变量
+            for c in column_remove:
+                if c in x_columns:
+                    x_columns.remove(c)
+            if overview != "":
+                filter_corr_overview = f"{filter_corr_overview}{overview}\n"
+                filter_corr_detail[column] = column_remove
+        for column in list(bin_info_dict.keys()):
+            if column not in x_columns:
+                bin_info_dict.pop(column)
+        context.set_filter_info(ContextEnum.FILTER_CORR, filter_corr_overview, filter_corr_detail)
+        return bin_info_dict
+
+    def _f_vif_filter(self, data: DataSplitEntity, bin_info_dict: Dict[str, BinInfo]) -> Dict[str, BinInfo]:
+        vif_threshold = self.ml_config.vif_threshold
+        train_data = data.train_data
+
+        x_columns = list(bin_info_dict.keys())
+        sc_woebin = self._f_get_sc_woebin(train_data, bin_info_dict)
+        train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin, print_info=False)
+        vif_df = f_get_vif(train_woe)
+        if vif_df is None:
+            return bin_info_dict
+
+        filter_vif_overview = ""
+        filter_vif_detail = []
+        for _, row in vif_df.iterrows():
+            column = row["变量"]
+            vif = row["vif"]
+            bin_info = bin_info_dict[column]
+            bin_info.vif = vif
+            bin_info_dict[column] = bin_info
+            if vif < vif_threshold:
+                continue
+            filter_vif_overview = f"{filter_vif_overview}{column} 因为vif【{vif}】大于阈值被剔除\n"
+            filter_vif_detail.append(column)
+            bin_info_dict.pop(column)
+
+        context.set_filter_info(ContextEnum.FILTER_VIF, filter_vif_overview, filter_vif_detail)
+        return bin_info_dict
+
+    def post_filter(self, data: DataSplitEntity, bin_info_dict: Dict[str, BinInfo]):
+        # 变量之间进行比较的过滤器
+        max_feature_num = self.ml_config.max_feature_num
+        bin_info_filtered = self._f_corr_filter(data, bin_info_dict)
+        bin_info_filtered = self._f_vif_filter(data, bin_info_filtered)
+        bin_info_filtered = BinInfo.ivTopN(bin_info_filtered, max_feature_num)
+        self.sc_woebin = self._f_get_sc_woebin(data.train_data, bin_info_filtered)
+        context.set(ContextEnum.BIN_INFO_FILTERED, bin_info_filtered)
+        context.set(ContextEnum.WOEBIN, self.sc_woebin)
+
+    def feature_search(self, data: DataSplitEntity, *args, **kwargs):
+        # 粗筛
+        bin_info_fast = self._f_fast_filter(data)
+        x_columns = list(bin_info_fast.keys())
+
+        bin_info_filtered: Dict[str, BinInfo] = {}
+        # 数值型变量多种分箱方式的中间结果
+        homo_bin_info_numeric_set: Dict[str, HomologousBinInfo] = {}
+        filter_numeric_overview = "filter_numeric\n"
+        filter_numeric_detail = []
+        for x_column in tqdm(x_columns):
+            if is_numeric_dtype(data.train_data[x_column]):
+                # 数值型变量筛选
+                homo_bin_info_numeric: HomologousBinInfo = self._handle_numeric(data, x_column)
+                if homo_bin_info_numeric.is_auto_bins:
+                    homo_bin_info_numeric_set[x_column] = homo_bin_info_numeric
+                # iv psi 变量单调性 变量趋势一致性 筛选
+                bin_info: Optional[BinInfo] = homo_bin_info_numeric.filter()
+                if bin_info is not None:
+                    bin_info_filtered[x_column] = bin_info
+                else:
+                    # 不满足要求被剔除
+                    filter_numeric_overview = f"{filter_numeric_overview}{x_column} {homo_bin_info_numeric.drop_reason()}\n"
+                    filter_numeric_detail.append(x_column)
+            else:
+                # 字符型暂时用scorecardpy来处理
+                bin_info_filtered[x_column] = bin_info_fast[x_column]
+
+        self.post_filter(data, bin_info_filtered)
+
+        context.set(ContextEnum.HOMO_BIN_INFO_NUMERIC_SET, homo_bin_info_numeric_set)
+        context.set_filter_info(ContextEnum.FILTER_NUMERIC, filter_numeric_overview, filter_numeric_detail)
+
+    def feature_save(self, *args, **kwargs):
+        if self.sc_woebin is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
+        df_woebin = pd.concat(self.sc_woebin.values())
+        path = self.ml_config.f_get_save_path(f"feature.csv")
+        df_woebin.to_csv(path)
+        print(f"feature save to【{path}】success. ")
+
+    def feature_load(self, path: str, *args, **kwargs):
+        if os.path.isdir(path):
+            path = os.path.join(path, "feature.csv")
+        if not os.path.isfile(path) or "feature.csv" not in path:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征信息【feature.csv】不存在")
+
+        df_woebin = pd.read_csv(path)
+        variables = df_woebin["variable"].unique().tolist()
+        self.sc_woebin = {}
+        for variable in variables:
+            self.sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
+        print(f"feature load from【{path}】success.")
+
+    def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
+        x_columns = list(self.sc_woebin.keys())
+        # 排个序,防止因为顺序原因导致的可能的bug
+        x_columns.sort()
+        data_woe = sc.woebin_ply(data[x_columns], self.sc_woebin, print_info=False)
+        return data_woe
+
+    def feature_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+        y_column = self.ml_config.y_column
+        columns_anns = self.ml_config.columns_anns
+        x_columns = list(self.sc_woebin.keys())
+
+        train_data = data.train_data
+        test_data = data.test_data
+        # 跨模块调用中间结果,所以从上下文里取
+        bin_info_filtered: Dict[str, BinInfo] = context.get(ContextEnum.BIN_INFO_FILTERED)
+
+        metric_value_dict = {}
+        # 样本分布
+        metric_value_dict["样本分布"] = MetricFucResultEntity(table=data.get_distribution(y_column), table_font_size=10,
+                                                          table_cell_width=3)
+        # 变量iv、psi、vif
+        df_iv_psi_vif = pd.DataFrame()
+        train_iv = [bin_info_filtered[column].train_iv for column in x_columns]
+        psi = [bin_info_filtered[column].psi for column in x_columns]
+        vif = [bin_info_filtered[column].vif for column in x_columns]
+        anns = [columns_anns.get(column, "-") for column in x_columns]
+        df_iv_psi_vif["变量"] = x_columns
+        df_iv_psi_vif["iv"] = train_iv
+        df_iv_psi_vif["psi"] = psi
+        df_iv_psi_vif["vif"] = vif
+        df_iv_psi_vif["释义"] = anns
+        df_iv_psi_vif.sort_values(by=["iv"], ascending=[False], inplace=True)
+        img_path_iv = self.ml_config.f_get_save_path(f"iv.png")
+        f_df_to_image(df_iv_psi_vif, img_path_iv)
+        metric_value_dict["变量iv"] = MetricFucResultEntity(table=df_iv_psi_vif, image_path=img_path_iv)
+
+        # 变量相关性
+        sc_woebin_train = self._f_get_sc_woebin(train_data, bin_info_filtered)
+        train_woe = sc.woebin_ply(train_data[x_columns], sc_woebin_train, print_info=False)
+        img_path_corr = self._f_get_img_corr(train_woe)
+        metric_value_dict["变量相关性"] = MetricFucResultEntity(image_path=img_path_corr)
+
+        # 变量趋势-训练集
+        imgs_path_trend_train = self._f_get_img_trend(sc_woebin_train, x_columns, "train")
+        metric_value_dict["变量趋势-训练集"] = MetricFucResultEntity(image_path=imgs_path_trend_train, image_size=4)
+
+        # 变量趋势-测试集
+        sc_woebin_test = self._f_get_sc_woebin(test_data, bin_info_filtered)
+        imgs_path_trend_test = self._f_get_img_trend(sc_woebin_test, x_columns, "test")
+        metric_value_dict["变量趋势-测试集"] = MetricFucResultEntity(image_path=imgs_path_trend_test, image_size=4)
+
+        # context.set(ContextEnum.METRIC_FEATURE.value, metric_value_dict)
+
+        if self.ml_config.jupyter_print:
+            self.jupyter_print(data, metric_value_dict)
+
+        return metric_value_dict
+
+    def jupyter_print(self, data: DataSplitEntity, metric_value_dict=Dict[str, MetricFucResultEntity]):
+        from IPython import display
+
+        def detail_print(detail):
+            if isinstance(detail, list):
+                for column in detail:
+                    homo_bin_info_numeric = homo_bin_info_numeric_set.get(column)
+                    if homo_bin_info_numeric is None:
+                        continue
+                    bins_info = homo_bin_info_numeric.get_best_bins()
+                    print(f"-----【{column}】不同分箱数下变量的推荐切分点-----")
+                    imgs_path_trend_train = []
+                    imgs_path_trend_test = []
+                    for bin_info in bins_info:
+                        print(json.dumps(bin_info.points, ensure_ascii=False, cls=NumpyEncoder))
+                        breaks_list = [str(i) for i in bin_info.points]
+                        sc_woebin_train = self._f_get_sc_woebin(train_data, {column: bin_info})
+                        image_path = self._f_get_img_trend(sc_woebin_train, [column],
+                                                           f"train_{column}_{'_'.join(breaks_list)}")
+                        imgs_path_trend_train.append(image_path[0])
+                        sc_woebin_test = self._f_get_sc_woebin(test_data, {column: bin_info})
+                        image_path = self._f_get_img_trend(sc_woebin_test, [column],
+                                                           f"test_{column}_{'_'.join(breaks_list)}")
+                        imgs_path_trend_test.append(image_path[0])
+                    f_display_images_by_side(display, imgs_path_trend_train, title=f"训练集",
+                                             image_path_list2=imgs_path_trend_test, title2="测试集")
+            if isinstance(detail, dict):
+                for column, challenger_columns in detail.items():
+                    print(f"-----相关性筛选保留的【{column}】-----")
+                    detail_print(column)
+                    for challenger_column in challenger_columns:
+                        detail_print(challenger_column)
+
+        train_data = data.train_data
+        test_data = data.test_data
+
+        bin_info_filtered: Dict[str, BinInfo] = context.get(ContextEnum.BIN_INFO_FILTERED)
+        homo_bin_info_numeric_set: Dict[str, HomologousBinInfo] = context.get(
+            ContextEnum.HOMO_BIN_INFO_NUMERIC_SET)
+        filter_fast = context.get(ContextEnum.FILTER_FAST)
+        filter_numeric = context.get(ContextEnum.FILTER_NUMERIC)
+        filter_corr = context.get(ContextEnum.FILTER_CORR)
+        filter_vif = context.get(ContextEnum.FILTER_VIF)
+        filter_ivtop = context.get(ContextEnum.FILTER_IVTOP)
+
+        f_display_title(display, "样本分布")
+        display.display(metric_value_dict["样本分布"].table)
+
+        # 打印变量iv
+        f_display_title(display, "变量iv")
+        display.display(metric_value_dict["变量iv"].table)
+        # 打印变量相关性
+        f_display_images_by_side(display, metric_value_dict["变量相关性"].image_path, width=800)
+
+        # 打印变量趋势
+        f_display_title(display, "变量趋势")
+        imgs_path_trend_train = metric_value_dict["变量趋势-训练集"].image_path
+        imgs_path_trend_test = metric_value_dict.get("变量趋势-测试集").image_path
+        f_display_images_by_side(display, imgs_path_trend_train, title="训练集", image_path_list2=imgs_path_trend_test,
+                                 title2="测试集")
+
+        # 打印breaks_list
+        breaks_list = {column: bin_info.points for column, bin_info in bin_info_filtered.items()}
+        print("变量切分点:")
+        print(json.dumps(breaks_list, ensure_ascii=False, indent=2, cls=NumpyEncoder))
+        print("选中变量不同分箱数下变量的推荐切分点:")
+
+        # 打印fast_filter筛选情况
+        f_display_title(display, "快速筛选过程")
+        print(filter_fast.get("overview"))
+
+        # 打印filter_numeric筛选情况
+        f_display_title(display, "数值变量筛选过程")
+        print(filter_numeric.get("overview"))
+        detail = filter_numeric.get("detail")
+        detail_print(detail)
+
+        # 打印filter_corr筛选情况
+        f_display_title(display, "相关性筛选过程")
+        print(filter_corr.get("overview"))
+        detail = filter_corr.get("detail")
+        detail_print(detail)
+
+        # 打印filter_vif筛选情况
+        f_display_title(display, "vif筛选过程")
+        print(filter_vif.get("overview"))
+        detail = filter_vif.get("detail")
+        detail_print(detail)
+
+        # 打印ivtop筛选情况
+        f_display_title(display, "ivtop筛选过程")
+        print(filter_ivtop.get("overview"))
+        detail = filter_ivtop.get("detail")
+        detail_print(detail)

+ 40 - 38
feature/feature_utils.py → feature/woe/utils.py

@@ -4,9 +4,10 @@
 @time: 2023/12/28
 @desc:  特征工具类
 """
+from typing import Union
+
 import numpy as np
 import pandas as pd
-
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
 FORMAT_DICT = {
@@ -20,7 +21,7 @@ FORMAT_DICT = {
     # 次数类3 0 - 50
     "bin_cnt3": [0, 2, 4, 6, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50],
     # 次数类4 0 - 100
-    "bin_cnt4": [0, 3, 6, 10, 15, 20, 30, 40, 50, 100],
+    "bin_cnt4": [0, 3, 6, 10, 15, 20, 30, 40, 50, 80, 100],
 
     # 金额类1 0 - 1w
     "bin_amt1": np.arange(0, 1.1e4, 1e3),
@@ -75,59 +76,60 @@ def f_format_bin(data_describe: pd.Series, raw_v):
     return format_v
 
 
-# 此函数判断list的单调性,允许至多N次符号变化
-def f_judge_monto(bd_list: list, pos_neg_cnt: int = 1) -> bool:
-    if len(bd_list) < 2:
-        return True
-    start_tr = bd_list[1] - bd_list[0]
-    tmp_len = len(bd_list)
-    pos_neg_flag = 0
-    for i in range(2, tmp_len):
-        tmp_tr = bd_list[i] - bd_list[i - 1]
+# 单调性变化次数
+def f_monto_shift(badprobs: list) -> int:
+    if len(badprobs) <= 2:
+        return 0
+    before = badprobs[1] - badprobs[0]
+    change_cnt = 0
+    for i in range(2, len(badprobs)):
+        next = badprobs[i] - badprobs[i - 1]
         # 后一位bad_rate减前一位bad_rate,保证bad_rate的单调性
-        # 记录符号变化, 允许 最多一次符号变化,即U型分布
-        if (tmp_tr >= 0 and start_tr >= 0) or (tmp_tr <= 0 and start_tr <= 0):
+        if (next >= 0 and before >= 0) or (next <= 0 and before <= 0):
             # 满足趋势保持,查看下一位
             continue
         else:
             # 记录一次符号变化
-            start_tr = tmp_tr
-            pos_neg_flag += 1
-            if pos_neg_flag > pos_neg_cnt:
-                return False
-    # 记录满足趋势要求的变量
-    if pos_neg_flag <= pos_neg_cnt:
-        return True
-    return False
-
-
-# 变量趋势一致性判断
-def f_monto_contrast(train_bd_list: list, test_bd_list: list, monto_contrast_change_cnt: int = 0):
-    if len(train_bd_list) != len(test_bd_list) or len(train_bd_list) < 2 or len(test_bd_list) < 2:
-        return False
-
-    train_monto = np.array(train_bd_list[1:]) - np.array(train_bd_list[0:-1])
-    train_monto = np.where(train_monto >= 0, 1, -1)
+            before = next
+            change_cnt += 1
+    return change_cnt
 
-    test_monto = np.array(test_bd_list[1:]) - np.array(test_bd_list[0:-1])
-    test_monto = np.where(test_monto >= 0, 1, -1)
 
+# 变量趋势一致变化次数
+def f_trend_shift(train_badprobs: list, test_badprobs: list) -> int:
+    if len(train_badprobs) != len(test_badprobs) or len(train_badprobs) < 2 or len(test_badprobs) < 2:
+        return 0
+    train_monto = np.array(train_badprobs[1:]) - np.array(train_badprobs[0:-1])
+    train_monto = np.where(train_monto >= 0, 1, -1)
+    test_monto = np.array(test_badprobs[1:]) - np.array(test_badprobs[0:-1])
+    test_monto = np.where(test_monto >= 0, 1, -1)
     contrast = train_monto - test_monto
-    if len(contrast[contrast != 0]) > monto_contrast_change_cnt:
-        return False
+    return len(contrast[contrast != 0])
+
+
+def f_get_psi(train_bins, test_bins):
+    train_bins['count'] = train_bins['good'] + train_bins['bad']
+    train_bins['proportion'] = train_bins['count'] / train_bins['count'].sum()
+    test_bins['count'] = test_bins['good'] + test_bins['bad']
+    test_bins['proportion'] = test_bins['count'] / test_bins['count'].sum()
+
+    psi = (train_bins['proportion'] - test_bins['proportion']) * np.log(
+        train_bins['proportion'] / test_bins['proportion'])
+    psi = psi.reset_index()
+    psi = psi.rename(columns={"proportion": "psi"})
 
-    return True
+    return psi["psi"].sum().round(3)
 
 
 def f_get_corr(data: pd.DataFrame, meth: str = 'spearman') -> pd.DataFrame:
     return data.corr(method=meth)
 
 
-def f_get_ivf(data: pd.DataFrame) -> pd.DataFrame:
+def f_get_vif(data: pd.DataFrame) -> Union[pd.DataFrame, None]:
     if len(data.columns.to_list()) <= 1:
         return None
-    vif_v = [vif(data.values, data.columns.get_loc(i)) for i in data.columns]
+    vif_v = [round(vif(data.values, data.columns.get_loc(i)), 3) for i in data.columns]
     vif_df = pd.DataFrame()
-    vif_df["变量"] = data.columns
+    vif_df["变量"] = [column.replace("_woe", "") for column in data.columns]
     vif_df['vif'] = vif_v
     return vif_df

+ 42 - 1
init/__init__.py

@@ -5,14 +5,55 @@
 @desc: 一些资源初始化
 """
 import sys
+import threading
 
 import matplotlib
+from contextvars import ContextVar
 
 matplotlib.use('Agg')
 
 import matplotlib.pyplot as plt
 
-__all__ = ['init', 'warning_ignore']
+__all__ = ['init', 'warning_ignore', "context"]
+
+class Context:
+    def __init__(self):
+        # 上下文,只在当前线程内有效,notebook下会失效
+        self._instance_lock = threading.Lock()
+        self.context = {}
+
+    def set(self, k: str, data: object):
+        with self._instance_lock:
+            self.context.update({k: data})
+
+    def get(self, k: str):
+        return self.context.get(k, None)
+
+    def set_filter_info(self, key, overview, detail=None):
+        data = {"overview": overview, "detail": detail}
+        self.set(key, data)
+
+class ContexThreading:
+    def __init__(self):
+        # 上下文,只在当前线程内有效,notebook下会失效
+        self.context = ContextVar('context')
+        self.context.set({})
+
+    def set(self, k: str, data: object):
+        context_map: dict = self.context.get()
+        context_map.update({k: data})
+        self.context.set(context_map)
+
+    def get(self, k: str):
+        context_map: dict = self.context.get()
+        return context_map.get(k, None)
+
+    def set_filter_info(self, key, overview, detail=None):
+        data = {"overview": overview, "detail": detail}
+        self.set(key, data)
+
+
+context = Context()
 
 
 def init():

+ 6 - 7
metric_test.py

@@ -6,10 +6,9 @@
 """
 import pandas as pd
 
-from data import DataLoaderMysql, DataLoaderBase, DataLoaderHive
-from entitys import DbConfigEntity, MetricFucEntity
+from data import DataLoaderBase, DataLoaderHive
+from entitys import DbConfigEntity, MetricFucResultEntity
 from metrics import MetricBase
-from monitor import MonitorMetric
 
 
 class A(MetricBase):
@@ -23,9 +22,9 @@ class A(MetricBase):
         data = data_loader.get_data(self._file_path, self._sheet_name)
         return data
 
-    def calculate(self, *args, **kwargs) -> MetricFucEntity:
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
         data = self._load_data()
-        return MetricFucEntity(table=data, value='1', image_path='cache/image/t1.png')
+        return MetricFucResultEntity(table=data, value='1', image_path='cache/image/t1.png')
 
 
 if __name__ == "__main__":
@@ -36,6 +35,6 @@ if __name__ == "__main__":
     print(df.head())
 
     # data_loader = DataLoaderMysql(DbConfigEntity.from_config("./config/mysql_config.json"))
-    # monitor_metric = MonitorMetric("./config/model_monitor_config_template.json")
+    # monitor_metric = MonitorMetric("./config/model_feature_strategy_template.json")
     # monitor_metric.calculate_metric(data_loader=data_loader)
-    # monitor_metric.generate_report()
+    # monitor_metric.report()

+ 8 - 8
metric_test2.py

@@ -6,8 +6,8 @@
 """
 import pandas as pd
 
-from data import DataLoaderMysql, DataLoaderBase, DataLoaderExcel
-from entitys import DbConfigEntity, MetricFucEntity
+from data import DataLoaderBase, DataLoaderExcel
+from entitys import MetricFucResultEntity
 from metrics import MetricBase, f_register_metric_func
 from monitor import MonitorMetric
 
@@ -23,9 +23,9 @@ class AMetric(MetricBase):
         data = data_loader.get_data(self._file_path, self._sheet_name)
         return data
 
-    def calculate(self, *args, **kwargs) -> MetricFucEntity:
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
         data = self._load_data(*args, **kwargs)
-        return MetricFucEntity(table=data, value='1', image_path='cache/image/t1.png')
+        return MetricFucResultEntity(table=data, value='1', image_path='cache/image/t1.png')
 
 class BMetric(MetricBase):
 
@@ -33,11 +33,11 @@ class BMetric(MetricBase):
         super().__init__(*args, **kwargs)
         self._v = v
 
-    def calculate(self, *args, **kwargs) -> MetricFucEntity:
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
         if ".png" in self._v:
-            return MetricFucEntity(image_path=self._v)
+            return MetricFucResultEntity(image_path=self._v)
         else:
-            return MetricFucEntity(value=self._v)
+            return MetricFucResultEntity(value=self._v)
 
 
 if __name__ == "__main__":
@@ -49,6 +49,6 @@ if __name__ == "__main__":
     a.writr("cache/a.xlsx")
 
 
-    monitor_metric = MonitorMetric("./cache/model_monitor_config1.json")
+    monitor_metric = MonitorMetric("./cache/model_feature_strategy1.json")
     monitor_metric.calculate_metric(data_loader=data_loader)
     monitor_metric.generate_report()

+ 2 - 2
metrics/metric_base.py

@@ -8,7 +8,7 @@ import abc
 import os
 
 from config import BaseConfig
-from entitys import MetricFucEntity
+from entitys import MetricFucResultEntity
 
 
 class MetricBase(metaclass=abc.ABCMeta):
@@ -22,5 +22,5 @@ class MetricBase(metaclass=abc.ABCMeta):
         return image_path
 
     @abc.abstractmethod
-    def calculate(self, *args, **kwargs) -> MetricFucEntity:
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
         pass

+ 3 - 3
metrics/metric_by_sql_general.py

@@ -8,7 +8,7 @@ import pandas as pd
 
 from commom import get_logger, f_fill_placeholder
 from data import DataLoaderBase
-from entitys import MetricFucEntity
+from entitys import MetricFucResultEntity
 from .metric_base import MetricBase
 
 logger = get_logger()
@@ -30,5 +30,5 @@ class MetricBySqlGeneral(MetricBase):
         logger.info(f"sql execute result: {data.head(5)}")
         return data
 
-    def calculate(self, *args, **kwargs) -> MetricFucEntity:
-        return MetricFucEntity(table=self._load_data(*args, **kwargs))
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
+        return MetricFucResultEntity(table=self._load_data(*args, **kwargs))

+ 33 - 11
model/model_base.py

@@ -7,33 +7,55 @@
 import abc
 from typing import Dict
 
+import numpy as np
 import pandas as pd
 
-from entitys import TrainConfigEntity, DataPreparedEntity, MetricFucEntity, DataProcessConfigEntity
+from entitys import MetricFucResultEntity, MlConfigEntity, DataFeatureEntity
 
 
 class ModelBase(metaclass=abc.ABCMeta):
 
-    def __init__(self, train_config: TrainConfigEntity = None, *args, **kwargs):
-        if train_config is not None:
-            self._train_config = train_config
-        else:
-            self._train_config = TrainConfigEntity(*args, **kwargs)
+    def __init__(self, ml_config: MlConfigEntity = None, *args, **kwargs):
+        self._ml_config = ml_config
 
-        self._data_process_config: DataProcessConfigEntity = None
+    @property
+    def ml_config(self):
+        return self._ml_config
 
     @abc.abstractmethod
-    def get_template_path(self, ) -> str:
+    def get_report_template_path(self, ) -> str:
         pass
 
     @abc.abstractmethod
-    def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
+    def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
         pass
 
     @abc.abstractmethod
-    def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
+    def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
         pass
 
     @abc.abstractmethod
-    def export_model_file(self, ):
+    def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        pass
+
+    @abc.abstractmethod
+    def model_save(self, *args, **kwargs):
+        pass
+
+    @abc.abstractmethod
+    def model_load(self, path: str, *args, **kwargs):
+        pass
+
+    @abc.abstractmethod
+    def train_report(self, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+        """
+        特征报告
+        """
+        pass
+
+    @abc.abstractmethod
+    def jupyter_print(self, *args, **kwargs):
+        """
+        notebook输出
+        """
         pass

+ 1 - 1
model/model_factory.py

@@ -8,7 +8,7 @@ from typing import Type
 
 from commom import GeneralException
 from enums import ModelEnum, ResultCodesEnum
-from model import ModelBase
+from .model_base import ModelBase
 from .model_lr import ModelLr
 
 model_map = {

+ 166 - 116
model/model_lr.py

@@ -5,17 +5,21 @@
 @desc: 
 """
 import os.path
+import pickle
 from os.path import dirname, realpath
 from typing import Dict
 
+import numpy as np
 import pandas as pd
 import scorecardpy as sc
-from sklearn.linear_model import LogisticRegression
+import statsmodels.api as sm
 
-from commom import f_df_to_image, f_display_images_by_side
-from entitys import DataPreparedEntity, MetricFucEntity, DataSplitEntity
+from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title
+from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
+from enums import ContextEnum, ResultCodesEnum
+from init import context
 from .model_base import ModelBase
-from .model_utils import f_strees_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 
 
 class ModelLr(ModelBase):
@@ -23,138 +27,184 @@ class ModelLr(ModelBase):
         super().__init__(*args, **kwargs)
         # 报告模板
         self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
-        self.lr = LogisticRegression(C=1e12, fit_intercept=False)
+        self.lr = None
+        self.card = None
 
-    def get_template_path(self):
+    def get_report_template_path(self):
         return self._template_path
 
-    def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
-        bins = kwargs["bins"]
-        data_split_original: DataSplitEntity = kwargs["data_split_original"]
-        jupyter = self._data_process_config.jupyter
-        strees = self._data_process_config.strees
-        strees_sample_times = self._data_process_config.strees_sample_times
-        strees_bad_rate_list = self._data_process_config.strees_bad_rate_list
-
-        # woe编码之前的数据
-        train_data_original = data_split_original.train_data
-        test_data_original = data_split_original.test_data
-
-        train_data = data.train_data
-        train_y = train_data.get_Ydata()
-        y_column = train_data.y_column
-
-        test_data = data.test_data
-
-        self.lr.fit(train_data.get_Xdata(), train_y)
+    def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
+        woebin = context.get(ContextEnum.WOEBIN)
+        data_x = train_data.data_x.copy()
+        # scorecardpy高版本区分了sklearn与statsmodels,为了后面生成评分卡需要,但是又不需要截距,所以常数项置0
+        if sc.__version__ > '0.1.9.2':
+            data_x["const"] = [0] * len(data_x)
+        family = sm.families.Binomial()
+        logit = sm.GLM(train_data.data_y, data_x, family=family)
+        self.lr = logit.fit()
+        # scorecardpy低版本
+        if sc.__version__ <= '0.1.9.2':
+            self.lr.coef_ = [list(self.lr.summary2().tables[1].loc[:, 'Coef.'])]
+            self.lr.intercept_ = [0]
+            if len(self.lr.coef_[0]) != len(data_x.columns):
+                raise GeneralException(ResultCodesEnum.SYSTEM_ERROR, message=f"lr模型coef系数长度与x_columns长度不一致。")
+        self.card = sc.scorecard(woebin, self.lr, data_x.columns, points0=600, pdo=50, odds0=train_data.get_odds0())
+
+    def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        # scorecardpy高版本
+        if sc.__version__ > '0.1.9.2':
+            x = x.copy()
+            x["const"] = [0] * len(x)
+        return np.array(self.lr.predict(x))
+
+    def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
+
+    def model_save(self):
+        if self.lr is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
+        if self.card is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"card不存在")
+        path = self.ml_config.f_get_save_path(f"model.pkl")
+        self.lr.save(path)
+        df_card = pd.concat(self.card.values())
+        path = self.ml_config.f_get_save_path(f"card.csv")
+        df_card.to_csv(path)
+        print(f"model save to【{path}】success. ")
+
+    def model_load(self, path: str, *args, **kwargs):
+        if not os.path.isdir(path):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
+        path_model = os.path.join(path, "model.pkl")
+        if not os.path.isfile(path_model):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
+        path_card = os.path.join(path, "card.csv")
+        if not os.path.isfile(path_card):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_card}】不存在")
+
+        with open(path_model, 'rb') as f:
+            self.lr = pickle.load(f)
+
+        df_card = pd.read_csv(path_card)
+        variables = df_card["variable"].unique().tolist()
+        self.card = {}
+        for variable in variables:
+            self.card[variable] = df_card[df_card["variable"] == variable]
+
+        print(f"model load from【{path}】success.")
+
+    def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+
+        y_column = self._ml_config.y_column
+        stress_test = self.ml_config.stress_test
+        stress_sample_times = self.ml_config.stress_sample_times
+        stress_bad_rate_list = self.ml_config.stress_bad_rate_list
+
+        train_data = data.train_data.copy()
+        test_data = data.test_data.copy()
 
         metric_value_dict = {}
         # 评分卡
-        card: Dict = sc.scorecard(bins, self.lr, train_data.x_columns, points0=600, odds0=train_data.get_odds0(),
-                                  pdo=50)
-        card_df = pd.DataFrame(columns=card['basepoints'].keys())
-        for k, v in card.items():
-            card_df = pd.concat((card_df, v))
-        card_df_path = self._train_config.f_get_save_path(f"card_df.png")
-        f_df_to_image(card_df, card_df_path)
-        metric_value_dict["评分卡"] = MetricFucEntity(table=card_df, image_path=card_df_path)
+        df_card = pd.concat(self.card.values())
+        img_path_card = self.ml_config.f_get_save_path(f"card.png")
+        f_df_to_image(df_card, img_path_card)
+        metric_value_dict["评分卡"] = MetricFucResultEntity(table=df_card, image_path=img_path_card)
 
         # 模型系数
-        coef = dict(zip(train_data.x_columns, self.lr.coef_.reshape(-1)))
-        coef_df = pd.DataFrame()
-        coef_df['变量'] = coef.keys()
-        coef_df['变量系数'] = coef.values()
-        metric_value_dict["变量系数"] = MetricFucEntity(table=coef_df, table_font_size=10)
+        coef_table = self.lr.summary().tables[1]
+        var_name = coef_table.data[0]
+        var_name[0] = "var"
+        df_coef = pd.DataFrame(columns=var_name, data=coef_table.data[1:])
+        img_path_coef = self.ml_config.f_get_save_path(f"coef.png")
+        f_df_to_image(df_coef, img_path_coef)
+        metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
 
         # 模型ks auc
-        train_prob = self.lr.predict_proba(train_data.get_Xdata())[:, 1]
-        image_path_list = []
-        train_perf = sc.perf_eva(train_y, train_prob, title="train", show_plot=True)
-        path = self._train_config.f_get_save_path(f"train_perf.png")
+        img_path_perf = []
+        train_score = self.score(train_data)
+        train_perf = sc.perf_eva(train_data[y_column], train_score, title="train", show_plot=True)
+        path = self.ml_config.f_get_save_path(f"train_perf.png")
         train_perf["pic"].savefig(path)
-        image_path_list.append(path)
-
+        img_path_perf.append(path)
         train_auc = train_perf["AUC"]
         train_ks = train_perf["KS"]
 
-        test_auc = "-"
-        test_ks = "-"
-        if test_data is not None:
-            test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
-            test_y = test_data.get_Ydata()
-            test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
-            path = self._train_config.f_get_save_path(f"test_perf.png")
-            test_perf["pic"].savefig(path)
-            image_path_list.append(path)
-            test_auc = test_perf["AUC"]
-            test_ks = test_perf["KS"]
-
-        df_auc = pd.DataFrame()
-        df_auc["样本集"] = ["训练集", "测试集"]
-        df_auc["AUC"] = [train_auc, test_auc]
-        df_auc["KS"] = [train_ks, test_ks]
-        metric_value_dict["模型结果"] = MetricFucEntity(table=df_auc, image_path=image_path_list, image_size=5,
-                                                    table_font_size=10)
+        test_score = self.score(test_data)
+        test_perf = sc.perf_eva(test_data[y_column], test_score, title="test", show_plot=True)
+        path = self.ml_config.f_get_save_path(f"test_perf.png")
+        test_perf["pic"].savefig(path)
+        img_path_perf.append(path)
+        test_auc = test_perf["AUC"]
+        test_ks = test_perf["KS"]
+
+        df_auc_ks = pd.DataFrame()
+        df_auc_ks["样本集"] = ["训练集", "测试集"]
+        df_auc_ks["AUC"] = [train_auc, test_auc]
+        df_auc_ks["KS"] = [train_ks, test_ks]
+        metric_value_dict["模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_perf, image_size=5,
+                                                          table_font_size=10)
 
         # 评分卡分箱
-        train_data_original, score_bins = f_get_model_score_bin(train_data_original, card)
-        train_data_gain = f_calcu_model_ks(train_data_original, y_column, sort_ascending=True)
-        train_data_gain_path = self._train_config.f_get_save_path(f"train_data_gain.png")
-        f_df_to_image(train_data_gain, train_data_gain_path)
-        metric_value_dict["训练集分数分箱"] = MetricFucEntity(table=train_data_gain, image_path=train_data_gain_path)
-        if test_data is not None:
-            test_data_original, bins = f_get_model_score_bin(test_data_original, card, score_bins)
-            test_data_gain = f_calcu_model_ks(test_data_original, y_column, sort_ascending=True)
-            test_data_gain_path = self._train_config.f_get_save_path(f"test_data_gain.png")
-            f_df_to_image(test_data_gain, test_data_gain_path)
-            metric_value_dict["测试集分数分箱"] = MetricFucEntity(table=test_data_gain, image_path=test_data_gain_path)
-
-            # 模型分psi
-            model_psi = f_calcu_model_psi(train_data_original, test_data_original)
-            model_psi_path = self._train_config.f_get_save_path(f"model_psi.png")
-            f_df_to_image(model_psi, model_psi_path)
-            metric_value_dict["模型稳定性"] = MetricFucEntity(table=model_psi, value=model_psi["psi"].sum().round(4),
-                                                         image_path=model_psi_path)
-
-            # 压力测试
-            if strees:
-                df_strees = f_strees_test(test_data_original, sample_times=strees_sample_times,
-                                          bad_rate_list=strees_bad_rate_list, target_column=y_column,
-                                          score_column="score")
-
-                df_strees_path = self._train_config.f_get_save_path(f"strees.png")
-                f_df_to_image(df_strees, df_strees_path)
-                metric_value_dict["压力测试"] = MetricFucEntity(table=df_strees, image_path=df_strees_path)
-
-        if jupyter:
-            from IPython import display
-            print("-----模型结果-----")
-            display.display(metric_value_dict["模型结果"].table)
-            f_display_images_by_side(metric_value_dict["模型结果"].image_path, display)
-            # 模型psi
-            if test_data is not None:
-                display.display(metric_value_dict["模型稳定性"].table)
-                print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
-            display.display(metric_value_dict["变量系数"].table)
-            print("-----训练集-分数分箱-----")
-            display.display(metric_value_dict["训练集分数分箱"].table)
-            if test_data is not None:
-                print("-----测试集-分数分箱-----")
-                display.display(metric_value_dict["测试集分数分箱"].table)
-            # 评分卡
-            display.display(metric_value_dict["评分卡"].table)
-
-            if test_data is not None and strees:
-                print("-----压力测试-----")
-                display.display(metric_value_dict["压力测试"].table)
+        train_data, score_bins = f_get_model_score_bin(train_data, train_score)
+        train_data_gain = f_calcu_model_ks(train_data, y_column, sort_ascending=True)
+        img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
+        f_df_to_image(train_data_gain, img_path_train_gain)
+        metric_value_dict["训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain, image_path=img_path_train_gain)
+
+        test_data, _ = f_get_model_score_bin(test_data, test_score, score_bins)
+        test_data_gain = f_calcu_model_ks(test_data, y_column, sort_ascending=True)
+        img_path_test_gain = self.ml_config.f_get_save_path(f"tes_gain.png")
+        f_df_to_image(test_data_gain, img_path_test_gain)
+        metric_value_dict["测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain, image_path=img_path_test_gain)
+
+        # 模型分psi
+        model_psi = f_calcu_model_psi(train_data, test_data)
+        img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
+        f_df_to_image(model_psi, img_path_psi)
+        metric_value_dict["模型稳定性"] = MetricFucResultEntity(table=model_psi, value=model_psi["psi"].sum().round(3),
+                                                           image_path=img_path_psi)
+
+        # 压力测试
+        if stress_test:
+            df_stress = f_stress_test(test_data, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
+                                      target_column=y_column, score_column="score")
+
+            img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
+            f_df_to_image(df_stress, img_path_stress)
+            metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
+
+        if self.ml_config.jupyter_print:
+            self.jupyter_print(metric_value_dict)
 
         return metric_value_dict
 
-    def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
-        return self.lr.predict_proba(x)[:, 1]
+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
+        from IPython import display
+
+        f_display_title(display, "模型结果")
+        display.display(metric_value_dict["模型结果"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
+
+        # 模型psi
+        f_display_title(display, "模型psi")
+        display.display(metric_value_dict["模型稳定性"].table)
+        print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
+
+        f_display_title(display, "模型变量系数")
+        print(self.lr.summary().tables[0])
+        display.display(metric_value_dict["变量系数"].table)
+
+        f_display_title(display, "训练集-分数分箱")
+        display.display(metric_value_dict["训练集分数分箱"].table)
+        f_display_title(display, "测试集-分数分箱")
+        display.display(metric_value_dict["测试集分数分箱"].table)
+        # 评分卡
+        f_display_title(display, "评分卡")
+        display.display(metric_value_dict["评分卡"].table)
 
-    def export_model_file(self):
-        pass
+        if "压力测试" in metric_value_dict.keys():
+            f_display_title(display, "压力测试")
+            display.display(metric_value_dict["压力测试"].table)
 
 
 if __name__ == "__main__":

+ 16 - 18
model/model_utils.py

@@ -6,7 +6,6 @@
 """
 import numpy as np
 import pandas as pd
-import scorecardpy as sc
 from sklearn.metrics import roc_auc_score
 
 
@@ -14,40 +13,39 @@ def f_calcu_model_ks(data, y_column, sort_ascending):
     var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
     var_ks.columns = ['样本数', '坏样本数']
     var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
-    var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(4)
-    var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(3)
+    var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(3)
     var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
     var_ks['总好样本数'] = var_ks['好样本数'].sum()
-    var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(3)
     var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
     var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
     var_ks['累计样本数'] = var_ks['样本数'].cumsum()
-    var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(4)
-    var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(4)
-    var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(4)
-    var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(4)
+    var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(3)
+    var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(3)
+    var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(3)
+    var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(3)
     return var_ks.reset_index()
 
 
-def f_get_model_score_bin(df, card, bins=None):
-    train_score = sc.scorecard_ply(df, card, print_step=0)
-    df['score'] = train_score
+def f_get_model_score_bin(df, score, bins=None):
     if bins is None:
-        _, bins = pd.qcut(df['score'], q=10, retbins=True, duplicates="drop")
+        _, bins = pd.qcut(score, q=10, retbins=True, duplicates="drop")
         bins = list(bins)
         bins[0] = -np.inf
         bins[-1] = np.inf
-    score_bins = pd.cut(df['score'], bins=bins)
-    df['MODEL_SCORE_BIN'] = score_bins.astype(str).values
+    score_bins = pd.cut(score, bins=bins)
+    df['score'] = score
+    df['MODEL_SCORE_BIN'] = score_bins
     return df, bins
 
 
 def f_calcu_model_psi(df_train, df_test):
     tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
-    tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(4)
+    tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
     tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
-    tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(4)
-    psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(4)
+    tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
+    psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
     psi = psi.reset_index()
     psi = psi.rename(columns={"样本数比例": "psi"})
     psi['训练样本数'] = list(tmp1['count'])
@@ -57,7 +55,7 @@ def f_calcu_model_psi(df_train, df_test):
     return psi
 
 
-def f_strees_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, target_column: str, score_column: str,
+def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, target_column: str, score_column: str,
                   sort_ascending=True):
     # 压力测试
     rows = []

+ 6 - 6
monitor/monitor_metric.py

@@ -7,16 +7,16 @@
 import threading
 from typing import Dict
 
-from entitys import MonitorMetricConfigEntity, MetricFucEntity
+from entitys import MonitorConfigEntity, MetricFucResultEntity
 from .report_generate import Report
 
 
 class MonitorMetric():
 
-    def __init__(self, monitor_metric_config_path: str):
-        self._monitor_metric_config = MonitorMetricConfigEntity.from_config(monitor_metric_config_path)
+    def __init__(self, monitor_config_path: str):
+        self._monitor_config = MonitorConfigEntity.from_config(monitor_config_path)
         self.lock = threading.Lock()
-        self._metric_value_dict: Dict[str, MetricFucEntity] = {}
+        self._metric_value_dict: Dict[str, MetricFucResultEntity] = {}
 
     @property
     def metric_value_dict(self):
@@ -28,13 +28,13 @@ class MonitorMetric():
 
     #  TODO 多线程计算指标
     def calculate_metric(self, *args, **kwargs):
-        metric_dict = self._monitor_metric_config.metric_dict
+        metric_dict = self._monitor_config.metric_dict
         for metric_code, metric_clazz in metric_dict.items():
             metric_value = metric_clazz.calculate(*args, **kwargs)
             self._update_metric_value_dict(metric_code, metric_value)
 
     def generate_report(self):
-        Report.generate_report(self._metric_value_dict, self._monitor_metric_config.template_path)
+        Report.generate_report(self._metric_value_dict, self._monitor_config.template_path)
 
 
 if __name__ == "__main__":

+ 5 - 5
monitor/report_generate.py

@@ -17,7 +17,7 @@ from docx.shared import Inches, Cm, Pt
 
 from commom import GeneralException, f_get_datetime
 from config import BaseConfig
-from entitys import MetricFucEntity
+from entitys import MetricFucResultEntity
 from enums import ResultCodesEnum, PlaceholderPrefixEnum
 
 
@@ -127,7 +127,7 @@ class Report():
         return "{{" + f"{placeholder_prefix_enum.value}{metric_code}" + "}}"
 
     @staticmethod
-    def _fill_value_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucEntity]):
+    def _fill_value_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucResultEntity]):
         # 替换指标
         for paragraph in doc.paragraphs:
             text = paragraph.text
@@ -148,7 +148,7 @@ class Report():
         return sum(2 if '\u4e00' <= char <= '\u9fff' else 1 for char in text)
 
     @staticmethod
-    def _fill_table_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucEntity]):
+    def _fill_table_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucResultEntity]):
         # 替换表格
         for paragraph in doc.paragraphs:
             for metric_code, metric_fuc_entity in metric_value_dict.items():
@@ -198,7 +198,7 @@ class Report():
                     table.autofit = False
 
     @staticmethod
-    def _fill_image_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucEntity]):
+    def _fill_image_placeholder(doc: Document, metric_value_dict: Dict[str, MetricFucResultEntity]):
         # 替换图片
         for paragraph in doc.paragraphs:
             for metric_code, metric_fuc_entity in metric_value_dict.items():
@@ -223,7 +223,7 @@ class Report():
                         run.add_picture(path, width=Inches(image_size))
 
     @staticmethod
-    def generate_report(metric_value_dict: Dict[str, MetricFucEntity], template_path: str, save_path=None):
+    def generate_report(metric_value_dict: Dict[str, MetricFucResultEntity], template_path: str, save_path=None):
         if os.path.exists(template_path):
             doc = Document(template_path)
         else:

+ 2 - 2
trainer/__init__.py → pipeline/__init__.py

@@ -5,9 +5,9 @@
 @desc: 
 """
 
-from .train import TrainPipeline
+from .pipeline import Pipeline
 
-__all__ = ['TrainPipeline']
+__all__ = ['Pipeline']
 
 if __name__ == "__main__":
     pass

+ 70 - 0
pipeline/pipeline.py

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 模型训练管道
+"""
+import pandas as pd
+
+from entitys import DataSplitEntity, MlConfigEntity, DataFeatureEntity
+from feature import FeatureStrategyFactory
+from feature.feature_strategy_base import FeatureStrategyBase
+from init import init
+from model import ModelBase
+from model import ModelFactory
+from monitor.report_generate import Report
+
+init()
+
+
+class Pipeline():
+    def __init__(self, ml_config: MlConfigEntity = None, data: DataSplitEntity = None, *args, **kwargs):
+        if ml_config is not None:
+            self._ml_config = ml_config
+        else:
+            self._ml_config = MlConfigEntity(*args, **kwargs)
+        feature_strategy_clazz = FeatureStrategyFactory.get_strategy(self._ml_config.feature_strategy)
+        self._feature_strategy: FeatureStrategyBase = feature_strategy_clazz(self._ml_config)
+        model_clazz = ModelFactory.get_model(self._ml_config.model_type)
+        self._model: ModelBase = model_clazz(self._ml_config)
+        self._data = data
+
+    def train(self, ):
+        # 特征筛选
+        self._feature_strategy.feature_search(self._data)
+        metric_feature = self._feature_strategy.feature_report(self._data)
+
+        # 生成训练数据
+        train_data = self._feature_strategy.feature_generate(self._data.train_data)
+        train_data = DataFeatureEntity(data_x=train_data, data_y=self._data.train_data[self._ml_config.y_column])
+        test_data = self._feature_strategy.feature_generate(self._data.test_data)
+        test_data = DataFeatureEntity(data_x=test_data, data_y=self._data.test_data[self._ml_config.y_column])
+        self._model.train(train_data, test_data)
+        metric_model = self._model.train_report(self._data)
+
+        self.metric_value_dict = {**metric_feature, **metric_model}
+
+    def prob(self, data: pd.DataFrame):
+        feature = self._feature_strategy.feature_generate(data)
+        prob = self._model.prob(feature)
+        return prob
+
+    def score(self, data: pd.DataFrame):
+        return self._model.score(data)
+
+    def report(self, ):
+        save_path = self._ml_config.f_get_save_path("模型报告.docx")
+        Report.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)
+        print(f"模型报告文件储存路径:{save_path}")
+
+    def save(self):
+        self._feature_strategy.feature_save()
+        self._model.model_save()
+
+    def load(self, path: str):
+        self._feature_strategy.feature_load(path)
+        self._model.model_load(path)
+
+
+if __name__ == "__main__":
+    pass

BIN
template/模型开发报告模板_lr.docx


+ 5 - 17
train_test.py

@@ -6,10 +6,8 @@
 """
 import time
 
-from entitys import DataSplitEntity, DataProcessConfigEntity
-from feature import FilterStrategyFactory
-from model import ModelFactory
-from trainer import TrainPipeline
+from entitys import DataSplitEntity, MlConfigEntity
+from pipeline import Pipeline
 
 if __name__ == "__main__":
     time_now = time.time()
@@ -22,21 +20,11 @@ if __name__ == "__main__":
     dat.columns = dat_columns
 
     dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
-    data = DataSplitEntity(train_data=dat[:709], val_data=None, test_data=dat[709:])
-
-    # 特征处理
-    ## 获取特征筛选策略
-    filter_strategy_clazz = FilterStrategyFactory.get_strategy("iv")
-    ## 也可从配置文件加载
-    filter_strategy = filter_strategy_clazz(DataProcessConfigEntity.from_config('./config/data_process_config_template.json'))
-
-    # 选择模型
-    model_clazz = ModelFactory.get_model("lr")
-    model = model_clazz()
+    data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
 
     # 训练并生成报告
-    train_pipeline = TrainPipeline(filter_strategy, model, data)
+    train_pipeline = Pipeline(MlConfigEntity.from_config('./config/ml_config_template.json'), data)
     train_pipeline.train()
-    train_pipeline.generate_report()
+    train_pipeline.report()
 
     print(time.time() - time_now)

+ 0 - 45
trainer/train.py

@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: yq
-@time: 2024/11/1
-@desc: 模型训练管道
-"""
-from typing import Dict
-
-from entitys import DataSplitEntity, MetricFucEntity
-from feature.filter_strategy_base import FilterStrategyBase
-from init import init
-from model import ModelBase
-from monitor.report_generate import Report
-
-init()
-
-
-class TrainPipeline():
-    def __init__(self, filter_strategy: FilterStrategyBase, model: ModelBase, data: DataSplitEntity):
-        self._filter_strategy = filter_strategy
-        self._model = model
-        self._data = data
-        self._model._train_config.set_save_path_func(self._filter_strategy.data_process_config.f_get_save_path)
-        self._model._data_process_config = self._filter_strategy.data_process_config
-
-    def train(self, ) -> Dict[str, MetricFucEntity]:
-        # 处理数据,获取候选特征
-        candidate_feature, numeric_candidate_dict_all = self._filter_strategy.filter(self._data)
-        # 生成训练数据
-        data_prepared = self._filter_strategy.feature_generate(self._data, candidate_feature)
-        # 特征信息
-        metric_value_dict_feature = self._filter_strategy.feature_report(self._data, candidate_feature, numeric_candidate_dict_all)
-
-        metric_value_dict_train = self._model.train(data_prepared, *data_prepared.args, **data_prepared.kwargs)
-        self.metric_value_dict = {**metric_value_dict_feature, **metric_value_dict_train}
-        return self.metric_value_dict
-
-    def generate_report(self, ):
-        save_path = self._filter_strategy.data_process_config.f_get_save_path("模型报告.docx")
-        Report.generate_report(self.metric_value_dict, self._model.get_template_path(), save_path=save_path)
-        print(f"模型报告文件储存路径:{save_path}")
-
-
-if __name__ == "__main__":
-    pass

+ 7 - 7
webui/utils.py

@@ -15,9 +15,9 @@ from sklearn.model_selection import train_test_split
 from config import BaseConfig
 from data import DataLoaderExcel, DataExplore
 from entitys import DataSplitEntity
-from feature import FilterStrategyFactory
+from feature import FeatureStrategyFactory
 from model import ModelFactory
-from trainer import TrainPipeline
+from pipeline import Pipeline
 from .manager import engine
 
 DATA_SUB_DIR = "data"
@@ -105,7 +105,7 @@ def f_data_upload(data):
         engine.get_elem_by_id("data_upload"): gr.update(value=df, visible=True),
         engine.get_elem_by_id("data_insight"): gr.update(value=distribution, visible=True),
         engine.get_elem_by_id("y_column"): gr.update(choices=columns),
-        engine.get_elem_by_id("x_columns_candidate"): gr.update(choices=columns)
+        engine.get_elem_by_id("x_columns"): gr.update(choices=columns)
     }
 
 
@@ -151,18 +151,18 @@ def f_train(data, progress=gr.Progress(track_tqdm=True)):
 
     # 特征处理
     ## 获取特征筛选策略
-    filter_strategy_clazz = FilterStrategyFactory.get_strategy(feature_search_strategy)
-    filter_strategy = filter_strategy_clazz(**all_param)
+    feature_strategy_clazz = FeatureStrategyFactory.get_strategy(feature_search_strategy)
+    feature_strategy = feature_strategy_clazz(**all_param)
 
     # 选择模型
     model_clazz = ModelFactory.get_model(model_type)
     model = model_clazz(**all_param)
 
     # 训练并生成报告
-    train_pipeline = TrainPipeline(filter_strategy, model, data_split)
+    train_pipeline = Pipeline(feature_strategy, model, data_split)
     metric_value_dict = train_pipeline.train()
     progress(0.95)
-    train_pipeline.generate_report()
+    train_pipeline.report()
 
     auc_df = metric_value_dict["模型结果"].table
 

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.