فهرست منبع

add: 加入加减分规则功能

yq 1 ماه پیش
والد
کامیت
40e9680df3
6فایلهای تغییر یافته به همراه144 افزوده شده و 61 حذف شده
  1. 12 3
      entitys/ml_config_entity.py
  2. 3 1
      enums/__init__.py
  3. 12 0
      enums/constant_enum.py
  4. 4 0
      model/model_base.py
  5. 94 51
      model/model_lr.py
  6. 19 6
      model/model_utils.py

+ 12 - 3
entitys/ml_config_entity.py

@@ -39,13 +39,14 @@ class MlConfigEntity():
                  stress_test=False,
                  stress_sample_times=100,
                  stress_bad_rate_list: List[float] = [],
-                 model_type = "lr",
-                 feature_strategy = "woe",
+                 model_type="lr",
+                 feature_strategy="woe",
+                 rules=[],
                  fill_method: str = None,
                  fill_value=None,
                  *args, **kwargs):
 
-        self._model_type= model_type
+        self._model_type = model_type
 
         self._feature_strategy = feature_strategy
 
@@ -120,6 +121,10 @@ class MlConfigEntity():
         # 贪婪搜索采样比例,只针对4箱5箱时有效
         self._bin_sample_rate = bin_sample_rate
 
+        # 加减分规则
+        self._rules = rules
+
+
         if self._project_name is None or len(self._project_name) == 0:
             self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
         else:
@@ -193,6 +198,10 @@ class MlConfigEntity():
     def bin_sample_rate(self):
         return self._bin_sample_rate
 
+    @property
+    def rules(self):
+        return self._rules
+
     @property
     def corr_threshold(self):
         return self._corr_threshold

+ 3 - 1
enums/__init__.py

@@ -4,10 +4,12 @@
 @time: 2024/10/30
 @desc: 枚举值
 """
+from .constant_enum import ConstantEnum
 from .context_enum import ContextEnum
 from .feature_strategy_enum import FeatureStrategyEnum
 from .model_enum import ModelEnum
 from .placeholder_prefix_enum import PlaceholderPrefixEnum
 from .result_codes_enum import ResultCodesEnum
 
-__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'FeatureStrategyEnum', 'ModelEnum', 'ContextEnum']
+__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'FeatureStrategyEnum', 'ModelEnum', 'ContextEnum',
+           'ConstantEnum']

+ 12 - 0
enums/constant_enum.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/14
+@desc: 常数枚举值
+"""
+from enum import Enum
+
+
+class ConstantEnum(Enum):
+    SCORE = "SCORE"
+    SCORE_BIN = "MODEL_SCORE_BIN"

+ 4 - 0
model/model_base.py

@@ -38,6 +38,10 @@ class ModelBase(metaclass=abc.ABCMeta):
     def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
         pass
 
+    @abc.abstractmethod
+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        pass
+
     @abc.abstractmethod
     def model_save(self, *args, **kwargs):
         pass

+ 94 - 51
model/model_lr.py

@@ -16,10 +16,10 @@ import statsmodels.api as sm
 
 from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title
 from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
-from enums import ContextEnum, ResultCodesEnum
+from enums import ContextEnum, ResultCodesEnum, ConstantEnum
 from init import context
 from .model_base import ModelBase
-from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
 
 
 class ModelLr(ModelBase):
@@ -60,6 +60,11 @@ class ModelLr(ModelBase):
     def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
         return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
 
+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        x[ConstantEnum.SCORE.value] = self.score(x)
+        x = f_add_rules(x, self.ml_config.rules)
+        return np.array(x[ConstantEnum.SCORE.value])
+
     def model_save(self):
         if self.lr is None:
             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
@@ -96,13 +101,69 @@ class ModelLr(ModelBase):
 
     def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
 
+        def _get_auc_ks(data_y, score, title):
+            perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
+            path = self.ml_config.f_get_save_path(f"perf_{title}.png")
+            perf["pic"].savefig(path)
+            auc = perf["AUC"]
+            ks = perf["KS"]
+            return auc, ks, path
+
+        def _get_perf(perf_rule=False):
+            # 模型ks auc
+            img_path_auc_ks = []
+            suffix = ""
+            if perf_rule:
+                suffix = "-规则"
+                train_score = self.score_rule(train_data)
+                test_score = self.score_rule(test_data)
+            else:
+                train_score = self.score(train_data)
+                test_score = self.score(test_data)
+
+            train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train{suffix}")
+            img_path_auc_ks.append(path)
+            test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test{suffix}")
+            img_path_auc_ks.append(path)
+
+            df_auc_ks = pd.DataFrame()
+            df_auc_ks["样本集"] = ["训练集", "测试集"]
+            df_auc_ks["AUC"] = [train_auc, test_auc]
+            df_auc_ks["KS"] = [train_ks, test_ks]
+            metric_value_dict[f"模型结果{suffix}"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
+                                                                       image_size=5, table_font_size=10)
+
+            # 评分卡分箱
+            train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
+            train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=True)
+            img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain{suffix}.png")
+            f_df_to_image(train_data_gain, img_path_train_gain)
+            metric_value_dict[f"训练集分数分箱{suffix}"] = MetricFucResultEntity(table=train_data_gain,
+                                                                          image_path=img_path_train_gain)
+
+            test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
+            test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=True)
+            img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain{suffix}.png")
+            f_df_to_image(test_data_gain, img_path_test_gain)
+            metric_value_dict[f"测试集分数分箱{suffix}"] = MetricFucResultEntity(table=test_data_gain,
+                                                                          image_path=img_path_test_gain)
+
+            # 模型分psi
+            model_psi = f_calcu_model_psi(train_score_bin, test_score_bin)
+            img_path_psi = self.ml_config.f_get_save_path(f"model_psi{suffix}.png")
+            f_df_to_image(model_psi, img_path_psi)
+            metric_value_dict[f"模型稳定性{suffix}"] = MetricFucResultEntity(table=model_psi,
+                                                                        value=model_psi["psi"].sum().round(3),
+                                                                        image_path=img_path_psi)
+            return train_score_bin, test_score_bin
+
         y_column = self._ml_config.y_column
         stress_test = self.ml_config.stress_test
         stress_sample_times = self.ml_config.stress_sample_times
         stress_bad_rate_list = self.ml_config.stress_bad_rate_list
 
-        train_data = data.train_data.copy()
-        test_data = data.test_data.copy()
+        train_data = data.train_data
+        test_data = data.test_data
 
         metric_value_dict = {}
         # 评分卡
@@ -120,55 +181,15 @@ class ModelLr(ModelBase):
         f_df_to_image(df_coef, img_path_coef)
         metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
 
-        # 模型ks auc
-        img_path_perf = []
-        train_score = self.score(train_data)
-        train_perf = sc.perf_eva(train_data[y_column], train_score, title="train", show_plot=True)
-        path = self.ml_config.f_get_save_path(f"train_perf.png")
-        train_perf["pic"].savefig(path)
-        img_path_perf.append(path)
-        train_auc = train_perf["AUC"]
-        train_ks = train_perf["KS"]
-
-        test_score = self.score(test_data)
-        test_perf = sc.perf_eva(test_data[y_column], test_score, title="test", show_plot=True)
-        path = self.ml_config.f_get_save_path(f"test_perf.png")
-        test_perf["pic"].savefig(path)
-        img_path_perf.append(path)
-        test_auc = test_perf["AUC"]
-        test_ks = test_perf["KS"]
-
-        df_auc_ks = pd.DataFrame()
-        df_auc_ks["样本集"] = ["训练集", "测试集"]
-        df_auc_ks["AUC"] = [train_auc, test_auc]
-        df_auc_ks["KS"] = [train_ks, test_ks]
-        metric_value_dict["模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_perf, image_size=5,
-                                                          table_font_size=10)
-
-        # 评分卡分箱
-        train_data, score_bins = f_get_model_score_bin(train_data, train_score)
-        train_data_gain = f_calcu_model_ks(train_data, y_column, sort_ascending=True)
-        img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
-        f_df_to_image(train_data_gain, img_path_train_gain)
-        metric_value_dict["训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain, image_path=img_path_train_gain)
-
-        test_data, _ = f_get_model_score_bin(test_data, test_score, score_bins)
-        test_data_gain = f_calcu_model_ks(test_data, y_column, sort_ascending=True)
-        img_path_test_gain = self.ml_config.f_get_save_path(f"tes_gain.png")
-        f_df_to_image(test_data_gain, img_path_test_gain)
-        metric_value_dict["测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain, image_path=img_path_test_gain)
-
-        # 模型分psi
-        model_psi = f_calcu_model_psi(train_data, test_data)
-        img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
-        f_df_to_image(model_psi, img_path_psi)
-        metric_value_dict["模型稳定性"] = MetricFucResultEntity(table=model_psi, value=model_psi["psi"].sum().round(3),
-                                                           image_path=img_path_psi)
+        _, test_score_bin = _get_perf()
+        if len(self.ml_config.rules) != 0:
+            _, test_score_bin = _get_perf(perf_rule=True)
 
         # 压力测试
         if stress_test:
-            df_stress = f_stress_test(test_data, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
-                                      target_column=y_column, score_column="score")
+            df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
+                                      bad_rate_list=stress_bad_rate_list,
+                                      target_column=y_column, score_column=ConstantEnum.SCORE.value)
 
             img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
             f_df_to_image(df_stress, img_path_stress)
@@ -181,11 +202,16 @@ class ModelLr(ModelBase):
 
     def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
         from IPython import display
-
+        suffix = "-规则"
         f_display_title(display, "模型结果")
         display.display(metric_value_dict["模型结果"].table)
         f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
 
+        if len(self.ml_config.rules) != 0:
+            print("加入规则后:")
+            display.display(metric_value_dict[f"模型结果{suffix}"].table)
+            f_display_images_by_side(display, metric_value_dict[f"模型结果{suffix}"].image_path)
+
         f_display_title(display, "模型变量系数")
         print(self.lr.summary().tables[0])
         display.display(metric_value_dict["变量系数"].table)
@@ -195,13 +221,30 @@ class ModelLr(ModelBase):
         display.display(metric_value_dict["模型稳定性"].table)
         print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
 
+        if len(self.ml_config.rules) != 0:
+            print("加入规则后:")
+            display.display(metric_value_dict[f"模型稳定性{suffix}"].table)
+            print(f"模型psi: {metric_value_dict[f'模型稳定性{suffix}'].value}")
+
         f_display_title(display, "分数分箱")
         print("训练集-分数分箱")
         display.display(metric_value_dict["训练集分数分箱"].table)
+        if len(self.ml_config.rules) != 0:
+            print("加入规则后:")
+            print(f"训练集-分数分箱")
+            display.display(metric_value_dict[f"训练集分数分箱{suffix}"].table)
+
         print("测试集-分数分箱")
         display.display(metric_value_dict["测试集分数分箱"].table)
+        if len(self.ml_config.rules) != 0:
+            print("加入规则后:")
+            print(f"测试集-分数分箱")
+            display.display(metric_value_dict[f"测试集分数分箱{suffix}"].table)
+
         # 评分卡
         f_display_title(display, "评分卡")
+        if len(self.ml_config.rules) != 0:
+            print(f"评分卡不包含规则")
         display.display(metric_value_dict["评分卡"].table)
 
         if "压力测试" in metric_value_dict.keys():

+ 19 - 6
model/model_utils.py

@@ -8,9 +8,12 @@ import numpy as np
 import pandas as pd
 from sklearn.metrics import roc_auc_score
 
+from enums import ConstantEnum
+
 
 def f_calcu_model_ks(data, y_column, sort_ascending):
-    var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
+    var_ks = data.groupby(ConstantEnum.SCORE_BIN.value)[y_column].agg([len, np.sum]).sort_index(
+        ascending=sort_ascending)
     var_ks.columns = ['样本数', '坏样本数']
     var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
     var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(3)
@@ -35,15 +38,18 @@ def f_get_model_score_bin(df, score, bins=None):
         bins[0] = -np.inf
         bins[-1] = np.inf
     score_bins = pd.cut(score, bins=bins)
-    df['score'] = score
-    df['MODEL_SCORE_BIN'] = score_bins
+    df = df.copy()
+    df[ConstantEnum.SCORE.value] = score
+    df[ConstantEnum.SCORE_BIN.value] = score_bins
     return df, bins
 
 
 def f_calcu_model_psi(df_train, df_test):
-    tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
+        ascending=True)
     tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
-    tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
+        ascending=True)
     tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
     psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
     psi = psi.reset_index()
@@ -92,7 +98,7 @@ def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, targ
                 if cut not in score_cut_point:
                     score_cut_point.append(cut)
             score_cut_point = [-np.inf] + score_cut_point + [np.inf]
-            df_tmp["socre_bin"] = pd.cut(df_tmp[score_column], score_cut_point).astype(str).values
+            df_tmp[ConstantEnum.SCORE_BIN.value] = pd.cut(df_tmp[score_column], score_cut_point).astype(str).values
             ks = f_calcu_model_ks(df_tmp, target_column, sort_ascending)["KS"].max()
             if sort_ascending:
                 auc = roc_auc_score(df_tmp[target_column], -df_tmp[score_column])
@@ -124,3 +130,10 @@ def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, targ
         row["95%置信区间KS"] = f"{low: .4f} - {high: .4f}"
         rows.append(row)
     return pd.DataFrame(rows)
+
+
+def f_add_rules(df: pd.DataFrame, rules: []):
+    for code in rules:
+        # 代码不安全,存在注入风险
+        exec(code, locals())
+    return df