пре 4 месеци · 40e9680df3
--- a/entitys/ml_config_entity.py
+++ b/entitys/ml_config_entity.py
@@ -39,13 +39,14 @@ class MlConfigEntity():
 
				                  stress_test=False,
			
 
				                  stress_sample_times=100,
			
 
				                  stress_bad_rate_list: List[float] = [],
			
 
				-                 model_type = "lr",
			
 
				-                 feature_strategy = "woe",
			
 
				+                 model_type="lr",
			
 
				+                 feature_strategy="woe",
			
 
				+                 rules=[],
			
 
				                  fill_method: str = None,
			
 
				                  fill_value=None,
			
 
				                  *args, **kwargs):
			
 
				 
			
 
				-        self._model_type= model_type
			
 
				+        self._model_type = model_type
			
 
				 
			
 
				         self._feature_strategy = feature_strategy
			
 
				 
			
@@ -120,6 +121,10 @@ class MlConfigEntity():
 
				         # 贪婪搜索采样比例,只针对4箱5箱时有效
			
 
				         self._bin_sample_rate = bin_sample_rate
			
 
				 
			
 
				+        # 加减分规则
			
 
				+        self._rules = rules
			
 
				+
			
 
				+
			
 
				         if self._project_name is None or len(self._project_name) == 0:
			
 
				             self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
			
 
				         else:
			
@@ -193,6 +198,10 @@ class MlConfigEntity():
 
				     def bin_sample_rate(self):
			
 
				         return self._bin_sample_rate
			
 
				 
			
 
				+    @property
			
 
				+    def rules(self):
			
 
				+        return self._rules
			
 
				+
			
 
				     @property
			
 
				     def corr_threshold(self):
			
 
				         return self._corr_threshold
			
--- a/enums/__init__.py
+++ b/enums/__init__.py
@@ -4,10 +4,12 @@
 
				 @time: 2024/10/30
			
 
				 @desc: 枚举值
			
 
				 """
			
 
				+from .constant_enum import ConstantEnum
			
 
				 from .context_enum import ContextEnum
			
 
				 from .feature_strategy_enum import FeatureStrategyEnum
			
 
				 from .model_enum import ModelEnum
			
 
				 from .placeholder_prefix_enum import PlaceholderPrefixEnum
			
 
				 from .result_codes_enum import ResultCodesEnum
			
 
				 
			
 
				-__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'FeatureStrategyEnum', 'ModelEnum', 'ContextEnum']
			
 
				+__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'FeatureStrategyEnum', 'ModelEnum', 'ContextEnum',
			
 
				+           'ConstantEnum']
			
--- a/enums/constant_enum.py
+++ b/enums/constant_enum.py
@@ -0,0 +1,12 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2024/11/14
			
 
				+@desc: 常数枚举值
			
 
				+"""
			
 
				+from enum import Enum
			
 
				+
			
 
				+
			
 
				+class ConstantEnum(Enum):
			
 
				+    SCORE = "SCORE"
			
 
				+    SCORE_BIN = "MODEL_SCORE_BIN"
			
--- a/model/model_base.py
+++ b/model/model_base.py
@@ -38,6 +38,10 @@ class ModelBase(metaclass=abc.ABCMeta):
 
				     def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				         pass
			
 
				 
			
 
				+    @abc.abstractmethod
			
 
				+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				+        pass
			
 
				+
			
 
				     @abc.abstractmethod
			
 
				     def model_save(self, *args, **kwargs):
			
 
				         pass
			
--- a/model/model_lr.py
+++ b/model/model_lr.py
@@ -16,10 +16,10 @@ import statsmodels.api as sm
 
				 
			
 
				 from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title
			
 
				 from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
			
 
				-from enums import ContextEnum, ResultCodesEnum
			
 
				+from enums import ContextEnum, ResultCodesEnum, ConstantEnum
			
 
				 from init import context
			
 
				 from .model_base import ModelBase
			
 
				-from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
			
 
				+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
			
 
				 
			
 
				 
			
 
				 class ModelLr(ModelBase):
			
@@ -60,6 +60,11 @@ class ModelLr(ModelBase):
 
				     def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				         return np.array(sc.scorecard_ply(x, self.card, print_step=0)["score"])
			
 
				 
			
 
				+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				+        x[ConstantEnum.SCORE.value] = self.score(x)
			
 
				+        x = f_add_rules(x, self.ml_config.rules)
			
 
				+        return np.array(x[ConstantEnum.SCORE.value])
			
 
				+
			
 
				     def model_save(self):
			
 
				         if self.lr is None:
			
 
				             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
			
@@ -96,13 +101,69 @@ class ModelLr(ModelBase):
 
				 
			
 
				     def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
			
 
				 
			
 
				+        def _get_auc_ks(data_y, score, title):
			
 
				+            perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
			
 
				+            path = self.ml_config.f_get_save_path(f"perf_{title}.png")
			
 
				+            perf["pic"].savefig(path)
			
 
				+            auc = perf["AUC"]
			
 
				+            ks = perf["KS"]
			
 
				+            return auc, ks, path
			
 
				+
			
 
				+        def _get_perf(perf_rule=False):
			
 
				+            # 模型ks auc
			
 
				+            img_path_auc_ks = []
			
 
				+            suffix = ""
			
 
				+            if perf_rule:
			
 
				+                suffix = "-规则"
			
 
				+                train_score = self.score_rule(train_data)
			
 
				+                test_score = self.score_rule(test_data)
			
 
				+            else:
			
 
				+                train_score = self.score(train_data)
			
 
				+                test_score = self.score(test_data)
			
 
				+
			
 
				+            train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train{suffix}")
			
 
				+            img_path_auc_ks.append(path)
			
 
				+            test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test{suffix}")
			
 
				+            img_path_auc_ks.append(path)
			
 
				+
			
 
				+            df_auc_ks = pd.DataFrame()
			
 
				+            df_auc_ks["样本集"] = ["训练集", "测试集"]
			
 
				+            df_auc_ks["AUC"] = [train_auc, test_auc]
			
 
				+            df_auc_ks["KS"] = [train_ks, test_ks]
			
 
				+            metric_value_dict[f"模型结果{suffix}"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
			
 
				+                                                                       image_size=5, table_font_size=10)
			
 
				+
			
 
				+            # 评分卡分箱
			
 
				+            train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
			
 
				+            train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=True)
			
 
				+            img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain{suffix}.png")
			
 
				+            f_df_to_image(train_data_gain, img_path_train_gain)
			
 
				+            metric_value_dict[f"训练集分数分箱{suffix}"] = MetricFucResultEntity(table=train_data_gain,
			
 
				+                                                                          image_path=img_path_train_gain)
			
 
				+
			
 
				+            test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
			
 
				+            test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=True)
			
 
				+            img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain{suffix}.png")
			
 
				+            f_df_to_image(test_data_gain, img_path_test_gain)
			
 
				+            metric_value_dict[f"测试集分数分箱{suffix}"] = MetricFucResultEntity(table=test_data_gain,
			
 
				+                                                                          image_path=img_path_test_gain)
			
 
				+
			
 
				+            # 模型分psi
			
 
				+            model_psi = f_calcu_model_psi(train_score_bin, test_score_bin)
			
 
				+            img_path_psi = self.ml_config.f_get_save_path(f"model_psi{suffix}.png")
			
 
				+            f_df_to_image(model_psi, img_path_psi)
			
 
				+            metric_value_dict[f"模型稳定性{suffix}"] = MetricFucResultEntity(table=model_psi,
			
 
				+                                                                        value=model_psi["psi"].sum().round(3),
			
 
				+                                                                        image_path=img_path_psi)
			
 
				+            return train_score_bin, test_score_bin
			
 
				+
			
 
				         y_column = self._ml_config.y_column
			
 
				         stress_test = self.ml_config.stress_test
			
 
				         stress_sample_times = self.ml_config.stress_sample_times
			
 
				         stress_bad_rate_list = self.ml_config.stress_bad_rate_list
			
 
				 
			
 
				-        train_data = data.train_data.copy()
			
 
				-        test_data = data.test_data.copy()
			
 
				+        train_data = data.train_data
			
 
				+        test_data = data.test_data
			
 
				 
			
 
				         metric_value_dict = {}
			
 
				         # 评分卡
			
@@ -120,55 +181,15 @@ class ModelLr(ModelBase):
 
				         f_df_to_image(df_coef, img_path_coef)
			
 
				         metric_value_dict["变量系数"] = MetricFucResultEntity(table=df_coef, image_path=img_path_coef)
			
 
				 
			
 
				-        # 模型ks auc
			
 
				-        img_path_perf = []
			
 
				-        train_score = self.score(train_data)
			
 
				-        train_perf = sc.perf_eva(train_data[y_column], train_score, title="train", show_plot=True)
			
 
				-        path = self.ml_config.f_get_save_path(f"train_perf.png")
			
 
				-        train_perf["pic"].savefig(path)
			
 
				-        img_path_perf.append(path)
			
 
				-        train_auc = train_perf["AUC"]
			
 
				-        train_ks = train_perf["KS"]
			
 
				-
			
 
				-        test_score = self.score(test_data)
			
 
				-        test_perf = sc.perf_eva(test_data[y_column], test_score, title="test", show_plot=True)
			
 
				-        path = self.ml_config.f_get_save_path(f"test_perf.png")
			
 
				-        test_perf["pic"].savefig(path)
			
 
				-        img_path_perf.append(path)
			
 
				-        test_auc = test_perf["AUC"]
			
 
				-        test_ks = test_perf["KS"]
			
 
				-
			
 
				-        df_auc_ks = pd.DataFrame()
			
 
				-        df_auc_ks["样本集"] = ["训练集", "测试集"]
			
 
				-        df_auc_ks["AUC"] = [train_auc, test_auc]
			
 
				-        df_auc_ks["KS"] = [train_ks, test_ks]
			
 
				-        metric_value_dict["模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_perf, image_size=5,
			
 
				-                                                          table_font_size=10)
			
 
				-
			
 
				-        # 评分卡分箱
			
 
				-        train_data, score_bins = f_get_model_score_bin(train_data, train_score)
			
 
				-        train_data_gain = f_calcu_model_ks(train_data, y_column, sort_ascending=True)
			
 
				-        img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
			
 
				-        f_df_to_image(train_data_gain, img_path_train_gain)
			
 
				-        metric_value_dict["训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain, image_path=img_path_train_gain)
			
 
				-
			
 
				-        test_data, _ = f_get_model_score_bin(test_data, test_score, score_bins)
			
 
				-        test_data_gain = f_calcu_model_ks(test_data, y_column, sort_ascending=True)
			
 
				-        img_path_test_gain = self.ml_config.f_get_save_path(f"tes_gain.png")
			
 
				-        f_df_to_image(test_data_gain, img_path_test_gain)
			
 
				-        metric_value_dict["测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain, image_path=img_path_test_gain)
			
 
				-
			
 
				-        # 模型分psi
			
 
				-        model_psi = f_calcu_model_psi(train_data, test_data)
			
 
				-        img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
			
 
				-        f_df_to_image(model_psi, img_path_psi)
			
 
				-        metric_value_dict["模型稳定性"] = MetricFucResultEntity(table=model_psi, value=model_psi["psi"].sum().round(3),
			
 
				-                                                           image_path=img_path_psi)
			
 
				+        _, test_score_bin = _get_perf()
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            _, test_score_bin = _get_perf(perf_rule=True)
			
 
				 
			
 
				         # 压力测试
			
 
				         if stress_test:
			
 
				-            df_stress = f_stress_test(test_data, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
			
 
				-                                      target_column=y_column, score_column="score")
			
 
				+            df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
			
 
				+                                      bad_rate_list=stress_bad_rate_list,
			
 
				+                                      target_column=y_column, score_column=ConstantEnum.SCORE.value)
			
 
				 
			
 
				             img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
			
 
				             f_df_to_image(df_stress, img_path_stress)
			
@@ -181,11 +202,16 @@ class ModelLr(ModelBase):
 
				 
			
 
				     def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
			
 
				         from IPython import display
			
 
				-
			
 
				+        suffix = "-规则"
			
 
				         f_display_title(display, "模型结果")
			
 
				         display.display(metric_value_dict["模型结果"].table)
			
 
				         f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
			
 
				 
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            print("加入规则后:")
			
 
				+            display.display(metric_value_dict[f"模型结果{suffix}"].table)
			
 
				+            f_display_images_by_side(display, metric_value_dict[f"模型结果{suffix}"].image_path)
			
 
				+
			
 
				         f_display_title(display, "模型变量系数")
			
 
				         print(self.lr.summary().tables[0])
			
 
				         display.display(metric_value_dict["变量系数"].table)
			
@@ -195,13 +221,30 @@ class ModelLr(ModelBase):
 
				         display.display(metric_value_dict["模型稳定性"].table)
			
 
				         print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
			
 
				 
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            print("加入规则后:")
			
 
				+            display.display(metric_value_dict[f"模型稳定性{suffix}"].table)
			
 
				+            print(f"模型psi: {metric_value_dict[f'模型稳定性{suffix}'].value}")
			
 
				+
			
 
				         f_display_title(display, "分数分箱")
			
 
				         print("训练集-分数分箱")
			
 
				         display.display(metric_value_dict["训练集分数分箱"].table)
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            print("加入规则后:")
			
 
				+            print(f"训练集-分数分箱")
			
 
				+            display.display(metric_value_dict[f"训练集分数分箱{suffix}"].table)
			
 
				+
			
 
				         print("测试集-分数分箱")
			
 
				         display.display(metric_value_dict["测试集分数分箱"].table)
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            print("加入规则后:")
			
 
				+            print(f"测试集-分数分箱")
			
 
				+            display.display(metric_value_dict[f"测试集分数分箱{suffix}"].table)
			
 
				+
			
 
				         # 评分卡
			
 
				         f_display_title(display, "评分卡")
			
 
				+        if len(self.ml_config.rules) != 0:
			
 
				+            print(f"评分卡不包含规则")
			
 
				         display.display(metric_value_dict["评分卡"].table)
			
 
				 
			
 
				         if "压力测试" in metric_value_dict.keys():
			
--- a/model/model_utils.py
+++ b/model/model_utils.py
@@ -8,9 +8,12 @@ import numpy as np
 
				 import pandas as pd
			
 
				 from sklearn.metrics import roc_auc_score
			
 
				 
			
 
				+from enums import ConstantEnum
			
 
				+
			
 
				 
			
 
				 def f_calcu_model_ks(data, y_column, sort_ascending):
			
 
				-    var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
			
 
				+    var_ks = data.groupby(ConstantEnum.SCORE_BIN.value)[y_column].agg([len, np.sum]).sort_index(
			
 
				+        ascending=sort_ascending)
			
 
				     var_ks.columns = ['样本数', '坏样本数']
			
 
				     var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
			
 
				     var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(3)
			
@@ -35,15 +38,18 @@ def f_get_model_score_bin(df, score, bins=None):
 
				         bins[0] = -np.inf
			
 
				         bins[-1] = np.inf
			
 
				     score_bins = pd.cut(score, bins=bins)
			
 
				-    df['score'] = score
			
 
				-    df['MODEL_SCORE_BIN'] = score_bins
			
 
				+    df = df.copy()
			
 
				+    df[ConstantEnum.SCORE.value] = score
			
 
				+    df[ConstantEnum.SCORE_BIN.value] = score_bins
			
 
				     return df, bins
			
 
				 
			
 
				 
			
 
				 def f_calcu_model_psi(df_train, df_test):
			
 
				-    tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
			
 
				+    tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
			
 
				+        ascending=True)
			
 
				     tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
			
 
				-    tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
			
 
				+    tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
			
 
				+        ascending=True)
			
 
				     tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
			
 
				     psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
			
 
				     psi = psi.reset_index()
			
@@ -92,7 +98,7 @@ def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, targ
 
				                 if cut not in score_cut_point:
			
 
				                     score_cut_point.append(cut)
			
 
				             score_cut_point = [-np.inf] + score_cut_point + [np.inf]
			
 
				-            df_tmp["socre_bin"] = pd.cut(df_tmp[score_column], score_cut_point).astype(str).values
			
 
				+            df_tmp[ConstantEnum.SCORE_BIN.value] = pd.cut(df_tmp[score_column], score_cut_point).astype(str).values
			
 
				             ks = f_calcu_model_ks(df_tmp, target_column, sort_ascending)["KS"].max()
			
 
				             if sort_ascending:
			
 
				                 auc = roc_auc_score(df_tmp[target_column], -df_tmp[score_column])
			
@@ -124,3 +130,10 @@ def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, targ
 
				         row["95%置信区间KS"] = f"{low: .4f} - {high: .4f}"
			
 
				         rows.append(row)
			
 
				     return pd.DataFrame(rows)
			
 
				+
			
 
				+
			
 
				+def f_add_rules(df: pd.DataFrame, rules: []):
			
 
				+    for code in rules:
			
 
				+        # 代码不安全，存在注入风险
			
 
				+        exec(code, locals())
			
 
				+    return df