yq před 3 měsíci
rodič
revize
81eabfe1d3

+ 24 - 2
entitys/data_process_config_entity.py

@@ -19,9 +19,18 @@ class DataProcessConfigEntity():
                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list, str] = None,
                  project_name: str = None, format_bin: str = False, breaks_list: dict = None, pos_neg_cnt=1,
-                 jupyter=False, *args, **kwargs):
+                 jupyter=False, strees=False, strees_sample_times=100, strees_bad_rate_list=[], *args, **kwargs):
 
-        # 单调性允许变化次数
+        # 是否开启下输出内容
+        self._strees = strees
+
+        # jupyter下输出内容
+        self._strees_sample_times = strees_sample_times
+
+        # jupyter下输出内容
+        self._strees_bad_rate_list = strees_bad_rate_list
+
+        # jupyter下输出内容
         self._jupyter = jupyter
 
         # 单调性允许变化次数
@@ -80,6 +89,19 @@ class DataProcessConfigEntity():
 
         os.makedirs(self._base_dir, exist_ok=True)
 
+    @property
+    def strees(self):
+        return self._strees
+
+    @property
+    def strees_sample_times(self):
+
+        return self._strees_sample_times
+
+    @property
+    def strees_bad_rate_list(self):
+        return self._strees_bad_rate_list
+
     @property
     def jupyter(self):
         return self._jupyter

+ 1 - 2
feature/__init__.py

@@ -4,7 +4,6 @@
 @time: 2024/11/1
 @desc: 特征挖掘
 """
-from .feature_utils import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 from .filter_strategy_factory import FilterStrategyFactory
 
-__all__ = ['FilterStrategyFactory', 'f_calcu_model_ks', 'f_get_model_score_bin', 'f_calcu_model_psi']
+__all__ = ['FilterStrategyFactory']

+ 1 - 48
feature/feature_utils.py

@@ -6,7 +6,7 @@
 """
 import numpy as np
 import pandas as pd
-import scorecardpy as sc
+
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
 FORMAT_DICT = {
@@ -113,50 +113,3 @@ def f_get_ivf(data: pd.DataFrame) -> pd.DataFrame:
     vif_df["变量"] = data.columns
     vif_df['vif'] = vif_v
     return vif_df
-
-
-def f_calcu_model_ks(data, y_column, sort_ascending):
-    var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
-    var_ks.columns = ['样本数', '坏样本数']
-    var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
-    var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(4)
-    var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(4)
-    var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
-    var_ks['总好样本数'] = var_ks['好样本数'].sum()
-    var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(4)
-    var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
-    var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
-    var_ks['累计样本数'] = var_ks['样本数'].cumsum()
-    var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(4)
-    var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(4)
-    var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(4)
-    var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(4)
-    return var_ks.reset_index()
-
-
-def f_get_model_score_bin(df, card, bins=None):
-    train_score = sc.scorecard_ply(df, card, print_step=0)
-    df['score'] = train_score
-    if bins is None:
-        _, bins = pd.qcut(df['score'], q=10, retbins=True, duplicates="drop")
-        bins = list(bins)
-        bins[0] = -np.inf
-        bins[-1] = np.inf
-    score_bins = pd.cut(df['score'], bins=bins)
-    df['MODEL_SCORE_BIN'] = score_bins.astype(str).values
-    return df, bins
-
-
-def f_calcu_model_psi(df_train, df_test):
-    tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
-    tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(4)
-    tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
-    tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(4)
-    psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(4)
-    psi = psi.reset_index()
-    psi = psi.rename(columns={"样本数比例": "psi"})
-    psi['训练样本数'] = list(tmp1['count'])
-    psi['测试样本数'] = list(tmp2['count'])
-    psi['训练样本数比例'] = list(tmp1['样本数比例'])
-    psi['测试样本数比例'] = list(tmp2['样本数比例'])
-    return psi

+ 1 - 2
feature/strategy_iv.py

@@ -331,7 +331,6 @@ class StrategyIv(FilterStrategyBase):
         val_data = data.val_data
         test_data = data.test_data
         y_column = self.data_process_config.y_column
-        jupyter = self.data_process_config.jupyter
         x_columns_candidate = list(candidate_dict.keys())
         bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
 
@@ -351,7 +350,7 @@ class StrategyIv(FilterStrategyBase):
             test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
                                                   train_woe.columns.tolist(), y_column)
         return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
-                                  data_split_original=data, jupyter=jupyter)
+                                  data_split_original=data)
 
     def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity],
                        *args, **kwargs) -> Dict[str, MetricFucEntity]:

+ 3 - 1
model/model_base.py

@@ -9,7 +9,7 @@ from typing import Dict
 
 import pandas as pd
 
-from entitys import TrainConfigEntity, DataPreparedEntity, MetricFucEntity
+from entitys import TrainConfigEntity, DataPreparedEntity, MetricFucEntity, DataProcessConfigEntity
 
 
 class ModelBase(metaclass=abc.ABCMeta):
@@ -20,6 +20,8 @@ class ModelBase(metaclass=abc.ABCMeta):
         else:
             self._train_config = TrainConfigEntity(*args, **kwargs)
 
+        self._data_process_config: DataProcessConfigEntity = None
+
     @abc.abstractmethod
     def get_template_path(self, ) -> str:
         pass

+ 19 - 2
model/model_lr.py

@@ -14,8 +14,8 @@ from sklearn.linear_model import LogisticRegression
 
 from commom import f_df_to_image, f_display_images_by_side
 from entitys import DataPreparedEntity, MetricFucEntity, DataSplitEntity
-from feature import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 from .model_base import ModelBase
+from .model_utils import f_strees_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 
 
 class ModelLr(ModelBase):
@@ -31,7 +31,10 @@ class ModelLr(ModelBase):
     def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
         bins = kwargs["bins"]
         data_split_original: DataSplitEntity = kwargs["data_split_original"]
-        jupyter = kwargs["jupyter"]
+        jupyter = self._data_process_config.jupyter
+        strees = self._data_process_config.strees
+        strees_sample_times = self._data_process_config.strees_sample_times
+        strees_bad_rate_list = self._data_process_config.strees_bad_rate_list
 
         # woe编码之前的数据
         train_data_original = data_split_original.train_data
@@ -113,6 +116,16 @@ class ModelLr(ModelBase):
             metric_value_dict["模型稳定性"] = MetricFucEntity(table=model_psi, value=model_psi["psi"].sum().round(4),
                                                          image_path=model_psi_path)
 
+            # 压力测试
+            if strees:
+                df_strees = f_strees_test(test_data_original, sample_times=strees_sample_times,
+                                          bad_rate_list=strees_bad_rate_list, target_column=y_column,
+                                          score_column="score")
+
+                df_strees_path = self._train_config.f_get_save_path(f"strees.png")
+                f_df_to_image(df_strees, df_strees_path)
+                metric_value_dict["压力测试"] = MetricFucEntity(table=df_strees, image_path=df_strees_path)
+
         if jupyter:
             from IPython import display
             print("-----模型结果-----")
@@ -131,6 +144,10 @@ class ModelLr(ModelBase):
             # 评分卡
             display.display(metric_value_dict["评分卡"].table)
 
+            if test_data is not None and strees:
+                print("-----压力测试-----")
+                display.display(metric_value_dict["压力测试"].table)
+
         return metric_value_dict
 
     def predict_prob(self, x: pd.DataFrame, *args, **kwargs):

+ 128 - 0
model/model_utils.py

@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/1/6
+@desc:  模型工具
+"""
+import numpy as np
+import pandas as pd
+import scorecardpy as sc
+from sklearn.metrics import roc_auc_score
+
+
+def f_calcu_model_ks(data, y_column, sort_ascending):
+    var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
+    var_ks.columns = ['样本数', '坏样本数']
+    var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
+    var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(4)
+    var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
+    var_ks['总好样本数'] = var_ks['好样本数'].sum()
+    var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
+    var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
+    var_ks['累计样本数'] = var_ks['样本数'].cumsum()
+    var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(4)
+    var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(4)
+    var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(4)
+    var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(4)
+    return var_ks.reset_index()
+
+
+def f_get_model_score_bin(df, card, bins=None):
+    train_score = sc.scorecard_ply(df, card, print_step=0)
+    df['score'] = train_score
+    if bins is None:
+        _, bins = pd.qcut(df['score'], q=10, retbins=True, duplicates="drop")
+        bins = list(bins)
+        bins[0] = -np.inf
+        bins[-1] = np.inf
+    score_bins = pd.cut(df['score'], bins=bins)
+    df['MODEL_SCORE_BIN'] = score_bins.astype(str).values
+    return df, bins
+
+
+def f_calcu_model_psi(df_train, df_test):
+    tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(4)
+    tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(4)
+    psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(4)
+    psi = psi.reset_index()
+    psi = psi.rename(columns={"样本数比例": "psi"})
+    psi['训练样本数'] = list(tmp1['count'])
+    psi['测试样本数'] = list(tmp2['count'])
+    psi['训练样本数比例'] = list(tmp1['样本数比例'])
+    psi['测试样本数比例'] = list(tmp2['样本数比例'])
+    return psi
+
+
+def f_strees_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, target_column: str, score_column: str,
+                  sort_ascending=True):
+    # 压力测试
+    rows = []
+    target_rate = df[target_column].mean()
+    target_counts = df[target_column].value_counts().to_dict()
+    if len(bad_rate_list) == 0:
+        bad_rate_list = np.arange(0.01, target_rate * 2, target_rate * 2 / 10)
+    for bad_rate in bad_rate_list:
+        bad_rate = round(bad_rate, 3)
+        row = {}
+        ks_list = []
+        auc_list = []
+        df_tmp = None
+        for random_state in range(sample_times):
+            # 目标坏率小于样本坏率,进行好样本采样,增加好样本数量
+            good_sample_counts = int(target_counts.get(1) / bad_rate) - target_counts.get(1)
+            if bad_rate < target_rate:
+                # 需要的好样本的数量
+                good_sample_times = good_sample_counts / target_counts.get(0)
+                good_sample_times_int = int(good_sample_times)
+                good_sample_times_decimal = good_sample_times - good_sample_times_int
+                good_df_tmp = []
+                for _ in range(good_sample_times_int):
+                    good_df_tmp.append(df[df[target_column] == 0])
+                good_df_tmp.append(
+                    df[df[target_column] == 0].sample(frac=good_sample_times_decimal, random_state=random_state))
+                good_df_tmp = pd.concat(good_df_tmp, ignore_index=True)
+            else:
+                good_df_tmp = df[df[target_column] == 0].sample(n=good_sample_counts, random_state=random_state)
+            df_tmp = pd.concat([df[df[target_column] == 1], good_df_tmp], ignore_index=True)
+            score_cut_point = []
+            for q in np.arange(0.1, 1, 0.1):
+                cut = round(df_tmp[score_column].quantile(q), 4)
+                if cut not in score_cut_point:
+                    score_cut_point.append(cut)
+            score_cut_point = [-np.inf] + score_cut_point + [np.inf]
+            df_tmp["socre_bin"] = pd.cut(df_tmp[score_column], score_cut_point).astype(str).values
+            ks = f_calcu_model_ks(df_tmp, target_column, sort_ascending)["KS"].max()
+            if sort_ascending:
+                auc = roc_auc_score(df_tmp[target_column], -df_tmp[score_column])
+            else:
+                auc = roc_auc_score(df_tmp[target_column], df_tmp[score_column])
+            ks_list.append(ks)
+            auc_list.append(auc)
+        # print(f"目标坏率: {bad_rate: .4f} | 抽样样本坏率: {df_tmp[target_column].mean(): .4f} | 抽样样本数: {len(df_tmp)} ")
+        row["违约率"] = bad_rate
+        row["抽样次数"] = sample_times
+        row["样本数"] = len(df_tmp)
+        row["好样本数"] = (df_tmp[target_column] == 0).sum()
+        row["坏样本数"] = (df_tmp[target_column] == 1).sum()
+
+        row["平均AUC"] = np.mean(auc_list)
+        row["最大AUC"] = max(auc_list)
+        row["最小AUC"] = min(auc_list)
+        row["AUC标准差"] = np.std(auc_list)
+        low = row["平均AUC"] - row["AUC标准差"] * 1.96
+        high = row["平均AUC"] + row["AUC标准差"] * 1.96
+        row["95%置信区间AUC"] = f"{low: .4f} - {high: .4f}"
+
+        row["平均KS"] = np.mean(ks_list)
+        row["最大KS"] = max(ks_list)
+        row["最小KS"] = min(ks_list)
+        row["KS标准差"] = np.std(ks_list)
+        low = row["平均KS"] - row["KS标准差"] * 1.96
+        high = row["平均KS"] + row["KS标准差"] * 1.96
+        row["95%置信区间KS"] = f"{low: .4f} - {high: .4f}"
+        rows.append(row)
+    return pd.DataFrame(rows)

binární
template/模型开发报告模板_lr.docx


+ 1 - 0
trainer/train.py

@@ -21,6 +21,7 @@ class TrainPipeline():
         self._model = model
         self._data = data
         self._model._train_config.set_save_path_func(self._filter_strategy.data_process_config.f_get_save_path)
+        self._model._data_process_config = self._filter_strategy.data_process_config
 
     def train(self, ) -> Dict[str, MetricFucEntity]:
         # 处理数据,获取候选特征