Bläddra i källkod

add: 模型结果报告

yq 4 månader sedan
förälder
incheckning
0c7645fcfd

+ 25 - 8
entitys/data_feaure_entity.py

@@ -61,16 +61,24 @@ class DataFeatureEntity():
     def get_Ydata(self):
         return self._data[self._y_column]
 
+    def get_odds0(self):
+        train_good_len = len(self._data[self._data[self._y_column] == 0])
+        train_bad_len = len(self._data[self._data[self._y_column] == 1])
+        odds0 = train_bad_len / train_good_len
+        return odds0
 
 class DataPreparedEntity():
     """
     训练集测试集特征准备完毕
     """
 
-    def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity):
+    def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity,
+                 *args, **kwargs):
         self._train_data = train_data
         self._val_data = val_data
         self._test_data = test_data
+        self.args = args
+        self.kwargs = kwargs
 
     @property
     def train_data(self):
@@ -84,6 +92,7 @@ class DataPreparedEntity():
     def test_data(self):
         return self._test_data
 
+
 class DataSplitEntity():
     """
     初始数据训练集测试集划分
@@ -109,22 +118,30 @@ class DataSplitEntity():
     def get_distribution(self, y_column) -> pd.DataFrame:
         df = pd.DataFrame()
         train_data_len = len(self._train_data)
-        test_data_len = len(self._test_data)
-        total = train_data_len + test_data_len
         train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
-        test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
+        train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%"
+
+        test_data_len = 0
+        test_bad_len = 0
+        test_bad_rate = "-"
+        if self._test_data is not None:
+            test_data_len = len(self._test_data)
+            test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
+            test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
+
+        total = train_data_len + test_data_len
         bad_total = train_bad_len + test_bad_len
+        bad_rate = f"{f_format_float(bad_total / total * 100, 2)}%"
 
         df["样本"] = ["训练集", "测试集", "合计"]
         df["样本数"] = [train_data_len, test_data_len, total]
         df["样本占比"] = [f"{f_format_float(train_data_len / total * 100, 2)}%",
                       f"{f_format_float(test_data_len / total * 100, 2)}%", "100%"]
         df["坏样本数"] = [train_bad_len, test_bad_len, bad_total]
-        df["坏样本比例"] = [f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%",
-                       f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%",
-                       f"{f_format_float(bad_total / total * 100, 2)}%"]
+        df["坏样本比例"] = [train_bad_rate, test_bad_rate, bad_rate]
 
         return df
-    
+
+
 if __name__ == "__main__":
     pass

+ 2 - 2
feature/__init__.py

@@ -4,7 +4,7 @@
 @time: 2024/11/1
 @desc: 特征挖掘
 """
-
+from .feature_utils import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 from .filter_strategy_factory import FilterStrategyFactory
 
-__all__ = ['FilterStrategyFactory']
+__all__ = ['FilterStrategyFactory', 'f_calcu_model_ks', 'f_get_model_score_bin', 'f_calcu_model_psi']

+ 51 - 6
feature/feature_utils.py

@@ -4,8 +4,9 @@
 @time: 2023/12/28
 @desc:  特征工具类
 """
-
+import numpy as np
 import pandas as pd
+import scorecardpy as sc
 import toad as td
 from sklearn.preprocessing import KBinsDiscretizer
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
@@ -107,11 +108,6 @@ def f_get_woe(data: DataSplitEntity, c: td.transform.Combiner, to_drop: list) ->
     return train_woe, test_woe, oot_woe
 
 
-def f_get_iv(data: DataSplitEntity) -> pd.DataFrame:
-    # 计算前,先排除掉不需要计算IV的cols
-    return td.quality(data, 'target', iv_only=True)
-
-
 def f_get_psi(train_data: DataSplitEntity, oot_data: DataSplitEntity) -> pd.DataFrame:
     # 计算前,先排除掉不需要的cols
     return td.metrics.PSI(train_data, oot_data)
@@ -127,3 +123,52 @@ def f_get_ivf(data: pd.DataFrame) -> pd.DataFrame:
     vif_df["变量"] = data.columns
     vif_df['vif'] = vif_v
     return vif_df
+
+
+def f_calcu_model_ks(data, y_column, sort_ascending):
+    var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
+    var_ks.columns = ['样本数', '坏样本数']
+    var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
+    var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(4)
+    var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
+    var_ks['总好样本数'] = var_ks['好样本数'].sum()
+    var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(4)
+    var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
+    var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
+    var_ks['累计样本数'] = var_ks['样本数'].cumsum()
+    var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(4)
+    var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(4)
+    var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(4)
+    var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(4)
+    return var_ks.reset_index()
+
+
+def f_get_model_score_bin(df, card, bins=None):
+    train_score = sc.scorecard_ply(df, card, print_step=0)
+    df['score'] = train_score
+    if bins is None:
+        _, bins = pd.qcut(df['score'], q=10, retbins=True)
+        bins = list(bins)
+        bins[0] = -np.inf
+        bins[-1] = np.inf
+    score_bins = pd.cut(df['score'], bins=bins)
+    df['MODEL_SCORE_BIN'] = score_bins.astype(str).values
+    return df, bins
+
+def f_calcu_model_psi(df_train, df_test):
+    tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(4)
+    tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
+    tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(4)
+    psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(4)
+    psi = psi.reset_index()
+    psi = psi.rename(columns={"样本数比例": "psi"})
+    psi['训练样本数'] = list(tmp1['count'])
+    psi['测试样本数'] = list(tmp2['count'])
+    psi['训练样本数比例'] = list(tmp1['样本数比例'])
+    psi['测试样本数比例']=list(tmp2['样本数比例'])
+    return psi
+
+
+

+ 9 - 0
feature/filter_strategy_base.py

@@ -21,12 +21,21 @@ class FilterStrategyBase(metaclass=abc.ABCMeta):
 
     @abc.abstractmethod
     def filter(self, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
+        """
+        特征筛选
+        """
         pass
 
     @abc.abstractmethod
     def feature_generate(self, *args, **kwargs) -> DataPreparedEntity:
+        """
+        特征转换
+        """
         pass
 
     @abc.abstractmethod
     def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucEntity]:
+        """
+        特征报告
+        """
         pass

+ 24 - 17
feature/strategy_iv.py

@@ -14,7 +14,6 @@ import scorecardpy as sc
 import seaborn as sns
 from pandas.core.dtypes.common import is_numeric_dtype
 
-
 from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
 from init import f_get_save_path
 from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf
@@ -28,6 +27,24 @@ class StrategyIv(FilterStrategyBase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+    def _f_get_iv_by_bins(self, bins) -> pd.DataFrame:
+        iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in bins.items()}
+        iv = pd.DataFrame.from_dict(iv, orient='index', columns=['IV']).reset_index()
+        iv = iv.sort_values('IV', ascending=False).reset_index(drop=True)
+        iv.columns = ['变量', 'IV']
+        return iv
+
+    def _f_get_var_corr_image(self, train_woe):
+        train_corr = f_get_corr(train_woe)
+        plt.figure(figsize=(12, 12))
+        sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
+        plt.title('Variables Correlation', fontsize=15)
+        plt.yticks(rotation=0)
+        plt.xticks(rotation=90)
+        path = f_get_save_path(f"var_corr.png")
+        plt.savefig(path)
+        return path
+
     def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
         image_path_list = []
         for k in x_columns_candidate:
@@ -318,7 +335,8 @@ class StrategyIv(FilterStrategyBase):
             test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
             test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
                                                   train_woe.columns.tolist(), y_column)
-        return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature)
+        return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
+                                  data_split_original=data)
 
     def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
                        **kwargs) -> Dict[str, MetricFucEntity]:
@@ -333,18 +351,14 @@ class StrategyIv(FilterStrategyBase):
                                                     table_cell_width=3)
         # 变量iv及psi
         train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
-        train_iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in train_bins.items()}
-        train_iv = pd.DataFrame.from_dict(train_iv, orient='index', columns=['IV']).reset_index()
-        train_iv = train_iv.sort_values('IV', ascending=False).reset_index(drop=True)
-        train_iv.columns = ['变量', 'IV']
+        train_iv = self._f_get_iv_by_bins(train_bins)
 
         if test_data is not None and len(test_data) != 0:
             # 计算psi仅需把y改成识别各自训练集测试集即可
             psi_df = pd.concat((train_data, test_data))
             psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
             psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
-            psi = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in psi.items()}
-            psi = pd.DataFrame.from_dict(psi, orient='index', columns=['psi']).reset_index()
+            psi = self._f_get_iv_by_bins(psi)
             psi.columns = ['变量', 'psi']
             train_iv = pd.merge(train_iv, psi, on="变量", how="left")
 
@@ -359,16 +373,9 @@ class StrategyIv(FilterStrategyBase):
         metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
         # 变量有效性
         train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
-        train_corr = f_get_corr(train_woe)
-        plt.figure(figsize=(12, 12))
-        sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
-        plt.title('Variables Correlation', fontsize=15)
-        plt.yticks(rotation=0)
-        plt.xticks(rotation=90)
-        path = f_get_save_path(f"var_corr.png")
-        plt.savefig(path)
+        var_corr_image_path = self._f_get_var_corr_image(train_woe)
         # vif
         vif_df = f_get_ivf(train_woe)
-        metric_value_dict["变量有效性"] = MetricFucEntity(image_path=path, table=vif_df)
+        metric_value_dict["变量有效性"] = MetricFucEntity(image_path=var_corr_image_path, table=vif_df)
 
         return metric_value_dict

+ 0 - 4
model/model_base.py

@@ -25,10 +25,6 @@ class ModelBase(metaclass=abc.ABCMeta):
     def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
         pass
 
-    @abc.abstractmethod
-    def predict(self, x: pd.DataFrame, *args, **kwargs):
-        pass
-
     @abc.abstractmethod
     def export_model_file(self, ):
         pass

+ 62 - 24
model/model_lr.py

@@ -8,10 +8,10 @@ from typing import Dict
 
 import pandas as pd
 import scorecardpy as sc
-from matplotlib import pyplot as plt
 from sklearn.linear_model import LogisticRegression
 
-from entitys import TrainConfigEntity, DataPreparedEntity, MetricFucEntity
+from entitys import TrainConfigEntity, DataPreparedEntity, MetricFucEntity, DataSplitEntity
+from feature import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
 from init import f_get_save_path
 from .model_base import ModelBase
 
@@ -22,48 +22,86 @@ class ModelLr(ModelBase):
         self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
 
     def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
+        bins = kwargs["bins"]
+        data_split_original: DataSplitEntity = kwargs["data_split_original"]
+
+        # woe编码之前的数据
+        train_data_original = data_split_original.train_data
+        test_data_original = data_split_original.test_data
+
         train_data = data.train_data
         train_y = train_data.get_Ydata()
+        y_column = train_data.y_column
+
         test_data = data.test_data
-        test_y = test_data.get_Ydata()
+
         self.lr.fit(train_data.get_Xdata(), train_y)
 
+        metric_value_dict = {}
+        # 评分卡
+        card: Dict = sc.scorecard(bins, self.lr, train_data.x_columns, points0=600, odds0=train_data.get_odds0(),
+                                  pdo=50)
+        card_df = pd.DataFrame(columns=card['basepoints'].keys())
+        for k, v in card.items():
+            card_df = pd.concat((card_df, v))
+        metric_value_dict["评分卡"] = MetricFucEntity(table=card_df, table_font_size=12)
+
+        # 模型系数
+        coef = dict(zip(train_data.x_columns, self.lr.coef_.reshape(-1)))
+        coef_df = pd.DataFrame()
+        coef_df['变量'] = coef.keys()
+        coef_df['变量系数'] = coef.values()
+        metric_value_dict["变量系数"] = MetricFucEntity(table=coef_df, table_font_size=12)
+
+        # 模型ks auc
         train_prob = self.lr.predict_proba(train_data.get_Xdata())[:, 1]
-        test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
-
         image_path_list = []
         train_perf = sc.perf_eva(train_y, train_prob, title="train", show_plot=True)
         path = f_get_save_path(f"train_perf.png")
         train_perf["pic"].savefig(path)
         image_path_list.append(path)
 
-        test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
-        path = f_get_save_path(f"test_perf.png")
-        test_perf["pic"].savefig(path)
-        image_path_list.append(path)
-
         train_auc = train_perf["KS"]
         train_ks = train_perf["AUC"]
 
-        test_auc = test_perf["KS"]
-        test_ks = test_perf["AUC"]
-
-        metric_value_dict = {}
-        df = pd.DataFrame()
-        df["样本集"] = ["训练集", "测试集"]
-        df["AUC"] = [train_auc, test_auc]
-        df["KS"] = [train_ks, test_ks]
-
-        metric_value_dict["模型结果"] = MetricFucEntity(table=df, image_path=image_path_list, image_size=5)
-
+        test_auc = "-"
+        test_ks = "-"
+        if test_data is not None:
+            test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
+            test_y = test_data.get_Ydata()
+            test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
+            path = f_get_save_path(f"test_perf.png")
+            test_perf["pic"].savefig(path)
+            image_path_list.append(path)
+            test_auc = test_perf["KS"]
+            test_ks = test_perf["AUC"]
+
+        df_auc = pd.DataFrame()
+        df_auc["样本集"] = ["训练集", "测试集"]
+        df_auc["AUC"] = [train_auc, test_auc]
+        df_auc["KS"] = [train_ks, test_ks]
+        metric_value_dict["模型结果"] = MetricFucEntity(table=df_auc, image_path=image_path_list, image_size=5,
+                                                    table_font_size=12)
+
+        # 评分卡分箱
+        train_data_original, score_bins = f_get_model_score_bin(train_data_original, card)
+        train_data_gain = f_calcu_model_ks(train_data_original, y_column, sort_ascending=True)
+        metric_value_dict["训练集分数分箱"] = MetricFucEntity(table=train_data_gain, table_font_size=12)
+        if test_data is not None:
+            test_data_original, bins = f_get_model_score_bin(test_data_original, card, score_bins)
+            test_data_gain = f_calcu_model_ks(test_data_original, y_column, sort_ascending=True)
+            metric_value_dict["测试集分数分箱"] = MetricFucEntity(table=test_data_gain,
+                                                           table_font_size=12)
+
+        # 模型分psi
+        model_psi = f_calcu_model_psi(train_data_original, test_data_original)
+        metric_value_dict["模型稳定性"] = MetricFucEntity(value=model_psi["psi"].sum().round(4), table=model_psi,
+                                                     table_font_size=12)
         return metric_value_dict
 
     def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
         return self.lr.predict_proba(x)[:, 1]
 
-    def predict(self, x: pd.DataFrame, *args, **kwargs):
-        return self.lr.predict(x)
-
     def export_model_file(self):
         pass
 

+ 1 - 1
monitor/report_generate.py

@@ -117,7 +117,7 @@ class Report():
                 metric_value = metric_fuc_entity.value
                 if metric_value is None:
                     continue
-                text = text.replace(placeholder, metric_value)
+                text = text.replace(placeholder, str(metric_value))
             # 段落中多个runs时执行,最后一个run改成替换好的文本,其他run置空
             if len(paragraph.runs[:-1]) > 0:
                 for run in paragraph.runs[:-1]:

BIN
template/模型开发报告模板_lr.docx


+ 1 - 1
train_test.py

@@ -16,7 +16,7 @@ if __name__ == "__main__":
 
     dat = sc.germancredit()
     dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
-    data = DataSplitEntity(dat[:700], None, dat[700:])
+    data = DataSplitEntity(dat[:709], None, dat[709:])
 
     # 特征处理
     filter_strategy_factory = FilterStrategyFactory(

+ 1 - 1
trainer/train.py

@@ -19,7 +19,7 @@ class TrainPipeline():
         self.model = model_clazz(self._train_config)
 
     def train(self, data: DataPreparedEntity) -> Dict[str, MetricFucEntity]:
-        metric_value_dict = self.model.train(data)
+        metric_value_dict = self.model.train(data, *data.args, **data.kwargs)
         return metric_value_dict
 
     def generate_report(self, metric_value_dict: Dict[str, MetricFucEntity]):