Quellcode durchsuchen

bugfix: 部分训练报告

yq vor 4 Monaten
Ursprung
Commit
cfab0ca943

+ 1 - 0
.gitignore

@@ -61,3 +61,4 @@ target/
 /logs
 /cache
 */image
+*/~$*

+ 13 - 1
config/data_process_config_template.json

@@ -1,4 +1,16 @@
 {
+  "sample_rate": 0.01,
+  "bin_search_interval": 0.05,
+  "feature_search_strategy": "iv",
+  "x_candidate_num": 10,
+  "special_values": null,
   "y_column": "creditability",
-  "bin_search_interval": 0.05
+  "x_columns_candidate": [
+    "duration_in_month",
+    "credit_amount",
+    "age_in_years",
+    "purpose",
+    "credit_history",
+    "savings_account_and_bonds"
+  ]
 }

+ 25 - 2
entitys/data_feaure_entity.py

@@ -7,6 +7,8 @@
 
 import pandas as pd
 
+from commom import f_format_float
+
 
 class CandidateFeatureEntity():
     """
@@ -35,6 +37,7 @@ class DataFeatureEntity():
     """
     数据特征准备完毕
     """
+
     def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
         self._data = data
         self._x_columns = x_columns
@@ -63,6 +66,7 @@ class DataPreparedEntity():
     """
     训练集测试集特征准备完毕
     """
+
     def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity):
         self._train_data = train_data
         self._val_data = val_data
@@ -80,11 +84,11 @@ class DataPreparedEntity():
     def test_data(self):
         return self._test_data
 
-
 class DataSplitEntity():
     """
     初始数据训练集测试集划分
     """
+
     def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame):
         self._train_data = train_data
         self._val_data = val_data
@@ -102,6 +106,25 @@ class DataSplitEntity():
     def test_data(self):
         return self._test_data
 
-
+    def get_distribution(self, y_column) -> pd.DataFrame:
+        df = pd.DataFrame()
+        train_data_len = len(self._train_data)
+        test_data_len = len(self._test_data)
+        total = train_data_len + test_data_len
+        train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
+        test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
+        bad_total = train_bad_len + test_bad_len
+
+        df["样本"] = ["训练集", "测试集", "合计"]
+        df["样本数"] = [train_data_len, test_data_len, total]
+        df["样本占比"] = [f"{f_format_float(train_data_len / total * 100)}%",
+                      f"{f_format_float(test_data_len / total * 100)}%", "100%"]
+        df["坏样本数"] = [train_bad_len, test_bad_len, bad_total]
+        df["坏样本比例"] = [f"{f_format_float(train_bad_len / train_data_len * 100)}%",
+                       f"{f_format_float(test_bad_len / test_data_len * 100)}%",
+                       f"{f_format_float(bad_total / total * 100)}%"]
+
+        return df
+    
 if __name__ == "__main__":
     pass

+ 10 - 1
entitys/data_process_config_entity.py

@@ -8,7 +8,8 @@ import json
 import os
 from typing import List, Union
 
-from commom import GeneralException
+from commom import GeneralException, f_get_datetime
+from config import BaseConfig
 from enums import ResultCodesEnum
 
 
@@ -17,6 +18,10 @@ class DataProcessConfigEntity():
                  split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
                  iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
                  sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list] = None):
+
+        self.save_path = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
+        os.makedirs(self.save_path, exist_ok=True)
+
         # 定义y变量
         self._y_column = y_column
 
@@ -120,6 +125,10 @@ class DataProcessConfigEntity():
 
         return DataProcessConfigEntity(**j)
 
+    def _get_save_path(self, file_name: str) -> str:
+        path = os.path.join(self.save_path, file_name)
+        return path
+
 
 if __name__ == "__main__":
     pass

+ 8 - 1
entitys/metric_entity.py

@@ -4,6 +4,8 @@
 @time: 2024/11/1
 @desc:  常用指标实体集合
 """
+from typing import Union
+
 import pandas as pd
 
 from commom import f_format_float
@@ -49,10 +51,12 @@ class MetricFucEntity():
     指标计算函数结果类
     """
 
-    def __init__(self, table: pd.DataFrame = None, value: str = None, image_path: str = None):
+    def __init__(self, table: pd.DataFrame = None, value: str = None, image_path: Union[str, list] = None,
+                 image_size: int = 6):
         self._table = table
         self._value = value
         self._image_path = image_path
+        self._image_size = image_size
 
     @property
     def table(self) -> pd.DataFrame:
@@ -66,6 +70,9 @@ class MetricFucEntity():
     def image_path(self):
         return self._image_path
 
+    @property
+    def image_size(self):
+        return self._image_size
 
 if __name__ == "__main__":
     pass

+ 8 - 1
entitys/train_config_entity.py

@@ -8,7 +8,7 @@ import json
 import os
 
 from commom import GeneralException
-from enums import ResultCodesEnum
+from enums import ResultCodesEnum, ModelEnum
 
 
 class TrainConfigEntity():
@@ -17,6 +17,13 @@ class TrainConfigEntity():
         self._model_type = model_type
         # 学习率
         self._lr = lr
+        # 报告模板
+        if model_type == ModelEnum.LR.value:
+            self._template_path = "./template/模型开发报告模板_lr.docx"
+
+    @property
+    def template_path(self):
+        return self._template_path
 
     @property
     def model_type(self):

+ 6 - 2
feature/filter_strategy_base.py

@@ -5,9 +5,9 @@
 @desc: 特征筛选基类
 """
 import abc
-from typing import Dict
+from typing import Dict, List
 
-from entitys import DataProcessConfigEntity, DataPreparedEntity, CandidateFeatureEntity
+from entitys import DataProcessConfigEntity, DataPreparedEntity, CandidateFeatureEntity, MetricFucEntity
 
 
 class FilterStrategyBase(metaclass=abc.ABCMeta):
@@ -26,3 +26,7 @@ class FilterStrategyBase(metaclass=abc.ABCMeta):
     @abc.abstractmethod
     def feature_generate(self, *args, **kwargs) -> DataPreparedEntity:
         pass
+
+    @abc.abstractmethod
+    def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucEntity]:
+        pass

+ 70 - 3
feature/strategy_iv.py

@@ -7,23 +7,40 @@
 from itertools import combinations_with_replacement
 from typing import List, Dict
 
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scorecardpy as sc
+import seaborn as sns
 from pandas.core.dtypes.common import is_numeric_dtype
 
-from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity
+from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
 from .feature_utils import f_judge_monto, f_get_corr
 from .filter_strategy_base import FilterStrategyBase
 
+plt.rcParams['figure.figsize'] = (8, 8)
+
 
 class StrategyIv(FilterStrategyBase):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity]):
-        y_column = self.data_process_config.y_column
+    def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
+        image_path_list = []
+        for k in x_columns_candidate:
+            bin_df = bins[k]
+            # bin_df["bin"] = bin_df["bin"].apply(lambda x: re.sub(r"(\d+\.\d+)",
+            #                                                      lambda m: "{:.2f}".format(float(m.group(0))), x))
+            sc.woebin_plot(bin_df)
+            path = self.data_process_config._get_save_path(f"{prefix}_{k}.png")
+            plt.savefig(path)
+            image_path_list.append(path)
+        return image_path_list
+
+    def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity],
+                              y_column=None):
+        y_column = self.data_process_config.y_column if y_column is None else y_column
         special_values = self.data_process_config.special_values
         x_columns_candidate = list(candidate_dict.keys())
         breaks_list = {}
@@ -300,3 +317,53 @@ class StrategyIv(FilterStrategyBase):
             test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
                                                   train_woe.columns.tolist(), y_column)
         return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature)
+
+    def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
+                       **kwargs) -> Dict[str, MetricFucEntity]:
+        y_column = self.data_process_config.y_column
+        x_columns_candidate = list(candidate_dict.keys())
+        train_data = data.train_data
+        test_data = data.test_data
+
+        metric_value_dict = {}
+        # 样本分布
+        metric_value_dict["样本分布"] = MetricFucEntity(table=data.get_distribution(y_column))
+        # 变量iv及psi
+        train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
+        train_iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in train_bins.items()}
+        train_iv = pd.DataFrame.from_dict(train_iv, orient='index', columns=['IV']).reset_index()
+        train_iv = train_iv.sort_values('IV', ascending=False).reset_index(drop=True)
+        train_iv.columns = ['变量', 'IV']
+
+        if test_data is not None and len(test_data) != 0:
+            # 计算psi仅需把y改成识别各自训练集测试集即可
+            psi_df = pd.concat((train_data, test_data))
+            psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
+            psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
+            psi = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in psi.items()}
+            psi = pd.DataFrame.from_dict(psi, orient='index', columns=['psi']).reset_index()
+            psi.columns = ['变量', 'psi']
+            train_iv = pd.merge(train_iv, psi, on="变量", how="left")
+
+            # 变量趋势-测试集
+            test_bins = self._f_get_bins_by_breaks(test_data, candidate_dict)
+            image_path_list = self._f_save_var_trend(test_bins, x_columns_candidate, "test")
+            metric_value_dict["变量趋势-测试集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
+
+        metric_value_dict["变量iv"] = MetricFucEntity(table=train_iv)
+        # 变量趋势-训练集
+        image_path_list = self._f_save_var_trend(train_bins, x_columns_candidate, "train")
+        metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
+        # 变量有效性
+        train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
+        train_corr = f_get_corr(train_woe)
+        plt.figure(figsize=(12, 12))
+        sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
+        plt.title('Variables Correlation', fontsize=15)
+        plt.yticks(rotation=0)
+        plt.xticks(rotation=90)
+        path = self.data_process_config._get_save_path(f"var_corr.png")
+        plt.savefig(path)
+        metric_value_dict["变量有效性"] = MetricFucEntity(image_path=path)
+
+        return metric_value_dict

+ 11 - 4
monitor/report_generate.py

@@ -160,21 +160,26 @@ class Report():
             for metric_code, metric_fuc_entity in metric_value_dict.items():
                 placeholder = Report._get_placeholder(PlaceholderPrefixEnum.IMAGE, metric_code)
                 image_path = metric_fuc_entity.image_path
+                image_size = metric_fuc_entity.image_size
                 if image_path is None:
                     continue
                 if not placeholder in paragraph.text:
                     continue
-                if not os.path.exists(image_path):
-                    raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"文件【{image_path}】不存在")
+                if isinstance(image_path, str):
+                    image_path = [image_path]
+                for path in image_path:
+                    if not os.path.exists(path):
+                        raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"文件【{image_path}】不存在")
                 # 清除占位符
                 for run in paragraph.runs:
                     if placeholder not in run.text:
                         continue
                     run.text = run.text.replace(placeholder, "")
-                    run.add_picture(image_path, width=Inches(6))
+                    for path in image_path:
+                        run.add_picture(path, width=Inches(image_size))
 
     @staticmethod
-    def generate_report(metric_value_dict: Dict[str, MetricFucEntity], template_path: str):
+    def generate_report(metric_value_dict: Dict[str, MetricFucEntity], template_path: str, path=None):
         if os.path.exists(template_path):
             doc = Document(template_path)
         else:
@@ -184,6 +189,8 @@ class Report():
         Report._fill_table_placeholder(doc, metric_value_dict)
         Report._fill_image_placeholder(doc, metric_value_dict)
         new_path = template_path.replace(".docx", f"{f_get_datetime()}.docx")
+        if path is not None:
+            new_path = path
         doc.save(f"./{new_path}")
 
 

+ 1 - 1
requirements.txt

@@ -2,4 +2,4 @@ pymysql==1.0.2
 python-docx==0.8.11
 xlrd==1.2.0
 scorecardpy==0.1.9.7
-toad==0.0.64
+toad==0.0.64

BIN
template/模型开发报告模板_lr.docx


+ 5 - 1
train_test.py

@@ -18,13 +18,17 @@ if __name__ == "__main__":
     dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
     data = DataSplitEntity(dat[:700], None, dat[700:])
 
+    # 特征处理
     filter_strategy_factory = FilterStrategyFactory(
         DataProcessConfigEntity.from_config('./config/data_process_config_template.json'))
     strategy = filter_strategy_factory.get_strategy()
     candidate_feature = strategy.filter(data)
     data_prepared = strategy.feature_generate(data, candidate_feature)
-
+    # 训练
     train_pipeline = TrainPipeline(TrainConfigEntity.from_config('./config/train_config_template.json'))
     train_pipeline.train(data_prepared)
+    # 报告生成
+    metric_value_dict = strategy.feature_report(data, candidate_feature)
+    train_pipeline.generate_report(metric_value_dict)
 
     print(time.time() - time_now)

+ 6 - 2
trainer/train.py

@@ -4,8 +4,11 @@
 @time: 2024/11/1
 @desc: 模型训练管道
 """
-from entitys import DataPreparedEntity, TrainConfigEntity
+from typing import Dict
+
+from entitys import DataPreparedEntity, TrainConfigEntity, MetricFucEntity
 from model import f_get_model
+from monitor.report_generate import Report
 
 
 class TrainPipeline():
@@ -18,7 +21,8 @@ class TrainPipeline():
         metric_train = self.model.train(data)
         print(metric_train)
 
-    def generate_report(self, data: DataPreparedEntity):
+    def generate_report(self, metric_value_dict: Dict[str, MetricFucEntity]):
+        Report.generate_report(metric_value_dict, self._train_config.template_path, path="模型报告.docx")
         pass