Browse Source

add: OnlineLearning

yq 1 month ago
parent
commit
01649d9201

+ 3 - 5
__init__.py

@@ -9,14 +9,12 @@ from os.path import dirname, realpath
 
 
 sys.path.append(dirname(realpath(__file__)))
 sys.path.append(dirname(realpath(__file__)))
 
 
-from feature import FeatureStrategyFactory
-from model import ModelFactory
+from online_learning import OnlineLearningTrainer
 from pipeline import Pipeline
 from pipeline import Pipeline
-
 from data import DataLoaderMysql
 from data import DataLoaderMysql
 from entitys import DbConfigEntity, DataSplitEntity
 from entitys import DbConfigEntity, DataSplitEntity
 from monitor import MonitorMetric
 from monitor import MonitorMetric
 from metrics import MetricBase
 from metrics import MetricBase
 
 
-__all__ = ['MonitorMetric', 'DataLoaderMysql', 'DbConfigEntity', 'MetricBase', 'FeatureStrategyFactory', 'ModelFactory',
-           'Pipeline', 'DataSplitEntity']
+__all__ = ['MonitorMetric', 'MetricBase', 'DataLoaderMysql', 'DbConfigEntity',
+           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainer']

File diff suppressed because it is too large
+ 30 - 26
easy_ml_demo.ipynb


+ 2 - 1
entitys/__init__.py

@@ -9,9 +9,10 @@ from .db_config_entity import DbConfigEntity
 from .metric_entity import MetricFucResultEntity, MetricConfigEntity
 from .metric_entity import MetricFucResultEntity, MetricConfigEntity
 from .ml_config_entity import MlConfigEntity
 from .ml_config_entity import MlConfigEntity
 from .monitor_entity import MonitorConfigEntity
 from .monitor_entity import MonitorConfigEntity
+from .ol_config_entity import OnlineLearningConfigEntity
 
 
 __all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MonitorConfigEntity', 'MetricConfigEntity', 'MetricFucResultEntity',
 __all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MonitorConfigEntity', 'MetricConfigEntity', 'MetricFucResultEntity',
-           'DataSplitEntity', 'MlConfigEntity']
+           'DataSplitEntity', 'MlConfigEntity', 'OnlineLearningConfigEntity']
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     pass
     pass

+ 139 - 0
entitys/ol_config_entity.py

@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: OnlineLearning数配置类
+"""
+import json
+import os
+from typing import List
+
+from commom import GeneralException, f_get_datetime
+from config import BaseConfig
+from enums import ResultCodesEnum
+from init import warning_ignore
+
+
+class OnlineLearningConfigEntity():
+    def __init__(self,
+                 path_resources: str,
+                 y_column: str,
+                 project_name: str = None,
+                 lr: float = 0.01,
+                 batch_size: int = 64,
+                 epochs: int = 50,
+                 columns_anns: dict = {},
+                 jupyter_print=False,
+                 stress_test=False,
+                 stress_sample_times=100,
+                 stress_bad_rate_list: List[float] = [],
+                 *args, **kwargs):
+
+        self._path_resources = path_resources
+        # 定义y变量
+        self._y_column = y_column
+        # 项目名称,和缓存路径有关
+        self._project_name = project_name
+        # 学习率
+        self._lr = lr
+        # 模型单次更新使用数据量
+        self._batch_size = batch_size
+        # 最大训练轮数
+        self._epochs = epochs
+
+        # 变量注释
+        self._columns_anns = columns_anns
+
+        # jupyter下输出内容
+        self._jupyter_print = jupyter_print
+
+        # 是否开启下输出内容
+        self._stress_test = stress_test
+
+        # jupyter下输出内容
+        self._stress_sample_times = stress_sample_times
+
+        # jupyter下输出内容
+        self._stress_bad_rate_list = stress_bad_rate_list
+
+        if self._project_name is None or len(self._project_name) == 0:
+            self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
+        else:
+            self._base_dir = os.path.join(BaseConfig.train_path, self._project_name)
+        os.makedirs(self._base_dir, exist_ok=True)
+        print(f"项目路径:【{self._base_dir}】")
+
+        if self._jupyter_print:
+            warning_ignore()
+
+    @property
+    def path_resources(self):
+        return self._path_resources
+
+    @property
+    def y_column(self):
+        return self._y_column
+
+    @property
+    def lr(self):
+        return self._lr
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def epochs(self):
+        return self._epochs
+
+    @property
+    def columns_anns(self):
+        return self._columns_anns
+
+    @property
+    def jupyter_print(self):
+        return self._jupyter_print
+
+    @property
+    def stress_test(self):
+        return self._stress_test
+
+    @property
+    def stress_sample_times(self):
+        return self._stress_sample_times
+
+    @property
+    def stress_bad_rate_list(self):
+        return self._stress_bad_rate_list
+
+    @staticmethod
+    def from_config(config_path: str):
+        """
+        从配置文件生成实体类
+        """
+        if os.path.isdir(config_path):
+            config_path = os.path.join(config_path, "olcfg.json")
+
+        if os.path.exists(config_path):
+            with open(config_path, mode="r", encoding="utf-8") as f:
+                j = json.loads(f.read())
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
+        print(f"olcfg load from【{config_path}】success. ")
+        return OnlineLearningConfigEntity(**j)
+
+    def config_save(self):
+        path = self.f_get_save_path("olcfg.json")
+        with open(path, mode="w", encoding="utf-8") as f:
+            j = {k.lstrip("_"): v for k, v in self.__dict__.items()}
+            j = json.dumps(j, ensure_ascii=False)
+            f.write(j)
+        print(f"olcfg save to【{path}】success. ")
+
+    def f_get_save_path(self, file_name: str) -> str:
+        path = os.path.join(self._base_dir, file_name)
+        return path
+
+
+if __name__ == "__main__":
+    pass

+ 0 - 43
entitys/train_config_entity.py

@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: yq
-@time: 2024/11/1
-@desc: 模型训练超参数配置类
-"""
-import json
-import os
-
-from commom import GeneralException
-from enums import ResultCodesEnum
-
-
-class TrainConfigEntity():
-    def __init__(self, lr: float = None, *args, **kwargs):
-        # 学习率
-        self._lr = lr
-        # 该函数需要去继承
-        self.f_get_save_path = None
-
-    @property
-    def lr(self):
-        return self._lr
-
-    def set_save_path_func(self, f):
-        self.f_get_save_path = f
-
-    @staticmethod
-    def from_config(config_path: str):
-        """
-        从配置文件生成实体类
-        """
-        if os.path.exists(config_path):
-            with open(config_path, mode="r", encoding="utf-8") as f:
-                j = json.loads(f.read())
-        else:
-            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
-
-        return TrainConfigEntity(**j)
-
-
-if __name__ == "__main__":
-    pass

+ 2 - 0
enums/constant_enum.py

@@ -10,3 +10,5 @@ from enum import Enum
 class ConstantEnum(Enum):
 class ConstantEnum(Enum):
     SCORE = "SCORE"
     SCORE = "SCORE"
     SCORE_BIN = "MODEL_SCORE_BIN"
     SCORE_BIN = "MODEL_SCORE_BIN"
+    # lr模型常数项
+    INTERCEPT = "const"

+ 1 - 0
enums/context_enum.py

@@ -8,6 +8,7 @@ from enum import Enum
 
 
 
 
 class ContextEnum(Enum):
 class ContextEnum(Enum):
+    PARAM_OPTIMIZED = "param_optimized"
     BIN_INFO_FILTERED = "bin_info_filtered"
     BIN_INFO_FILTERED = "bin_info_filtered"
     HOMO_BIN_INFO_NUMERIC_SET = "homo_bin_info_numeric_set"
     HOMO_BIN_INFO_NUMERIC_SET = "homo_bin_info_numeric_set"
     WOEBIN = "woebin"
     WOEBIN = "woebin"

+ 2 - 1
feature/__init__.py

@@ -6,5 +6,6 @@
 """
 """
 from .feature_strategy_base import FeatureStrategyBase
 from .feature_strategy_base import FeatureStrategyBase
 from .feature_strategy_factory import FeatureStrategyFactory
 from .feature_strategy_factory import FeatureStrategyFactory
+from .woe.utils import f_woebin_load
 
 
-__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase']
+__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load']

+ 2 - 13
feature/woe/strategy_woe.py

@@ -5,7 +5,6 @@
 @desc: iv值及单调性筛选类
 @desc: iv值及单调性筛选类
 """
 """
 import json
 import json
-import os.path
 from itertools import combinations_with_replacement
 from itertools import combinations_with_replacement
 from typing import Dict, Optional, Union
 from typing import Dict, Optional, Union
 
 
@@ -25,7 +24,7 @@ from enums import ContextEnum, ResultCodesEnum
 from feature.feature_strategy_base import FeatureStrategyBase
 from feature.feature_strategy_base import FeatureStrategyBase
 from init import context
 from init import context
 from .entity import BinInfo, HomologousBinInfo
 from .entity import BinInfo, HomologousBinInfo
-from .utils import f_monto_shift, f_get_corr, f_get_vif, f_format_bin, f_trend_shift, f_get_psi
+from .utils import f_monto_shift, f_get_corr, f_get_vif, f_format_bin, f_trend_shift, f_get_psi, f_woebin_load
 
 
 
 
 class StrategyWoe(FeatureStrategyBase):
 class StrategyWoe(FeatureStrategyBase):
@@ -481,17 +480,7 @@ class StrategyWoe(FeatureStrategyBase):
         print(f"feature save to【{path}】success. ")
         print(f"feature save to【{path}】success. ")
 
 
     def feature_load(self, path: str, *args, **kwargs):
     def feature_load(self, path: str, *args, **kwargs):
-        if os.path.isdir(path):
-            path = os.path.join(path, "feature.csv")
-        if not os.path.isfile(path) or "feature.csv" not in path:
-            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征信息【feature.csv】不存在")
-
-        df_woebin = pd.read_csv(path)
-        variables = df_woebin["variable"].unique().tolist()
-        self.sc_woebin = {}
-        for variable in variables:
-            self.sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
-        print(f"feature load from【{path}】success.")
+        self.sc_woebin = f_woebin_load(path)
 
 
     def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
     def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
         x_columns = list(self.sc_woebin.keys())
         x_columns = list(self.sc_woebin.keys())

+ 19 - 0
feature/woe/utils.py

@@ -4,12 +4,16 @@
 @time: 2023/12/28
 @time: 2023/12/28
 @desc:  特征工具类
 @desc:  特征工具类
 """
 """
+import os
 from typing import Union
 from typing import Union
 
 
 import numpy as np
 import numpy as np
 import pandas as pd
 import pandas as pd
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
 
+from commom import GeneralException
+from enums import ResultCodesEnum
+
 FORMAT_DICT = {
 FORMAT_DICT = {
     # 比例类 -1 - 1
     # 比例类 -1 - 1
     "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
     "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
@@ -133,3 +137,18 @@ def f_get_vif(data: pd.DataFrame) -> Union[pd.DataFrame, None]:
     df_vif["变量"] = [column.replace("_woe", "") for column in data.columns]
     df_vif["变量"] = [column.replace("_woe", "") for column in data.columns]
     df_vif['vif'] = vif_v
     df_vif['vif'] = vif_v
     return df_vif
     return df_vif
+
+
+def f_woebin_load(path: str):
+    if os.path.isdir(path):
+        path = os.path.join(path, "feature.csv")
+    if not os.path.isfile(path) or "feature.csv" not in path:
+        raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征信息【feature.csv】不存在")
+
+    df_woebin = pd.read_csv(path)
+    variables = df_woebin["variable"].unique().tolist()
+    sc_woebin = {}
+    for variable in variables:
+        sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
+    print(f"feature load from【{path}】success.")
+    return sc_woebin

+ 3 - 2
model/__init__.py

@@ -6,6 +6,7 @@
 """
 """
 from .model_base import ModelBase
 from .model_base import ModelBase
 from .model_factory import ModelFactory
 from .model_factory import ModelFactory
-from .model_utils import f_add_rules
+from .model_utils import f_add_rules, f_get_model_score_bin, f_calcu_model_ks, f_calcu_model_psi, f_stress_test
 
 
-__all__ = ['ModelBase', 'ModelFactory', 'f_add_rules']
+__all__ = ['ModelBase', 'ModelFactory', 'f_add_rules', 'f_get_model_score_bin', 'f_calcu_model_ks', 'f_calcu_model_psi',
+           'f_stress_test']

+ 1 - 1
model/model_lr.py

@@ -79,7 +79,7 @@ class ModelLr(ModelBase):
         self.lr.save(path)
         self.lr.save(path)
         print(f"model save to【{path}】success. ")
         print(f"model save to【{path}】success. ")
 
 
-        path = self.ml_config.f_get_save_path(f"coef.dict")
+        path = self.ml_config.f_get_save_path("coef.dict")
         with open(path, mode="w", encoding="utf-8") as f:
         with open(path, mode="w", encoding="utf-8") as f:
             j = json.dumps(self.coef, ensure_ascii=False)
             j = json.dumps(self.coef, ensure_ascii=False)
             f.write(j)
             f.write(j)

+ 3 - 3
model/model_utils.py

@@ -44,12 +44,12 @@ def f_get_model_score_bin(df, score, bins=None):
     return df, bins
     return df, bins
 
 
 
 
-def f_calcu_model_psi(df_train, df_test):
+def f_calcu_model_psi(df_train, df_test, sort_ascending=True):
     tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
     tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
-        ascending=True)
+        ascending=sort_ascending)
     tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
     tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
     tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
     tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
-        ascending=True)
+        ascending=sort_ascending)
     tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
     tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
     psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
     psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
     psi = psi.reset_index()
     psi = psi.reset_index()

+ 54 - 0
ol_test.py

@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/27
+@desc:
+"""
+import time
+
+from entitys import DataSplitEntity
+from online_learning import OnlineLearningTrainer
+
+
+if __name__ == "__main__":
+    time_now = time.time()
+    import scorecardpy as sc
+
+    # 加载数据
+    dat = sc.germancredit()
+    dat_columns = dat.columns.tolist()
+    dat_columns = [c.replace(".","_") for c in dat_columns]
+    dat.columns = dat_columns
+
+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
+
+    data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
+
+    # 特征处理
+    cfg = {
+        # 模型系数,分箱信息等,请参考ol_resources_demo目录下文件
+        # 模型系数文件 coef.dict(如果有常数项(截距)请用const作为key)
+        # 分箱信息文件 feature.csv(数值型的分箱信息请按升序排列)
+        "path_resources": "/root/notebook/ol_resources_demo",
+        # 项目名称,影响数据存储位置
+        "project_name": "OnlineLearningDemo",
+        "y_column": "creditability",
+        # 学习率
+        "lr": 0.01,
+        # 单次更新批大小
+        "batch_size": 64,
+        # 训练轮数
+        "epochs": 20,
+        "jupyter_print": True,
+        # 压力测试
+        "stress_test": True,
+        # 压力测试抽样次数
+        "stress_sample_times": 10,
+    }
+
+    # 训练并生成报告
+    trainer = OnlineLearningTrainer(data=data, **cfg)
+    trainer.train()
+    trainer.report()
+
+    print(time.time() - time_now)

+ 10 - 0
online_learning/__init__.py

@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+
+from .trainer import OnlineLearningTrainer
+
+__all__ = ['OnlineLearningTrainer']

+ 364 - 0
online_learning/trainer.py

@@ -0,0 +1,364 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+import json
+import math
+import os
+import re
+from os.path import dirname, realpath
+from typing import Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scorecardpy as sc
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+
+from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
+    f_display_images_by_side
+from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
+from enums import ResultCodesEnum, ConstantEnum, ContextEnum
+from feature import f_woebin_load
+from init import init, context
+from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
+from monitor import ReportWord
+from .utils import LR
+
+init()
+
+
+class OnlineLearningTrainer:
+    def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
+        if ol_config is not None:
+            self._ol_config = ol_config
+        else:
+            self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
+        self._data = data
+        self._columns = None
+        self._model_original: LR
+        self._model_optimized: LR
+        self.sc_woebin = None
+        # 报告模板
+        self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
+                                           "./template/OnlineLearning报告模板_lr.docx")
+        self._init(self._ol_config.path_resources)
+
+    def _init(self, path: str):
+        if not os.path.isdir(path):
+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
+
+        path_coef = os.path.join(path, "coef.dict")
+        if not os.path.isfile(path_coef):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型系数文件【{path_coef}】不存在")
+        with open(path_coef, mode="r", encoding="utf-8") as f:
+            coef = json.loads(f.read())
+            print(f"coef load from【{path_coef}】success.")
+
+        self._columns = list(coef.keys())
+        # 排个序,防止因为顺序原因导致的可能的bug
+        self._columns.sort()
+        weight = [coef[k] for k in self._columns]
+        self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
+        self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
+
+        self._columns = [re.sub('_woe$', '', i) for i in self._columns]
+        # 剔除常数项,因为woe编码里没有常数项
+        self._columns_intercept_remove = self._columns.copy()
+        if ConstantEnum.INTERCEPT.value in self._columns_intercept_remove:
+            self._columns_intercept_remove.remove(ConstantEnum.INTERCEPT.value)
+        # woe编码后带_woe后缀
+        self._columns_woe = [f"{i}_woe" for i in self._columns]
+
+        self.sc_woebin = f_woebin_load(path)
+        for k in self._columns_intercept_remove:
+            if k not in self.sc_woebin.keys():
+                GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型变量【{k}】在woe特征里不存在")
+
+    def _feature_generate(self, data: pd.DataFrame) -> pd.DataFrame:
+        data_woe = sc.woebin_ply(data[self._columns_intercept_remove], self.sc_woebin, print_info=False)
+        data_woe[f"{ConstantEnum.INTERCEPT.value}_woe"] = [1] * len(data_woe)
+        return data_woe[self._columns_woe].to_numpy()
+
+    def _f_get_best_model(self, df_param: pd.DataFrame) -> LR:
+        df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
+        print(f"最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
+        weight = list(df_param_sort.iloc[0])
+        weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
+        return LR(weight)
+
+    def _f_get_metric_auc_ks(self, model_type: str):
+        def _get_auc_ks(data, title):
+            y = data[self._ol_config.y_column]
+            y_prob = self.prob(data, model)
+            perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
+            path = self._ol_config.f_get_save_path(f"perf_{title}.png")
+            perf["pic"].savefig(path)
+            auc = perf["AUC"]
+            ks = perf["KS"]
+            f_image_crop_white_borders(path, path)
+            return auc, ks, path
+
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        data = pd.concat((train_data, test_data))
+
+        model = self._model_optimized
+        if model_type != "新模型":
+            model = self._model_original
+
+        img_path_auc_ks = []
+        auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
+        img_path_auc_ks.append(path)
+        train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
+        img_path_auc_ks.append(path)
+        test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
+        img_path_auc_ks.append(path)
+
+        df_auc_ks = pd.DataFrame()
+        df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
+        df_auc_ks["AUC"] = [auc, train_auc, test_auc]
+        df_auc_ks["KS"] = [ks, train_ks, test_ks]
+
+        return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
+
+    def _f_get_metric_trend(self, ):
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+
+        # 建模样本变量趋势
+        breaks_list = {}
+        special_values = {}
+        for column, bin in self.sc_woebin.items():
+            breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
+            sv = list(bin[bin["is_special_values"] == True]['breaks'])
+            if len(sv) > 0:
+                special_values[column] = sv
+        woebin = sc.woebin(data[self._columns_intercept_remove + [y_column]], y=y_column, breaks_list=breaks_list,
+                           special_values=special_values, print_info=False)
+
+        imgs_path = []
+        for k, df_bin in woebin.items():
+            sc.woebin_plot(df_bin)
+            path = self._ol_config.f_get_save_path(f"trend_{k}.png")
+            plt.savefig(path)
+            imgs_path.append(path)
+        return MetricFucResultEntity(image_path=imgs_path, image_size=4)
+
+    def _f_get_metric_coef(self, ):
+        columns_anns = self._ol_config.columns_anns
+        df = pd.DataFrame()
+        df["变量"] = self._columns
+        df["原变量WOE拟合系数"] = [round(i, 4) for i in self._model_original.linear.weight.tolist()]
+        df["新变量WOE拟合系数"] = [round(i, 4) for i in self._model_optimized.linear.weight.tolist()]
+        anns = [columns_anns.get(column, "-") for column in self._columns]
+        df["释义"] = anns
+        img_path_coef = self._ol_config.f_get_save_path(f"coef.png")
+        f_df_to_image(df, img_path_coef)
+        return MetricFucResultEntity(table=df, image_path=img_path_coef)
+
+    def _f_get_metric_gain(self, model_type: str):
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+
+        model = self._model_optimized
+        if model_type != "新模型":
+            model = self._model_original
+
+        score = self.prob(data, model)
+        score_bin, _ = f_get_model_score_bin(data, score)
+        gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
+        img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
+        f_df_to_image(gain, img_path_gain)
+
+        return MetricFucResultEntity(table=gain, image_path=img_path_gain)
+
+    def _f_get_stress_test(self, ):
+        stress_sample_times = self._ol_config.stress_sample_times
+        stress_bad_rate_list = self._ol_config.stress_bad_rate_list
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+        score = self.prob(data, self._model_optimized)
+        score_bin, _ = f_get_model_score_bin(data, score)
+        df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
+                                  target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
+
+        img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
+        f_df_to_image(df_stress, img_path_stress)
+        return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
+
+    def prob(self, x: pd.DataFrame, model=None):
+        if model is None:
+            model = self._model_optimized
+        model.eval()
+        with torch.no_grad():
+            x = torch.tensor(self._feature_generate(x), dtype=torch.float64)
+            y_prob = model(x)
+            y_prob = y_prob.detach().numpy()
+            return y_prob
+
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
+        y1 = self.prob(x1)
+        y2 = self.prob(x2)
+        x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
+        x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
+        model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
+        print(f"模型psi: {model_psi['psi'].sum()}")
+        return model_psi
+
+    def train(self, ):
+        epochs = self._ol_config.epochs
+        batch_size = self._ol_config.batch_size
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        train_x = self._feature_generate(train_data)
+        train_y = train_data[self._ol_config.y_column].to_numpy()
+        test_x = torch.tensor(self._feature_generate(test_data), dtype=torch.float64)
+        test_y = test_data[self._ol_config.y_column]
+
+        criterion = nn.BCELoss()
+        optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
+
+        df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
+        df_param = pd.DataFrame(columns=df_param_columns)
+
+        for epoch in tqdm(range(epochs)):
+            data_len = len(train_x)
+            loss_train = 0
+            for i in range(math.ceil(data_len / batch_size)):
+                train_x_batch = torch.tensor(train_x[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
+                train_y_batch = torch.tensor(train_y[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
+                self._model_optimized.train()
+                optimizer.zero_grad()
+                y_prob = self._model_optimized(train_x_batch)
+                loss = criterion(y_prob, train_y_batch)
+                loss.backward()
+                optimizer.step()
+                loss_train = loss.detach().item()
+            # 测试集评估
+            self._model_optimized.eval()
+            with torch.no_grad():
+                y_prob = self._model_optimized(test_x)
+                loss = criterion(y_prob, torch.tensor(test_y.to_numpy(), dtype=torch.float64))
+                loss_test = loss.detach().item()
+                y_prob = y_prob.detach().numpy()
+                perf = sc.perf_eva(test_y, y_prob, show_plot=False)
+                auc = perf["AUC"]
+                ks = perf["KS"]
+                row = self._model_optimized.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
+                df_param.loc[len(df_param)] = dict(zip(df_param_columns, row))
+                # print(f"epoch:{epoch + 1} auc:{auc} ks:{ks}")
+
+        self._model_optimized = self._f_get_best_model(df_param)
+
+        context.set(ContextEnum.PARAM_OPTIMIZED, df_param)
+
+    def save(self):
+
+        self._ol_config.config_save()
+
+        if self.sc_woebin is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
+        df_woebin = pd.concat(self.sc_woebin.values())
+        path = self._ol_config.f_get_save_path(f"feature.csv")
+        df_woebin.to_csv(path)
+        print(f"feature save to【{path}】success. ")
+
+        if self._model_optimized is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
+        path = self._ol_config.f_get_save_path("coef.dict")
+        with open(path, mode="w", encoding="utf-8") as f:
+            coef = dict(zip(self._columns, self._model_optimized.linear.weight.tolist()))
+            j = json.dumps(coef, ensure_ascii=False)
+            f.write(j)
+        print(f"model save to【{path}】success. ")
+
+    @staticmethod
+    def load(path: str):
+        ol_config = OnlineLearningConfigEntity.from_config(path)
+        ol_config._path_resources = path
+        return OnlineLearningTrainer(ol_config=ol_config)
+
+    def report(self, ):
+
+        metric_value_dict = {}
+        # 样本分布
+        metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
+                                                          table_font_size=10, table_cell_width=3)
+
+        # 模型结果对比
+        metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
+        metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
+
+        # 变量趋势
+        metric_value_dict["变量趋势-建模数据"] = self._f_get_metric_trend()
+
+        # 模型系数对比
+        metric_value_dict["模型系数"] = self._f_get_metric_coef()
+
+        # 分数分箱
+        metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
+        metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
+
+        # 压力测试
+        if self._ol_config.stress_test:
+            metric_value_dict["压力测试"] = self._f_get_stress_test()
+
+        if self._ol_config.jupyter_print:
+            self.jupyter_print(metric_value_dict)
+
+        save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
+        ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
+        print(f"模型报告文件储存路径:{save_path}")
+
+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
+        from IPython import display
+
+        df_param = context.get(ContextEnum.PARAM_OPTIMIZED)
+
+        f_display_title(display, "样本分布")
+        display.display(metric_value_dict["样本分布"].table)
+
+        f_display_title(display, "模型结果")
+        print(f"原模型")
+        display.display(metric_value_dict["模型结果-原模型"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
+        print(f"新模型")
+        display.display(metric_value_dict["模型结果-新模型"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
+
+        f_display_title(display, "模型系数")
+        display.display(metric_value_dict["模型系数"].table)
+
+        f_display_title(display, "分数分箱")
+        print(f"建模数据上分数分箱")
+        print(f"原模型")
+        display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
+        print(f"新模型")
+        display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
+
+        f_display_title(display, "变量趋势")
+        print(f"建模数据上变量趋势")
+        f_display_images_by_side(display, metric_value_dict["变量趋势-建模数据"].image_path)
+
+        if "压力测试" in metric_value_dict.keys():
+            f_display_title(display, "压力测试")
+            display.display(metric_value_dict["压力测试"].table)
+
+        f_display_title(display, "系数优化过程")
+        display.display(df_param)
+
+
+if __name__ == "__main__":
+    pass

+ 22 - 0
online_learning/utils.py

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+import torch.nn as nn
+
+
+class LR(nn.Module):
+    def __init__(self, weight: nn.Parameter):
+        super(LR, self).__init__()
+        self.linear = nn.Linear(weight.shape[0], 1, bias=False)
+        self.linear.weight = weight
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(self.linear(x))
+
+
+if __name__ == "__main__":
+    pass

+ 16 - 1
pipeline/pipeline.py

@@ -4,12 +4,14 @@
 @time: 2024/11/1
 @time: 2024/11/1
 @desc: 模型训练管道
 @desc: 模型训练管道
 """
 """
+from typing import List
+
 import pandas as pd
 import pandas as pd
 
 
 from entitys import DataSplitEntity, MlConfigEntity, DataFeatureEntity
 from entitys import DataSplitEntity, MlConfigEntity, DataFeatureEntity
 from feature import FeatureStrategyFactory, FeatureStrategyBase
 from feature import FeatureStrategyFactory, FeatureStrategyBase
 from init import init
 from init import init
-from model import ModelBase, ModelFactory, f_add_rules
+from model import ModelBase, ModelFactory, f_add_rules, f_get_model_score_bin, f_calcu_model_psi
 from monitor import ReportWord
 from monitor import ReportWord
 
 
 init()
 init()
@@ -53,6 +55,19 @@ class Pipeline():
     def score_rule(self, data: pd.DataFrame):
     def score_rule(self, data: pd.DataFrame):
         return self._model.score_rule(data)
         return self._model.score_rule(data)
 
 
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
+        if len(self._ml_config.rules) != 0:
+            y1 = self.score_rule(x1)
+            y2 = self.score_rule(x2)
+        else:
+            y1 = self.score(x1)
+            y2 = self.score(x2)
+        x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
+        x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
+        model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin)
+        print(f"模型psi: {model_psi['psi'].sum()}")
+        return model_psi
+
     def report(self, ):
     def report(self, ):
         save_path = self._ml_config.f_get_save_path("模型报告.docx")
         save_path = self._ml_config.f_get_save_path("模型报告.docx")
         ReportWord.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)
         ReportWord.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)

+ 1 - 0
requirements-analysis.txt

@@ -18,3 +18,4 @@ kaleido==0.2.1
 statsmodels==0.12.2
 statsmodels==0.12.2
 beautifulsoup4==4.11.1
 beautifulsoup4==4.11.1
 openpyxl==3.0.9
 openpyxl==3.0.9
+torch==1.1.0

BIN
template/OnlineLearning报告模板_lr.docx


Some files were not shown because too many files changed in this diff