Browse Source

add: OnlineLearning

yq 1 tháng trước cách đây
mục cha
commit
01649d9201

+ 3 - 5
__init__.py

@@ -9,14 +9,12 @@ from os.path import dirname, realpath
 
 sys.path.append(dirname(realpath(__file__)))
 
-from feature import FeatureStrategyFactory
-from model import ModelFactory
+from online_learning import OnlineLearningTrainer
 from pipeline import Pipeline
-
 from data import DataLoaderMysql
 from entitys import DbConfigEntity, DataSplitEntity
 from monitor import MonitorMetric
 from metrics import MetricBase
 
-__all__ = ['MonitorMetric', 'DataLoaderMysql', 'DbConfigEntity', 'MetricBase', 'FeatureStrategyFactory', 'ModelFactory',
-           'Pipeline', 'DataSplitEntity']
+__all__ = ['MonitorMetric', 'MetricBase', 'DataLoaderMysql', 'DbConfigEntity',
+           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainer']

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 30 - 26
easy_ml_demo.ipynb


+ 2 - 1
entitys/__init__.py

@@ -9,9 +9,10 @@ from .db_config_entity import DbConfigEntity
 from .metric_entity import MetricFucResultEntity, MetricConfigEntity
 from .ml_config_entity import MlConfigEntity
 from .monitor_entity import MonitorConfigEntity
+from .ol_config_entity import OnlineLearningConfigEntity
 
 __all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MonitorConfigEntity', 'MetricConfigEntity', 'MetricFucResultEntity',
-           'DataSplitEntity', 'MlConfigEntity']
+           'DataSplitEntity', 'MlConfigEntity', 'OnlineLearningConfigEntity']
 
 if __name__ == "__main__":
     pass

+ 139 - 0
entitys/ol_config_entity.py

@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: OnlineLearning数配置类
+"""
+import json
+import os
+from typing import List
+
+from commom import GeneralException, f_get_datetime
+from config import BaseConfig
+from enums import ResultCodesEnum
+from init import warning_ignore
+
+
+class OnlineLearningConfigEntity():
+    def __init__(self,
+                 path_resources: str,
+                 y_column: str,
+                 project_name: str = None,
+                 lr: float = 0.01,
+                 batch_size: int = 64,
+                 epochs: int = 50,
+                 columns_anns: dict = {},
+                 jupyter_print=False,
+                 stress_test=False,
+                 stress_sample_times=100,
+                 stress_bad_rate_list: List[float] = [],
+                 *args, **kwargs):
+
+        self._path_resources = path_resources
+        # 定义y变量
+        self._y_column = y_column
+        # 项目名称,和缓存路径有关
+        self._project_name = project_name
+        # 学习率
+        self._lr = lr
+        # 模型单次更新使用数据量
+        self._batch_size = batch_size
+        # 最大训练轮数
+        self._epochs = epochs
+
+        # 变量注释
+        self._columns_anns = columns_anns
+
+        # jupyter下输出内容
+        self._jupyter_print = jupyter_print
+
+        # 是否开启下输出内容
+        self._stress_test = stress_test
+
+        # jupyter下输出内容
+        self._stress_sample_times = stress_sample_times
+
+        # jupyter下输出内容
+        self._stress_bad_rate_list = stress_bad_rate_list
+
+        if self._project_name is None or len(self._project_name) == 0:
+            self._base_dir = os.path.join(BaseConfig.train_path, f"{f_get_datetime()}")
+        else:
+            self._base_dir = os.path.join(BaseConfig.train_path, self._project_name)
+        os.makedirs(self._base_dir, exist_ok=True)
+        print(f"项目路径:【{self._base_dir}】")
+
+        if self._jupyter_print:
+            warning_ignore()
+
+    @property
+    def path_resources(self):
+        return self._path_resources
+
+    @property
+    def y_column(self):
+        return self._y_column
+
+    @property
+    def lr(self):
+        return self._lr
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def epochs(self):
+        return self._epochs
+
+    @property
+    def columns_anns(self):
+        return self._columns_anns
+
+    @property
+    def jupyter_print(self):
+        return self._jupyter_print
+
+    @property
+    def stress_test(self):
+        return self._stress_test
+
+    @property
+    def stress_sample_times(self):
+        return self._stress_sample_times
+
+    @property
+    def stress_bad_rate_list(self):
+        return self._stress_bad_rate_list
+
+    @staticmethod
+    def from_config(config_path: str):
+        """
+        从配置文件生成实体类
+        """
+        if os.path.isdir(config_path):
+            config_path = os.path.join(config_path, "olcfg.json")
+
+        if os.path.exists(config_path):
+            with open(config_path, mode="r", encoding="utf-8") as f:
+                j = json.loads(f.read())
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
+        print(f"olcfg load from【{config_path}】success. ")
+        return OnlineLearningConfigEntity(**j)
+
+    def config_save(self):
+        path = self.f_get_save_path("olcfg.json")
+        with open(path, mode="w", encoding="utf-8") as f:
+            j = {k.lstrip("_"): v for k, v in self.__dict__.items()}
+            j = json.dumps(j, ensure_ascii=False)
+            f.write(j)
+        print(f"olcfg save to【{path}】success. ")
+
+    def f_get_save_path(self, file_name: str) -> str:
+        path = os.path.join(self._base_dir, file_name)
+        return path
+
+
+if __name__ == "__main__":
+    pass

+ 0 - 43
entitys/train_config_entity.py

@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: yq
-@time: 2024/11/1
-@desc: 模型训练超参数配置类
-"""
-import json
-import os
-
-from commom import GeneralException
-from enums import ResultCodesEnum
-
-
-class TrainConfigEntity():
-    def __init__(self, lr: float = None, *args, **kwargs):
-        # 学习率
-        self._lr = lr
-        # 该函数需要去继承
-        self.f_get_save_path = None
-
-    @property
-    def lr(self):
-        return self._lr
-
-    def set_save_path_func(self, f):
-        self.f_get_save_path = f
-
-    @staticmethod
-    def from_config(config_path: str):
-        """
-        从配置文件生成实体类
-        """
-        if os.path.exists(config_path):
-            with open(config_path, mode="r", encoding="utf-8") as f:
-                j = json.loads(f.read())
-        else:
-            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
-
-        return TrainConfigEntity(**j)
-
-
-if __name__ == "__main__":
-    pass

+ 2 - 0
enums/constant_enum.py

@@ -10,3 +10,5 @@ from enum import Enum
 class ConstantEnum(Enum):
     SCORE = "SCORE"
     SCORE_BIN = "MODEL_SCORE_BIN"
+    # lr模型常数项
+    INTERCEPT = "const"

+ 1 - 0
enums/context_enum.py

@@ -8,6 +8,7 @@ from enum import Enum
 
 
 class ContextEnum(Enum):
+    PARAM_OPTIMIZED = "param_optimized"
     BIN_INFO_FILTERED = "bin_info_filtered"
     HOMO_BIN_INFO_NUMERIC_SET = "homo_bin_info_numeric_set"
     WOEBIN = "woebin"

+ 2 - 1
feature/__init__.py

@@ -6,5 +6,6 @@
 """
 from .feature_strategy_base import FeatureStrategyBase
 from .feature_strategy_factory import FeatureStrategyFactory
+from .woe.utils import f_woebin_load
 
-__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase']
+__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load']

+ 2 - 13
feature/woe/strategy_woe.py

@@ -5,7 +5,6 @@
 @desc: iv值及单调性筛选类
 """
 import json
-import os.path
 from itertools import combinations_with_replacement
 from typing import Dict, Optional, Union
 
@@ -25,7 +24,7 @@ from enums import ContextEnum, ResultCodesEnum
 from feature.feature_strategy_base import FeatureStrategyBase
 from init import context
 from .entity import BinInfo, HomologousBinInfo
-from .utils import f_monto_shift, f_get_corr, f_get_vif, f_format_bin, f_trend_shift, f_get_psi
+from .utils import f_monto_shift, f_get_corr, f_get_vif, f_format_bin, f_trend_shift, f_get_psi, f_woebin_load
 
 
 class StrategyWoe(FeatureStrategyBase):
@@ -481,17 +480,7 @@ class StrategyWoe(FeatureStrategyBase):
         print(f"feature save to【{path}】success. ")
 
     def feature_load(self, path: str, *args, **kwargs):
-        if os.path.isdir(path):
-            path = os.path.join(path, "feature.csv")
-        if not os.path.isfile(path) or "feature.csv" not in path:
-            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征信息【feature.csv】不存在")
-
-        df_woebin = pd.read_csv(path)
-        variables = df_woebin["variable"].unique().tolist()
-        self.sc_woebin = {}
-        for variable in variables:
-            self.sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
-        print(f"feature load from【{path}】success.")
+        self.sc_woebin = f_woebin_load(path)
 
     def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
         x_columns = list(self.sc_woebin.keys())

+ 19 - 0
feature/woe/utils.py

@@ -4,12 +4,16 @@
 @time: 2023/12/28
 @desc:  特征工具类
 """
+import os
 from typing import Union
 
 import numpy as np
 import pandas as pd
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
+from commom import GeneralException
+from enums import ResultCodesEnum
+
 FORMAT_DICT = {
     # 比例类 -1 - 1
     "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
@@ -133,3 +137,18 @@ def f_get_vif(data: pd.DataFrame) -> Union[pd.DataFrame, None]:
     df_vif["变量"] = [column.replace("_woe", "") for column in data.columns]
     df_vif['vif'] = vif_v
     return df_vif
+
+
+def f_woebin_load(path: str):
+    if os.path.isdir(path):
+        path = os.path.join(path, "feature.csv")
+    if not os.path.isfile(path) or "feature.csv" not in path:
+        raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"特征信息【feature.csv】不存在")
+
+    df_woebin = pd.read_csv(path)
+    variables = df_woebin["variable"].unique().tolist()
+    sc_woebin = {}
+    for variable in variables:
+        sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
+    print(f"feature load from【{path}】success.")
+    return sc_woebin

+ 3 - 2
model/__init__.py

@@ -6,6 +6,7 @@
 """
 from .model_base import ModelBase
 from .model_factory import ModelFactory
-from .model_utils import f_add_rules
+from .model_utils import f_add_rules, f_get_model_score_bin, f_calcu_model_ks, f_calcu_model_psi, f_stress_test
 
-__all__ = ['ModelBase', 'ModelFactory', 'f_add_rules']
+__all__ = ['ModelBase', 'ModelFactory', 'f_add_rules', 'f_get_model_score_bin', 'f_calcu_model_ks', 'f_calcu_model_psi',
+           'f_stress_test']

+ 1 - 1
model/model_lr.py

@@ -79,7 +79,7 @@ class ModelLr(ModelBase):
         self.lr.save(path)
         print(f"model save to【{path}】success. ")
 
-        path = self.ml_config.f_get_save_path(f"coef.dict")
+        path = self.ml_config.f_get_save_path("coef.dict")
         with open(path, mode="w", encoding="utf-8") as f:
             j = json.dumps(self.coef, ensure_ascii=False)
             f.write(j)

+ 3 - 3
model/model_utils.py

@@ -44,12 +44,12 @@ def f_get_model_score_bin(df, score, bins=None):
     return df, bins
 
 
-def f_calcu_model_psi(df_train, df_test):
+def f_calcu_model_psi(df_train, df_test, sort_ascending=True):
     tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
-        ascending=True)
+        ascending=sort_ascending)
     tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
     tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
-        ascending=True)
+        ascending=sort_ascending)
     tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
     psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
     psi = psi.reset_index()

+ 54 - 0
ol_test.py

@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/27
+@desc:
+"""
+import time
+
+from entitys import DataSplitEntity
+from online_learning import OnlineLearningTrainer
+
+
+if __name__ == "__main__":
+    time_now = time.time()
+    import scorecardpy as sc
+
+    # 加载数据
+    dat = sc.germancredit()
+    dat_columns = dat.columns.tolist()
+    dat_columns = [c.replace(".","_") for c in dat_columns]
+    dat.columns = dat_columns
+
+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
+
+    data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
+
+    # 特征处理
+    cfg = {
+        # 模型系数,分箱信息等,请参考ol_resources_demo目录下文件
+        # 模型系数文件 coef.dict(如果有常数项(截距)请用const作为key)
+        # 分箱信息文件 feature.csv(数值型的分箱信息请按升序排列)
+        "path_resources": "/root/notebook/ol_resources_demo",
+        # 项目名称,影响数据存储位置
+        "project_name": "OnlineLearningDemo",
+        "y_column": "creditability",
+        # 学习率
+        "lr": 0.01,
+        # 单次更新批大小
+        "batch_size": 64,
+        # 训练轮数
+        "epochs": 20,
+        "jupyter_print": True,
+        # 压力测试
+        "stress_test": True,
+        # 压力测试抽样次数
+        "stress_sample_times": 10,
+    }
+
+    # 训练并生成报告
+    trainer = OnlineLearningTrainer(data=data, **cfg)
+    trainer.train()
+    trainer.report()
+
+    print(time.time() - time_now)

+ 10 - 0
online_learning/__init__.py

@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+
+from .trainer import OnlineLearningTrainer
+
+__all__ = ['OnlineLearningTrainer']

+ 364 - 0
online_learning/trainer.py

@@ -0,0 +1,364 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+import json
+import math
+import os
+import re
+from os.path import dirname, realpath
+from typing import Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scorecardpy as sc
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+
+from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
+    f_display_images_by_side
+from entitys import DataSplitEntity, OnlineLearningConfigEntity, MetricFucResultEntity
+from enums import ResultCodesEnum, ConstantEnum, ContextEnum
+from feature import f_woebin_load
+from init import init, context
+from model import f_get_model_score_bin, f_calcu_model_ks, f_stress_test, f_calcu_model_psi
+from monitor import ReportWord
+from .utils import LR
+
+init()
+
+
+class OnlineLearningTrainer:
+    def __init__(self, data: DataSplitEntity = None, ol_config: OnlineLearningConfigEntity = None, *args, **kwargs):
+        if ol_config is not None:
+            self._ol_config = ol_config
+        else:
+            self._ol_config = OnlineLearningConfigEntity(*args, **kwargs)
+        self._data = data
+        self._columns = None
+        self._model_original: LR
+        self._model_optimized: LR
+        self.sc_woebin = None
+        # 报告模板
+        self._template_path = os.path.join(dirname(dirname(realpath(__file__))),
+                                           "./template/OnlineLearning报告模板_lr.docx")
+        self._init(self._ol_config.path_resources)
+
+    def _init(self, path: str):
+        if not os.path.isdir(path):
+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"【{path}】不是文件夹")
+
+        path_coef = os.path.join(path, "coef.dict")
+        if not os.path.isfile(path_coef):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型系数文件【{path_coef}】不存在")
+        with open(path_coef, mode="r", encoding="utf-8") as f:
+            coef = json.loads(f.read())
+            print(f"coef load from【{path_coef}】success.")
+
+        self._columns = list(coef.keys())
+        # 排个序,防止因为顺序原因导致的可能的bug
+        self._columns.sort()
+        weight = [coef[k] for k in self._columns]
+        self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
+        self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
+
+        self._columns = [re.sub('_woe$', '', i) for i in self._columns]
+        # 剔除常数项,因为woe编码里没有常数项
+        self._columns_intercept_remove = self._columns.copy()
+        if ConstantEnum.INTERCEPT.value in self._columns_intercept_remove:
+            self._columns_intercept_remove.remove(ConstantEnum.INTERCEPT.value)
+        # woe编码后带_woe后缀
+        self._columns_woe = [f"{i}_woe" for i in self._columns]
+
+        self.sc_woebin = f_woebin_load(path)
+        for k in self._columns_intercept_remove:
+            if k not in self.sc_woebin.keys():
+                GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型变量【{k}】在woe特征里不存在")
+
+    def _feature_generate(self, data: pd.DataFrame) -> pd.DataFrame:
+        data_woe = sc.woebin_ply(data[self._columns_intercept_remove], self.sc_woebin, print_info=False)
+        data_woe[f"{ConstantEnum.INTERCEPT.value}_woe"] = [1] * len(data_woe)
+        return data_woe[self._columns_woe].to_numpy()
+
+    def _f_get_best_model(self, df_param: pd.DataFrame) -> LR:
+        df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
+        print(f"最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
+        weight = list(df_param_sort.iloc[0])
+        weight = nn.Parameter(torch.tensor(np.array(weight[0:-5])))
+        return LR(weight)
+
+    def _f_get_metric_auc_ks(self, model_type: str):
+        def _get_auc_ks(data, title):
+            y = data[self._ol_config.y_column]
+            y_prob = self.prob(data, model)
+            perf = sc.perf_eva(y, y_prob, title=f"{title}", show_plot=True)
+            path = self._ol_config.f_get_save_path(f"perf_{title}.png")
+            perf["pic"].savefig(path)
+            auc = perf["AUC"]
+            ks = perf["KS"]
+            f_image_crop_white_borders(path, path)
+            return auc, ks, path
+
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        data = pd.concat((train_data, test_data))
+
+        model = self._model_optimized
+        if model_type != "新模型":
+            model = self._model_original
+
+        img_path_auc_ks = []
+        auc, ks, path = _get_auc_ks(data, f"{model_type}-建模数据")
+        img_path_auc_ks.append(path)
+        train_auc, train_ks, path = _get_auc_ks(train_data, f"{model_type}-训练集")
+        img_path_auc_ks.append(path)
+        test_auc, test_ks, path = _get_auc_ks(test_data, f"{model_type}-测试集")
+        img_path_auc_ks.append(path)
+
+        df_auc_ks = pd.DataFrame()
+        df_auc_ks["样本集"] = ["建模数据", "训练集", "测试集"]
+        df_auc_ks["AUC"] = [auc, train_auc, test_auc]
+        df_auc_ks["KS"] = [ks, train_ks, test_ks]
+
+        return MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks, image_size=5, table_font_size=10)
+
+    def _f_get_metric_trend(self, ):
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+
+        # 建模样本变量趋势
+        breaks_list = {}
+        special_values = {}
+        for column, bin in self.sc_woebin.items():
+            breaks_list[column] = list(bin[bin["is_special_values"] == False]['breaks'])
+            sv = list(bin[bin["is_special_values"] == True]['breaks'])
+            if len(sv) > 0:
+                special_values[column] = sv
+        woebin = sc.woebin(data[self._columns_intercept_remove + [y_column]], y=y_column, breaks_list=breaks_list,
+                           special_values=special_values, print_info=False)
+
+        imgs_path = []
+        for k, df_bin in woebin.items():
+            sc.woebin_plot(df_bin)
+            path = self._ol_config.f_get_save_path(f"trend_{k}.png")
+            plt.savefig(path)
+            imgs_path.append(path)
+        return MetricFucResultEntity(image_path=imgs_path, image_size=4)
+
+    def _f_get_metric_coef(self, ):
+        columns_anns = self._ol_config.columns_anns
+        df = pd.DataFrame()
+        df["变量"] = self._columns
+        df["原变量WOE拟合系数"] = [round(i, 4) for i in self._model_original.linear.weight.tolist()]
+        df["新变量WOE拟合系数"] = [round(i, 4) for i in self._model_optimized.linear.weight.tolist()]
+        anns = [columns_anns.get(column, "-") for column in self._columns]
+        df["释义"] = anns
+        img_path_coef = self._ol_config.f_get_save_path(f"coef.png")
+        f_df_to_image(df, img_path_coef)
+        return MetricFucResultEntity(table=df, image_path=img_path_coef)
+
+    def _f_get_metric_gain(self, model_type: str):
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+
+        model = self._model_optimized
+        if model_type != "新模型":
+            model = self._model_original
+
+        score = self.prob(data, model)
+        score_bin, _ = f_get_model_score_bin(data, score)
+        gain = f_calcu_model_ks(score_bin, y_column, sort_ascending=False)
+        img_path_gain = self._ol_config.f_get_save_path(f"{model_type}-gain.png")
+        f_df_to_image(gain, img_path_gain)
+
+        return MetricFucResultEntity(table=gain, image_path=img_path_gain)
+
+    def _f_get_stress_test(self, ):
+        stress_sample_times = self._ol_config.stress_sample_times
+        stress_bad_rate_list = self._ol_config.stress_bad_rate_list
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        y_column = self._ol_config.y_column
+        data = pd.concat((train_data, test_data))
+        score = self.prob(data, self._model_optimized)
+        score_bin, _ = f_get_model_score_bin(data, score)
+        df_stress = f_stress_test(score_bin, sample_times=stress_sample_times, bad_rate_list=stress_bad_rate_list,
+                                  target_column=y_column, score_column=ConstantEnum.SCORE.value, sort_ascending=False)
+
+        img_path_stress = self._ol_config.f_get_save_path(f"stress.png")
+        f_df_to_image(df_stress, img_path_stress)
+        return MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
+
+    def prob(self, x: pd.DataFrame, model=None):
+        if model is None:
+            model = self._model_optimized
+        model.eval()
+        with torch.no_grad():
+            x = torch.tensor(self._feature_generate(x), dtype=torch.float64)
+            y_prob = model(x)
+            y_prob = y_prob.detach().numpy()
+            return y_prob
+
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
+        y1 = self.prob(x1)
+        y2 = self.prob(x2)
+        x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
+        x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
+        model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin, sort_ascending=False)
+        print(f"模型psi: {model_psi['psi'].sum()}")
+        return model_psi
+
+    def train(self, ):
+        epochs = self._ol_config.epochs
+        batch_size = self._ol_config.batch_size
+        train_data = self._data.train_data
+        test_data = self._data.test_data
+        train_x = self._feature_generate(train_data)
+        train_y = train_data[self._ol_config.y_column].to_numpy()
+        test_x = torch.tensor(self._feature_generate(test_data), dtype=torch.float64)
+        test_y = test_data[self._ol_config.y_column]
+
+        criterion = nn.BCELoss()
+        optimizer = optim.Adam(self._model_optimized.parameters(), lr=self._ol_config.lr)
+
+        df_param_columns = self._columns + ["auc_test", "ks_test", "epoch", "loss_train", "loss_test"]
+        df_param = pd.DataFrame(columns=df_param_columns)
+
+        for epoch in tqdm(range(epochs)):
+            data_len = len(train_x)
+            loss_train = 0
+            for i in range(math.ceil(data_len / batch_size)):
+                train_x_batch = torch.tensor(train_x[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
+                train_y_batch = torch.tensor(train_y[i * batch_size:(i + 1) * batch_size], dtype=torch.float64)
+                self._model_optimized.train()
+                optimizer.zero_grad()
+                y_prob = self._model_optimized(train_x_batch)
+                loss = criterion(y_prob, train_y_batch)
+                loss.backward()
+                optimizer.step()
+                loss_train = loss.detach().item()
+            # 测试集评估
+            self._model_optimized.eval()
+            with torch.no_grad():
+                y_prob = self._model_optimized(test_x)
+                loss = criterion(y_prob, torch.tensor(test_y.to_numpy(), dtype=torch.float64))
+                loss_test = loss.detach().item()
+                y_prob = y_prob.detach().numpy()
+                perf = sc.perf_eva(test_y, y_prob, show_plot=False)
+                auc = perf["AUC"]
+                ks = perf["KS"]
+                row = self._model_optimized.linear.weight.tolist() + [auc, ks, epoch + 1, loss_train, loss_test]
+                df_param.loc[len(df_param)] = dict(zip(df_param_columns, row))
+                # print(f"epoch:{epoch + 1} auc:{auc} ks:{ks}")
+
+        self._model_optimized = self._f_get_best_model(df_param)
+
+        context.set(ContextEnum.PARAM_OPTIMIZED, df_param)
+
+    def save(self):
+
+        self._ol_config.config_save()
+
+        if self.sc_woebin is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")
+        df_woebin = pd.concat(self.sc_woebin.values())
+        path = self._ol_config.f_get_save_path(f"feature.csv")
+        df_woebin.to_csv(path)
+        print(f"feature save to【{path}】success. ")
+
+        if self._model_optimized is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
+        path = self._ol_config.f_get_save_path("coef.dict")
+        with open(path, mode="w", encoding="utf-8") as f:
+            coef = dict(zip(self._columns, self._model_optimized.linear.weight.tolist()))
+            j = json.dumps(coef, ensure_ascii=False)
+            f.write(j)
+        print(f"model save to【{path}】success. ")
+
+    @staticmethod
+    def load(path: str):
+        ol_config = OnlineLearningConfigEntity.from_config(path)
+        ol_config._path_resources = path
+        return OnlineLearningTrainer(ol_config=ol_config)
+
+    def report(self, ):
+
+        metric_value_dict = {}
+        # 样本分布
+        metric_value_dict["样本分布"] = MetricFucResultEntity(table=self._data.get_distribution(self._ol_config.y_column),
+                                                          table_font_size=10, table_cell_width=3)
+
+        # 模型结果对比
+        metric_value_dict[f"模型结果-新模型"] = self._f_get_metric_auc_ks("新模型")
+        metric_value_dict[f"模型结果-原模型"] = self._f_get_metric_auc_ks("原模型")
+
+        # 变量趋势
+        metric_value_dict["变量趋势-建模数据"] = self._f_get_metric_trend()
+
+        # 模型系数对比
+        metric_value_dict["模型系数"] = self._f_get_metric_coef()
+
+        # 分数分箱
+        metric_value_dict["分数分箱-建模数据-新模型"] = self._f_get_metric_gain("新模型")
+        metric_value_dict["分数分箱-建模数据-原模型"] = self._f_get_metric_gain("原模型")
+
+        # 压力测试
+        if self._ol_config.stress_test:
+            metric_value_dict["压力测试"] = self._f_get_stress_test()
+
+        if self._ol_config.jupyter_print:
+            self.jupyter_print(metric_value_dict)
+
+        save_path = self._ol_config.f_get_save_path("OnlineLearning报告.docx")
+        ReportWord.generate_report(metric_value_dict, self._template_path, save_path=save_path)
+        print(f"模型报告文件储存路径:{save_path}")
+
+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity]):
+        from IPython import display
+
+        df_param = context.get(ContextEnum.PARAM_OPTIMIZED)
+
+        f_display_title(display, "样本分布")
+        display.display(metric_value_dict["样本分布"].table)
+
+        f_display_title(display, "模型结果")
+        print(f"原模型")
+        display.display(metric_value_dict["模型结果-原模型"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果-原模型"].image_path)
+        print(f"新模型")
+        display.display(metric_value_dict["模型结果-新模型"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果-新模型"].image_path)
+
+        f_display_title(display, "模型系数")
+        display.display(metric_value_dict["模型系数"].table)
+
+        f_display_title(display, "分数分箱")
+        print(f"建模数据上分数分箱")
+        print(f"原模型")
+        display.display(metric_value_dict["分数分箱-建模数据-原模型"].table)
+        print(f"新模型")
+        display.display(metric_value_dict["分数分箱-建模数据-新模型"].table)
+
+        f_display_title(display, "变量趋势")
+        print(f"建模数据上变量趋势")
+        f_display_images_by_side(display, metric_value_dict["变量趋势-建模数据"].image_path)
+
+        if "压力测试" in metric_value_dict.keys():
+            f_display_title(display, "压力测试")
+            display.display(metric_value_dict["压力测试"].table)
+
+        f_display_title(display, "系数优化过程")
+        display.display(df_param)
+
+
+if __name__ == "__main__":
+    pass

+ 22 - 0
online_learning/utils.py

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/2/27
+@desc: 
+"""
+import torch.nn as nn
+
+
+class LR(nn.Module):
+    def __init__(self, weight: nn.Parameter):
+        super(LR, self).__init__()
+        self.linear = nn.Linear(weight.shape[0], 1, bias=False)
+        self.linear.weight = weight
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(self.linear(x))
+
+
+if __name__ == "__main__":
+    pass

+ 16 - 1
pipeline/pipeline.py

@@ -4,12 +4,14 @@
 @time: 2024/11/1
 @desc: 模型训练管道
 """
+from typing import List
+
 import pandas as pd
 
 from entitys import DataSplitEntity, MlConfigEntity, DataFeatureEntity
 from feature import FeatureStrategyFactory, FeatureStrategyBase
 from init import init
-from model import ModelBase, ModelFactory, f_add_rules
+from model import ModelBase, ModelFactory, f_add_rules, f_get_model_score_bin, f_calcu_model_psi
 from monitor import ReportWord
 
 init()
@@ -53,6 +55,19 @@ class Pipeline():
     def score_rule(self, data: pd.DataFrame):
         return self._model.score_rule(data)
 
+    def psi(self, x1: pd.DataFrame, x2: pd.DataFrame, points: List[float] = None) -> pd.DataFrame:
+        if len(self._ml_config.rules) != 0:
+            y1 = self.score_rule(x1)
+            y2 = self.score_rule(x2)
+        else:
+            y1 = self.score(x1)
+            y2 = self.score(x2)
+        x1_score_bin, score_bins = f_get_model_score_bin(x1, y1, points)
+        x2_score_bin, _ = f_get_model_score_bin(x2, y2, score_bins)
+        model_psi = f_calcu_model_psi(x1_score_bin, x2_score_bin)
+        print(f"模型psi: {model_psi['psi'].sum()}")
+        return model_psi
+
     def report(self, ):
         save_path = self._ml_config.f_get_save_path("模型报告.docx")
         ReportWord.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)

+ 1 - 0
requirements-analysis.txt

@@ -18,3 +18,4 @@ kaleido==0.2.1
 statsmodels==0.12.2
 beautifulsoup4==4.11.1
 openpyxl==3.0.9
+torch==1.1.0

BIN
template/OnlineLearning报告模板_lr.docx


Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác