hace 2 meses · e8bf9d8622
--- a/__init__.py
+++ b/__init__.py
@@ -10,6 +10,7 @@ from os.path import dirname, realpath
 
				 
			
 
				 sys.path.append(dirname(realpath(__file__)))
			
 
				 
			
 
				+from feature import f_get_var_mapping
			
 
				 from online_learning import OnlineLearningTrainerLr, OnlineLearningTrainerXgb
			
 
				 from pipeline import Pipeline
			
 
				 from data import DataLoaderMysql
			
@@ -18,4 +19,5 @@ from monitor import MonitorMetric
 
				 from metrics import MetricBase
			
 
				 
			
 
				 __all__ = ['MonitorMetric', 'MetricBase', 'DataLoaderMysql', 'DbConfigEntity',
			
 
				-           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb']
			
 
				+           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb',
			
 
				+           'f_get_var_mapping']
			
--- a/enums/file_enum.py
+++ b/enums/file_enum.py
@@ -12,6 +12,7 @@ class FileEnum(Enum):
 
				     OLCFG = "olcfg.json"
			
 
				     FEATURE = "feature.csv"
			
 
				     FEATURE_PKL = "feature.pkl"
			
 
				+    VAR_MAPPING = "var_mapping.csv"
			
 
				     CARD = "card.csv"
			
 
				     CARD_CFG = "card.cfg"
			
 
				     COEF = "coef.json"
			
--- a/feature/__init__.py
+++ b/feature/__init__.py
@@ -6,6 +6,6 @@
 
				 """
			
 
				 from .feature_strategy_base import FeatureStrategyBase
			
 
				 from .feature_strategy_factory import FeatureStrategyFactory
			
 
				-from .woe.utils import f_woebin_load
			
 
				+from .woe.utils import f_woebin_load, f_get_var_mapping
			
 
				 
			
 
				-__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load']
			
 
				+__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load', 'f_get_var_mapping']
			
--- a/feature/woe/utils.py
+++ b/feature/woe/utils.py
@@ -4,6 +4,7 @@
 
				 @time: 2023/12/28
			
 
				 @desc:  特征工具类
			
 
				 """
			
 
				+import json
			
 
				 import os
			
 
				 from typing import Union
			
 
				 
			
@@ -11,7 +12,7 @@ import numpy as np
 
				 import pandas as pd
			
 
				 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
			
 
				 
			
 
				-from commom import GeneralException
			
 
				+from commom import GeneralException, f_is_number
			
 
				 from enums import ResultCodesEnum, FileEnum
			
 
				 
			
 
				 FORMAT_DICT = {
			
@@ -23,7 +24,8 @@ FORMAT_DICT = {
 
				     # 次数类2 0 - 20
			
 
				     "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
			
 
				     # 次数类3 0 - 50
			
 
				-    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
			
 
				+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
			
 
				+                 50.0],
			
 
				     # 次数类4 0 - 100
			
 
				     "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
			
 
				 
			
@@ -152,3 +154,91 @@ def f_woebin_load(path: str):
 
				         sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
			
 
				     print(f"feature load from【{path}】success.")
			
 
				     return sc_woebin
			
 
				+
			
 
				+
			
 
				+def f_get_var_mapping(df_bins, df_card, model_name="", model_desc="", columns_anns={}) -> pd.DataFrame:
			
 
				+    def _get_bin_opt(bin: str):
			
 
				+        is_num = 0
			
 
				+        bin = str(bin)
			
 
				+        rst = {
			
 
				+            "LEFT_OP": "",
			
 
				+            "LEFT_VALUE": "",
			
 
				+            "RIGHT_OP": "",
			
 
				+            "RIGHT_VALUE": "",
			
 
				+        }
			
 
				+        # 数值型
			
 
				+        if "," in bin and ("[" in bin or "]" in bin or "(" in bin or ")" in bin):
			
 
				+            is_num = 1
			
 
				+            left = bin.split(",")[0]
			
 
				+            if "-inf" not in left:
			
 
				+                rst["LEFT_VALUE"] = left[1:]
			
 
				+                rst["LEFT_OP"] = ">"
			
 
				+                if "[" in left:
			
 
				+                    rst["LEFT_OP"] = ">="
			
 
				+
			
 
				+            right = bin.split(",")[1]
			
 
				+            if "inf" not in right:
			
 
				+                rst["RIGHT_VALUE"] = right[:-1]
			
 
				+                rst["RIGHT_OP"] = "<"
			
 
				+                if "]" in right:
			
 
				+                    rst["LEFT_OP"] = "<="
			
 
				+        else:
			
 
				+            # 字符型
			
 
				+            e = bin.split("%,%")
			
 
				+            if len(e) == 1:
			
 
				+                rst["LEFT_VALUE"] = e[0]
			
 
				+                if f_is_number(e[0]):
			
 
				+                    is_num = 1
			
 
				+            else:
			
 
				+                rst["LEFT_VALUE"] = json.dumps(e, ensure_ascii=False)
			
 
				+
			
 
				+        return rst, is_num
			
 
				+
			
 
				+    rows = []
			
 
				+    binning_id_dict = {}
			
 
				+    for _, row_bin in df_bins.iterrows():
			
 
				+        variable = row_bin["variable"]
			
 
				+        binning_id = binning_id_dict.get(variable, 1)
			
 
				+        bin_opt, is_num = _get_bin_opt(row_bin["bin"])
			
 
				+        var_info = {
			
 
				+            "MODEL_NAME": model_name,
			
 
				+            "MODEL_DESC": model_desc,
			
 
				+            "VERSION": 1,
			
 
				+            "VAR_NAME": variable,
			
 
				+            "VAR_DESC": columns_anns.get(variable, ""),
			
 
				+            "BINNING_ID": binning_id,
			
 
				+            "IS_NUM": is_num,
			
 
				+            "VAR_WOE": df_card[(df_card["variable"] == variable) & (df_card["bin"] == row_bin["bin"])][
			
 
				+                'points'].values[0],
			
 
				+            "VAR_WEIGHT": 1,
			
 
				+            "VAR_IV": round(row_bin["total_iv"], 3),
			
 
				+            "BINNING_PARTION": round(row_bin["count_distr"], 3),
			
 
				+        }
			
 
				+        var_info.update(bin_opt)
			
 
				+        rows.append(var_info)
			
 
				+        binning_id_dict[variable] = binning_id + 1
			
 
				+    rows.append({
			
 
				+        "MODEL_NAME": model_name,
			
 
				+        "MODEL_DESC": model_desc,
			
 
				+        "VERSION": 1,
			
 
				+        "VAR_NAME": "INTERCEPT",
			
 
				+        "VAR_DESC": "截距",
			
 
				+        "BINNING_ID": 0,
			
 
				+        "IS_NUM": 1,
			
 
				+        "LEFT_OP": "",
			
 
				+        "LEFT_VALUE": "",
			
 
				+        "RIGHT_OP": "",
			
 
				+        "RIGHT_VALUE": "",
			
 
				+        "VAR_WOE": "",
			
 
				+        "VAR_WEIGHT": 0,
			
 
				+        "VAR_IV": "",
			
 
				+        "BINNING_PARTION": "",
			
 
				+    })
			
 
				+    df_var_mapping = pd.DataFrame(
			
 
				+        columns=["MODEL_NAME", "MODEL_DESC", "VERSION", "VAR_NAME", "VAR_DESC", "BINNING_ID", "IS_NUM",
			
 
				+                 "LEFT_OP", "LEFT_VALUE", "RIGHT_OP", "RIGHT_VALUE", "VAR_WOE", "VAR_WEIGHT", "VAR_IV",
			
 
				+                 "BINNING_PARTION"],
			
 
				+        data=rows
			
 
				+    )
			
 
				+
			
 
				+    return df_var_mapping
			
--- a/model/model_lr.py
+++ b/model/model_lr.py
@@ -19,6 +19,7 @@ from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_
 
				     f_image_crop_white_borders
			
 
				 from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
			
 
				 from enums import ContextEnum, ResultCodesEnum, ConstantEnum, FileEnum
			
 
				+from feature import f_get_var_mapping
			
 
				 from init import context
			
 
				 from .model_base import ModelBase
			
 
				 from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
			
@@ -98,6 +99,13 @@ class ModelLr(ModelBase):
 
				             f.write(j)
			
 
				         print(f"model save to【{path}】success. ")
			
 
				 
			
 
				+        woebin = context.get(ContextEnum.WOEBIN)
			
 
				+        df_woebin = pd.concat(woebin.values())
			
 
				+        df_var_mapping = f_get_var_mapping(df_woebin, df_card, columns_anns=self.ml_config.columns_anns)
			
 
				+        path = self.ml_config.f_get_save_path(FileEnum.VAR_MAPPING.value)
			
 
				+        df_var_mapping.to_csv(path, encoding="utf-8")
			
 
				+        print(f"model save to【{path}】success. ")
			
 
				+
			
 
				     def model_load(self, path: str, *args, **kwargs):
			
 
				         if not os.path.isdir(path):
			
 
				             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
			
--- a/online_learning/trainer_lr.py
+++ b/online_learning/trainer_lr.py
@@ -73,6 +73,7 @@ class OnlineLearningTrainerLr:
 
				         # 排个序，防止因为顺序原因导致的可能的bug
			
 
				         self._columns.sort()
			
 
				         weight = [coef[k] for k in self._columns]
			
 
				+        # TODO 常数项（截距）不应该被训练
			
 
				         self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
			
 
				         self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
			
 
				 
			
--- a/train_test_lr.py
+++ b/train_test_lr.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
 
				         "psi_threshold": 0.001,
			
 
				         "vif_threshold": 1.06,
			
 
				         # 压力测试
			
 
				-        "stress_test": True,
			
 
				+        "stress_test": False,
			
 
				         "stress_sample_times": 10,
			
 
				         # 特殊值
			
 
				         "special_values": {"age_in_years": [36]},
			
@@ -65,11 +65,11 @@ if __name__ == "__main__":
 
				             "duration_in_month",
			
 
				             "credit_amount",
			
 
				             "age_in_years",
			
 
				-            "purpose",
			
 
				-            "credit_history",
			
 
				-
			
 
				-            "credit_amount_corr1",
			
 
				-            "credit_amount_corr2",
			
 
				+            # "purpose",
			
 
				+            # "credit_history",
			
 
				+            #
			
 
				+            # "credit_amount_corr1",
			
 
				+            # "credit_amount_corr2",
			
 
				         ],
			
 
				         "columns_anns": {
			
 
				             "age_in_years": "年龄",
			
@@ -83,5 +83,6 @@ if __name__ == "__main__":
 
				     train_pipeline = Pipeline(data=data, **cfg)
			
 
				     train_pipeline.train()
			
 
				     train_pipeline.report()
			
 
				+    train_pipeline.save()
			
 
				 
			
 
				     print(time.time() - time_now)