Explorar el Código

add: f_get_var_mapping

yq hace 6 horas
padre
commit
e8bf9d8622
Se han modificado 7 ficheros con 114 adiciones y 11 borrados
  1. 3 1
      __init__.py
  2. 1 0
      enums/file_enum.py
  3. 2 2
      feature/__init__.py
  4. 92 2
      feature/woe/utils.py
  5. 8 0
      model/model_lr.py
  6. 1 0
      online_learning/trainer_lr.py
  7. 7 6
      train_test_lr.py

+ 3 - 1
__init__.py

@@ -10,6 +10,7 @@ from os.path import dirname, realpath
 
 sys.path.append(dirname(realpath(__file__)))
 
+from feature import f_get_var_mapping
 from online_learning import OnlineLearningTrainerLr, OnlineLearningTrainerXgb
 from pipeline import Pipeline
 from data import DataLoaderMysql
@@ -18,4 +19,5 @@ from monitor import MonitorMetric
 from metrics import MetricBase
 
 __all__ = ['MonitorMetric', 'MetricBase', 'DataLoaderMysql', 'DbConfigEntity',
-           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb']
+           'DataSplitEntity', 'Pipeline', 'OnlineLearningTrainerLr', 'OnlineLearningTrainerXgb',
+           'f_get_var_mapping']

+ 1 - 0
enums/file_enum.py

@@ -12,6 +12,7 @@ class FileEnum(Enum):
     OLCFG = "olcfg.json"
     FEATURE = "feature.csv"
     FEATURE_PKL = "feature.pkl"
+    VAR_MAPPING = "var_mapping.csv"
     CARD = "card.csv"
     CARD_CFG = "card.cfg"
     COEF = "coef.json"

+ 2 - 2
feature/__init__.py

@@ -6,6 +6,6 @@
 """
 from .feature_strategy_base import FeatureStrategyBase
 from .feature_strategy_factory import FeatureStrategyFactory
-from .woe.utils import f_woebin_load
+from .woe.utils import f_woebin_load, f_get_var_mapping
 
-__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load']
+__all__ = ['FeatureStrategyFactory', 'FeatureStrategyBase', 'f_woebin_load', 'f_get_var_mapping']

+ 92 - 2
feature/woe/utils.py

@@ -4,6 +4,7 @@
 @time: 2023/12/28
 @desc:  特征工具类
 """
+import json
 import os
 from typing import Union
 
@@ -11,7 +12,7 @@ import numpy as np
 import pandas as pd
 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 
-from commom import GeneralException
+from commom import GeneralException, f_is_number
 from enums import ResultCodesEnum, FileEnum
 
 FORMAT_DICT = {
@@ -23,7 +24,8 @@ FORMAT_DICT = {
     # 次数类2 0 - 20
     "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
     # 次数类3 0 - 50
-    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+                 50.0],
     # 次数类4 0 - 100
     "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
 
@@ -152,3 +154,91 @@ def f_woebin_load(path: str):
         sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
     print(f"feature load from【{path}】success.")
     return sc_woebin
+
+
+def f_get_var_mapping(df_bins, df_card, model_name="", model_desc="", columns_anns={}) -> pd.DataFrame:
+    def _get_bin_opt(bin: str):
+        is_num = 0
+        bin = str(bin)
+        rst = {
+            "LEFT_OP": "",
+            "LEFT_VALUE": "",
+            "RIGHT_OP": "",
+            "RIGHT_VALUE": "",
+        }
+        # 数值型
+        if "," in bin and ("[" in bin or "]" in bin or "(" in bin or ")" in bin):
+            is_num = 1
+            left = bin.split(",")[0]
+            if "-inf" not in left:
+                rst["LEFT_VALUE"] = left[1:]
+                rst["LEFT_OP"] = ">"
+                if "[" in left:
+                    rst["LEFT_OP"] = ">="
+
+            right = bin.split(",")[1]
+            if "inf" not in right:
+                rst["RIGHT_VALUE"] = right[:-1]
+                rst["RIGHT_OP"] = "<"
+                if "]" in right:
+                    rst["LEFT_OP"] = "<="
+        else:
+            # 字符型
+            e = bin.split("%,%")
+            if len(e) == 1:
+                rst["LEFT_VALUE"] = e[0]
+                if f_is_number(e[0]):
+                    is_num = 1
+            else:
+                rst["LEFT_VALUE"] = json.dumps(e, ensure_ascii=False)
+
+        return rst, is_num
+
+    rows = []
+    binning_id_dict = {}
+    for _, row_bin in df_bins.iterrows():
+        variable = row_bin["variable"]
+        binning_id = binning_id_dict.get(variable, 1)
+        bin_opt, is_num = _get_bin_opt(row_bin["bin"])
+        var_info = {
+            "MODEL_NAME": model_name,
+            "MODEL_DESC": model_desc,
+            "VERSION": 1,
+            "VAR_NAME": variable,
+            "VAR_DESC": columns_anns.get(variable, ""),
+            "BINNING_ID": binning_id,
+            "IS_NUM": is_num,
+            "VAR_WOE": df_card[(df_card["variable"] == variable) & (df_card["bin"] == row_bin["bin"])][
+                'points'].values[0],
+            "VAR_WEIGHT": 1,
+            "VAR_IV": round(row_bin["total_iv"], 3),
+            "BINNING_PARTION": round(row_bin["count_distr"], 3),
+        }
+        var_info.update(bin_opt)
+        rows.append(var_info)
+        binning_id_dict[variable] = binning_id + 1
+    rows.append({
+        "MODEL_NAME": model_name,
+        "MODEL_DESC": model_desc,
+        "VERSION": 1,
+        "VAR_NAME": "INTERCEPT",
+        "VAR_DESC": "截距",
+        "BINNING_ID": 0,
+        "IS_NUM": 1,
+        "LEFT_OP": "",
+        "LEFT_VALUE": "",
+        "RIGHT_OP": "",
+        "RIGHT_VALUE": "",
+        "VAR_WOE": "",
+        "VAR_WEIGHT": 0,
+        "VAR_IV": "",
+        "BINNING_PARTION": "",
+    })
+    df_var_mapping = pd.DataFrame(
+        columns=["MODEL_NAME", "MODEL_DESC", "VERSION", "VAR_NAME", "VAR_DESC", "BINNING_ID", "IS_NUM",
+                 "LEFT_OP", "LEFT_VALUE", "RIGHT_OP", "RIGHT_VALUE", "VAR_WOE", "VAR_WEIGHT", "VAR_IV",
+                 "BINNING_PARTION"],
+        data=rows
+    )
+
+    return df_var_mapping

+ 8 - 0
model/model_lr.py

@@ -19,6 +19,7 @@ from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_
     f_image_crop_white_borders
 from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
 from enums import ContextEnum, ResultCodesEnum, ConstantEnum, FileEnum
+from feature import f_get_var_mapping
 from init import context
 from .model_base import ModelBase
 from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi, f_add_rules
@@ -98,6 +99,13 @@ class ModelLr(ModelBase):
             f.write(j)
         print(f"model save to【{path}】success. ")
 
+        woebin = context.get(ContextEnum.WOEBIN)
+        df_woebin = pd.concat(woebin.values())
+        df_var_mapping = f_get_var_mapping(df_woebin, df_card, columns_anns=self.ml_config.columns_anns)
+        path = self.ml_config.f_get_save_path(FileEnum.VAR_MAPPING.value)
+        df_var_mapping.to_csv(path, encoding="utf-8")
+        print(f"model save to【{path}】success. ")
+
     def model_load(self, path: str, *args, **kwargs):
         if not os.path.isdir(path):
             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")

+ 1 - 0
online_learning/trainer_lr.py

@@ -73,6 +73,7 @@ class OnlineLearningTrainerLr:
         # 排个序,防止因为顺序原因导致的可能的bug
         self._columns.sort()
         weight = [coef[k] for k in self._columns]
+        # TODO 常数项(截距)不应该被训练
         self._model_original = LR(nn.Parameter(torch.tensor(np.array(weight))))
         self._model_optimized = LR(nn.Parameter(torch.tensor(np.array(weight))))
 

+ 7 - 6
train_test_lr.py

@@ -46,7 +46,7 @@ if __name__ == "__main__":
         "psi_threshold": 0.001,
         "vif_threshold": 1.06,
         # 压力测试
-        "stress_test": True,
+        "stress_test": False,
         "stress_sample_times": 10,
         # 特殊值
         "special_values": {"age_in_years": [36]},
@@ -65,11 +65,11 @@ if __name__ == "__main__":
             "duration_in_month",
             "credit_amount",
             "age_in_years",
-            "purpose",
-            "credit_history",
-
-            "credit_amount_corr1",
-            "credit_amount_corr2",
+            # "purpose",
+            # "credit_history",
+            #
+            # "credit_amount_corr1",
+            # "credit_amount_corr2",
         ],
         "columns_anns": {
             "age_in_years": "年龄",
@@ -83,5 +83,6 @@ if __name__ == "__main__":
     train_pipeline = Pipeline(data=data, **cfg)
     train_pipeline.train()
     train_pipeline.report()
+    train_pipeline.save()
 
     print(time.time() - time_now)