|
@@ -4,6 +4,7 @@
|
|
|
@time: 2023/12/28
|
|
|
@desc: 特征工具类
|
|
|
"""
|
|
|
+import json
|
|
|
import os
|
|
|
from typing import Union
|
|
|
|
|
@@ -11,7 +12,7 @@ import numpy as np
|
|
|
import pandas as pd
|
|
|
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
|
|
|
|
|
|
-from commom import GeneralException
|
|
|
+from commom import GeneralException, f_is_number
|
|
|
from enums import ResultCodesEnum, FileEnum
|
|
|
|
|
|
FORMAT_DICT = {
|
|
@@ -23,7 +24,8 @@ FORMAT_DICT = {
|
|
|
# 次数类2 0 - 20
|
|
|
"bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
|
|
|
# 次数类3 0 - 50
|
|
|
- "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
|
|
|
+ "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
|
+ 50.0],
|
|
|
# 次数类4 0 - 100
|
|
|
"bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
|
|
|
|
|
@@ -152,3 +154,91 @@ def f_woebin_load(path: str):
|
|
|
sc_woebin[variable] = df_woebin[df_woebin["variable"] == variable]
|
|
|
print(f"feature load from【{path}】success.")
|
|
|
return sc_woebin
|
|
|
+
|
|
|
+
|
|
|
+def f_get_var_mapping(df_bins, df_card, model_name="", model_desc="", columns_anns={}) -> pd.DataFrame:
|
|
|
+ def _get_bin_opt(bin: str):
|
|
|
+ is_num = 0
|
|
|
+ bin = str(bin)
|
|
|
+ rst = {
|
|
|
+ "LEFT_OP": "",
|
|
|
+ "LEFT_VALUE": "",
|
|
|
+ "RIGHT_OP": "",
|
|
|
+ "RIGHT_VALUE": "",
|
|
|
+ }
|
|
|
+ # 数值型
|
|
|
+ if "," in bin and ("[" in bin or "]" in bin or "(" in bin or ")" in bin):
|
|
|
+ is_num = 1
|
|
|
+ left = bin.split(",")[0]
|
|
|
+ if "-inf" not in left:
|
|
|
+ rst["LEFT_VALUE"] = left[1:]
|
|
|
+ rst["LEFT_OP"] = ">"
|
|
|
+ if "[" in left:
|
|
|
+ rst["LEFT_OP"] = ">="
|
|
|
+
|
|
|
+ right = bin.split(",")[1]
|
|
|
+ if "inf" not in right:
|
|
|
+ rst["RIGHT_VALUE"] = right[:-1]
|
|
|
+ rst["RIGHT_OP"] = "<"
|
|
|
+ if "]" in right:
|
|
|
+ rst["LEFT_OP"] = "<="
|
|
|
+ else:
|
|
|
+ # 字符型
|
|
|
+ e = bin.split("%,%")
|
|
|
+ if len(e) == 1:
|
|
|
+ rst["LEFT_VALUE"] = e[0]
|
|
|
+ if f_is_number(e[0]):
|
|
|
+ is_num = 1
|
|
|
+ else:
|
|
|
+ rst["LEFT_VALUE"] = json.dumps(e, ensure_ascii=False)
|
|
|
+
|
|
|
+ return rst, is_num
|
|
|
+
|
|
|
+ rows = []
|
|
|
+ binning_id_dict = {}
|
|
|
+ for _, row_bin in df_bins.iterrows():
|
|
|
+ variable = row_bin["variable"]
|
|
|
+ binning_id = binning_id_dict.get(variable, 1)
|
|
|
+ bin_opt, is_num = _get_bin_opt(row_bin["bin"])
|
|
|
+ var_info = {
|
|
|
+ "MODEL_NAME": model_name,
|
|
|
+ "MODEL_DESC": model_desc,
|
|
|
+ "VERSION": 1,
|
|
|
+ "VAR_NAME": variable,
|
|
|
+ "VAR_DESC": columns_anns.get(variable, ""),
|
|
|
+ "BINNING_ID": binning_id,
|
|
|
+ "IS_NUM": is_num,
|
|
|
+ "VAR_WOE": df_card[(df_card["variable"] == variable) & (df_card["bin"] == row_bin["bin"])][
|
|
|
+ 'points'].values[0],
|
|
|
+ "VAR_WEIGHT": 1,
|
|
|
+ "VAR_IV": round(row_bin["total_iv"], 3),
|
|
|
+ "BINNING_PARTION": round(row_bin["count_distr"], 3),
|
|
|
+ }
|
|
|
+ var_info.update(bin_opt)
|
|
|
+ rows.append(var_info)
|
|
|
+ binning_id_dict[variable] = binning_id + 1
|
|
|
+ rows.append({
|
|
|
+ "MODEL_NAME": model_name,
|
|
|
+ "MODEL_DESC": model_desc,
|
|
|
+ "VERSION": 1,
|
|
|
+ "VAR_NAME": "INTERCEPT",
|
|
|
+ "VAR_DESC": "截距",
|
|
|
+ "BINNING_ID": 0,
|
|
|
+ "IS_NUM": 1,
|
|
|
+ "LEFT_OP": "",
|
|
|
+ "LEFT_VALUE": "",
|
|
|
+ "RIGHT_OP": "",
|
|
|
+ "RIGHT_VALUE": "",
|
|
|
+ "VAR_WOE": "",
|
|
|
+ "VAR_WEIGHT": 0,
|
|
|
+ "VAR_IV": "",
|
|
|
+ "BINNING_PARTION": "",
|
|
|
+ })
|
|
|
+ df_var_mapping = pd.DataFrame(
|
|
|
+ columns=["MODEL_NAME", "MODEL_DESC", "VERSION", "VAR_NAME", "VAR_DESC", "BINNING_ID", "IS_NUM",
|
|
|
+ "LEFT_OP", "LEFT_VALUE", "RIGHT_OP", "RIGHT_VALUE", "VAR_WOE", "VAR_WEIGHT", "VAR_IV",
|
|
|
+ "BINNING_PARTION"],
|
|
|
+ data=rows
|
|
|
+ )
|
|
|
+
|
|
|
+ return df_var_mapping
|