před 2 měsíci · 8e7a1ebb37
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -19,5 +19,8 @@ class BaseConfig:
 
				     # 运行环境，目前影响上下文的储存
			
 
				     run_env = "jupyter"
			
 
				 
			
 
				+    # java_home
			
 
				+    java_home = "/usr/local/jdk1.8"
			
 
				+
			
 
				     # 表格合并相同列名的列
			
 
				     merge_table_column = True
			
--- a/entitys/data_feaure_entity.py
+++ b/entitys/data_feaure_entity.py
@@ -49,15 +49,15 @@ class DataSplitEntity():
 
				         self._data = pd.concat((train_data, test_data))
			
 
				 
			
 
				     @property
			
 
				-    def data(self):
			
 
				+    def data(self) -> pd.DataFrame:
			
 
				         return self._data
			
 
				 
			
 
				     @property
			
 
				-    def train_data(self):
			
 
				+    def train_data(self) -> pd.DataFrame:
			
 
				         return self._train_data
			
 
				 
			
 
				     @property
			
 
				-    def test_data(self):
			
 
				+    def test_data(self) -> pd.DataFrame:
			
 
				         return self._test_data
			
 
				 
			
 
				     def get_distribution(self, y_column) -> pd.DataFrame:
			
--- a/entitys/ml_config_entity.py
+++ b/entitys/ml_config_entity.py
@@ -41,15 +41,16 @@ class MlConfigEntity():
 
				                  stress_bad_rate_list: List[float] = [],
			
 
				                  model_type="lr",
			
 
				                  feature_strategy="woe",
			
 
				+                 params_xgb={},
			
 
				                  rules=[],
			
 
				-                 fill_method: str = None,
			
 
				-                 fill_value=None,
			
 
				                  *args, **kwargs):
			
 
				 
			
 
				         self._model_type = model_type
			
 
				 
			
 
				         self._feature_strategy = feature_strategy
			
 
				 
			
 
				+        self._params_xgb = params_xgb
			
 
				+
			
 
				         self._psi_threshold = psi_threshold
			
 
				 
			
 
				         self._vif_threshold = vif_threshold
			
@@ -96,12 +97,6 @@ class MlConfigEntity():
 
				         # 候选x变量
			
 
				         self._x_columns = x_columns
			
 
				 
			
 
				-        # 缺失值填充方法
			
 
				-        self._fill_method = fill_method
			
 
				-
			
 
				-        # 缺失值填充值
			
 
				-        self._fill_value = fill_value
			
 
				-
			
 
				         # 使用iv筛变量时的阈值
			
 
				         self._iv_threshold = iv_threshold
			
 
				 
			
@@ -145,6 +140,27 @@ class MlConfigEntity():
 
				     def feature_strategy(self):
			
 
				         return self._feature_strategy
			
 
				 
			
 
				+    @property
			
 
				+    def params_xgb(self):
			
 
				+        params = {
			
 
				+            'objective': 'binary:logistic',
			
 
				+            'eval_metric': 'auc',
			
 
				+            'learning_rate': 0.1,
			
 
				+            'max_depth': 3,
			
 
				+            'subsample': None,
			
 
				+            'colsample_bytree': None,
			
 
				+            'alpha': None,
			
 
				+            'num_boost_round': 500,
			
 
				+            'early_stopping_rounds': 20,
			
 
				+            'verbose_eval': 10,
			
 
				+            'random_state': 2025,
			
 
				+            'save_pmml': True,
			
 
				+            'trees_print': False,
			
 
				+        }
			
 
				+        params.update(self._params_xgb)
			
 
				+
			
 
				+        return params
			
 
				+
			
 
				     @property
			
 
				     def psi_threshold(self):
			
 
				         return self._psi_threshold
			
@@ -230,14 +246,6 @@ class MlConfigEntity():
 
				     def columns_anns(self):
			
 
				         return self._columns_anns
			
 
				 
			
 
				-    @property
			
 
				-    def fill_value(self):
			
 
				-        return self._fill_value
			
 
				-
			
 
				-    @property
			
 
				-    def fill_method(self):
			
 
				-        return self._fill_method
			
 
				-
			
 
				     @property
			
 
				     def iv_threshold(self):
			
 
				         return self._iv_threshold
			
--- a/enums/feature_strategy_enum.py
+++ b/enums/feature_strategy_enum.py
@@ -8,4 +8,7 @@ from enum import Enum
 
				 
			
 
				 
			
 
				 class FeatureStrategyEnum(Enum):
			
 
				+    # woe编码
			
 
				     WOE = "woe"
			
 
				+    # 粗分箱编码
			
 
				+    NORM = "norm"
			
--- a/enums/file_enum.py
+++ b/enums/file_enum.py
@@ -15,6 +15,7 @@ class FileEnum(Enum):
 
				     CARD_CFG = "card.cfg"
			
 
				     COEF = "coef.json"
			
 
				     MODEL = "model.pkl"
			
 
				+    PMML = "model.pmml"
			
 
				 
			
 
				 
			
 
				 
			
--- a/enums/model_enum.py
+++ b/enums/model_enum.py
@@ -9,3 +9,4 @@ from enum import Enum
 
				 
			
 
				 class ModelEnum(Enum):
			
 
				     LR = "lr"
			
 
				+    XGB = "xgb"
			
--- a/feature/bin/__init__.py
+++ b/feature/bin/__init__.py
@@ -0,0 +1,9 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2025/4/3
			
 
				+@desc: 
			
 
				+"""
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass
			
--- a/feature/bin/strategy_norm.py
+++ b/feature/bin/strategy_norm.py
@@ -0,0 +1,179 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2025/4/3
			
 
				+@desc: 值标准化，类似于分箱
			
 
				+"""
			
 
				+
			
 
				+from typing import Dict, List
			
 
				+
			
 
				+import pandas as pd
			
 
				+import xgboost as xgb
			
 
				+from pandas.core.dtypes.common import is_numeric_dtype
			
 
				+
			
 
				+from commom import GeneralException, f_display_title
			
 
				+from data import DataExplore
			
 
				+from entitys import DataSplitEntity, MetricFucResultEntity
			
 
				+from enums import ResultCodesEnum, ContextEnum
			
 
				+from feature.feature_strategy_base import FeatureStrategyBase
			
 
				+from init import context
			
 
				+from .utils import f_format_value, OneHot, f_format_bin
			
 
				+
			
 
				+
			
 
				+class StrategyNorm(FeatureStrategyBase):
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.x_columns = None
			
 
				+        self.one_hot_encoder_dict: Dict[str, OneHot] = {}
			
 
				+        self.points_dict: Dict[str, List[float]] = {}
			
 
				+
			
 
				+    def _f_fast_filter(self, data: DataSplitEntity) -> List[str]:
			
 
				+        y_column = self.ml_config.y_column
			
 
				+        x_columns = self.ml_config.x_columns
			
 
				+        columns_exclude = self.ml_config.columns_exclude
			
 
				+        format_bin = self.ml_config.format_bin
			
 
				+        params_xgb = self.ml_config.params_xgb
			
 
				+        max_feature_num = self.ml_config.max_feature_num
			
 
				+
			
 
				+        train_data = data.train_data.copy()
			
 
				+        test_data = data.test_data.copy()
			
 
				+
			
 
				+        # 特征列配置
			
 
				+        if len(x_columns) == 0:
			
 
				+            x_columns = train_data.columns.tolist()
			
 
				+        if y_column in x_columns:
			
 
				+            x_columns.remove(y_column)
			
 
				+        for column in columns_exclude:
			
 
				+            if column in x_columns:
			
 
				+                x_columns.remove(column)
			
 
				+
			
 
				+        # 简单校验数据类型一致性
			
 
				+        check_msg = DataExplore.check_type(data.data[x_columns])
			
 
				+        if check_msg != "":
			
 
				+            print(f"数据类型分析:\n{check_msg}\n同一变量请保持数据类型一致")
			
 
				+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"数据类型错误.")
			
 
				+
			
 
				+        # 数据处理
			
 
				+        model_columns = []
			
 
				+        num_columns = []
			
 
				+        str_columns = []
			
 
				+        for x_column in x_columns:
			
 
				+            if is_numeric_dtype(train_data[x_column]):
			
 
				+                num_columns.append(x_column)
			
 
				+                # 粗分箱
			
 
				+                if format_bin:
			
 
				+                    data_x_describe = train_data[x_column].describe(percentiles=[0.1, 0.9])
			
 
				+                    points = f_format_bin(data_x_describe)
			
 
				+                    self.points_dict[x_column] = points
			
 
				+                    train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				+                    test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				+            else:
			
 
				+                str_columns.append(x_column)
			
 
				+                one_hot_encoder = OneHot(data.data, x_column)
			
 
				+                one_hot_encoder.encoder(train_data)
			
 
				+                one_hot_encoder.encoder(test_data)
			
 
				+                model_columns.extend(one_hot_encoder.columns_onehot)
			
 
				+                self.one_hot_encoder_dict[x_column] = one_hot_encoder
			
 
				+
			
 
				+        model_columns.extend(num_columns)
			
 
				+
			
 
				+        # 重要性剔除弱变量
			
 
				+        model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
			
 
				+                                  n_estimators=params_xgb.get("num_boost_round"),
			
 
				+                                  max_depth=params_xgb.get("max_depth"),
			
 
				+                                  learning_rate=params_xgb.get("learning_rate"),
			
 
				+                                  random_state=params_xgb.get("random_state"),
			
 
				+                                  reg_alpha=params_xgb.get("alpha"),
			
 
				+                                  subsample=params_xgb.get("subsample"),
			
 
				+                                  colsample_bytree=params_xgb.get("colsample_bytree"),
			
 
				+                                  importance_type='weight'
			
 
				+                                  )
			
 
				+
			
 
				+        model.fit(X=train_data[model_columns], y=train_data[y_column],
			
 
				+                  eval_set=[(train_data[model_columns], train_data[y_column]),
			
 
				+                            (test_data[model_columns], test_data[y_column])],
			
 
				+                  eval_metric=params_xgb.get("eval_metric"),
			
 
				+                  early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
			
 
				+                  verbose=False,
			
 
				+                  )
			
 
				+
			
 
				+        # 重要合并，字符型变量重要性为各one-hot子变量求和
			
 
				+        importance = model.feature_importances_
			
 
				+        feature = []
			
 
				+        importance_weight = []
			
 
				+        for x_column in num_columns:
			
 
				+            for i, j in zip(model_columns, importance):
			
 
				+                if i == x_column:
			
 
				+                    feature.append(x_column)
			
 
				+                    importance_weight.append(j)
			
 
				+                    break
			
 
				+        for x_column in str_columns:
			
 
				+            feature_cache = 0
			
 
				+            for i, j in zip(model_columns, importance):
			
 
				+                if i.startswith(f"{x_column}("):
			
 
				+                    feature_cache += j
			
 
				+            feature.append(x_column)
			
 
				+            importance_weight.append(feature_cache)
			
 
				+
			
 
				+        df_importance = pd.DataFrame({'feature': feature, f'importance_weight': importance_weight})
			
 
				+        df_importance.sort_values(by=["importance_weight"], ascending=[False], inplace=True)
			
 
				+        df_importance.reset_index(drop=True, inplace=True)
			
 
				+        df_importance_rank = df_importance[df_importance["importance_weight"] > 0]
			
 
				+        df_importance_rank.reset_index(drop=True, inplace=True)
			
 
				+
			
 
				+        x_columns_filter = list(df_importance_rank["feature"])[0:max_feature_num]
			
 
				+
			
 
				+        context.set_filter_info(ContextEnum.FILTER_FAST,
			
 
				+                                f"筛选前变量数量:{len(x_columns)}\n{x_columns}\n"
			
 
				+                                f"快速筛选剔除变量数量:{len(x_columns) - len(x_columns_filter)}", detail=df_importance)
			
 
				+
			
 
				+        return x_columns_filter
			
 
				+
			
 
				+    def feature_search(self, data: DataSplitEntity, *args, **kwargs):
			
 
				+        x_columns = self._f_fast_filter(data)
			
 
				+        # 排个序，防止因为顺序原因导致的可能的bug
			
 
				+        x_columns.sort()
			
 
				+        self.x_columns = x_columns
			
 
				+
			
 
				+    def variable_analyse(self, *args, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+    def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
			
 
				+        df = data.copy()
			
 
				+        model_columns = []
			
 
				+        for x_column in self.x_columns:
			
 
				+            if x_column in self.points_dict.keys():
			
 
				+                points = self.points_dict[x_column]
			
 
				+                df[x_column] = df[x_column].apply(lambda x: f_format_value(points, x))
			
 
				+                model_columns.append(x_column)
			
 
				+            elif x_column in self.one_hot_encoder_dict.keys():
			
 
				+                one_hot_encoder = self.one_hot_encoder_dict[x_column]
			
 
				+                one_hot_encoder.encoder(df)
			
 
				+                model_columns.extend(one_hot_encoder.columns_onehot)
			
 
				+            else:
			
 
				+                model_columns.append(x_column)
			
 
				+
			
 
				+        return df[model_columns]
			
 
				+
			
 
				+    def feature_save(self, *args, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+    def feature_load(self, path: str, *args, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+    def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
			
 
				+        self.jupyter_print()
			
 
				+        return {}
			
 
				+
			
 
				+    def jupyter_print(self, *args, **kwargs):
			
 
				+
			
 
				+        max_feature_num = self.ml_config.max_feature_num
			
 
				+
			
 
				+        from IPython import display
			
 
				+
			
 
				+        filter_fast = context.get(ContextEnum.FILTER_FAST)
			
 
				+        f_display_title(display, "快速筛选过程")
			
 
				+        print(f"剔除变量重要性排名{max_feature_num}以后的变量")
			
 
				+        print(filter_fast.get("overview"))
			
 
				+        display.display(filter_fast["detail"])
			
--- a/feature/bin/utils.py
+++ b/feature/bin/utils.py
@@ -0,0 +1,94 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2023/12/28
			
 
				+@desc:  特征工具类
			
 
				+"""
			
 
				+import re
			
 
				+from typing import List
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from sklearn.preprocessing import OneHotEncoder
			
 
				+
			
 
				+FORMAT_DICT = {
			
 
				+    # 比例类 -1 - 1
			
 
				+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
			
 
				+
			
 
				+    # 次数类1 0 -10
			
 
				+    "bin_cnt1": np.arange(0, 11, 1),
			
 
				+    # 次数类2 0 - 20
			
 
				+    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
			
 
				+    # 次数类3 0 - 50
			
 
				+    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
			
 
				+    # 次数类4 0 - 100
			
 
				+    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
			
 
				+
			
 
				+    # 金额类1 0 - 1w
			
 
				+    "bin_amt1": np.arange(0, 1.1e4, 1e3),
			
 
				+    # 金额类2 0 - 5w
			
 
				+    "bin_amt2": np.arange(0, 5.5e4, 5e3),
			
 
				+    # 金额类3 0 - 10w
			
 
				+    "bin_amt3": np.arange(0, 11e4, 1e4),
			
 
				+    # 金额类4 0 - 20w
			
 
				+    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				+    # 金额类5 0 - 100w
			
 
				+    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				+
			
 
				+    # 年龄类
			
 
				+    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# 粗分箱
			
 
				+def f_format_bin(data_describe: pd.Series):
			
 
				+    # 筛选最合适的标准化分箱节点
			
 
				+    percent10 = data_describe["10%"]
			
 
				+    percent90 = data_describe["90%"]
			
 
				+    cache = None
			
 
				+    for k, v_list in FORMAT_DICT.items():
			
 
				+        bin_min = min(v_list)
			
 
				+        bin_max = max(v_list)
			
 
				+        if bin_min <= percent10 and percent90 <= bin_max:
			
 
				+            if cache is None:
			
 
				+                cache = (k, bin_max)
			
 
				+            elif cache[1] > bin_max:
			
 
				+                cache = (k, bin_max)
			
 
				+    if cache is None:
			
 
				+        return None
			
 
				+    return FORMAT_DICT[cache[0]]
			
 
				+
			
 
				+
			
 
				+def f_format_value(points, raw_v):
			
 
				+    format_v = raw_v
			
 
				+    # 选择分箱内靠左的切分点
			
 
				+    for idx in range(1, len(points)):
			
 
				+        v_left = points[idx - 1]
			
 
				+        v_right = points[idx]
			
 
				+        # 靠左原则
			
 
				+        if v_left <= raw_v < v_right:
			
 
				+            format_v = v_left
			
 
				+        if raw_v > v_right:
			
 
				+            format_v = v_right
			
 
				+
			
 
				+    return format_v
			
 
				+
			
 
				+
			
 
				+class OneHot():
			
 
				+
			
 
				+    def __init__(self, data: pd.DataFrame, x_column: str):
			
 
				+        self._x_column = x_column
			
 
				+        self._one_hot_encoder = OneHotEncoder()
			
 
				+        self._one_hot_encoder.fit(data[x_column].to_numpy().reshape(-1, 1))
			
 
				+        self._columns_onehot = [re.sub(r"[\[\]<]", "", f"{x_column}({i})") for i in
			
 
				+                                self._one_hot_encoder.categories_[0]]
			
 
				+
			
 
				+    def encoder(self, data: pd.DataFrame):
			
 
				+        one_hot_x = self._one_hot_encoder.transform(data[self._x_column].to_numpy().reshape(-1, 1))
			
 
				+        one_hot_x = one_hot_x.toarray()
			
 
				+        for idx, column_name in enumerate(self._columns_onehot):
			
 
				+            data[column_name] = one_hot_x[:, idx]
			
 
				+
			
 
				+    @property
			
 
				+    def columns_onehot(self) -> List[str]:
			
 
				+        return self._columns_onehot
			
--- a/feature/feature_strategy_factory.py
+++ b/feature/feature_strategy_factory.py
@@ -8,11 +8,13 @@ from typing import Type
 
				 
			
 
				 from commom import GeneralException
			
 
				 from enums import FeatureStrategyEnum, ResultCodesEnum
			
 
				+from .bin.strategy_norm import StrategyNorm
			
 
				 from .feature_strategy_base import FeatureStrategyBase
			
 
				 from .woe.strategy_woe import StrategyWoe
			
 
				 
			
 
				 strategy_map = {
			
 
				-    FeatureStrategyEnum.WOE.value: StrategyWoe
			
 
				+    FeatureStrategyEnum.WOE.value: StrategyWoe,
			
 
				+    FeatureStrategyEnum.NORM.value: StrategyNorm
			
 
				 }
			
 
				 
			
 
				 
			
--- a/feature/woe/strategy_woe.py
+++ b/feature/woe/strategy_woe.py
@@ -525,6 +525,7 @@ class StrategyWoe(FeatureStrategyBase):
 
				 
			
 
				         df_iv_psi_vif["释义"] = anns
			
 
				         df_iv_psi_vif.sort_values(by=["iv"], ascending=[False], inplace=True)
			
 
				+        df_iv_psi_vif = df_iv_psi_vif.reset_index(drop=True)
			
 
				         img_path_iv = self.ml_config.f_get_save_path(f"iv.png")
			
 
				         f_df_to_image(df_iv_psi_vif, img_path_iv)
			
 
				         metric_value_dict["变量iv"] = MetricFucResultEntity(table=df_iv_psi_vif, image_path=img_path_iv)
			
--- a/model/model_factory.py
+++ b/model/model_factory.py
@@ -10,9 +10,11 @@ from commom import GeneralException
 
				 from enums import ModelEnum, ResultCodesEnum
			
 
				 from .model_base import ModelBase
			
 
				 from .model_lr import ModelLr
			
 
				+from .model_xgb import ModelXgb
			
 
				 
			
 
				 model_map = {
			
 
				-    ModelEnum.LR.value: ModelLr
			
 
				+    ModelEnum.LR.value: ModelLr,
			
 
				+    ModelEnum.XGB.value: ModelXgb
			
 
				 }
			
 
				 
			
 
				 
			
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -0,0 +1,218 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2024/11/1
			
 
				+@desc: 
			
 
				+"""
			
 
				+import os.path
			
 
				+from os.path import dirname, realpath
			
 
				+from typing import Dict
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import scorecardpy as sc
			
 
				+import xgboost as xgb
			
 
				+from sklearn2pmml import sklearn2pmml, make_pmml_pipeline
			
 
				+
			
 
				+from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
			
 
				+    f_image_crop_white_borders
			
 
				+from config import BaseConfig
			
 
				+from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
			
 
				+from enums import ResultCodesEnum, ConstantEnum, FileEnum
			
 
				+from .model_base import ModelBase
			
 
				+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
			
 
				+
			
 
				+
			
 
				+class ModelXgb(ModelBase):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        # 报告模板
			
 
				+        self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
			
 
				+        self.model = None
			
 
				+
			
 
				+    def get_report_template_path(self):
			
 
				+        return self._template_path
			
 
				+
			
 
				+    def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
			
 
				+        print(f"{'-' * 50}开始训练{'-' * 50}")
			
 
				+        params_xgb = self.ml_config.params_xgb
			
 
				+
			
 
				+        # dtrain = xgb.DMatrix(data=train_data.data_x, label=train_data.data_y)
			
 
				+        # dtest = xgb.DMatrix(data=test_data.data_x, label=test_data.data_y)
			
 
				+        # self.model = xgb.train(
			
 
				+        #     params_xgb,
			
 
				+        #     dtrain=dtrain,
			
 
				+        #     evals=[(dtrain, 'train'), (dtest, 'test')],
			
 
				+        #     num_boost_round=params_xgb.get("num_boost_round"),
			
 
				+        #     early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
			
 
				+        #     verbose_eval=params_xgb.get("verbose_eval")
			
 
				+        # )
			
 
				+
			
 
				+        self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
			
 
				+                                       n_estimators=params_xgb.get("num_boost_round"),
			
 
				+                                       max_depth=params_xgb.get("max_depth"),
			
 
				+                                       learning_rate=params_xgb.get("learning_rate"),
			
 
				+                                       random_state=params_xgb.get("random_state"),
			
 
				+                                       reg_alpha=params_xgb.get("alpha"),
			
 
				+                                       subsample=params_xgb.get("subsample"),
			
 
				+                                       colsample_bytree=params_xgb.get("colsample_bytree"),
			
 
				+                                       importance_type='weight'
			
 
				+                                       )
			
 
				+
			
 
				+        self.model.fit(X=train_data.data_x, y=train_data.data_y,
			
 
				+                       eval_set=[(train_data.data_x, train_data.data_y), (test_data.data_x, test_data.data_y)],
			
 
				+                       eval_metric=params_xgb.get("eval_metric"),
			
 
				+                       early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
			
 
				+                       verbose=params_xgb.get("verbose_eval"),
			
 
				+                       )
			
 
				+
			
 
				+        if params_xgb.get("trees_print"):
			
 
				+            trees = self.model.get_booster().get_dump()
			
 
				+            for i, tree in enumerate(trees):
			
 
				+                if i < self.model.best_ntree_limit:
			
 
				+                    print(f"Tree {i}:")
			
 
				+                    print(tree)
			
 
				+
			
 
				+        self._train_score = self.prob(train_data.data_x)
			
 
				+        self._test_score = self.prob(test_data.data_x)
			
 
				+
			
 
				+    def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				+        prob = self.model.predict_proba(x)[:, 1]
			
 
				+        return prob
			
 
				+
			
 
				+    def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				+        pass
			
 
				+
			
 
				+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				+        pass
			
 
				+
			
 
				+    def model_save(self):
			
 
				+        if self.model is None:
			
 
				+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
			
 
				+
			
 
				+        path_model = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
			
 
				+        self.model.save_model(path_model)
			
 
				+        print(f"model save to【{path_model}】success. ")
			
 
				+
			
 
				+        path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
			
 
				+        pipeline = make_pmml_pipeline(self.model)
			
 
				+        sklearn2pmml(pipeline, path_pmml, with_repr=True, java_home=BaseConfig.java_home)
			
 
				+
			
 
				+    def model_load(self, path: str, *args, **kwargs):
			
 
				+        if not os.path.isdir(path):
			
 
				+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
			
 
				+        path_model = os.path.join(path, FileEnum.MODEL.value)
			
 
				+        if not os.path.isfile(path_model):
			
 
				+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
			
 
				+
			
 
				+        self.model = xgb.XGBClassifier()
			
 
				+        self.model.load_model(path_model)
			
 
				+
			
 
				+        print(f"model load from【{path_model}】success.")
			
 
				+
			
 
				+    def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
			
 
				+
			
 
				+        def _get_auc_ks(data_y, score, title):
			
 
				+            perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
			
 
				+            path = self.ml_config.f_get_save_path(f"perf_{title}.png")
			
 
				+            perf["pic"].savefig(path)
			
 
				+            auc = perf["AUC"]
			
 
				+            ks = perf["KS"]
			
 
				+            f_image_crop_white_borders(path, path)
			
 
				+            return auc, ks, path
			
 
				+
			
 
				+        def _get_perf():
			
 
				+            # 模型ks auc
			
 
				+            img_path_auc_ks = []
			
 
				+
			
 
				+            train_score = self._train_score
			
 
				+            test_score = self._test_score
			
 
				+
			
 
				+            train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train")
			
 
				+            img_path_auc_ks.append(path)
			
 
				+            test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test")
			
 
				+            img_path_auc_ks.append(path)
			
 
				+
			
 
				+            df_auc_ks = pd.DataFrame()
			
 
				+            df_auc_ks["样本集"] = ["训练集", "测试集"]
			
 
				+            df_auc_ks["AUC"] = [train_auc, test_auc]
			
 
				+            df_auc_ks["KS"] = [train_ks, test_ks]
			
 
				+            metric_value_dict[f"模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
			
 
				+                                                               image_size=5, table_font_size=10)
			
 
				+
			
 
				+            # 评分卡分箱
			
 
				+            train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
			
 
				+            train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=False)
			
 
				+            img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
			
 
				+            f_df_to_image(train_data_gain, img_path_train_gain)
			
 
				+            metric_value_dict[f"训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain,
			
 
				+                                                                  image_path=img_path_train_gain)
			
 
				+
			
 
				+            test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
			
 
				+            test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=False)
			
 
				+            img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain.png")
			
 
				+            f_df_to_image(test_data_gain, img_path_test_gain)
			
 
				+            metric_value_dict[f"测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain,
			
 
				+                                                                  image_path=img_path_test_gain)
			
 
				+
			
 
				+            # 模型分psi
			
 
				+            model_psi = f_calcu_model_psi(train_score_bin, test_score_bin, sort_ascending=False)
			
 
				+            img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
			
 
				+            f_df_to_image(model_psi, img_path_psi)
			
 
				+            metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
			
 
				+                                                                value=model_psi["psi"].sum().round(3),
			
 
				+                                                                image_path=img_path_psi)
			
 
				+            return train_score_bin, test_score_bin
			
 
				+
			
 
				+        y_column = self._ml_config.y_column
			
 
				+        stress_test = self.ml_config.stress_test
			
 
				+        stress_sample_times = self.ml_config.stress_sample_times
			
 
				+        stress_bad_rate_list = self.ml_config.stress_bad_rate_list
			
 
				+
			
 
				+        train_data = data.train_data
			
 
				+        test_data = data.test_data
			
 
				+
			
 
				+        metric_value_dict = {}
			
 
				+
			
 
				+        _, test_score_bin = _get_perf()
			
 
				+
			
 
				+        # 压力测试
			
 
				+        if stress_test:
			
 
				+            df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
			
 
				+                                      bad_rate_list=stress_bad_rate_list,
			
 
				+                                      target_column=y_column, score_column=ConstantEnum.SCORE.value,
			
 
				+                                      sort_ascending=False)
			
 
				+
			
 
				+            img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
			
 
				+            f_df_to_image(df_stress, img_path_stress)
			
 
				+            metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
			
 
				+
			
 
				+        if self.ml_config.jupyter_print:
			
 
				+            self.jupyter_print(metric_value_dict)
			
 
				+
			
 
				+        return metric_value_dict
			
 
				+
			
 
				+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
			
 
				+        from IPython import display
			
 
				+        f_display_title(display, "模型结果")
			
 
				+        display.display(metric_value_dict["模型结果"].table)
			
 
				+        f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
			
 
				+
			
 
				+        # 模型psi
			
 
				+        f_display_title(display, "模型psi")
			
 
				+        display.display(metric_value_dict["模型稳定性"].table)
			
 
				+        print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
			
 
				+
			
 
				+        f_display_title(display, "分数分箱")
			
 
				+        print("训练集-分数分箱")
			
 
				+        display.display(metric_value_dict["训练集分数分箱"].table)
			
 
				+        print("测试集-分数分箱")
			
 
				+        display.display(metric_value_dict["测试集分数分箱"].table)
			
 
				+
			
 
				+        if "压力测试" in metric_value_dict.keys():
			
 
				+            f_display_title(display, "压力测试")
			
 
				+            display.display(metric_value_dict["压力测试"].table)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass
			
--- a/requirements-analysis.txt
+++ b/requirements-analysis.txt
@@ -1,15 +1,11 @@
 
				-#pymysql==1.0.2
			
 
				 python-docx==0.8.11
			
 
				 xlrd==2.0.1
			
 
				 scorecardpy==0.1.9.2
			
 
				-#dataframe_image==0.1.14
			
 
				 matplotlib==3.3.4
			
 
				 numpy==1.18.2
			
 
				 pandas==1.1.5
			
 
				 scikit-learn==0.24.2
			
 
				-#pyhive==0.7.0
			
 
				 thrift==0.16.0
			
 
				-#thrift-sasl==0.4.3
			
 
				 seaborn==0.11.2
			
 
				 contextvars==2.4
			
 
				 tqdm==4.64.0
			
@@ -19,3 +15,13 @@ statsmodels==0.12.2
 
				 beautifulsoup4==4.11.1
			
 
				 openpyxl==3.0.9
			
 
				 torch==1.1.0
			
 
				+xgboost==1.0.2
			
 
				+pypmml==0.9.0
			
 
				+#pymysql==1.0.2
			
 
				+#dataframe_image==0.1.14
			
 
				+#thrift-sasl==0.4.3
			
 
				+#pyhive==0.7.0
			
 
				+#sklearn2pmml==0.103.3
			
 
				+#sklearn-pandas==2.2.0
			
 
				+#dill==0.3.4
			
 
				+
			
--- a/train_test_lr.py
+++ b/train_test_lr.py
--- a/train_test_xgb.py
+++ b/train_test_xgb.py
@@ -0,0 +1,75 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: yq
			
 
				+@time: 2024/11/27
			
 
				+@desc: 
			
 
				+"""
			
 
				+import time
			
 
				+
			
 
				+from entitys import DataSplitEntity, MlConfigEntity
			
 
				+from pipeline import Pipeline
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    time_now = time.time()
			
 
				+    import scorecardpy as sc
			
 
				+
			
 
				+    # 加载数据
			
 
				+    dat = sc.germancredit()
			
 
				+    dat_columns = dat.columns.tolist()
			
 
				+    dat_columns = [c.replace(".","_") for c in dat_columns]
			
 
				+    dat.columns = dat_columns
			
 
				+
			
 
				+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
			
 
				+
			
 
				+    # dat["credit_amount_corr1"] = dat["credit_amount"] * 2
			
 
				+    # dat["credit_amount_corr2"] = dat["credit_amount"] * 3
			
 
				+
			
 
				+    data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
			
 
				+
			
 
				+    # 训练并生成报告
			
 
				+    # train_pipeline = Pipeline(MlConfigEntity.from_config('config/demo/ml_config_template.json'), data)
			
 
				+    # 特征处理
			
 
				+    cfg = {
			
 
				+        # 项目名称，影响数据存储位置
			
 
				+        "project_name": "demo",
			
 
				+        # jupyter下输出内容
			
 
				+        "jupyter_print": True,
			
 
				+        # 是否开启粗分箱
			
 
				+        "format_bin": True,
			
 
				+        "max_feature_num": 20,
			
 
				+        # 压力测试
			
 
				+        "stress_test": True,
			
 
				+        # 压力测试抽样次数
			
 
				+        "stress_sample_times": 10,
			
 
				+        # y
			
 
				+        "y_column": "creditability",
			
 
				+        # 参与建模的候选变量
			
 
				+        # "x_columns": [
			
 
				+        # "duration_in_month",
			
 
				+        # "credit_amount",
			
 
				+        # "age_in_years",
			
 
				+        # "purpose",
			
 
				+        # "credit_history",
			
 
				+        # "random",
			
 
				+
			
 
				+        # "credit_amount_corr1",
			
 
				+        # "credit_amount_corr2",
			
 
				+        #   ],
			
 
				+        # 变量释义
			
 
				+        "columns_anns": {
			
 
				+            "age_in_years": "年龄",
			
 
				+            "credit_history": "借贷历史"
			
 
				+        },
			
 
				+        # 被排除的变量
			
 
				+        "columns_exclude": [],
			
 
				+        # 强制使用的变量
			
 
				+        # "columns_include": ["credit_amount"],
			
 
				+        "model_type": "xgb",
			
 
				+        "feature_strategy": "norm",
			
 
				+    }
			
 
				+
			
 
				+    train_pipeline = Pipeline(data=data, **cfg)
			
 
				+    train_pipeline.train()
			
 
				+    train_pipeline.report()
			
 
				+
			
 
				+    print(time.time() - time_now)