yq 18 godzin temu
rodzic
commit
8e7a1ebb37

+ 3 - 0
config/base_config.py

@@ -19,5 +19,8 @@ class BaseConfig:
     # 运行环境,目前影响上下文的储存
     run_env = "jupyter"
 
+    # java_home
+    java_home = "/usr/local/jdk1.8"
+
     # 表格合并相同列名的列
     merge_table_column = True

+ 3 - 3
entitys/data_feaure_entity.py

@@ -49,15 +49,15 @@ class DataSplitEntity():
         self._data = pd.concat((train_data, test_data))
 
     @property
-    def data(self):
+    def data(self) -> pd.DataFrame:
         return self._data
 
     @property
-    def train_data(self):
+    def train_data(self) -> pd.DataFrame:
         return self._train_data
 
     @property
-    def test_data(self):
+    def test_data(self) -> pd.DataFrame:
         return self._test_data
 
     def get_distribution(self, y_column) -> pd.DataFrame:

+ 24 - 16
entitys/ml_config_entity.py

@@ -41,15 +41,16 @@ class MlConfigEntity():
                  stress_bad_rate_list: List[float] = [],
                  model_type="lr",
                  feature_strategy="woe",
+                 params_xgb={},
                  rules=[],
-                 fill_method: str = None,
-                 fill_value=None,
                  *args, **kwargs):
 
         self._model_type = model_type
 
         self._feature_strategy = feature_strategy
 
+        self._params_xgb = params_xgb
+
         self._psi_threshold = psi_threshold
 
         self._vif_threshold = vif_threshold
@@ -96,12 +97,6 @@ class MlConfigEntity():
         # 候选x变量
         self._x_columns = x_columns
 
-        # 缺失值填充方法
-        self._fill_method = fill_method
-
-        # 缺失值填充值
-        self._fill_value = fill_value
-
         # 使用iv筛变量时的阈值
         self._iv_threshold = iv_threshold
 
@@ -145,6 +140,27 @@ class MlConfigEntity():
     def feature_strategy(self):
         return self._feature_strategy
 
+    @property
+    def params_xgb(self):
+        params = {
+            'objective': 'binary:logistic',
+            'eval_metric': 'auc',
+            'learning_rate': 0.1,
+            'max_depth': 3,
+            'subsample': None,
+            'colsample_bytree': None,
+            'alpha': None,
+            'num_boost_round': 500,
+            'early_stopping_rounds': 20,
+            'verbose_eval': 10,
+            'random_state': 2025,
+            'save_pmml': True,
+            'trees_print': False,
+        }
+        params.update(self._params_xgb)
+
+        return params
+
     @property
     def psi_threshold(self):
         return self._psi_threshold
@@ -230,14 +246,6 @@ class MlConfigEntity():
     def columns_anns(self):
         return self._columns_anns
 
-    @property
-    def fill_value(self):
-        return self._fill_value
-
-    @property
-    def fill_method(self):
-        return self._fill_method
-
     @property
     def iv_threshold(self):
         return self._iv_threshold

+ 3 - 0
enums/feature_strategy_enum.py

@@ -8,4 +8,7 @@ from enum import Enum
 
 
 class FeatureStrategyEnum(Enum):
+    # woe编码
     WOE = "woe"
+    # 粗分箱编码
+    NORM = "norm"

+ 1 - 0
enums/file_enum.py

@@ -15,6 +15,7 @@ class FileEnum(Enum):
     CARD_CFG = "card.cfg"
     COEF = "coef.json"
     MODEL = "model.pkl"
+    PMML = "model.pmml"
 
 
 

+ 1 - 0
enums/model_enum.py

@@ -9,3 +9,4 @@ from enum import Enum
 
 class ModelEnum(Enum):
     LR = "lr"
+    XGB = "xgb"

+ 9 - 0
feature/bin/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/4/3
+@desc: 
+"""
+
+if __name__ == "__main__":
+    pass

+ 179 - 0
feature/bin/strategy_norm.py

@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2025/4/3
+@desc: 值标准化,类似于分箱
+"""
+
+from typing import Dict, List
+
+import pandas as pd
+import xgboost as xgb
+from pandas.core.dtypes.common import is_numeric_dtype
+
+from commom import GeneralException, f_display_title
+from data import DataExplore
+from entitys import DataSplitEntity, MetricFucResultEntity
+from enums import ResultCodesEnum, ContextEnum
+from feature.feature_strategy_base import FeatureStrategyBase
+from init import context
+from .utils import f_format_value, OneHot, f_format_bin
+
+
+class StrategyNorm(FeatureStrategyBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.x_columns = None
+        self.one_hot_encoder_dict: Dict[str, OneHot] = {}
+        self.points_dict: Dict[str, List[float]] = {}
+
+    def _f_fast_filter(self, data: DataSplitEntity) -> List[str]:
+        y_column = self.ml_config.y_column
+        x_columns = self.ml_config.x_columns
+        columns_exclude = self.ml_config.columns_exclude
+        format_bin = self.ml_config.format_bin
+        params_xgb = self.ml_config.params_xgb
+        max_feature_num = self.ml_config.max_feature_num
+
+        train_data = data.train_data.copy()
+        test_data = data.test_data.copy()
+
+        # 特征列配置
+        if len(x_columns) == 0:
+            x_columns = train_data.columns.tolist()
+        if y_column in x_columns:
+            x_columns.remove(y_column)
+        for column in columns_exclude:
+            if column in x_columns:
+                x_columns.remove(column)
+
+        # 简单校验数据类型一致性
+        check_msg = DataExplore.check_type(data.data[x_columns])
+        if check_msg != "":
+            print(f"数据类型分析:\n{check_msg}\n同一变量请保持数据类型一致")
+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"数据类型错误.")
+
+        # 数据处理
+        model_columns = []
+        num_columns = []
+        str_columns = []
+        for x_column in x_columns:
+            if is_numeric_dtype(train_data[x_column]):
+                num_columns.append(x_column)
+                # 粗分箱
+                if format_bin:
+                    data_x_describe = train_data[x_column].describe(percentiles=[0.1, 0.9])
+                    points = f_format_bin(data_x_describe)
+                    self.points_dict[x_column] = points
+                    train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
+                    test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
+            else:
+                str_columns.append(x_column)
+                one_hot_encoder = OneHot(data.data, x_column)
+                one_hot_encoder.encoder(train_data)
+                one_hot_encoder.encoder(test_data)
+                model_columns.extend(one_hot_encoder.columns_onehot)
+                self.one_hot_encoder_dict[x_column] = one_hot_encoder
+
+        model_columns.extend(num_columns)
+
+        # 重要性剔除弱变量
+        model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
+                                  n_estimators=params_xgb.get("num_boost_round"),
+                                  max_depth=params_xgb.get("max_depth"),
+                                  learning_rate=params_xgb.get("learning_rate"),
+                                  random_state=params_xgb.get("random_state"),
+                                  reg_alpha=params_xgb.get("alpha"),
+                                  subsample=params_xgb.get("subsample"),
+                                  colsample_bytree=params_xgb.get("colsample_bytree"),
+                                  importance_type='weight'
+                                  )
+
+        model.fit(X=train_data[model_columns], y=train_data[y_column],
+                  eval_set=[(train_data[model_columns], train_data[y_column]),
+                            (test_data[model_columns], test_data[y_column])],
+                  eval_metric=params_xgb.get("eval_metric"),
+                  early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
+                  verbose=False,
+                  )
+
+        # 重要合并,字符型变量重要性为各one-hot子变量求和
+        importance = model.feature_importances_
+        feature = []
+        importance_weight = []
+        for x_column in num_columns:
+            for i, j in zip(model_columns, importance):
+                if i == x_column:
+                    feature.append(x_column)
+                    importance_weight.append(j)
+                    break
+        for x_column in str_columns:
+            feature_cache = 0
+            for i, j in zip(model_columns, importance):
+                if i.startswith(f"{x_column}("):
+                    feature_cache += j
+            feature.append(x_column)
+            importance_weight.append(feature_cache)
+
+        df_importance = pd.DataFrame({'feature': feature, f'importance_weight': importance_weight})
+        df_importance.sort_values(by=["importance_weight"], ascending=[False], inplace=True)
+        df_importance.reset_index(drop=True, inplace=True)
+        df_importance_rank = df_importance[df_importance["importance_weight"] > 0]
+        df_importance_rank.reset_index(drop=True, inplace=True)
+
+        x_columns_filter = list(df_importance_rank["feature"])[0:max_feature_num]
+
+        context.set_filter_info(ContextEnum.FILTER_FAST,
+                                f"筛选前变量数量:{len(x_columns)}\n{x_columns}\n"
+                                f"快速筛选剔除变量数量:{len(x_columns) - len(x_columns_filter)}", detail=df_importance)
+
+        return x_columns_filter
+
+    def feature_search(self, data: DataSplitEntity, *args, **kwargs):
+        x_columns = self._f_fast_filter(data)
+        # 排个序,防止因为顺序原因导致的可能的bug
+        x_columns.sort()
+        self.x_columns = x_columns
+
+    def variable_analyse(self, *args, **kwargs):
+        pass
+
+    def feature_generate(self, data: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
+        df = data.copy()
+        model_columns = []
+        for x_column in self.x_columns:
+            if x_column in self.points_dict.keys():
+                points = self.points_dict[x_column]
+                df[x_column] = df[x_column].apply(lambda x: f_format_value(points, x))
+                model_columns.append(x_column)
+            elif x_column in self.one_hot_encoder_dict.keys():
+                one_hot_encoder = self.one_hot_encoder_dict[x_column]
+                one_hot_encoder.encoder(df)
+                model_columns.extend(one_hot_encoder.columns_onehot)
+            else:
+                model_columns.append(x_column)
+
+        return df[model_columns]
+
+    def feature_save(self, *args, **kwargs):
+        pass
+
+    def feature_load(self, path: str, *args, **kwargs):
+        pass
+
+    def feature_report(self, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+        self.jupyter_print()
+        return {}
+
+    def jupyter_print(self, *args, **kwargs):
+
+        max_feature_num = self.ml_config.max_feature_num
+
+        from IPython import display
+
+        filter_fast = context.get(ContextEnum.FILTER_FAST)
+        f_display_title(display, "快速筛选过程")
+        print(f"剔除变量重要性排名{max_feature_num}以后的变量")
+        print(filter_fast.get("overview"))
+        display.display(filter_fast["detail"])

+ 94 - 0
feature/bin/utils.py

@@ -0,0 +1,94 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2023/12/28
+@desc:  特征工具类
+"""
+import re
+from typing import List
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import OneHotEncoder
+
+FORMAT_DICT = {
+    # 比例类 -1 - 1
+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
+
+    # 次数类1 0 -10
+    "bin_cnt1": np.arange(0, 11, 1),
+    # 次数类2 0 - 20
+    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
+    # 次数类3 0 - 50
+    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
+    # 次数类4 0 - 100
+    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
+
+    # 金额类1 0 - 1w
+    "bin_amt1": np.arange(0, 1.1e4, 1e3),
+    # 金额类2 0 - 5w
+    "bin_amt2": np.arange(0, 5.5e4, 5e3),
+    # 金额类3 0 - 10w
+    "bin_amt3": np.arange(0, 11e4, 1e4),
+    # 金额类4 0 - 20w
+    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
+    # 金额类5 0 - 100w
+    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
+
+    # 年龄类
+    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
+}
+
+
+# 粗分箱
+def f_format_bin(data_describe: pd.Series):
+    # 筛选最合适的标准化分箱节点
+    percent10 = data_describe["10%"]
+    percent90 = data_describe["90%"]
+    cache = None
+    for k, v_list in FORMAT_DICT.items():
+        bin_min = min(v_list)
+        bin_max = max(v_list)
+        if bin_min <= percent10 and percent90 <= bin_max:
+            if cache is None:
+                cache = (k, bin_max)
+            elif cache[1] > bin_max:
+                cache = (k, bin_max)
+    if cache is None:
+        return None
+    return FORMAT_DICT[cache[0]]
+
+
+def f_format_value(points, raw_v):
+    format_v = raw_v
+    # 选择分箱内靠左的切分点
+    for idx in range(1, len(points)):
+        v_left = points[idx - 1]
+        v_right = points[idx]
+        # 靠左原则
+        if v_left <= raw_v < v_right:
+            format_v = v_left
+        if raw_v > v_right:
+            format_v = v_right
+
+    return format_v
+
+
+class OneHot():
+
+    def __init__(self, data: pd.DataFrame, x_column: str):
+        self._x_column = x_column
+        self._one_hot_encoder = OneHotEncoder()
+        self._one_hot_encoder.fit(data[x_column].to_numpy().reshape(-1, 1))
+        self._columns_onehot = [re.sub(r"[\[\]<]", "", f"{x_column}({i})") for i in
+                                self._one_hot_encoder.categories_[0]]
+
+    def encoder(self, data: pd.DataFrame):
+        one_hot_x = self._one_hot_encoder.transform(data[self._x_column].to_numpy().reshape(-1, 1))
+        one_hot_x = one_hot_x.toarray()
+        for idx, column_name in enumerate(self._columns_onehot):
+            data[column_name] = one_hot_x[:, idx]
+
+    @property
+    def columns_onehot(self) -> List[str]:
+        return self._columns_onehot

+ 3 - 1
feature/feature_strategy_factory.py

@@ -8,11 +8,13 @@ from typing import Type
 
 from commom import GeneralException
 from enums import FeatureStrategyEnum, ResultCodesEnum
+from .bin.strategy_norm import StrategyNorm
 from .feature_strategy_base import FeatureStrategyBase
 from .woe.strategy_woe import StrategyWoe
 
 strategy_map = {
-    FeatureStrategyEnum.WOE.value: StrategyWoe
+    FeatureStrategyEnum.WOE.value: StrategyWoe,
+    FeatureStrategyEnum.NORM.value: StrategyNorm
 }
 
 

+ 1 - 0
feature/woe/strategy_woe.py

@@ -525,6 +525,7 @@ class StrategyWoe(FeatureStrategyBase):
 
         df_iv_psi_vif["释义"] = anns
         df_iv_psi_vif.sort_values(by=["iv"], ascending=[False], inplace=True)
+        df_iv_psi_vif = df_iv_psi_vif.reset_index(drop=True)
         img_path_iv = self.ml_config.f_get_save_path(f"iv.png")
         f_df_to_image(df_iv_psi_vif, img_path_iv)
         metric_value_dict["变量iv"] = MetricFucResultEntity(table=df_iv_psi_vif, image_path=img_path_iv)

+ 3 - 1
model/model_factory.py

@@ -10,9 +10,11 @@ from commom import GeneralException
 from enums import ModelEnum, ResultCodesEnum
 from .model_base import ModelBase
 from .model_lr import ModelLr
+from .model_xgb import ModelXgb
 
 model_map = {
-    ModelEnum.LR.value: ModelLr
+    ModelEnum.LR.value: ModelLr,
+    ModelEnum.XGB.value: ModelXgb
 }
 
 

+ 218 - 0
model/model_xgb.py

@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 
+"""
+import os.path
+from os.path import dirname, realpath
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+import scorecardpy as sc
+import xgboost as xgb
+from sklearn2pmml import sklearn2pmml, make_pmml_pipeline
+
+from commom import f_df_to_image, f_display_images_by_side, GeneralException, f_display_title, \
+    f_image_crop_white_borders
+from config import BaseConfig
+from entitys import MetricFucResultEntity, DataSplitEntity, DataFeatureEntity
+from enums import ResultCodesEnum, ConstantEnum, FileEnum
+from .model_base import ModelBase
+from .model_utils import f_stress_test, f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
+
+
+class ModelXgb(ModelBase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # 报告模板
+        self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
+        self.model = None
+
+    def get_report_template_path(self):
+        return self._template_path
+
+    def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs):
+        print(f"{'-' * 50}开始训练{'-' * 50}")
+        params_xgb = self.ml_config.params_xgb
+
+        # dtrain = xgb.DMatrix(data=train_data.data_x, label=train_data.data_y)
+        # dtest = xgb.DMatrix(data=test_data.data_x, label=test_data.data_y)
+        # self.model = xgb.train(
+        #     params_xgb,
+        #     dtrain=dtrain,
+        #     evals=[(dtrain, 'train'), (dtest, 'test')],
+        #     num_boost_round=params_xgb.get("num_boost_round"),
+        #     early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
+        #     verbose_eval=params_xgb.get("verbose_eval")
+        # )
+
+        self.model = xgb.XGBClassifier(objective=params_xgb.get("objective"),
+                                       n_estimators=params_xgb.get("num_boost_round"),
+                                       max_depth=params_xgb.get("max_depth"),
+                                       learning_rate=params_xgb.get("learning_rate"),
+                                       random_state=params_xgb.get("random_state"),
+                                       reg_alpha=params_xgb.get("alpha"),
+                                       subsample=params_xgb.get("subsample"),
+                                       colsample_bytree=params_xgb.get("colsample_bytree"),
+                                       importance_type='weight'
+                                       )
+
+        self.model.fit(X=train_data.data_x, y=train_data.data_y,
+                       eval_set=[(train_data.data_x, train_data.data_y), (test_data.data_x, test_data.data_y)],
+                       eval_metric=params_xgb.get("eval_metric"),
+                       early_stopping_rounds=params_xgb.get("early_stopping_rounds"),
+                       verbose=params_xgb.get("verbose_eval"),
+                       )
+
+        if params_xgb.get("trees_print"):
+            trees = self.model.get_booster().get_dump()
+            for i, tree in enumerate(trees):
+                if i < self.model.best_ntree_limit:
+                    print(f"Tree {i}:")
+                    print(tree)
+
+        self._train_score = self.prob(train_data.data_x)
+        self._test_score = self.prob(test_data.data_x)
+
+    def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        prob = self.model.predict_proba(x)[:, 1]
+        return prob
+
+    def score(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        pass
+
+    def score_rule(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
+        pass
+
+    def model_save(self):
+        if self.model is None:
+            GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
+
+        path_model = self.ml_config.f_get_save_path(FileEnum.MODEL.value)
+        self.model.save_model(path_model)
+        print(f"model save to【{path_model}】success. ")
+
+        path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
+        pipeline = make_pmml_pipeline(self.model)
+        sklearn2pmml(pipeline, path_pmml, with_repr=True, java_home=BaseConfig.java_home)
+
+    def model_load(self, path: str, *args, **kwargs):
+        if not os.path.isdir(path):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
+        path_model = os.path.join(path, FileEnum.MODEL.value)
+        if not os.path.isfile(path_model):
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型文件【{path_model}】不存在")
+
+        self.model = xgb.XGBClassifier()
+        self.model.load_model(path_model)
+
+        print(f"model load from【{path_model}】success.")
+
+    def train_report(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, MetricFucResultEntity]:
+
+        def _get_auc_ks(data_y, score, title):
+            perf = sc.perf_eva(data_y, score, title=title, show_plot=True)
+            path = self.ml_config.f_get_save_path(f"perf_{title}.png")
+            perf["pic"].savefig(path)
+            auc = perf["AUC"]
+            ks = perf["KS"]
+            f_image_crop_white_borders(path, path)
+            return auc, ks, path
+
+        def _get_perf():
+            # 模型ks auc
+            img_path_auc_ks = []
+
+            train_score = self._train_score
+            test_score = self._test_score
+
+            train_auc, train_ks, path = _get_auc_ks(train_data[y_column], train_score, f"train")
+            img_path_auc_ks.append(path)
+            test_auc, test_ks, path = _get_auc_ks(test_data[y_column], test_score, f"test")
+            img_path_auc_ks.append(path)
+
+            df_auc_ks = pd.DataFrame()
+            df_auc_ks["样本集"] = ["训练集", "测试集"]
+            df_auc_ks["AUC"] = [train_auc, test_auc]
+            df_auc_ks["KS"] = [train_ks, test_ks]
+            metric_value_dict[f"模型结果"] = MetricFucResultEntity(table=df_auc_ks, image_path=img_path_auc_ks,
+                                                               image_size=5, table_font_size=10)
+
+            # 评分卡分箱
+            train_score_bin, score_bins = f_get_model_score_bin(train_data, train_score)
+            train_data_gain = f_calcu_model_ks(train_score_bin, y_column, sort_ascending=False)
+            img_path_train_gain = self.ml_config.f_get_save_path(f"train_gain.png")
+            f_df_to_image(train_data_gain, img_path_train_gain)
+            metric_value_dict[f"训练集分数分箱"] = MetricFucResultEntity(table=train_data_gain,
+                                                                  image_path=img_path_train_gain)
+
+            test_score_bin, _ = f_get_model_score_bin(test_data, test_score, score_bins)
+            test_data_gain = f_calcu_model_ks(test_score_bin, y_column, sort_ascending=False)
+            img_path_test_gain = self.ml_config.f_get_save_path(f"test_gain.png")
+            f_df_to_image(test_data_gain, img_path_test_gain)
+            metric_value_dict[f"测试集分数分箱"] = MetricFucResultEntity(table=test_data_gain,
+                                                                  image_path=img_path_test_gain)
+
+            # 模型分psi
+            model_psi = f_calcu_model_psi(train_score_bin, test_score_bin, sort_ascending=False)
+            img_path_psi = self.ml_config.f_get_save_path(f"model_psi.png")
+            f_df_to_image(model_psi, img_path_psi)
+            metric_value_dict[f"模型稳定性"] = MetricFucResultEntity(table=model_psi,
+                                                                value=model_psi["psi"].sum().round(3),
+                                                                image_path=img_path_psi)
+            return train_score_bin, test_score_bin
+
+        y_column = self._ml_config.y_column
+        stress_test = self.ml_config.stress_test
+        stress_sample_times = self.ml_config.stress_sample_times
+        stress_bad_rate_list = self.ml_config.stress_bad_rate_list
+
+        train_data = data.train_data
+        test_data = data.test_data
+
+        metric_value_dict = {}
+
+        _, test_score_bin = _get_perf()
+
+        # 压力测试
+        if stress_test:
+            df_stress = f_stress_test(test_score_bin, sample_times=stress_sample_times,
+                                      bad_rate_list=stress_bad_rate_list,
+                                      target_column=y_column, score_column=ConstantEnum.SCORE.value,
+                                      sort_ascending=False)
+
+            img_path_stress = self.ml_config.f_get_save_path(f"stress_test.png")
+            f_df_to_image(df_stress, img_path_stress)
+            metric_value_dict["压力测试"] = MetricFucResultEntity(table=df_stress, image_path=img_path_stress)
+
+        if self.ml_config.jupyter_print:
+            self.jupyter_print(metric_value_dict)
+
+        return metric_value_dict
+
+    def jupyter_print(self, metric_value_dict=Dict[str, MetricFucResultEntity], *args, **kwargs):
+        from IPython import display
+        f_display_title(display, "模型结果")
+        display.display(metric_value_dict["模型结果"].table)
+        f_display_images_by_side(display, metric_value_dict["模型结果"].image_path)
+
+        # 模型psi
+        f_display_title(display, "模型psi")
+        display.display(metric_value_dict["模型稳定性"].table)
+        print(f"模型psi: {metric_value_dict['模型稳定性'].value}")
+
+        f_display_title(display, "分数分箱")
+        print("训练集-分数分箱")
+        display.display(metric_value_dict["训练集分数分箱"].table)
+        print("测试集-分数分箱")
+        display.display(metric_value_dict["测试集分数分箱"].table)
+
+        if "压力测试" in metric_value_dict.keys():
+            f_display_title(display, "压力测试")
+            display.display(metric_value_dict["压力测试"].table)
+
+
+if __name__ == "__main__":
+    pass

+ 10 - 4
requirements-analysis.txt

@@ -1,15 +1,11 @@
-#pymysql==1.0.2
 python-docx==0.8.11
 xlrd==2.0.1
 scorecardpy==0.1.9.2
-#dataframe_image==0.1.14
 matplotlib==3.3.4
 numpy==1.18.2
 pandas==1.1.5
 scikit-learn==0.24.2
-#pyhive==0.7.0
 thrift==0.16.0
-#thrift-sasl==0.4.3
 seaborn==0.11.2
 contextvars==2.4
 tqdm==4.64.0
@@ -19,3 +15,13 @@ statsmodels==0.12.2
 beautifulsoup4==4.11.1
 openpyxl==3.0.9
 torch==1.1.0
+xgboost==1.0.2
+pypmml==0.9.0
+#pymysql==1.0.2
+#dataframe_image==0.1.14
+#thrift-sasl==0.4.3
+#pyhive==0.7.0
+#sklearn2pmml==0.103.3
+#sklearn-pandas==2.2.0
+#dill==0.3.4
+

+ 0 - 0
train_test.py → train_test_lr.py


+ 75 - 0
train_test_xgb.py

@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/27
+@desc: 
+"""
+import time
+
+from entitys import DataSplitEntity, MlConfigEntity
+from pipeline import Pipeline
+
+if __name__ == "__main__":
+    time_now = time.time()
+    import scorecardpy as sc
+
+    # 加载数据
+    dat = sc.germancredit()
+    dat_columns = dat.columns.tolist()
+    dat_columns = [c.replace(".","_") for c in dat_columns]
+    dat.columns = dat_columns
+
+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
+
+    # dat["credit_amount_corr1"] = dat["credit_amount"] * 2
+    # dat["credit_amount_corr2"] = dat["credit_amount"] * 3
+
+    data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])
+
+    # 训练并生成报告
+    # train_pipeline = Pipeline(MlConfigEntity.from_config('config/demo/ml_config_template.json'), data)
+    # 特征处理
+    cfg = {
+        # 项目名称,影响数据存储位置
+        "project_name": "demo",
+        # jupyter下输出内容
+        "jupyter_print": True,
+        # 是否开启粗分箱
+        "format_bin": True,
+        "max_feature_num": 20,
+        # 压力测试
+        "stress_test": True,
+        # 压力测试抽样次数
+        "stress_sample_times": 10,
+        # y
+        "y_column": "creditability",
+        # 参与建模的候选变量
+        # "x_columns": [
+        # "duration_in_month",
+        # "credit_amount",
+        # "age_in_years",
+        # "purpose",
+        # "credit_history",
+        # "random",
+
+        # "credit_amount_corr1",
+        # "credit_amount_corr2",
+        #   ],
+        # 变量释义
+        "columns_anns": {
+            "age_in_years": "年龄",
+            "credit_history": "借贷历史"
+        },
+        # 被排除的变量
+        "columns_exclude": [],
+        # 强制使用的变量
+        # "columns_include": ["credit_amount"],
+        "model_type": "xgb",
+        "feature_strategy": "norm",
+    }
+
+    train_pipeline = Pipeline(data=data, **cfg)
+    train_pipeline.train()
+    train_pipeline.report()
+
+    print(time.time() - time_now)