Bläddra i källkod

modify: xgb代码优化

yq 3 dagar sedan
förälder
incheckning
6839e43b22
6 ändrade filer med 64 tillägg och 45 borttagningar
  1. 3 0
      enums/constant_enum.py
  2. 2 1
      enums/context_enum.py
  3. 6 4
      feature/bin/strategy_norm.py
  4. 11 11
      feature/bin/utils.py
  5. 11 11
      feature/woe/utils.py
  6. 31 18
      model/model_xgb.py

+ 3 - 0
enums/constant_enum.py

@@ -12,3 +12,6 @@ class ConstantEnum(Enum):
     SCORE_BIN = "MODEL_SCORE_BIN"
     # lr模型常数项
     INTERCEPT = "const"
+
+    # xgb特征粗分箱时标准之外的箱
+    XGB_BIN_LOWEST = -99999.0

+ 2 - 1
enums/context_enum.py

@@ -17,5 +17,6 @@ class ContextEnum(Enum):
     FILTER_VIF = "filter_vif"
     FILTER_IVTOP = "filter_ivtop"
 
-    XGB_COLUMNS_STR = "xgb_columns_str"
+    XGB_COLUMNS_SELECTED = "xgb_columns_selected"
     XGB_COLUMNS_NUM = "xgb_columns_num"
+    XGB_POINTS = "xgb_points"

+ 6 - 4
feature/bin/strategy_norm.py

@@ -67,9 +67,10 @@ class StrategyNorm(FeatureStrategyBase):
                 if format_bin:
                     data_x_describe = train_data[x_column].describe(percentiles=[0.1, 0.9])
                     points = f_format_bin(data_x_describe)
-                    self.points_dict[x_column] = points
-                    train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
-                    test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
+                    if points is not None:
+                        self.points_dict[x_column] = points
+                        train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
+                        test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
             else:
                 str_columns.append(x_column)
                 one_hot_encoder = OneHot()
@@ -132,8 +133,8 @@ class StrategyNorm(FeatureStrategyBase):
                                 f"筛选前变量数量:{len(x_columns)}\n{x_columns}\n"
                                 f"快速筛选剔除变量数量:{len(x_columns) - len(x_columns_filter)}", detail=df_importance)
 
-        context.set(ContextEnum.XGB_COLUMNS_STR, str_columns)
         context.set(ContextEnum.XGB_COLUMNS_NUM, num_columns)
+        context.set(ContextEnum.XGB_POINTS, self.points_dict)
 
         return x_columns_filter
 
@@ -142,6 +143,7 @@ class StrategyNorm(FeatureStrategyBase):
         # 排个序,防止因为顺序原因导致的可能的bug
         x_columns.sort()
         self.x_columns = x_columns
+        context.set(ContextEnum.XGB_COLUMNS_SELECTED, x_columns)
 
     def variable_analyse(self, *args, **kwargs):
         pass

+ 11 - 11
feature/bin/utils.py

@@ -13,30 +13,30 @@ from sklearn.preprocessing import OneHotEncoder
 
 FORMAT_DICT = {
     # 比例类 -1 - 1
-    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1).tolist(),
 
     # 次数类1 0 -10
-    "bin_cnt1": np.arange(0, 11, 1),
+    "bin_cnt1": np.arange(0.0, 11.0, 1.0).tolist(),
     # 次数类2 0 - 20
-    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
+    "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
     # 次数类3 0 - 50
-    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
     # 次数类4 0 - 100
-    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
+    "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
 
     # 金额类1 0 - 1w
-    "bin_amt1": np.arange(0, 1.1e4, 1e3),
+    "bin_amt1": np.arange(0, 1.1e4, 1e3).tolist(),
     # 金额类2 0 - 5w
-    "bin_amt2": np.arange(0, 5.5e4, 5e3),
+    "bin_amt2": np.arange(0, 5.5e4, 5e3).tolist(),
     # 金额类3 0 - 10w
-    "bin_amt3": np.arange(0, 11e4, 1e4),
+    "bin_amt3": np.arange(0, 11e4, 1e4).tolist(),
     # 金额类4 0 - 20w
-    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
+    "bin_amt4": [0.0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
     # 金额类5 0 - 100w
-    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
+    "bin_amt5": [0.0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
 
     # 年龄类
-    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
+    "bin_age": [20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0],
 }
 
 

+ 11 - 11
feature/woe/utils.py

@@ -16,30 +16,30 @@ from enums import ResultCodesEnum, FileEnum
 
 FORMAT_DICT = {
     # 比例类 -1 - 1
-    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1).tolist(),
 
     # 次数类1 0 -10
-    "bin_cnt1": np.arange(0, 11, 1),
+    "bin_cnt1": np.arange(0.0, 11.0, 1.0).tolist(),
     # 次数类2 0 - 20
-    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
+    "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
     # 次数类3 0 - 50
-    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
     # 次数类4 0 - 100
-    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
+    "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
 
     # 金额类1 0 - 1w
-    "bin_amt1": np.arange(0, 1.1e4, 1e3),
+    "bin_amt1": np.arange(0, 1.1e4, 1e3).tolist(),
     # 金额类2 0 - 5w
-    "bin_amt2": np.arange(0, 5.5e4, 5e3),
+    "bin_amt2": np.arange(0, 5.5e4, 5e3).tolist(),
     # 金额类3 0 - 10w
-    "bin_amt3": np.arange(0, 11e4, 1e4),
+    "bin_amt3": np.arange(0, 11e4, 1e4).tolist(),
     # 金额类4 0 - 20w
-    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
+    "bin_amt4": [0.0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
     # 金额类5 0 - 100w
-    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
+    "bin_amt5": [0.0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
 
     # 年龄类
-    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
+    "bin_age": [20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0],
 }
 
 

+ 31 - 18
model/model_xgb.py

@@ -49,8 +49,11 @@ class ModelXgb(ModelBase):
         print(f"{'-' * 50}开始训练{'-' * 50}")
         params_xgb = self.ml_config.params_xgb
         y_column = self._ml_config.y_column
-        str_columns = context.get(ContextEnum.XGB_COLUMNS_STR)
+        # 选定的变量
+        x_columns_selected = context.get(ContextEnum.XGB_COLUMNS_SELECTED)
+        # 包含了未选定的变量
         num_columns = context.get(ContextEnum.XGB_COLUMNS_NUM)
+        points_dict: dict = context.get(ContextEnum.XGB_POINTS)
 
         data: DataSplitEntity = kwargs["data"]
         train_data_raw = data.train_data
@@ -87,19 +90,19 @@ class ModelXgb(ModelBase):
         #                verbose=params_xgb.get("verbose_eval"),
         #                )
 
-        # if params_xgb.get("trees_print"):
-        #     trees = self.model.get_booster().get_dump()
-        #     for i, tree in enumerate(trees):
-        #         if i < self.model.best_ntree_limit:
-        #             print(f"Tree {i}:")
-        #             print(tree)
-
-        mapper = [(str_columns, OneHotEncoder())]
-        # for column in str_columns:
-        #     mapper.append((column, OneHotEncoder()))
-        for column in num_columns:
-            mapper.append(
-                (column, CutTransformer([-np.inf, 10, 20, 30, +np.inf], labels=[1, 2, 3, 4])))
+        str_columns_selected = [i for i in x_columns_selected if i not in num_columns]
+        mapper = [(str_columns_selected, OneHotEncoder())]
+        for column in x_columns_selected:
+            if column in str_columns_selected:
+                continue
+            # 粗分箱
+            if column in points_dict.keys():
+                points = [-np.inf] + points_dict[column] + [np.inf]
+                labels = [ConstantEnum.XGB_BIN_LOWEST.value] + points_dict[column]
+                mapper.append((column, CutTransformer(points, right=False, labels=labels)))
+            else:
+                mapper.append((column, None))
+
         mapper = DataFrameMapper(mapper)
 
         self.pipeline = PMMLPipeline([("mapper", mapper), ("classifier", self.model)])
@@ -114,6 +117,13 @@ class ModelXgb(ModelBase):
                           classifier__verbose=params_xgb.get("verbose_eval"),
                           )
 
+        if params_xgb.get("trees_print"):
+            trees = self.model.get_booster().get_dump()
+            for i, tree in enumerate(trees):
+                if i < self.model.best_ntree_limit:
+                    print(f"Tree {i}:")
+                    print(tree)
+
     def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
         # prob = self.model.predict_proba(x)[:, 1]
         prob = self.pipeline.predict_proba(x)[:, 1]
@@ -126,6 +136,8 @@ class ModelXgb(ModelBase):
         pass
 
     def model_save(self):
+        params_xgb = self.ml_config.params_xgb
+
         if self.pipeline is None:
             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
 
@@ -134,10 +146,11 @@ class ModelXgb(ModelBase):
         joblib.dump(self.pipeline, path_model)
         print(f"model save to【{path_model}】success. ")
 
-        path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
-        # pipeline = make_pmml_pipeline(self.model)
-        sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
-        print(f"model save to【{path_pmml}】success. ")
+        if params_xgb.get("save_pmml"):
+            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
+            # pipeline = make_pmml_pipeline(self.model)
+            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
+            print(f"model save to【{path_pmml}】success. ")
 
     def model_load(self, path: str, *args, **kwargs):
         if not os.path.isdir(path):