2 månader sedan · 6839e43b22
--- a/enums/constant_enum.py
+++ b/enums/constant_enum.py
@@ -12,3 +12,6 @@ class ConstantEnum(Enum):
 
				     SCORE_BIN = "MODEL_SCORE_BIN"
			
 
				     # lr模型常数项
			
 
				     INTERCEPT = "const"
			
 
				+
			
 
				+    # xgb特征粗分箱时标准之外的箱
			
 
				+    XGB_BIN_LOWEST = -99999.0
			
--- a/enums/context_enum.py
+++ b/enums/context_enum.py
@@ -17,5 +17,6 @@ class ContextEnum(Enum):
 
				     FILTER_VIF = "filter_vif"
			
 
				     FILTER_IVTOP = "filter_ivtop"
			
 
				 
			
 
				-    XGB_COLUMNS_STR = "xgb_columns_str"
			
 
				+    XGB_COLUMNS_SELECTED = "xgb_columns_selected"
			
 
				     XGB_COLUMNS_NUM = "xgb_columns_num"
			
 
				+    XGB_POINTS = "xgb_points"
			
--- a/feature/bin/strategy_norm.py
+++ b/feature/bin/strategy_norm.py
@@ -67,9 +67,10 @@ class StrategyNorm(FeatureStrategyBase):
 
				                 if format_bin:
			
 
				                     data_x_describe = train_data[x_column].describe(percentiles=[0.1, 0.9])
			
 
				                     points = f_format_bin(data_x_describe)
			
 
				-                    self.points_dict[x_column] = points
			
 
				-                    train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				-                    test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				+                    if points is not None:
			
 
				+                        self.points_dict[x_column] = points
			
 
				+                        train_data[x_column] = train_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				+                        test_data[x_column] = test_data[x_column].apply(lambda x: f_format_value(points, x))
			
 
				             else:
			
 
				                 str_columns.append(x_column)
			
 
				                 one_hot_encoder = OneHot()
			
@@ -132,8 +133,8 @@ class StrategyNorm(FeatureStrategyBase):
 
				                                 f"筛选前变量数量:{len(x_columns)}\n{x_columns}\n"
			
 
				                                 f"快速筛选剔除变量数量:{len(x_columns) - len(x_columns_filter)}", detail=df_importance)
			
 
				 
			
 
				-        context.set(ContextEnum.XGB_COLUMNS_STR, str_columns)
			
 
				         context.set(ContextEnum.XGB_COLUMNS_NUM, num_columns)
			
 
				+        context.set(ContextEnum.XGB_POINTS, self.points_dict)
			
 
				 
			
 
				         return x_columns_filter
			
 
				 
			
@@ -142,6 +143,7 @@ class StrategyNorm(FeatureStrategyBase):
 
				         # 排个序，防止因为顺序原因导致的可能的bug
			
 
				         x_columns.sort()
			
 
				         self.x_columns = x_columns
			
 
				+        context.set(ContextEnum.XGB_COLUMNS_SELECTED, x_columns)
			
 
				 
			
 
				     def variable_analyse(self, *args, **kwargs):
			
 
				         pass
			
--- a/feature/bin/utils.py
+++ b/feature/bin/utils.py
@@ -13,30 +13,30 @@ from sklearn.preprocessing import OneHotEncoder
 
				 
			
 
				 FORMAT_DICT = {
			
 
				     # 比例类 -1 - 1
			
 
				-    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
			
 
				+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1).tolist(),
			
 
				 
			
 
				     # 次数类1 0 -10
			
 
				-    "bin_cnt1": np.arange(0, 11, 1),
			
 
				+    "bin_cnt1": np.arange(0.0, 11.0, 1.0).tolist(),
			
 
				     # 次数类2 0 - 20
			
 
				-    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
			
 
				+    "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
			
 
				     # 次数类3 0 - 50
			
 
				-    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
			
 
				+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
			
 
				     # 次数类4 0 - 100
			
 
				-    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
			
 
				+    "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
			
 
				 
			
 
				     # 金额类1 0 - 1w
			
 
				-    "bin_amt1": np.arange(0, 1.1e4, 1e3),
			
 
				+    "bin_amt1": np.arange(0, 1.1e4, 1e3).tolist(),
			
 
				     # 金额类2 0 - 5w
			
 
				-    "bin_amt2": np.arange(0, 5.5e4, 5e3),
			
 
				+    "bin_amt2": np.arange(0, 5.5e4, 5e3).tolist(),
			
 
				     # 金额类3 0 - 10w
			
 
				-    "bin_amt3": np.arange(0, 11e4, 1e4),
			
 
				+    "bin_amt3": np.arange(0, 11e4, 1e4).tolist(),
			
 
				     # 金额类4 0 - 20w
			
 
				-    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				+    "bin_amt4": [0.0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				     # 金额类5 0 - 100w
			
 
				-    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				+    "bin_amt5": [0.0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				 
			
 
				     # 年龄类
			
 
				-    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
			
 
				+    "bin_age": [20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0],
			
 
				 }
			
 
				 
			
 
				 
			
--- a/feature/woe/utils.py
+++ b/feature/woe/utils.py
@@ -16,30 +16,30 @@ from enums import ResultCodesEnum, FileEnum
 
				 
			
 
				 FORMAT_DICT = {
			
 
				     # 比例类 -1 - 1
			
 
				-    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1),
			
 
				+    "bin_rate1": np.arange(-1, 1 + 0.1, 0.1).tolist(),
			
 
				 
			
 
				     # 次数类1 0 -10
			
 
				-    "bin_cnt1": np.arange(0, 11, 1),
			
 
				+    "bin_cnt1": np.arange(0.0, 11.0, 1.0).tolist(),
			
 
				     # 次数类2 0 - 20
			
 
				-    "bin_cnt2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20],
			
 
				+    "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
			
 
				     # 次数类3 0 - 50
			
 
				-    "bin_cnt3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50],
			
 
				+    "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
			
 
				     # 次数类4 0 - 100
			
 
				-    "bin_cnt4": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 80, 100],
			
 
				+    "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
			
 
				 
			
 
				     # 金额类1 0 - 1w
			
 
				-    "bin_amt1": np.arange(0, 1.1e4, 1e3),
			
 
				+    "bin_amt1": np.arange(0, 1.1e4, 1e3).tolist(),
			
 
				     # 金额类2 0 - 5w
			
 
				-    "bin_amt2": np.arange(0, 5.5e4, 5e3),
			
 
				+    "bin_amt2": np.arange(0, 5.5e4, 5e3).tolist(),
			
 
				     # 金额类3 0 - 10w
			
 
				-    "bin_amt3": np.arange(0, 11e4, 1e4),
			
 
				+    "bin_amt3": np.arange(0, 11e4, 1e4).tolist(),
			
 
				     # 金额类4 0 - 20w
			
 
				-    "bin_amt4": [0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				+    "bin_amt4": [0.0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
			
 
				     # 金额类5 0 - 100w
			
 
				-    "bin_amt5": [0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				+    "bin_amt5": [0.0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
			
 
				 
			
 
				     # 年龄类
			
 
				-    "bin_age": [20, 25, 30, 35, 40, 45, 50, 55, 60, 65],
			
 
				+    "bin_age": [20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0],
			
 
				 }
			
 
				 
			
 
				 
			
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -49,8 +49,11 @@ class ModelXgb(ModelBase):
 
				         print(f"{'-' * 50}开始训练{'-' * 50}")
			
 
				         params_xgb = self.ml_config.params_xgb
			
 
				         y_column = self._ml_config.y_column
			
 
				-        str_columns = context.get(ContextEnum.XGB_COLUMNS_STR)
			
 
				+        # 选定的变量
			
 
				+        x_columns_selected = context.get(ContextEnum.XGB_COLUMNS_SELECTED)
			
 
				+        # 包含了未选定的变量
			
 
				         num_columns = context.get(ContextEnum.XGB_COLUMNS_NUM)
			
 
				+        points_dict: dict = context.get(ContextEnum.XGB_POINTS)
			
 
				 
			
 
				         data: DataSplitEntity = kwargs["data"]
			
 
				         train_data_raw = data.train_data
			
@@ -87,19 +90,19 @@ class ModelXgb(ModelBase):
 
				         #                verbose=params_xgb.get("verbose_eval"),
			
 
				         #                )
			
 
				 
			
 
				-        # if params_xgb.get("trees_print"):
			
 
				-        #     trees = self.model.get_booster().get_dump()
			
 
				-        #     for i, tree in enumerate(trees):
			
 
				-        #         if i < self.model.best_ntree_limit:
			
 
				-        #             print(f"Tree {i}:")
			
 
				-        #             print(tree)
			
 
				-
			
 
				-        mapper = [(str_columns, OneHotEncoder())]
			
 
				-        # for column in str_columns:
			
 
				-        #     mapper.append((column, OneHotEncoder()))
			
 
				-        for column in num_columns:
			
 
				-            mapper.append(
			
 
				-                (column, CutTransformer([-np.inf, 10, 20, 30, +np.inf], labels=[1, 2, 3, 4])))
			
 
				+        str_columns_selected = [i for i in x_columns_selected if i not in num_columns]
			
 
				+        mapper = [(str_columns_selected, OneHotEncoder())]
			
 
				+        for column in x_columns_selected:
			
 
				+            if column in str_columns_selected:
			
 
				+                continue
			
 
				+            # 粗分箱
			
 
				+            if column in points_dict.keys():
			
 
				+                points = [-np.inf] + points_dict[column] + [np.inf]
			
 
				+                labels = [ConstantEnum.XGB_BIN_LOWEST.value] + points_dict[column]
			
 
				+                mapper.append((column, CutTransformer(points, right=False, labels=labels)))
			
 
				+            else:
			
 
				+                mapper.append((column, None))
			
 
				+
			
 
				         mapper = DataFrameMapper(mapper)
			
 
				 
			
 
				         self.pipeline = PMMLPipeline([("mapper", mapper), ("classifier", self.model)])
			
@@ -114,6 +117,13 @@ class ModelXgb(ModelBase):
 
				                           classifier__verbose=params_xgb.get("verbose_eval"),
			
 
				                           )
			
 
				 
			
 
				+        if params_xgb.get("trees_print"):
			
 
				+            trees = self.model.get_booster().get_dump()
			
 
				+            for i, tree in enumerate(trees):
			
 
				+                if i < self.model.best_ntree_limit:
			
 
				+                    print(f"Tree {i}:")
			
 
				+                    print(tree)
			
 
				+
			
 
				     def prob(self, x: pd.DataFrame, *args, **kwargs) -> np.array:
			
 
				         # prob = self.model.predict_proba(x)[:, 1]
			
 
				         prob = self.pipeline.predict_proba(x)[:, 1]
			
@@ -126,6 +136,8 @@ class ModelXgb(ModelBase):
 
				         pass
			
 
				 
			
 
				     def model_save(self):
			
 
				+        params_xgb = self.ml_config.params_xgb
			
 
				+
			
 
				         if self.pipeline is None:
			
 
				             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"模型不存在")
			
 
				 
			
@@ -134,10 +146,11 @@ class ModelXgb(ModelBase):
 
				         joblib.dump(self.pipeline, path_model)
			
 
				         print(f"model save to【{path_model}】success. ")
			
 
				 
			
 
				-        path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
			
 
				-        # pipeline = make_pmml_pipeline(self.model)
			
 
				-        sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
			
 
				-        print(f"model save to【{path_pmml}】success. ")
			
 
				+        if params_xgb.get("save_pmml"):
			
 
				+            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
			
 
				+            # pipeline = make_pmml_pipeline(self.model)
			
 
				+            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
			
 
				+            print(f"model save to【{path_pmml}】success. ")
			
 
				 
			
 
				     def model_load(self, path: str, *args, **kwargs):
			
 
				         if not os.path.isdir(path):