2 months ago · 31ad006ce0
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -14,6 +14,7 @@ import numpy as np
 
															 import pandas as pd
														
 
															 import scorecardpy as sc
														
 
															 import xgboost as xgb
														
 
															+from pypmml import Model
														
 
															 from sklearn.preprocessing import OneHotEncoder
														
 
															 from sklearn2pmml import sklearn2pmml, PMMLPipeline
														
 
															 from sklearn2pmml.preprocessing import CutTransformer
														
@@ -42,6 +43,14 @@ class ModelXgb(ModelBase):
 
															         self.pipeline: PMMLPipeline
														
 
															         self.model = xgb.XGBClassifier
														
 
															+    def _f_rewrite_pmml(self, path_pmml: str):
														
 
															+        with open(path_pmml, mode="r", encoding="utf-8") as f:
														
 
															+            pmml = f.read()
														
 
															+            pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
														
 
															+        with open(path_pmml, mode="w", encoding="utf-8") as f:
														
 
															+            f.write(pmml)
														
 
															+            f.flush()
														
 
															+
														
 
															     def get_report_template_path(self):
														
 
															         return self._template_path
														
@@ -117,6 +126,23 @@ class ModelXgb(ModelBase):
 
															                           classifier__verbose=params_xgb.get("verbose_eval"),
														
 
															                           )
														
 
															+        if params_xgb.get("save_pmml"):
														
 
															+            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
														
 
															+            # pipeline = make_pmml_pipeline(self.model)
														
 
															+            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
														
 
															+            print(f"model save to【{path_pmml}】success. ")
														
 
															+            self._f_rewrite_pmml(path_pmml)
														
 
															+            # pmml与原生模型结果一致性校验
														
 
															+            model_pmml = Model.fromFile(path_pmml)
														
 
															+            prob_pmml = model_pmml.predict(data.data)["probability(1)"]
														
 
															+            prob_pipeline = self.pipeline.predict_proba(data.data)[:, 1]
														
 
															+            diff = pd.DataFrame()
														
 
															+            diff["prob_pmml"] = prob_pmml
														
 
															+            diff["prob_pipeline"] = prob_pipeline
														
 
															+            diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
														
 
															+            diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
														
 
															+            print(f"pmml模型结果一致率(误差小于0.001)：{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
														
 
															+
														
 
															         if params_xgb.get("trees_print"):
														
 
															             trees = self.model.get_booster().get_dump()
														
 
															             for i, tree in enumerate(trees):
														
@@ -146,12 +172,6 @@ class ModelXgb(ModelBase):
 
															         joblib.dump(self.pipeline, path_model)
														
 
															         print(f"model save to【{path_model}】success. ")
														
 
															-        if params_xgb.get("save_pmml"):
														
 
															-            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
														
 
															-            # pipeline = make_pmml_pipeline(self.model)
														
 
															-            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
														
 
															-            print(f"model save to【{path_pmml}】success. ")
														
 
															-
														
 
															     def model_load(self, path: str, *args, **kwargs):
														
 
															         if not os.path.isdir(path):
														
 
															             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
														
--- a/requirements-analysis.txt
+++ b/requirements-analysis.txt
@@ -23,3 +23,4 @@ pypmml==0.9.0
 
															 #pyhive==0.7.0
														
 
															 #sklearn2pmml==0.65.0
														
 
															 #sklearn-pandas==2.2.0
														
 
															+#JPype1==1.3.0
														
--- a/train_test_xgb.py
+++ b/train_test_xgb.py
@@ -36,9 +36,9 @@ if __name__ == "__main__":
 
															         "jupyter_print": True,
														
 
															         # 是否开启粗分箱
														
 
															         "format_bin": True,
														
 
															-        "max_feature_num": 20,
														
 
															+        "max_feature_num": 5,
														
 
															         # 压力测试
														
 
															-        "stress_test": True,
														
 
															+        "stress_test": False,
														
 
															         # 压力测试抽样次数
														
 
															         "stress_sample_times": 10,
														
 
															         # y