yq 3 өдөр өмнө
parent
commit
31ad006ce0

+ 26 - 6
model/model_xgb.py

@@ -14,6 +14,7 @@ import numpy as np
 import pandas as pd
 import scorecardpy as sc
 import xgboost as xgb
+from pypmml import Model
 from sklearn.preprocessing import OneHotEncoder
 from sklearn2pmml import sklearn2pmml, PMMLPipeline
 from sklearn2pmml.preprocessing import CutTransformer
@@ -42,6 +43,14 @@ class ModelXgb(ModelBase):
         self.pipeline: PMMLPipeline
         self.model = xgb.XGBClassifier
 
+    def _f_rewrite_pmml(self, path_pmml: str):
+        with open(path_pmml, mode="r", encoding="utf-8") as f:
+            pmml = f.read()
+            pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
+        with open(path_pmml, mode="w", encoding="utf-8") as f:
+            f.write(pmml)
+            f.flush()
+
     def get_report_template_path(self):
         return self._template_path
 
@@ -117,6 +126,23 @@ class ModelXgb(ModelBase):
                           classifier__verbose=params_xgb.get("verbose_eval"),
                           )
 
+        if params_xgb.get("save_pmml"):
+            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
+            # pipeline = make_pmml_pipeline(self.model)
+            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
+            print(f"model save to【{path_pmml}】success. ")
+            self._f_rewrite_pmml(path_pmml)
+            # pmml与原生模型结果一致性校验
+            model_pmml = Model.fromFile(path_pmml)
+            prob_pmml = model_pmml.predict(data.data)["probability(1)"]
+            prob_pipeline = self.pipeline.predict_proba(data.data)[:, 1]
+            diff = pd.DataFrame()
+            diff["prob_pmml"] = prob_pmml
+            diff["prob_pipeline"] = prob_pipeline
+            diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
+            diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
+            print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
+
         if params_xgb.get("trees_print"):
             trees = self.model.get_booster().get_dump()
             for i, tree in enumerate(trees):
@@ -146,12 +172,6 @@ class ModelXgb(ModelBase):
         joblib.dump(self.pipeline, path_model)
         print(f"model save to【{path_model}】success. ")
 
-        if params_xgb.get("save_pmml"):
-            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
-            # pipeline = make_pmml_pipeline(self.model)
-            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
-            print(f"model save to【{path_pmml}】success. ")
-
     def model_load(self, path: str, *args, **kwargs):
         if not os.path.isdir(path):
             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")

+ 1 - 0
requirements-analysis.txt

@@ -23,3 +23,4 @@ pypmml==0.9.0
 #pyhive==0.7.0
 #sklearn2pmml==0.65.0
 #sklearn-pandas==2.2.0
+#JPype1==1.3.0

+ 2 - 2
train_test_xgb.py

@@ -36,9 +36,9 @@ if __name__ == "__main__":
         "jupyter_print": True,
         # 是否开启粗分箱
         "format_bin": True,
-        "max_feature_num": 20,
+        "max_feature_num": 5,
         # 压力测试
-        "stress_test": True,
+        "stress_test": False,
         # 压力测试抽样次数
         "stress_sample_times": 10,
         # y