2 сар өмнө · 31ad006ce0
--- a/model/model_xgb.py
+++ b/model/model_xgb.py
@@ -14,6 +14,7 @@ import numpy as np
 
				 import pandas as pd
			
 
				 import scorecardpy as sc
			
 
				 import xgboost as xgb
			
 
				+from pypmml import Model
			
 
				 from sklearn.preprocessing import OneHotEncoder
			
 
				 from sklearn2pmml import sklearn2pmml, PMMLPipeline
			
 
				 from sklearn2pmml.preprocessing import CutTransformer
			
@@ -42,6 +43,14 @@ class ModelXgb(ModelBase):
 
				         self.pipeline: PMMLPipeline
			
 
				         self.model = xgb.XGBClassifier
			
 
				 
			
 
				+    def _f_rewrite_pmml(self, path_pmml: str):
			
 
				+        with open(path_pmml, mode="r", encoding="utf-8") as f:
			
 
				+            pmml = f.read()
			
 
				+            pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
			
 
				+        with open(path_pmml, mode="w", encoding="utf-8") as f:
			
 
				+            f.write(pmml)
			
 
				+            f.flush()
			
 
				+
			
 
				     def get_report_template_path(self):
			
 
				         return self._template_path
			
 
				 
			
@@ -117,6 +126,23 @@ class ModelXgb(ModelBase):
 
				                           classifier__verbose=params_xgb.get("verbose_eval"),
			
 
				                           )
			
 
				 
			
 
				+        if params_xgb.get("save_pmml"):
			
 
				+            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
			
 
				+            # pipeline = make_pmml_pipeline(self.model)
			
 
				+            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
			
 
				+            print(f"model save to【{path_pmml}】success. ")
			
 
				+            self._f_rewrite_pmml(path_pmml)
			
 
				+            # pmml与原生模型结果一致性校验
			
 
				+            model_pmml = Model.fromFile(path_pmml)
			
 
				+            prob_pmml = model_pmml.predict(data.data)["probability(1)"]
			
 
				+            prob_pipeline = self.pipeline.predict_proba(data.data)[:, 1]
			
 
				+            diff = pd.DataFrame()
			
 
				+            diff["prob_pmml"] = prob_pmml
			
 
				+            diff["prob_pipeline"] = prob_pipeline
			
 
				+            diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
			
 
				+            diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
			
 
				+            print(f"pmml模型结果一致率(误差小于0.001)：{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
			
 
				+
			
 
				         if params_xgb.get("trees_print"):
			
 
				             trees = self.model.get_booster().get_dump()
			
 
				             for i, tree in enumerate(trees):
			
@@ -146,12 +172,6 @@ class ModelXgb(ModelBase):
 
				         joblib.dump(self.pipeline, path_model)
			
 
				         print(f"model save to【{path_model}】success. ")
			
 
				 
			
 
				-        if params_xgb.get("save_pmml"):
			
 
				-            path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
			
 
				-            # pipeline = make_pmml_pipeline(self.model)
			
 
				-            sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
			
 
				-            print(f"model save to【{path_pmml}】success. ")
			
 
				-
			
 
				     def model_load(self, path: str, *args, **kwargs):
			
 
				         if not os.path.isdir(path):
			
 
				             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
			
--- a/requirements-analysis.txt
+++ b/requirements-analysis.txt
@@ -23,3 +23,4 @@ pypmml==0.9.0
 
				 #pyhive==0.7.0
			
 
				 #sklearn2pmml==0.65.0
			
 
				 #sklearn-pandas==2.2.0
			
 
				+#JPype1==1.3.0
			
--- a/train_test_xgb.py
+++ b/train_test_xgb.py
@@ -36,9 +36,9 @@ if __name__ == "__main__":
 
				         "jupyter_print": True,
			
 
				         # 是否开启粗分箱
			
 
				         "format_bin": True,
			
 
				-        "max_feature_num": 20,
			
 
				+        "max_feature_num": 5,
			
 
				         # 压力测试
			
 
				-        "stress_test": True,
			
 
				+        "stress_test": False,
			
 
				         # 压力测试抽样次数
			
 
				         "stress_sample_times": 10,
			
 
				         # y