|
@@ -14,6 +14,7 @@ import numpy as np
|
|
import pandas as pd
|
|
import pandas as pd
|
|
import scorecardpy as sc
|
|
import scorecardpy as sc
|
|
import xgboost as xgb
|
|
import xgboost as xgb
|
|
|
|
+from pypmml import Model
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn2pmml import sklearn2pmml, PMMLPipeline
|
|
from sklearn2pmml import sklearn2pmml, PMMLPipeline
|
|
from sklearn2pmml.preprocessing import CutTransformer
|
|
from sklearn2pmml.preprocessing import CutTransformer
|
|
@@ -42,6 +43,14 @@ class ModelXgb(ModelBase):
|
|
self.pipeline: PMMLPipeline
|
|
self.pipeline: PMMLPipeline
|
|
self.model = xgb.XGBClassifier
|
|
self.model = xgb.XGBClassifier
|
|
|
|
|
|
|
|
+ def _f_rewrite_pmml(self, path_pmml: str):
|
|
|
|
+ with open(path_pmml, mode="r", encoding="utf-8") as f:
|
|
|
|
+ pmml = f.read()
|
|
|
|
+ pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
|
|
|
|
+ with open(path_pmml, mode="w", encoding="utf-8") as f:
|
|
|
|
+ f.write(pmml)
|
|
|
|
+ f.flush()
|
|
|
|
+
|
|
def get_report_template_path(self):
|
|
def get_report_template_path(self):
|
|
return self._template_path
|
|
return self._template_path
|
|
|
|
|
|
@@ -117,6 +126,23 @@ class ModelXgb(ModelBase):
|
|
classifier__verbose=params_xgb.get("verbose_eval"),
|
|
classifier__verbose=params_xgb.get("verbose_eval"),
|
|
)
|
|
)
|
|
|
|
|
|
|
|
+ if params_xgb.get("save_pmml"):
|
|
|
|
+ path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
|
|
|
|
+ # pipeline = make_pmml_pipeline(self.model)
|
|
|
|
+ sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
|
|
|
|
+ print(f"model save to【{path_pmml}】success. ")
|
|
|
|
+ self._f_rewrite_pmml(path_pmml)
|
|
|
|
+ # pmml与原生模型结果一致性校验
|
|
|
|
+ model_pmml = Model.fromFile(path_pmml)
|
|
|
|
+ prob_pmml = model_pmml.predict(data.data)["probability(1)"]
|
|
|
|
+ prob_pipeline = self.pipeline.predict_proba(data.data)[:, 1]
|
|
|
|
+ diff = pd.DataFrame()
|
|
|
|
+ diff["prob_pmml"] = prob_pmml
|
|
|
|
+ diff["prob_pipeline"] = prob_pipeline
|
|
|
|
+ diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
|
|
|
|
+ diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
|
|
|
|
+ print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
|
|
|
|
+
|
|
if params_xgb.get("trees_print"):
|
|
if params_xgb.get("trees_print"):
|
|
trees = self.model.get_booster().get_dump()
|
|
trees = self.model.get_booster().get_dump()
|
|
for i, tree in enumerate(trees):
|
|
for i, tree in enumerate(trees):
|
|
@@ -146,12 +172,6 @@ class ModelXgb(ModelBase):
|
|
joblib.dump(self.pipeline, path_model)
|
|
joblib.dump(self.pipeline, path_model)
|
|
print(f"model save to【{path_model}】success. ")
|
|
print(f"model save to【{path_model}】success. ")
|
|
|
|
|
|
- if params_xgb.get("save_pmml"):
|
|
|
|
- path_pmml = self.ml_config.f_get_save_path(FileEnum.PMML.value)
|
|
|
|
- # pipeline = make_pmml_pipeline(self.model)
|
|
|
|
- sklearn2pmml(self.pipeline, path_pmml, with_repr=True, )
|
|
|
|
- print(f"model save to【{path_pmml}】success. ")
|
|
|
|
-
|
|
|
|
def model_load(self, path: str, *args, **kwargs):
|
|
def model_load(self, path: str, *args, **kwargs):
|
|
if not os.path.isdir(path):
|
|
if not os.path.isdir(path):
|
|
raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
|
|
raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"【{path}】不是文件夹")
|