|
@@ -12,7 +12,8 @@ import joblib
|
|
|
import pandas as pd
|
|
|
import scorecardpy as sc
|
|
|
import xgboost as xgb
|
|
|
-from sklearn2pmml import PMMLPipeline
|
|
|
+from pypmml import Model
|
|
|
+from sklearn2pmml import PMMLPipeline, sklearn2pmml
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from commom import GeneralException, f_image_crop_white_borders, f_df_to_image, f_display_title, \
|
|
@@ -61,15 +62,41 @@ class OnlineLearningTrainerXgb:
|
|
|
self._pipeline_optimized = joblib.load(path_model)
|
|
|
print(f"model load from【{path_model}】success.")
|
|
|
|
|
|
+ def _f_rewrite_pmml(self, path_pmml: str):
|
|
|
+ with open(path_pmml, mode="r", encoding="utf-8") as f:
|
|
|
+ pmml = f.read()
|
|
|
+ pmml = pmml.replace('optype="categorical" dataType="double"', 'optype="categorical" dataType="string"')
|
|
|
+ with open(path_pmml, mode="w", encoding="utf-8") as f:
|
|
|
+ f.write(pmml)
|
|
|
+ f.flush()
|
|
|
+
|
|
|
def _f_get_best_model(self, df_param: pd.DataFrame, ntree: int = None):
|
|
|
if ntree is None:
|
|
|
df_param_sort = df_param.sort_values(by=["ks_test", "auc_test"], ascending=[False, False])
|
|
|
print(f"选择最佳参数:\n{df_param_sort.iloc[0].to_dict()}")
|
|
|
- self._train(df_param_sort.iloc[0][2])
|
|
|
+ self._train(int(df_param_sort.iloc[0][2]))
|
|
|
else:
|
|
|
print(f"选择ntree:【{ntree}】的参数:\n{df_param[df_param['ntree'] == ntree].iloc[0].to_dict()}")
|
|
|
self._train(ntree)
|
|
|
|
|
|
+ if self._ol_config.save_pmml:
|
|
|
+ data = self._data.data
|
|
|
+ path_pmml = self._ol_config.f_get_save_path(FileEnum.PMML.value)
|
|
|
+ # pipeline = make_pmml_pipeline(self.model)
|
|
|
+ sklearn2pmml(self._pipeline_optimized, path_pmml, with_repr=True, )
|
|
|
+ self._f_rewrite_pmml(path_pmml)
|
|
|
+ print(f"model save to【{path_pmml}】success. ")
|
|
|
+ # pmml与原生模型结果一致性校验
|
|
|
+ model_pmml = Model.fromFile(path_pmml)
|
|
|
+ prob_pmml = model_pmml.predict(data)["probability(1)"]
|
|
|
+ prob_pipeline = self._pipeline_optimized.predict_proba(data)[:, 1]
|
|
|
+ diff = pd.DataFrame()
|
|
|
+ diff["prob_pmml"] = prob_pmml
|
|
|
+ diff["prob_pipeline"] = prob_pipeline
|
|
|
+ diff["diff"] = diff["prob_pmml"] - diff["prob_pipeline"]
|
|
|
+ diff["diff_format"] = diff["diff"].apply(lambda x: 1 if abs(x) < 0.001 else 0)
|
|
|
+ print(f"pmml模型结果一致率(误差小于0.001):{len(diff) / diff['diff_format'].sum().round(3) * 100}%")
|
|
|
+
|
|
|
def _f_get_metric_auc_ks(self, model_type: str):
|
|
|
def _get_auc_ks(data, title):
|
|
|
y = data[self._ol_config.y_column]
|