123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- # -*- coding: utf-8 -*-
- """
- @author: yq
- @time: 2024/11/1
- @desc:
- """
- import os.path
- from os.path import dirname, realpath
- from typing import Dict
- import pandas as pd
- import scorecardpy as sc
- from sklearn.linear_model import LogisticRegression
- from commom import f_df_to_image
- from entitys import DataPreparedEntity, MetricFucEntity, DataSplitEntity
- from feature import f_calcu_model_ks, f_get_model_score_bin, f_calcu_model_psi
- from .model_base import ModelBase
- class ModelLr(ModelBase):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- # 报告模板
- self._template_path = os.path.join(dirname(dirname(realpath(__file__))), "./template/模型开发报告模板_lr.docx")
- self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
- def get_template_path(self):
- return self._template_path
- def train(self, data: DataPreparedEntity, *args, **kwargs) -> Dict[str, MetricFucEntity]:
- bins = kwargs["bins"]
- data_split_original: DataSplitEntity = kwargs["data_split_original"]
- # woe编码之前的数据
- train_data_original = data_split_original.train_data
- test_data_original = data_split_original.test_data
- train_data = data.train_data
- train_y = train_data.get_Ydata()
- y_column = train_data.y_column
- test_data = data.test_data
- self.lr.fit(train_data.get_Xdata(), train_y)
- metric_value_dict = {}
- # 评分卡
- card: Dict = sc.scorecard(bins, self.lr, train_data.x_columns, points0=600, odds0=train_data.get_odds0(),
- pdo=50)
- card_df = pd.DataFrame(columns=card['basepoints'].keys())
- for k, v in card.items():
- card_df = pd.concat((card_df, v))
- card_df_path = self._train_config.f_get_save_path(f"card_df.png")
- f_df_to_image(card_df, card_df_path)
- metric_value_dict["评分卡"] = MetricFucEntity(image_path=card_df_path)
- # 模型系数
- coef = dict(zip(train_data.x_columns, self.lr.coef_.reshape(-1)))
- coef_df = pd.DataFrame()
- coef_df['变量'] = coef.keys()
- coef_df['变量系数'] = coef.values()
- metric_value_dict["变量系数"] = MetricFucEntity(table=coef_df, table_font_size=10)
- # 模型ks auc
- train_prob = self.lr.predict_proba(train_data.get_Xdata())[:, 1]
- image_path_list = []
- train_perf = sc.perf_eva(train_y, train_prob, title="train", show_plot=True)
- path = self._train_config.f_get_save_path(f"train_perf.png")
- train_perf["pic"].savefig(path)
- image_path_list.append(path)
- train_auc = train_perf["AUC"]
- train_ks = train_perf["KS"]
- test_auc = "-"
- test_ks = "-"
- if test_data is not None:
- test_prob = self.lr.predict_proba(test_data.get_Xdata())[:, 1]
- test_y = test_data.get_Ydata()
- test_perf = sc.perf_eva(test_y, test_prob, title="test", show_plot=True)
- path = self._train_config.f_get_save_path(f"test_perf.png")
- test_perf["pic"].savefig(path)
- image_path_list.append(path)
- test_auc = test_perf["AUC"]
- test_ks = test_perf["KS"]
- df_auc = pd.DataFrame()
- df_auc["样本集"] = ["训练集", "测试集"]
- df_auc["AUC"] = [train_auc, test_auc]
- df_auc["KS"] = [train_ks, test_ks]
- metric_value_dict["模型结果"] = MetricFucEntity(table=df_auc, image_path=image_path_list, image_size=5,
- table_font_size=10)
- # 评分卡分箱
- train_data_original, score_bins = f_get_model_score_bin(train_data_original, card)
- train_data_gain = f_calcu_model_ks(train_data_original, y_column, sort_ascending=True)
- train_data_gain_path = self._train_config.f_get_save_path(f"train_data_gain.png")
- f_df_to_image(train_data_gain, train_data_gain_path)
- metric_value_dict["训练集分数分箱"] = MetricFucEntity(image_path=train_data_gain_path)
- if test_data is not None:
- test_data_original, bins = f_get_model_score_bin(test_data_original, card, score_bins)
- test_data_gain = f_calcu_model_ks(test_data_original, y_column, sort_ascending=True)
- test_data_gain_path = self._train_config.f_get_save_path(f"test_data_gain.png")
- f_df_to_image(test_data_gain, test_data_gain_path)
- metric_value_dict["测试集分数分箱"] = MetricFucEntity(image_path=test_data_gain_path)
- # 模型分psi
- model_psi = f_calcu_model_psi(train_data_original, test_data_original)
- model_psi_path = self._train_config.f_get_save_path(f"model_psi.png")
- f_df_to_image(model_psi, model_psi_path)
- metric_value_dict["模型稳定性"] = MetricFucEntity(value=model_psi["psi"].sum().round(4), image_path=model_psi_path)
- return metric_value_dict
- def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
- return self.lr.predict_proba(x)[:, 1]
- def export_model_file(self):
- pass
- if __name__ == "__main__":
- pass
|