|
@@ -7,23 +7,40 @@
|
|
|
from itertools import combinations_with_replacement
|
|
|
from typing import List, Dict
|
|
|
|
|
|
+import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import scorecardpy as sc
|
|
|
+import seaborn as sns
|
|
|
from pandas.core.dtypes.common import is_numeric_dtype
|
|
|
|
|
|
-from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity
|
|
|
+from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
|
|
|
from .feature_utils import f_judge_monto, f_get_corr
|
|
|
from .filter_strategy_base import FilterStrategyBase
|
|
|
|
|
|
+plt.rcParams['figure.figsize'] = (8, 8)
|
|
|
+
|
|
|
|
|
|
class StrategyIv(FilterStrategyBase):
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
- def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity]):
|
|
|
- y_column = self.data_process_config.y_column
|
|
|
+ def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
|
|
|
+ image_path_list = []
|
|
|
+ for k in x_columns_candidate:
|
|
|
+ bin_df = bins[k]
|
|
|
+ # bin_df["bin"] = bin_df["bin"].apply(lambda x: re.sub(r"(\d+\.\d+)",
|
|
|
+ # lambda m: "{:.2f}".format(float(m.group(0))), x))
|
|
|
+ sc.woebin_plot(bin_df)
|
|
|
+ path = self.data_process_config._get_save_path(f"{prefix}_{k}.png")
|
|
|
+ plt.savefig(path)
|
|
|
+ image_path_list.append(path)
|
|
|
+ return image_path_list
|
|
|
+
|
|
|
+ def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity],
|
|
|
+ y_column=None):
|
|
|
+ y_column = self.data_process_config.y_column if y_column is None else y_column
|
|
|
special_values = self.data_process_config.special_values
|
|
|
x_columns_candidate = list(candidate_dict.keys())
|
|
|
breaks_list = {}
|
|
@@ -300,3 +317,53 @@ class StrategyIv(FilterStrategyBase):
|
|
|
test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
|
|
|
train_woe.columns.tolist(), y_column)
|
|
|
return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature)
|
|
|
+
|
|
|
+ def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
|
|
|
+ **kwargs) -> Dict[str, MetricFucEntity]:
|
|
|
+ y_column = self.data_process_config.y_column
|
|
|
+ x_columns_candidate = list(candidate_dict.keys())
|
|
|
+ train_data = data.train_data
|
|
|
+ test_data = data.test_data
|
|
|
+
|
|
|
+ metric_value_dict = {}
|
|
|
+ # 样本分布
|
|
|
+ metric_value_dict["样本分布"] = MetricFucEntity(table=data.get_distribution(y_column))
|
|
|
+ # 变量iv及psi
|
|
|
+ train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
|
|
|
+ train_iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in train_bins.items()}
|
|
|
+ train_iv = pd.DataFrame.from_dict(train_iv, orient='index', columns=['IV']).reset_index()
|
|
|
+ train_iv = train_iv.sort_values('IV', ascending=False).reset_index(drop=True)
|
|
|
+ train_iv.columns = ['变量', 'IV']
|
|
|
+
|
|
|
+ if test_data is not None and len(test_data) != 0:
|
|
|
+ # 计算psi仅需把y改成识别各自训练集测试集即可
|
|
|
+ psi_df = pd.concat((train_data, test_data))
|
|
|
+ psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
|
|
|
+ psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
|
|
|
+ psi = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in psi.items()}
|
|
|
+ psi = pd.DataFrame.from_dict(psi, orient='index', columns=['psi']).reset_index()
|
|
|
+ psi.columns = ['变量', 'psi']
|
|
|
+ train_iv = pd.merge(train_iv, psi, on="变量", how="left")
|
|
|
+
|
|
|
+ # 变量趋势-测试集
|
|
|
+ test_bins = self._f_get_bins_by_breaks(test_data, candidate_dict)
|
|
|
+ image_path_list = self._f_save_var_trend(test_bins, x_columns_candidate, "test")
|
|
|
+ metric_value_dict["变量趋势-测试集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
|
|
|
+
|
|
|
+ metric_value_dict["变量iv"] = MetricFucEntity(table=train_iv)
|
|
|
+ # 变量趋势-训练集
|
|
|
+ image_path_list = self._f_save_var_trend(train_bins, x_columns_candidate, "train")
|
|
|
+ metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
|
|
|
+ # 变量有效性
|
|
|
+ train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
|
|
|
+ train_corr = f_get_corr(train_woe)
|
|
|
+ plt.figure(figsize=(12, 12))
|
|
|
+ sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
|
|
|
+ plt.title('Variables Correlation', fontsize=15)
|
|
|
+ plt.yticks(rotation=0)
|
|
|
+ plt.xticks(rotation=90)
|
|
|
+ path = self.data_process_config._get_save_path(f"var_corr.png")
|
|
|
+ plt.savefig(path)
|
|
|
+ metric_value_dict["变量有效性"] = MetricFucEntity(image_path=path)
|
|
|
+
|
|
|
+ return metric_value_dict
|