|
@@ -14,7 +14,6 @@ import scorecardpy as sc
|
|
import seaborn as sns
|
|
import seaborn as sns
|
|
from pandas.core.dtypes.common import is_numeric_dtype
|
|
from pandas.core.dtypes.common import is_numeric_dtype
|
|
|
|
|
|
-
|
|
|
|
from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
|
|
from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
|
|
from init import f_get_save_path
|
|
from init import f_get_save_path
|
|
from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf
|
|
from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf
|
|
@@ -28,6 +27,24 @@ class StrategyIv(FilterStrategyBase):
|
|
def __init__(self, *args, **kwargs):
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
|
|
+ def _f_get_iv_by_bins(self, bins) -> pd.DataFrame:
|
|
|
|
+ iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in bins.items()}
|
|
|
|
+ iv = pd.DataFrame.from_dict(iv, orient='index', columns=['IV']).reset_index()
|
|
|
|
+ iv = iv.sort_values('IV', ascending=False).reset_index(drop=True)
|
|
|
|
+ iv.columns = ['变量', 'IV']
|
|
|
|
+ return iv
|
|
|
|
+
|
|
|
|
+ def _f_get_var_corr_image(self, train_woe):
|
|
|
|
+ train_corr = f_get_corr(train_woe)
|
|
|
|
+ plt.figure(figsize=(12, 12))
|
|
|
|
+ sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
|
|
|
|
+ plt.title('Variables Correlation', fontsize=15)
|
|
|
|
+ plt.yticks(rotation=0)
|
|
|
|
+ plt.xticks(rotation=90)
|
|
|
|
+ path = f_get_save_path(f"var_corr.png")
|
|
|
|
+ plt.savefig(path)
|
|
|
|
+ return path
|
|
|
|
+
|
|
def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
|
|
def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
|
|
image_path_list = []
|
|
image_path_list = []
|
|
for k in x_columns_candidate:
|
|
for k in x_columns_candidate:
|
|
@@ -318,7 +335,8 @@ class StrategyIv(FilterStrategyBase):
|
|
test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
|
|
test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
|
|
test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
|
|
test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
|
|
train_woe.columns.tolist(), y_column)
|
|
train_woe.columns.tolist(), y_column)
|
|
- return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature)
|
|
|
|
|
|
+ return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
|
|
|
|
+ data_split_original=data)
|
|
|
|
|
|
def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
|
|
def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
|
|
**kwargs) -> Dict[str, MetricFucEntity]:
|
|
**kwargs) -> Dict[str, MetricFucEntity]:
|
|
@@ -333,18 +351,14 @@ class StrategyIv(FilterStrategyBase):
|
|
table_cell_width=3)
|
|
table_cell_width=3)
|
|
# 变量iv及psi
|
|
# 变量iv及psi
|
|
train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
|
|
train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
|
|
- train_iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in train_bins.items()}
|
|
|
|
- train_iv = pd.DataFrame.from_dict(train_iv, orient='index', columns=['IV']).reset_index()
|
|
|
|
- train_iv = train_iv.sort_values('IV', ascending=False).reset_index(drop=True)
|
|
|
|
- train_iv.columns = ['变量', 'IV']
|
|
|
|
|
|
+ train_iv = self._f_get_iv_by_bins(train_bins)
|
|
|
|
|
|
if test_data is not None and len(test_data) != 0:
|
|
if test_data is not None and len(test_data) != 0:
|
|
# 计算psi仅需把y改成识别各自训练集测试集即可
|
|
# 计算psi仅需把y改成识别各自训练集测试集即可
|
|
psi_df = pd.concat((train_data, test_data))
|
|
psi_df = pd.concat((train_data, test_data))
|
|
psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
|
|
psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
|
|
psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
|
|
psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
|
|
- psi = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in psi.items()}
|
|
|
|
- psi = pd.DataFrame.from_dict(psi, orient='index', columns=['psi']).reset_index()
|
|
|
|
|
|
+ psi = self._f_get_iv_by_bins(psi)
|
|
psi.columns = ['变量', 'psi']
|
|
psi.columns = ['变量', 'psi']
|
|
train_iv = pd.merge(train_iv, psi, on="变量", how="left")
|
|
train_iv = pd.merge(train_iv, psi, on="变量", how="left")
|
|
|
|
|
|
@@ -359,16 +373,9 @@ class StrategyIv(FilterStrategyBase):
|
|
metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
|
|
metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
|
|
# 变量有效性
|
|
# 变量有效性
|
|
train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
|
|
train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
|
|
- train_corr = f_get_corr(train_woe)
|
|
|
|
- plt.figure(figsize=(12, 12))
|
|
|
|
- sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
|
|
|
|
- plt.title('Variables Correlation', fontsize=15)
|
|
|
|
- plt.yticks(rotation=0)
|
|
|
|
- plt.xticks(rotation=90)
|
|
|
|
- path = f_get_save_path(f"var_corr.png")
|
|
|
|
- plt.savefig(path)
|
|
|
|
|
|
+ var_corr_image_path = self._f_get_var_corr_image(train_woe)
|
|
# vif
|
|
# vif
|
|
vif_df = f_get_ivf(train_woe)
|
|
vif_df = f_get_ivf(train_woe)
|
|
- metric_value_dict["变量有效性"] = MetricFucEntity(image_path=path, table=vif_df)
|
|
|
|
|
|
+ metric_value_dict["变量有效性"] = MetricFucEntity(image_path=var_corr_image_path, table=vif_df)
|
|
|
|
|
|
return metric_value_dict
|
|
return metric_value_dict
|