|
@@ -8,6 +8,7 @@ import pandas as pd
|
|
|
from sklearn.preprocessing import KBinsDiscretizer
|
|
|
from entitys import DataSplitEntity
|
|
|
from enums import BinsStrategyEnum
|
|
|
+import scorecardpy as sc
|
|
|
import toad as td
|
|
|
|
|
|
|
|
@@ -48,6 +49,33 @@ def f_get_bins_display(bins_info: pd.DataFrame) -> pd.DataFrame:
|
|
|
return result_df.style.format(subset=['count','good','bad'], precision=0).format(subset=['count_distr','bad','lift',
|
|
|
'badprob','woe','bin_iv'],precision=4).bar(subset=['badprob','bin_iv','lift'],color=['#d65f58','#5fbb7a'])
|
|
|
|
|
|
+# 此函数筛除变量分箱不单调或非U型的变量
|
|
|
+def f_bins_filter(bins: pd.DataFrame, cols: list) -> list:
|
|
|
+ result_cols = []
|
|
|
+ # 遍历原始变量列表
|
|
|
+ for tmp_col in cols:
|
|
|
+ tmp_br = bins[tmp_col]['bad_prob'].values.tolist()
|
|
|
+ tmp_len = len(tmp_br)
|
|
|
+ if tmp_len <= 2:
|
|
|
+ result_cols.append(tmp_col)
|
|
|
+ else:
|
|
|
+ start_tr = tmp_br[1] - tmp_br[0]
|
|
|
+ pos_neg_flag = 0
|
|
|
+ for i in range(2,tmp_len):
|
|
|
+ tmp_tr = tmp_br[i] - tmp_br[i-1]
|
|
|
+ # 后一位bad_rate减前一位bad_rate,保证bad_rate的单调性
|
|
|
+ # 记录符号变化, 允许 最多一次符号变化,即U型分布
|
|
|
+ if (tmp_tr >= 0 and start_tr >= 0) or (tmp_tr <= 0 and start_tr <= 0):
|
|
|
+ # 满足趋势保持,查看下一位
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ # 记录一次符号变化
|
|
|
+ pos_neg_flag += 1
|
|
|
+ # 记录满足趋势要求的变量
|
|
|
+ if pos_neg_flag <= 1:
|
|
|
+ result_cols.append(tmp_col)
|
|
|
+ return result_cols
|
|
|
+
|
|
|
def f_get_woe(data: DataSplitEntity, c: td.transform.Combiner, to_drop:list) -> pd.DataFrame:
|
|
|
transer = td.transform.WOETransformer()
|
|
|
# 根据训练数据来训练woe转换器,并选择目标变量和排除变量
|