Преглед изворни кода

add: 增加分箱趋势记录代码

wangzhaoyang пре 5 месеци
родитељ
комит
27541b31a7
1 измењених фајлова са 28 додато и 0 уклоњено
  1. 28 0
      feature/feature_utils.py

+ 28 - 0
feature/feature_utils.py

@@ -8,6 +8,7 @@ import pandas as pd
 from sklearn.preprocessing import KBinsDiscretizer
 from entitys import DataSplitEntity
 from enums import BinsStrategyEnum
+import scorecardpy as sc
 import toad as td
 
 
@@ -48,6 +49,33 @@ def f_get_bins_display(bins_info: pd.DataFrame) -> pd.DataFrame:
     return result_df.style.format(subset=['count','good','bad'], precision=0).format(subset=['count_distr','bad','lift',
                                     'badprob','woe','bin_iv'],precision=4).bar(subset=['badprob','bin_iv','lift'],color=['#d65f58','#5fbb7a'])
 
+# 此函数筛除变量分箱不单调或非U型的变量
+def f_bins_filter(bins: pd.DataFrame, cols: list) -> list:
+    result_cols = []
+    # 遍历原始变量列表
+    for tmp_col in cols:
+        tmp_br = bins[tmp_col]['bad_prob'].values.tolist()
+        tmp_len = len(tmp_br)
+        if tmp_len <= 2:
+            result_cols.append(tmp_col)
+        else:
+            start_tr = tmp_br[1] - tmp_br[0]
+            pos_neg_flag = 0
+            for i in range(2,tmp_len):
+                tmp_tr = tmp_br[i] - tmp_br[i-1]
+                # 后一位bad_rate减前一位bad_rate,保证bad_rate的单调性
+                # 记录符号变化, 允许 最多一次符号变化,即U型分布
+                if (tmp_tr >= 0 and start_tr >= 0) or  (tmp_tr <= 0 and start_tr <= 0):
+                    # 满足趋势保持,查看下一位
+                    continue
+                else:
+                    # 记录一次符号变化
+                    pos_neg_flag += 1
+            # 记录满足趋势要求的变量
+            if pos_neg_flag <= 1:
+                result_cols.append(tmp_col)
+    return result_cols
+
 def f_get_woe(data: DataSplitEntity, c: td.transform.Combiner, to_drop:list) -> pd.DataFrame:
     transer = td.transform.WOETransformer()
     # 根据训练数据来训练woe转换器,并选择目标变量和排除变量