ソースを参照

bugfix: 分箱排序不对导致的bug

yq 3 ヶ月 前
コミット
c5e734e12a
1 ファイル変更9 行追加1 行削除
  1. 9 1
      feature/strategy_iv.py

+ 9 - 1
feature/strategy_iv.py

@@ -187,6 +187,12 @@ class StrategyIv(FilterStrategyBase):
                     sv_bin_list.append(binning)
             return sv_bin_list
 
+        def _get_bin_left_value(bin: str):
+            if "," not in bin:
+                return float(bin)
+            left = bin.split(",")[0]
+            return float(left[1:])
+
         def _get_bins(df, x_column, y_column, breaks_list):
             dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
             bstbrks = [-np.inf] + breaks_list + [np.inf]
@@ -196,6 +202,8 @@ class StrategyIv(FilterStrategyBase):
             bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
                 .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
             bins['is_special_values'] = [False] * len(bins)
+            bins["ordered"] = bins['bin'].apply(_get_bin_left_value)
+            bins = bins.sort_values(by=["ordered"], ascending=[True])
             return bins
 
         def _get_badprob(bins):
@@ -356,7 +364,7 @@ class StrategyIv(FilterStrategyBase):
         for x_column in tqdm(x_columns_candidate):
             if is_numeric_dtype(data.train_data[x_column]):
                 iv_max, breaks_list, bins_enum_best_point = self._f_get_best_bins_numeric(data, x_column)
-                if len(bins_enum_best_point) != 0 :
+                if len(bins_enum_best_point) != 0:
                     numeric_candidate_dict_all[x_column] = []
                     for point in bins_enum_best_point:
                         numeric_candidate_dict_all[x_column].append(CandidateFeatureEntity(x_column, point, 0))