strategy_iv.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # -*- coding:utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/1/2
  5. @desc: iv值及单调性筛选类
  6. """
  7. from itertools import combinations_with_replacement
  8. from typing import List
  9. import numpy as np
  10. import pandas as pd
  11. from entitys import DataSplitEntity, CandidateFeatureEntity, DataProcessConfigEntity
  12. from .feature_utils import f_judge_monto
  13. from .filter_strategy_base import FilterStrategyBase
  14. class StrategyIv(FilterStrategyBase):
  15. def __init__(self, *args, **kwargs):
  16. super().__init__(*args, **kwargs)
  17. def _f_get_best_bins(self, data: DataSplitEntity, x_column: str):
  18. # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
  19. interval = self.data_process_config.bin_search_interval
  20. iv_threshold = self.data_process_config.iv_threshold
  21. special_values = self.data_process_config.get_special_values(x_column)
  22. y_column = self.data_process_config.y_column
  23. def _n0(x):
  24. return sum(x == 0)
  25. def _n1(x):
  26. return sum(x == 1)
  27. def _f_distribute_balls(balls, boxes):
  28. # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
  29. total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
  30. distribute_list = []
  31. # 遍历所有可能的隔板位置
  32. for combo in total_ways:
  33. # 根据隔板位置分配球
  34. distribution = [0] * boxes
  35. start = 0
  36. for i, divider in enumerate(combo):
  37. distribution[i] = divider - start + 1
  38. start = divider + 1
  39. distribution[-1] = balls - start # 最后一个箱子的球数
  40. # 确保每个箱子至少有一个球
  41. if all(x > 0 for x in distribution):
  42. distribute_list.append(distribution)
  43. return distribute_list
  44. def _get_sv_bins(df, x_column, y_column, special_values):
  45. # special_values_bins
  46. sv_bin_list = []
  47. for special in special_values:
  48. dtm = df[df[x_column] == special]
  49. if len(dtm) != 0:
  50. dtm['bin'] = [str(special)] * len(dtm)
  51. binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
  52. [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  53. binning['is_special_values'] = [True] * len(binning)
  54. sv_bin_list.append(binning)
  55. return sv_bin_list
  56. def _get_bins(df, x_column, y_column, breaks_list):
  57. dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
  58. bstbrks = [-np.inf] + breaks_list + [np.inf]
  59. labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
  60. dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
  61. dtm['bin'] = dtm['bin'].astype(str)
  62. bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
  63. .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  64. bins['is_special_values'] = [False] * len(bins)
  65. return bins
  66. def _calculation_iv(bins):
  67. bins['count'] = bins['good'] + bins['bad']
  68. bins['badprob'] = bins['bad'] / bins['count']
  69. # 单调性判断
  70. bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
  71. if not f_judge_monto(bad_prob):
  72. return -1
  73. # 计算iv
  74. infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
  75. .replace(0, 0.9) \
  76. .assign(
  77. DistrBad=lambda x: x.bad / sum(x.bad),
  78. DistrGood=lambda x: x.good / sum(x.good)
  79. ) \
  80. .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
  81. .iv
  82. bins['bin_iv'] = infovalue
  83. bins['total_iv'] = bins['bin_iv'].sum()
  84. iv = bins['total_iv'].values[0]
  85. return iv
  86. train_data = data.train_data
  87. train_data_filter = train_data[~train_data[x_column].isin(special_values)]
  88. train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
  89. train_data_x = train_data_filter[x_column]
  90. test_data = data.test_data
  91. test_data_filter = None
  92. if test_data is not None and len(test_data) != 0:
  93. test_data_filter = test_data[~test_data[x_column].isin(special_values)]
  94. test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
  95. # 构造数据切分点
  96. # 计算 2 - 5 箱的情况
  97. distribute_list = []
  98. points_list = []
  99. for bin_num in list(range(2, 6)):
  100. distribute_list.extend(_f_distribute_balls(int(1 / interval), bin_num))
  101. for distribute in distribute_list:
  102. point_list_cache = []
  103. point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
  104. for point_percentile in point_percentile_list:
  105. point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
  106. if point not in point_list_cache:
  107. point_list_cache.append(point)
  108. if point_list_cache not in points_list:
  109. points_list.append(point_list_cache)
  110. # IV与单调性过滤
  111. iv_max = 0
  112. breaks_list = []
  113. train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
  114. test_sv_bin_list = None
  115. if test_data_filter is not None:
  116. test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
  117. from tqdm import tqdm
  118. for point_list in tqdm(points_list):
  119. train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
  120. # 与special_values合并计算iv
  121. for sv_bin in train_sv_bin_list:
  122. train_bins = pd.concat((train_bins, sv_bin))
  123. train_iv = _calculation_iv(train_bins)
  124. # 只限制训练集的单调性与iv值大小
  125. if train_iv < iv_threshold:
  126. continue
  127. test_iv = 0
  128. if test_data_filter is not None:
  129. test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
  130. for sv_bin in test_sv_bin_list:
  131. test_bins = pd.concat((test_bins, sv_bin))
  132. test_iv = _calculation_iv(test_bins)
  133. iv = train_iv + test_iv
  134. if iv > iv_max:
  135. iv_max = iv
  136. breaks_list = point_list
  137. return iv_max, breaks_list
  138. def filter(self, data: DataSplitEntity, *args, **kwargs):
  139. x_columns_candidate = self.data_process_config.x_columns_candidate
  140. candidate_num = self.data_process_config.candidate_num
  141. candidate_list: List[CandidateFeatureEntity] = []
  142. for x_column in x_columns_candidate:
  143. iv_max, breaks_list = self._f_get_best_bins(data, x_column)
  144. candidate_list.append(CandidateFeatureEntity(x_column, breaks_list, iv_max))
  145. candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
  146. return candidate_list[0:candidate_num]