strategy_iv.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. # -*- coding:utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/1/2
  5. @desc: iv值及单调性筛选类
  6. """
  7. from itertools import combinations_with_replacement
  8. from typing import List, Dict
  9. import numpy as np
  10. import pandas as pd
  11. import scorecardpy as sc
  12. from pandas.core.dtypes.common import is_numeric_dtype
  13. from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity
  14. from .feature_utils import f_judge_monto, f_get_corr
  15. from .filter_strategy_base import FilterStrategyBase
  16. class StrategyIv(FilterStrategyBase):
  17. def __init__(self, *args, **kwargs):
  18. super().__init__(*args, **kwargs)
  19. def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity]):
  20. y_column = self.data_process_config.y_column
  21. special_values = self.data_process_config.special_values
  22. x_columns_candidate = list(candidate_dict.keys())
  23. breaks_list = {}
  24. for column, candidate in candidate_dict.items():
  25. breaks_list[column] = candidate.breaks_list
  26. bins = sc.woebin(data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
  27. special_values=special_values)
  28. return bins
  29. def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
  30. # 相关性剔除变量
  31. corr_threshold = self.data_process_config.corr_threshold
  32. train_data = data.train_data
  33. x_columns_candidate = list(candidate_dict.keys())
  34. bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
  35. train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
  36. corr_df = f_get_corr(train_woe)
  37. corr_dict = corr_df.to_dict()
  38. for column, corr in corr_dict.items():
  39. column = column.replace("_woe", "")
  40. if column not in x_columns_candidate:
  41. continue
  42. for challenger_column, challenger_corr in corr.items():
  43. challenger_column = challenger_column.replace("_woe", "")
  44. if challenger_corr < corr_threshold or column == challenger_column \
  45. or challenger_column not in x_columns_candidate:
  46. continue
  47. iv_max = candidate_dict[column].iv_max
  48. challenger_iv_max = candidate_dict[challenger_column].iv_max
  49. if iv_max > challenger_iv_max:
  50. x_columns_candidate.remove(challenger_column)
  51. else:
  52. x_columns_candidate.remove(column)
  53. break
  54. return x_columns_candidate
  55. def _f_wide_filter(self, data: DataSplitEntity) -> Dict:
  56. # 粗筛变量
  57. train_data = data.train_data
  58. test_data = data.test_data
  59. special_values = self.data_process_config.special_values
  60. y_column = self.data_process_config.y_column
  61. iv_threshold_wide = self.data_process_config.iv_threshold_wide
  62. x_columns_candidate = self.data_process_config.x_columns_candidate
  63. if x_columns_candidate is None or len(x_columns_candidate) == 0:
  64. x_columns_candidate = train_data.columns.tolist()
  65. x_columns_candidate.remove(y_column)
  66. bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, special_values=special_values,
  67. bin_num_limit=5)
  68. breaks_list = {}
  69. for column, bin in bins_train.items():
  70. breaks_list[column] = list(bin['breaks'])
  71. bins_test = None
  72. if test_data is not None and len(test_data) != 0:
  73. bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
  74. special_values=special_values
  75. )
  76. bins_iv_dict = {}
  77. for column, bin_train in bins_train.items():
  78. train_iv = bin_train['total_iv'][0]
  79. test_iv = 0
  80. if bins_test is not None:
  81. bin_test = bins_test[column]
  82. test_iv = bin_test['total_iv'][0]
  83. iv_max = train_iv + test_iv
  84. if train_iv < iv_threshold_wide:
  85. continue
  86. bins_iv_dict[column] = {"iv_max": iv_max, "breaks_list": breaks_list[column]}
  87. return bins_iv_dict
  88. def _f_get_best_bins_numeric(self, data: DataSplitEntity, x_column: str):
  89. # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
  90. interval = self.data_process_config.bin_search_interval
  91. iv_threshold = self.data_process_config.iv_threshold
  92. special_values = self.data_process_config.get_special_values(x_column)
  93. y_column = self.data_process_config.y_column
  94. sample_rate = self.data_process_config.sample_rate
  95. def _n0(x):
  96. return sum(x == 0)
  97. def _n1(x):
  98. return sum(x == 1)
  99. def _f_distribute_balls(balls, boxes):
  100. # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
  101. total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
  102. distribute_list = []
  103. # 遍历所有可能的隔板位置
  104. for combo in total_ways:
  105. # 根据隔板位置分配球
  106. distribution = [0] * boxes
  107. start = 0
  108. for i, divider in enumerate(combo):
  109. distribution[i] = divider - start + 1
  110. start = divider + 1
  111. distribution[-1] = balls - start # 最后一个箱子的球数
  112. # 确保每个箱子至少有一个球
  113. if all(x > 0 for x in distribution):
  114. distribute_list.append(distribution)
  115. return distribute_list
  116. def _get_sv_bins(df, x_column, y_column, special_values):
  117. # special_values_bins
  118. sv_bin_list = []
  119. for special in special_values:
  120. dtm = df[df[x_column] == special]
  121. if len(dtm) != 0:
  122. dtm['bin'] = [str(special)] * len(dtm)
  123. binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
  124. [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  125. binning['is_special_values'] = [True] * len(binning)
  126. sv_bin_list.append(binning)
  127. return sv_bin_list
  128. def _get_bins(df, x_column, y_column, breaks_list):
  129. dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
  130. bstbrks = [-np.inf] + breaks_list + [np.inf]
  131. labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
  132. dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
  133. dtm['bin'] = dtm['bin'].astype(str)
  134. bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
  135. .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  136. bins['is_special_values'] = [False] * len(bins)
  137. return bins
  138. def _calculation_iv(bins):
  139. bins['count'] = bins['good'] + bins['bad']
  140. bins['badprob'] = bins['bad'] / bins['count']
  141. # 单调性判断
  142. bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
  143. if not f_judge_monto(bad_prob):
  144. return -1
  145. # 计算iv
  146. infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
  147. .replace(0, 0.9) \
  148. .assign(
  149. DistrBad=lambda x: x.bad / sum(x.bad),
  150. DistrGood=lambda x: x.good / sum(x.good)
  151. ) \
  152. .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
  153. .iv
  154. bins['bin_iv'] = infovalue
  155. bins['total_iv'] = bins['bin_iv'].sum()
  156. iv = bins['total_iv'].values[0]
  157. return iv
  158. def _f_sampling(distribute_list: list, sample_rate: float):
  159. # 采样,完全贪婪搜索耗时太长
  160. sampled_list = distribute_list[::int(1 / sample_rate)]
  161. return sampled_list
  162. train_data = data.train_data
  163. train_data_filter = train_data[~train_data[x_column].isin(special_values)]
  164. train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
  165. train_data_x = train_data_filter[x_column]
  166. test_data = data.test_data
  167. test_data_filter = None
  168. if test_data is not None and len(test_data) != 0:
  169. test_data_filter = test_data[~test_data[x_column].isin(special_values)]
  170. test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
  171. # 构造数据切分点
  172. # 计算 2 - 5 箱的情况
  173. distribute_list = []
  174. points_list = []
  175. for bin_num in list(range(2, 6)):
  176. distribute_list_cache = _f_distribute_balls(int(1 / interval), bin_num)
  177. # 4箱及以上得采样,不然耗时太久
  178. sample_num = 1000 * sample_rate
  179. if sample_rate <= 0.15:
  180. sample_num *= 2
  181. if bin_num == 4 and len(distribute_list_cache) >= sample_num:
  182. distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
  183. sample_num = 4000 * sample_rate
  184. if bin_num == 5 and len(distribute_list_cache) >= sample_num:
  185. distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
  186. distribute_list.extend(distribute_list_cache)
  187. for distribute in distribute_list:
  188. point_list_cache = []
  189. point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
  190. for point_percentile in point_percentile_list:
  191. point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
  192. if point not in point_list_cache:
  193. point_list_cache.append(point)
  194. if point_list_cache not in points_list:
  195. points_list.append(point_list_cache)
  196. # IV与单调性过滤
  197. iv_max = 0
  198. breaks_list = []
  199. train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
  200. test_sv_bin_list = None
  201. if test_data_filter is not None:
  202. test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
  203. from tqdm import tqdm
  204. for point_list in tqdm(points_list):
  205. train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
  206. # 与special_values合并计算iv
  207. for sv_bin in train_sv_bin_list:
  208. train_bins = pd.concat((train_bins, sv_bin))
  209. train_iv = _calculation_iv(train_bins)
  210. # 只限制训练集的单调性与iv值大小
  211. if train_iv < iv_threshold:
  212. continue
  213. test_iv = 0
  214. if test_data_filter is not None:
  215. test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
  216. for sv_bin in test_sv_bin_list:
  217. test_bins = pd.concat((test_bins, sv_bin))
  218. test_iv = _calculation_iv(test_bins)
  219. iv = train_iv + test_iv
  220. if iv > iv_max:
  221. iv_max = iv
  222. breaks_list = point_list
  223. return iv_max, breaks_list
  224. def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
  225. # 粗筛
  226. bins_iv_dict = self._f_wide_filter(data)
  227. x_columns_candidate = list(bins_iv_dict.keys())
  228. candidate_num = self.data_process_config.candidate_num
  229. candidate_dict: Dict[str, CandidateFeatureEntity] = {}
  230. for x_column in x_columns_candidate:
  231. if is_numeric_dtype(data.train_data[x_column]):
  232. iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
  233. candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
  234. else:
  235. # 字符型暂时用scorecardpy来处理
  236. candidate_dict[x_column] = CandidateFeatureEntity(x_column, bins_iv_dict[x_column]["breaks_list"],
  237. bins_iv_dict[x_column]["iv_max"])
  238. # 相关性进一步剔除变量
  239. x_columns_candidate = self._f_corr_filter(data, candidate_dict)
  240. candidate_list: List[CandidateFeatureEntity] = []
  241. for x_column, v in candidate_dict.items():
  242. if x_column in x_columns_candidate:
  243. candidate_list.append(v)
  244. candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
  245. candidate_list = candidate_list[0:candidate_num]
  246. candidate_dict = {}
  247. for candidate in candidate_list:
  248. candidate_dict[candidate.x_column] = candidate
  249. return candidate_dict
  250. def feature_generate(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
  251. **kwargs) -> DataPreparedEntity:
  252. train_data = data.train_data
  253. val_data = data.val_data
  254. test_data = data.test_data
  255. y_column = self.data_process_config.y_column
  256. x_columns_candidate = list(candidate_dict.keys())
  257. bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
  258. train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
  259. train_data_feature = DataFeatureEntity(pd.concat((train_woe, train_data[y_column]), axis=1),
  260. train_woe.columns.tolist(), y_column)
  261. val_data_feature = None
  262. if val_data is not None and len(val_data) != 0:
  263. val_woe = sc.woebin_ply(val_data[x_columns_candidate], bins)
  264. val_data_feature = DataFeatureEntity(pd.concat((val_woe, val_data[y_column]), axis=1),
  265. train_woe.columns.tolist(), y_column)
  266. test_data_feature = None
  267. if test_data is not None and len(test_data) != 0:
  268. test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
  269. test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
  270. train_woe.columns.tolist(), y_column)
  271. return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature)