strategy_iv.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. # -*- coding:utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/1/2
  5. @desc: iv值及单调性筛选类
  6. """
  7. from itertools import combinations_with_replacement
  8. from typing import List, Dict
  9. import matplotlib.pyplot as plt
  10. import numpy as np
  11. import pandas as pd
  12. import scorecardpy as sc
  13. import seaborn as sns
  14. from pandas.core.dtypes.common import is_numeric_dtype
  15. from tqdm import tqdm
  16. from entitys import DataSplitEntity, CandidateFeatureEntity, DataPreparedEntity, DataFeatureEntity, MetricFucEntity
  17. from .feature_utils import f_judge_monto, f_get_corr, f_get_ivf, f_format_bin
  18. from .filter_strategy_base import FilterStrategyBase
  19. class StrategyIv(FilterStrategyBase):
  20. def __init__(self, *args, **kwargs):
  21. super().__init__(*args, **kwargs)
  22. def _f_get_iv_by_bins(self, bins) -> pd.DataFrame:
  23. iv = {key_: [round(value_['total_iv'].max(), 4)] for key_, value_ in bins.items()}
  24. iv = pd.DataFrame.from_dict(iv, orient='index', columns=['IV']).reset_index()
  25. iv = iv.sort_values('IV', ascending=False).reset_index(drop=True)
  26. iv.columns = ['变量', 'IV']
  27. return iv
  28. def _f_get_var_corr_image(self, train_woe):
  29. if len(train_woe.columns.to_list()) <= 1:
  30. return None
  31. train_corr = f_get_corr(train_woe)
  32. plt.figure(figsize=(12, 12))
  33. sns.heatmap(train_corr, vmax=1, square=True, cmap='RdBu', annot=True)
  34. plt.title('Variables Correlation', fontsize=15)
  35. plt.yticks(rotation=0)
  36. plt.xticks(rotation=90)
  37. path = self.data_process_config.f_get_save_path(f"var_corr.png")
  38. plt.savefig(path)
  39. return path
  40. def _f_save_var_trend(self, bins, x_columns_candidate, prefix):
  41. image_path_list = []
  42. for k in x_columns_candidate:
  43. bin_df = bins[k]
  44. # bin_df["bin"] = bin_df["bin"].apply(lambda x: re.sub(r"(\d+\.\d+)",
  45. # lambda m: "{:.2f}".format(float(m.group(0))), x))
  46. sc.woebin_plot(bin_df)
  47. path = self.data_process_config.f_get_save_path(f"{prefix}_{k}.png")
  48. plt.savefig(path)
  49. image_path_list.append(path)
  50. return image_path_list
  51. def _f_get_bins_by_breaks(self, data: pd.DataFrame, candidate_dict: Dict[str, CandidateFeatureEntity],
  52. y_column=None):
  53. y_column = self.data_process_config.y_column if y_column is None else y_column
  54. special_values = self.data_process_config.special_values
  55. x_columns_candidate = list(candidate_dict.keys())
  56. breaks_list = {}
  57. for column, candidate in candidate_dict.items():
  58. breaks_list[column] = candidate.breaks_list
  59. bins = sc.woebin(data[x_columns_candidate + [y_column]], y=y_column, breaks_list=breaks_list,
  60. special_values=special_values)
  61. return bins
  62. def _f_corr_filter(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity]) -> List[str]:
  63. # 相关性剔除变量
  64. corr_threshold = self.data_process_config.corr_threshold
  65. train_data = data.train_data
  66. x_columns_candidate = list(candidate_dict.keys())
  67. bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
  68. train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
  69. corr_df = f_get_corr(train_woe)
  70. corr_dict = corr_df.to_dict()
  71. for column, corr in corr_dict.items():
  72. column = column.replace("_woe", "")
  73. if column not in x_columns_candidate:
  74. continue
  75. for challenger_column, challenger_corr in corr.items():
  76. challenger_column = challenger_column.replace("_woe", "")
  77. if challenger_corr < corr_threshold or column == challenger_column \
  78. or challenger_column not in x_columns_candidate:
  79. continue
  80. iv_max = candidate_dict[column].iv_max
  81. challenger_iv_max = candidate_dict[challenger_column].iv_max
  82. if iv_max > challenger_iv_max:
  83. x_columns_candidate.remove(challenger_column)
  84. else:
  85. x_columns_candidate.remove(column)
  86. break
  87. return x_columns_candidate
  88. def _f_wide_filter(self, data: DataSplitEntity) -> Dict:
  89. # 粗筛变量
  90. train_data = data.train_data
  91. test_data = data.test_data
  92. special_values = self.data_process_config.special_values
  93. breaks_list = self.data_process_config.breaks_list.copy()
  94. y_column = self.data_process_config.y_column
  95. iv_threshold_wide = self.data_process_config.iv_threshold_wide
  96. x_columns_candidate = self.data_process_config.x_columns_candidate
  97. if x_columns_candidate is None or len(x_columns_candidate) == 0:
  98. x_columns_candidate = train_data.columns.tolist()
  99. if y_column in x_columns_candidate:
  100. x_columns_candidate.remove(y_column)
  101. bins_train = sc.woebin(train_data[x_columns_candidate + [y_column]], y=y_column, bin_num_limit=5,
  102. special_values=special_values, breaks_list=breaks_list)
  103. for column, bin in bins_train.items():
  104. breaks_list[column] = list(bin['breaks'])
  105. bins_test = None
  106. if test_data is not None and len(test_data) != 0:
  107. bins_test = sc.woebin(test_data[x_columns_candidate + [y_column]], y=y_column,
  108. special_values=special_values, breaks_list=breaks_list)
  109. bins_iv_dict = {}
  110. for column, bin_train in bins_train.items():
  111. train_iv = bin_train['total_iv'][0]
  112. test_iv = 0
  113. if bins_test is not None:
  114. bin_test = bins_test[column]
  115. test_iv = bin_test['total_iv'][0]
  116. iv_max = train_iv + test_iv
  117. if train_iv < iv_threshold_wide:
  118. continue
  119. bins_iv_dict[column] = {"iv_max": iv_max, "breaks_list": breaks_list[column]}
  120. return bins_iv_dict
  121. def _f_get_best_bins_numeric(self, data: DataSplitEntity, x_column: str):
  122. # 贪婪搜索【训练集】及【测试集】加起来【iv】值最高的且【单调】的分箱
  123. interval = self.data_process_config.bin_search_interval
  124. iv_threshold = self.data_process_config.iv_threshold
  125. special_values = self.data_process_config.get_special_values(x_column)
  126. breaks_list = self.data_process_config.get_breaks_list(x_column)
  127. y_column = self.data_process_config.y_column
  128. sample_rate = self.data_process_config.sample_rate
  129. format_bin = self.data_process_config.format_bin
  130. pos_neg_cnt = self.data_process_config.pos_neg_cnt
  131. def _n0(x):
  132. return sum(x == 0)
  133. def _n1(x):
  134. return sum(x == 1)
  135. def _f_distribute_balls(balls, boxes):
  136. # 计算在 balls - 1 个空位中放入 boxes - 1 个隔板的方法数
  137. total_ways = combinations_with_replacement(range(balls + boxes - 1), boxes - 1)
  138. distribute_list = []
  139. # 遍历所有可能的隔板位置
  140. for combo in total_ways:
  141. # 根据隔板位置分配球
  142. distribution = [0] * boxes
  143. start = 0
  144. for i, divider in enumerate(combo):
  145. distribution[i] = divider - start + 1
  146. start = divider + 1
  147. distribution[-1] = balls - start # 最后一个箱子的球数
  148. # 确保每个箱子至少有一个球
  149. if all(x > 0 for x in distribution):
  150. distribute_list.append(distribution)
  151. return distribute_list
  152. def _get_sv_bins(df, x_column, y_column, special_values):
  153. # special_values_bins
  154. sv_bin_list = []
  155. for special in special_values:
  156. dtm = df[df[x_column] == special]
  157. if len(dtm) != 0:
  158. dtm['bin'] = [str(special)] * len(dtm)
  159. binning = dtm.groupby(['bin'], group_keys=False)[y_column].agg(
  160. [_n0, _n1]).reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  161. binning['is_special_values'] = [True] * len(binning)
  162. sv_bin_list.append(binning)
  163. return sv_bin_list
  164. def _get_bins(df, x_column, y_column, breaks_list):
  165. dtm = pd.DataFrame({'y': df[y_column], 'value': df[x_column]})
  166. bstbrks = [-np.inf] + breaks_list + [np.inf]
  167. labels = ['[{},{})'.format(bstbrks[i], bstbrks[i + 1]) for i in range(len(bstbrks) - 1)]
  168. dtm.loc[:, 'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
  169. dtm['bin'] = dtm['bin'].astype(str)
  170. bins = dtm.groupby(['bin'], group_keys=False)['y'].agg([_n0, _n1]) \
  171. .reset_index().rename(columns={'_n0': 'good', '_n1': 'bad'})
  172. bins['is_special_values'] = [False] * len(bins)
  173. return bins
  174. def _calculation_iv(bins, judge_monto=True, pos_neg_cnt=1):
  175. bins['count'] = bins['good'] + bins['bad']
  176. bins['badprob'] = bins['bad'] / bins['count']
  177. # 单调性判断
  178. bad_prob = bins[bins['is_special_values'] == False]['badprob'].values.tolist()
  179. if judge_monto and not f_judge_monto(bad_prob, pos_neg_cnt):
  180. return -1
  181. # 计算iv
  182. infovalue = pd.DataFrame({'good': bins['good'], 'bad': bins['bad']}) \
  183. .replace(0, 0.9) \
  184. .assign(
  185. DistrBad=lambda x: x.bad / sum(x.bad),
  186. DistrGood=lambda x: x.good / sum(x.good)
  187. ) \
  188. .assign(iv=lambda x: (x.DistrBad - x.DistrGood) * np.log(x.DistrBad / x.DistrGood)) \
  189. .iv
  190. bins['bin_iv'] = infovalue
  191. bins['total_iv'] = bins['bin_iv'].sum()
  192. iv = bins['total_iv'].values[0]
  193. return iv
  194. def _f_sampling(distribute_list: list, sample_rate: float):
  195. # 采样,完全贪婪搜索耗时太长
  196. sampled_list = distribute_list[::int(1 / sample_rate)]
  197. return sampled_list
  198. train_data = data.train_data
  199. train_data_filter = train_data[~train_data[x_column].isin(special_values)]
  200. train_data_filter = train_data_filter.sort_values(by=x_column, ascending=True)
  201. train_data_x = train_data_filter[x_column]
  202. train_data_x_describe = train_data_x.describe(percentiles=[0.1, 0.9])
  203. test_data = data.test_data
  204. test_data_filter = None
  205. if test_data is not None and len(test_data) != 0:
  206. test_data_filter = test_data[~test_data[x_column].isin(special_values)]
  207. test_data_filter = test_data_filter.sort_values(by=x_column, ascending=True)
  208. # 构造数据切分点
  209. # 计算 2 - 5 箱的情况
  210. distribute_list = []
  211. points_list = []
  212. for bin_num in list(range(2, 6)):
  213. distribute_list_cache = _f_distribute_balls(int(1 / interval), bin_num)
  214. # 4箱及以上得采样,不然耗时太久
  215. sample_num = 1000 * sample_rate
  216. if sample_rate <= 0.15:
  217. sample_num *= 2
  218. if bin_num == 4 and len(distribute_list_cache) >= sample_num:
  219. distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
  220. sample_num = 4000 * sample_rate
  221. if bin_num == 5 and len(distribute_list_cache) >= sample_num:
  222. distribute_list_cache = _f_sampling(distribute_list_cache, sample_num / len(distribute_list_cache))
  223. distribute_list.extend(distribute_list_cache)
  224. for distribute in distribute_list:
  225. point_list_cache = []
  226. point_percentile_list = [sum(distribute[0:idx + 1]) * interval for idx, _ in enumerate(distribute[0:-1])]
  227. for point_percentile in point_percentile_list:
  228. point = train_data_x.iloc[int(len(train_data_x) * point_percentile)]
  229. if format_bin:
  230. point = f_format_bin(train_data_x_describe, point)
  231. if point not in point_list_cache:
  232. point_list_cache.append(point)
  233. if point_list_cache not in points_list:
  234. points_list.append(point_list_cache)
  235. # IV与单调性过滤
  236. iv_max = 0
  237. breaks_list_target = None
  238. judge_monto = True
  239. if len(breaks_list) != 0:
  240. points_list = [breaks_list]
  241. judge_monto = False
  242. train_sv_bin_list = _get_sv_bins(train_data, x_column, y_column, special_values)
  243. test_sv_bin_list = None
  244. if test_data_filter is not None:
  245. test_sv_bin_list = _get_sv_bins(test_data, x_column, y_column, special_values)
  246. for point_list in points_list:
  247. train_bins = _get_bins(train_data_filter, x_column, y_column, point_list)
  248. # 与special_values合并计算iv
  249. for sv_bin in train_sv_bin_list:
  250. train_bins = pd.concat((train_bins, sv_bin))
  251. # _calculation_iv包含了单调性判断,并排除了特殊值
  252. train_iv = _calculation_iv(train_bins, judge_monto, pos_neg_cnt)
  253. # 只限制训练集的单调性与iv值大小
  254. if train_iv < iv_threshold:
  255. continue
  256. test_iv = 0
  257. if test_data_filter is not None:
  258. test_bins = _get_bins(test_data_filter, x_column, y_column, point_list)
  259. for sv_bin in test_sv_bin_list:
  260. test_bins = pd.concat((test_bins, sv_bin))
  261. test_iv = _calculation_iv(test_bins, judge_monto, pos_neg_cnt)
  262. iv = train_iv + test_iv
  263. if iv > iv_max:
  264. iv_max = iv
  265. breaks_list_target = point_list
  266. return iv_max, breaks_list_target
  267. def filter(self, data: DataSplitEntity, *args, **kwargs) -> Dict[str, CandidateFeatureEntity]:
  268. # 粗筛
  269. bins_iv_dict = self._f_wide_filter(data)
  270. x_columns_candidate = list(bins_iv_dict.keys())
  271. candidate_num = self.data_process_config.candidate_num
  272. candidate_dict: Dict[str, CandidateFeatureEntity] = {}
  273. for x_column in tqdm(x_columns_candidate):
  274. if is_numeric_dtype(data.train_data[x_column]):
  275. iv_max, breaks_list = self._f_get_best_bins_numeric(data, x_column)
  276. if breaks_list is None:
  277. continue
  278. candidate_dict[x_column] = CandidateFeatureEntity(x_column, breaks_list, iv_max)
  279. else:
  280. # 字符型暂时用scorecardpy来处理
  281. candidate_dict[x_column] = CandidateFeatureEntity(x_column, bins_iv_dict[x_column]["breaks_list"],
  282. bins_iv_dict[x_column]["iv_max"])
  283. # 相关性进一步剔除变量
  284. x_columns_candidate = self._f_corr_filter(data, candidate_dict)
  285. candidate_list: List[CandidateFeatureEntity] = []
  286. for x_column, v in candidate_dict.items():
  287. if x_column in x_columns_candidate:
  288. candidate_list.append(v)
  289. candidate_list.sort(key=lambda x: x.iv_max, reverse=True)
  290. candidate_list = candidate_list[0:candidate_num]
  291. candidate_dict = {}
  292. for candidate in candidate_list:
  293. candidate_dict[candidate.x_column] = candidate
  294. return candidate_dict
  295. def feature_generate(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
  296. **kwargs) -> DataPreparedEntity:
  297. train_data = data.train_data
  298. val_data = data.val_data
  299. test_data = data.test_data
  300. y_column = self.data_process_config.y_column
  301. x_columns_candidate = list(candidate_dict.keys())
  302. bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
  303. train_woe = sc.woebin_ply(train_data[x_columns_candidate], bins)
  304. train_data_feature = DataFeatureEntity(pd.concat((train_woe, train_data[y_column]), axis=1),
  305. train_woe.columns.tolist(), y_column)
  306. val_data_feature = None
  307. if val_data is not None and len(val_data) != 0:
  308. val_woe = sc.woebin_ply(val_data[x_columns_candidate], bins)
  309. val_data_feature = DataFeatureEntity(pd.concat((val_woe, val_data[y_column]), axis=1),
  310. train_woe.columns.tolist(), y_column)
  311. test_data_feature = None
  312. if test_data is not None and len(test_data) != 0:
  313. test_woe = sc.woebin_ply(test_data[x_columns_candidate], bins)
  314. test_data_feature = DataFeatureEntity(pd.concat((test_woe, test_data[y_column]), axis=1),
  315. train_woe.columns.tolist(), y_column)
  316. return DataPreparedEntity(train_data_feature, val_data_feature, test_data_feature, bins=bins,
  317. data_split_original=data)
  318. def feature_report(self, data: DataSplitEntity, candidate_dict: Dict[str, CandidateFeatureEntity], *args,
  319. **kwargs) -> Dict[str, MetricFucEntity]:
  320. y_column = self.data_process_config.y_column
  321. x_columns_candidate = list(candidate_dict.keys())
  322. train_data = data.train_data
  323. test_data = data.test_data
  324. metric_value_dict = {}
  325. # 样本分布
  326. metric_value_dict["样本分布"] = MetricFucEntity(table=data.get_distribution(y_column), table_font_size=10,
  327. table_cell_width=3)
  328. # 变量iv及psi
  329. train_bins = self._f_get_bins_by_breaks(train_data, candidate_dict)
  330. train_iv = self._f_get_iv_by_bins(train_bins)
  331. if test_data is not None and len(test_data) != 0:
  332. # 计算psi仅需把y改成识别各自训练集测试集即可
  333. psi_df = pd.concat((train_data, test_data))
  334. psi_df["#target#"] = [1] * len(train_data) + [0] * len(test_data)
  335. psi = self._f_get_bins_by_breaks(psi_df, candidate_dict, y_column="#target#")
  336. psi = self._f_get_iv_by_bins(psi)
  337. psi.columns = ['变量', 'psi']
  338. train_iv = pd.merge(train_iv, psi, on="变量", how="left")
  339. # 变量趋势-测试集
  340. test_bins = self._f_get_bins_by_breaks(test_data, candidate_dict)
  341. image_path_list = self._f_save_var_trend(test_bins, x_columns_candidate, "test")
  342. metric_value_dict["变量趋势-测试集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
  343. metric_value_dict["变量iv"] = MetricFucEntity(table=train_iv, table_font_size=10, table_cell_width=3)
  344. # 变量趋势-训练集
  345. image_path_list = self._f_save_var_trend(train_bins, x_columns_candidate, "train")
  346. metric_value_dict["变量趋势-训练集"] = MetricFucEntity(image_path=image_path_list, image_size=4)
  347. # 变量有效性
  348. train_woe = sc.woebin_ply(train_data[x_columns_candidate], train_bins)
  349. var_corr_image_path = self._f_get_var_corr_image(train_woe)
  350. # vif
  351. vif_df = f_get_ivf(train_woe)
  352. metric_value_dict["变量有效性"] = MetricFucEntity(image_path=var_corr_image_path, table=vif_df)
  353. return metric_value_dict