feature_utils.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. # -*- coding:utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2023/12/28
  5. @desc: 特征工具类
  6. """
  7. import numpy as np
  8. import pandas as pd
  9. import scorecardpy as sc
  10. import toad as td
  11. from sklearn.preprocessing import KBinsDiscretizer
  12. from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
  13. from entitys import DataSplitEntity
  14. from enums import BinsStrategyEnum
  15. def f_get_bins(data: DataSplitEntity, feat: str, strategy: str = 'quantile', nbins: int = 10) -> pd.DataFrame:
  16. # 等频分箱
  17. if strategy == BinsStrategyEnum.QUANTILE.value:
  18. kbin_encoder = KBinsDiscretizer(n_bins=nbins, encode='ordinal', strategy='quantile')
  19. feature_binned = kbin_encoder.fit_transform(data[feat])
  20. return feature_binned.astype(int).astype(str)
  21. # 等宽分箱
  22. if strategy == BinsStrategyEnum.WIDTH.value:
  23. bin_width = (data.train_data()[feat].max() - data.train_data()[feat].min()) / nbins
  24. return pd.cut(data.train_data()[feat], bins=nbins, labels=[f'Bin_{i}' for i in range(1, nbins + 1)])
  25. # 使用toad分箱
  26. '''
  27. c = td.transfrom.Combiner()
  28. # method参数需要根据toad指定的几种方法名称选择
  29. c.fit(data, y = 'target', method = strategy, min_samples=None, n_bins = nbins, empty_separate = False)
  30. # 返回toad分箱combiner,用于训练集和测试集的分箱
  31. # 可使用c.export()[feature]查看某一特征的分箱临界值
  32. return c
  33. '''
  34. # 此函数入参应为scorecardpy进行woebin函数转换后的dataframe
  35. def f_get_bins_display(bins_info: pd.DataFrame) -> pd.DataFrame:
  36. df_list = []
  37. for col, bin_data in bins_info.items():
  38. tmp_df = pd.DataFrame(bin_data)
  39. df_list.append(tmp_df)
  40. result_df = pd.concat(df_list, ignore_index=True)
  41. total_bad = result_df['bad'].sum()
  42. total_cnt = result_df['count'].sum()
  43. # 整体的坏样本率
  44. br_overall = total_bad / total_cnt
  45. result_df['lift'] = result_df['badprob'] / br_overall
  46. result_df = \
  47. result_df.sort_values(['total_iv', 'variable'], ascending=False).set_index(['variable', 'total_iv', 'bin']) \
  48. [['count_distr', 'count', 'good', 'bad', 'badprob', 'lift', 'bin_iv', 'woe']]
  49. return result_df.style.format(subset=['count', 'good', 'bad'], precision=0).format(
  50. subset=['count_distr', 'bad', 'lift',
  51. 'badprob', 'woe', 'bin_iv'], precision=4).bar(subset=['badprob', 'bin_iv', 'lift'],
  52. color=['#d65f58', '#5fbb7a'])
  53. # 此函数筛除变量分箱不单调或非U型的变量
  54. def f_bins_filter(bins: pd.DataFrame, cols: list) -> list:
  55. result_cols = []
  56. # 遍历原始变量列表
  57. for tmp_col in cols:
  58. tmp_br = bins[tmp_col]['bad_prob'].values.tolist()
  59. tmp_len = len(tmp_br)
  60. if tmp_len <= 2:
  61. result_cols.append(tmp_col)
  62. else:
  63. tmp_judge = f_judge_monto(tmp_br)
  64. # f_judge_monto 函数返回1表示list单调,0表示非单调
  65. if tmp_judge:
  66. result_cols.append(tmp_col)
  67. return result_cols
  68. # 此函数判断list的单调性,允许至多N次符号变化
  69. def f_judge_monto(bd_list: list, pos_neg_cnt: int = 1) -> int:
  70. start_tr = bd_list[1] - bd_list[0]
  71. tmp_len = len(bd_list)
  72. pos_neg_flag = 0
  73. for i in range(2, tmp_len):
  74. tmp_tr = bd_list[i] - bd_list[i - 1]
  75. # 后一位bad_rate减前一位bad_rate,保证bad_rate的单调性
  76. # 记录符号变化, 允许 最多一次符号变化,即U型分布
  77. if (tmp_tr >= 0 and start_tr >= 0) or (tmp_tr <= 0 and start_tr <= 0):
  78. # 满足趋势保持,查看下一位
  79. continue
  80. else:
  81. # 记录一次符号变化
  82. start_tr = tmp_tr
  83. pos_neg_flag += 1
  84. if pos_neg_flag > pos_neg_cnt:
  85. return False
  86. # 记录满足趋势要求的变量
  87. if pos_neg_flag <= pos_neg_cnt:
  88. return True
  89. return False
  90. def f_get_woe(data: DataSplitEntity, c: td.transform.Combiner, to_drop: list) -> pd.DataFrame:
  91. transer = td.transform.WOETransformer()
  92. # 根据训练数据来训练woe转换器,并选择目标变量和排除变量
  93. train_woe = transer.fit_transform(c.transform(data.train_data()), data.train_data()['target'],
  94. exclude=to_drop + ['target'])
  95. test_woe = transer.transform(c.transfrom(data.test_data()))
  96. oot_woe = transer.transform(c.transform(data.val_data()))
  97. return train_woe, test_woe, oot_woe
  98. def f_get_psi(train_data: DataSplitEntity, oot_data: DataSplitEntity) -> pd.DataFrame:
  99. # 计算前,先排除掉不需要的cols
  100. return td.metrics.PSI(train_data, oot_data)
  101. def f_get_corr(data: pd.DataFrame, meth: str = 'spearman') -> pd.DataFrame:
  102. return data.corr(method=meth)
  103. def f_get_ivf(data: pd.DataFrame) -> pd.DataFrame:
  104. if len(data.columns.to_list()) <= 1:
  105. return None
  106. vif_v = [vif(data.values, data.columns.get_loc(i)) for i in data.columns]
  107. vif_df = pd.DataFrame()
  108. vif_df["变量"] = data.columns
  109. vif_df['vif'] = vif_v
  110. return vif_df
  111. def f_calcu_model_ks(data, y_column, sort_ascending):
  112. var_ks = data.groupby('MODEL_SCORE_BIN')[y_column].agg([len, np.sum]).sort_index(ascending=sort_ascending)
  113. var_ks.columns = ['样本数', '坏样本数']
  114. var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
  115. var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(4)
  116. var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(4)
  117. var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
  118. var_ks['总好样本数'] = var_ks['好样本数'].sum()
  119. var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(4)
  120. var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
  121. var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
  122. var_ks['累计样本数'] = var_ks['样本数'].cumsum()
  123. var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(4)
  124. var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(4)
  125. var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(4)
  126. var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(4)
  127. return var_ks.reset_index()
  128. def f_get_model_score_bin(df, card, bins=None):
  129. train_score = sc.scorecard_ply(df, card, print_step=0)
  130. df['score'] = train_score
  131. if bins is None:
  132. _, bins = pd.qcut(df['score'], q=10, retbins=True, duplicates="drop")
  133. bins = list(bins)
  134. bins[0] = -np.inf
  135. bins[-1] = np.inf
  136. score_bins = pd.cut(df['score'], bins=bins)
  137. df['MODEL_SCORE_BIN'] = score_bins.astype(str).values
  138. return df, bins
  139. def f_calcu_model_psi(df_train, df_test):
  140. tmp1 = df_train.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
  141. tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(4)
  142. tmp2 = df_test.groupby('MODEL_SCORE_BIN')['MODEL_SCORE_BIN'].agg(['count']).sort_index(ascending=True)
  143. tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(4)
  144. psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(4)
  145. psi = psi.reset_index()
  146. psi = psi.rename(columns={"样本数比例": "psi"})
  147. psi['训练样本数'] = list(tmp1['count'])
  148. psi['测试样本数'] = list(tmp2['count'])
  149. psi['训练样本数比例'] = list(tmp1['样本数比例'])
  150. psi['测试样本数比例'] = list(tmp2['样本数比例'])
  151. return psi