utils.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # -*- coding:utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2023/12/28
  5. @desc: 特征工具类
  6. """
  7. import re
  8. from typing import List
  9. import numpy as np
  10. import pandas as pd
  11. from sklearn.preprocessing import OneHotEncoder
  12. FORMAT_DICT = {
  13. # 比例类 -1 - 1
  14. "bin_rate1": np.arange(-1, 1 + 0.1, 0.1).tolist(),
  15. # 次数类1 0 -10
  16. "bin_cnt1": np.arange(0.0, 11.0, 1.0).tolist(),
  17. # 次数类2 0 - 20
  18. "bin_cnt2": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0, 17.0, 20.0],
  19. # 次数类3 0 - 50
  20. "bin_cnt3": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0],
  21. # 次数类4 0 - 100
  22. "bin_cnt4": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 80.0, 100.0],
  23. # 金额类1 0 - 1w
  24. "bin_amt1": np.arange(0, 1.1e4, 1e3).tolist(),
  25. # 金额类2 0 - 5w
  26. "bin_amt2": np.arange(0, 5.5e4, 5e3).tolist(),
  27. # 金额类3 0 - 10w
  28. "bin_amt3": np.arange(0, 11e4, 1e4).tolist(),
  29. # 金额类4 0 - 20w
  30. "bin_amt4": [0.0, 1e4, 2e4, 3e4, 4e4, 5e4, 8e4, 10e4, 15e4, 20e4],
  31. # 金额类5 0 - 100w
  32. "bin_amt5": [0.0, 5e4, 10e4, 15e4, 20e4, 25e4, 30e4, 40e4, 50e4, 100e4],
  33. # 年龄类
  34. "bin_age": [20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0],
  35. }
  36. # 粗分箱
  37. def f_format_bin(data_describe: pd.Series):
  38. # 筛选最合适的标准化分箱节点
  39. percent10 = data_describe["10%"]
  40. percent90 = data_describe["90%"]
  41. cache = None
  42. for k, v_list in FORMAT_DICT.items():
  43. bin_min = min(v_list)
  44. bin_max = max(v_list)
  45. if bin_min <= percent10 and percent90 <= bin_max:
  46. if cache is None:
  47. cache = (k, bin_max)
  48. elif cache[1] > bin_max:
  49. cache = (k, bin_max)
  50. if cache is None:
  51. return None
  52. return FORMAT_DICT[cache[0]]
  53. def f_format_value(points, raw_v):
  54. format_v = raw_v
  55. # 选择分箱内靠左的切分点
  56. for idx in range(1, len(points)):
  57. v_left = points[idx - 1]
  58. v_right = points[idx]
  59. # 靠左原则
  60. if v_left <= raw_v < v_right:
  61. format_v = v_left
  62. if raw_v > v_right:
  63. format_v = v_right
  64. return format_v
  65. class OneHot():
  66. def __init__(self, ):
  67. self._one_hot_encoder = OneHotEncoder()
  68. def fit(self, data: pd.DataFrame, x_column: str):
  69. self._x_column = x_column
  70. self._one_hot_encoder.fit(data[x_column].to_numpy().reshape(-1, 1))
  71. self._columns_onehot = [re.sub(r"[\[\]<]", "", f"{x_column}({i})") for i in
  72. self._one_hot_encoder.categories_[0]]
  73. def encoder(self, data: pd.DataFrame):
  74. one_hot_x = self._one_hot_encoder.transform(data[self._x_column].to_numpy().reshape(-1, 1))
  75. one_hot_x = one_hot_x.toarray()
  76. for idx, column_name in enumerate(self._columns_onehot):
  77. data[column_name] = one_hot_x[:, idx]
  78. @property
  79. def columns_onehot(self) -> List[str]:
  80. return self._columns_onehot