data_feaure_entity.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import pandas as pd
  8. from commom import f_format_float
  9. class CandidateFeatureEntity():
  10. """
  11. 经过特征筛选后的特征信息
  12. """
  13. def __init__(self, x_column: str, breaks_list: list = None, iv_max: float = None):
  14. self._x_column = x_column
  15. self._breaks_list = breaks_list
  16. self._iv_max = iv_max
  17. @property
  18. def x_column(self):
  19. return self._x_column
  20. @property
  21. def breaks_list(self) -> list:
  22. return self._breaks_list
  23. @property
  24. def iv_max(self):
  25. return self._iv_max
  26. class DataFeatureEntity():
  27. """
  28. 数据特征准备完毕
  29. """
  30. def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
  31. self._data = data
  32. self._x_columns = x_columns
  33. self._y_column = y_column
  34. @property
  35. def data(self):
  36. return self._data
  37. @property
  38. def x_columns(self):
  39. return self._x_columns
  40. @property
  41. def y_column(self):
  42. return self._y_column
  43. def get_Xdata(self):
  44. return self._data[self._x_columns]
  45. def get_Ydata(self):
  46. return self._data[self._y_column]
  47. def get_odds0(self):
  48. train_good_len = len(self._data[self._data[self._y_column] == 0])
  49. train_bad_len = len(self._data[self._data[self._y_column] == 1])
  50. odds0 = train_bad_len / train_good_len
  51. return odds0
  52. class DataPreparedEntity():
  53. """
  54. 训练集测试集特征准备完毕
  55. """
  56. def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity,
  57. *args, **kwargs):
  58. self._train_data = train_data
  59. self._val_data = val_data
  60. self._test_data = test_data
  61. self.args = args
  62. self.kwargs = kwargs
  63. @property
  64. def train_data(self):
  65. return self._train_data
  66. @property
  67. def val_data(self):
  68. return self._val_data
  69. @property
  70. def test_data(self):
  71. return self._test_data
  72. class DataSplitEntity():
  73. """
  74. 初始数据训练集测试集划分
  75. """
  76. def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame = None, test_data: pd.DataFrame = None):
  77. self._train_data = train_data
  78. self._val_data = val_data
  79. self._test_data = test_data
  80. @property
  81. def train_data(self):
  82. return self._train_data
  83. @property
  84. def val_data(self):
  85. return self._val_data
  86. @property
  87. def test_data(self):
  88. return self._test_data
  89. def get_distribution(self, y_column) -> pd.DataFrame:
  90. df = pd.DataFrame()
  91. train_data_len = len(self._train_data)
  92. train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
  93. train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%"
  94. test_data_len = 0
  95. test_bad_len = 0
  96. test_bad_rate = "-"
  97. if self._test_data is not None:
  98. test_data_len = len(self._test_data)
  99. test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
  100. test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
  101. total = train_data_len + test_data_len
  102. bad_total = train_bad_len + test_bad_len
  103. bad_rate = f"{f_format_float(bad_total / total * 100, 2)}%"
  104. df["样本"] = ["训练集", "测试集", "合计"]
  105. df["样本数"] = [train_data_len, test_data_len, total]
  106. df["样本占比"] = [f"{f_format_float(train_data_len / total * 100, 2)}%",
  107. f"{f_format_float(test_data_len / total * 100, 2)}%", "100%"]
  108. df["坏样本数"] = [train_bad_len, test_bad_len, bad_total]
  109. df["坏样本比例"] = [train_bad_rate, test_bad_rate, bad_rate]
  110. return df
  111. if __name__ == "__main__":
  112. pass