data_feaure_entity.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/1
  5. @desc:
  6. """
  7. import pandas as pd
  8. from commom import f_format_float
  9. class DataFeatureEntity():
  10. """
  11. 数据特征准备完毕
  12. """
  13. def __init__(self, data_x: pd.DataFrame, data_y: pd.Series):
  14. self._data_x = data_x
  15. self._data_y = data_y
  16. @property
  17. def x_columns(self):
  18. return self._data_x.columns.tolist()
  19. @property
  20. def data_x(self):
  21. return self._data_x
  22. @property
  23. def data_y(self):
  24. return self._data_y
  25. def get_odds0(self):
  26. train_good_len = len(self._data_y[self._data_y == 0])
  27. train_bad_len = len(self._data_y[self._data_y == 1])
  28. odds0 = train_bad_len / train_good_len
  29. return odds0
  30. class DataSplitEntity():
  31. """
  32. 初始数据训练集测试集划分
  33. """
  34. def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
  35. self._train_data = train_data
  36. self._test_data = test_data
  37. self._data = pd.concat((train_data, test_data))
  38. @property
  39. def data(self):
  40. return self._data
  41. @property
  42. def train_data(self):
  43. return self._train_data
  44. @property
  45. def test_data(self):
  46. return self._test_data
  47. def get_distribution(self, y_column) -> pd.DataFrame:
  48. df = pd.DataFrame()
  49. train_data_len = len(self._train_data)
  50. train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
  51. train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%"
  52. test_data_len = len(self._test_data)
  53. test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
  54. test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
  55. total = train_data_len + test_data_len
  56. bad_total = train_bad_len + test_bad_len
  57. bad_rate = f"{f_format_float(bad_total / total * 100, 2)}%"
  58. df["样本"] = ["训练集", "测试集", "合计"]
  59. df["样本数"] = [train_data_len, test_data_len, total]
  60. df["样本占比"] = [f"{f_format_float(train_data_len / total * 100, 2)}%",
  61. f"{f_format_float(test_data_len / total * 100, 2)}%", "100%"]
  62. df["坏样本数"] = [train_bad_len, test_bad_len, bad_total]
  63. df["坏样本比例"] = [train_bad_rate, test_bad_rate, bad_rate]
  64. return df
  65. if __name__ == "__main__":
  66. pass