123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- # -*- coding: utf-8 -*-
- """
- @author: yq
- @time: 2024/11/1
- @desc:
- """
- import pandas as pd
- from commom import f_format_float
- class CandidateFeatureEntity():
- """
- 经过特征筛选后的特征信息
- """
- def __init__(self, x_column: str, breaks_list: list = None, iv_max: float = None):
- self._x_column = x_column
- self._breaks_list = breaks_list
- self._iv_max = iv_max
- @property
- def x_column(self):
- return self._x_column
- @property
- def breaks_list(self):
- return self._breaks_list
- @property
- def iv_max(self):
- return self._iv_max
- class DataFeatureEntity():
- """
- 数据特征准备完毕
- """
- def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
- self._data = data
- self._x_columns = x_columns
- self._y_column = y_column
- @property
- def data(self):
- return self._data
- @property
- def x_columns(self):
- return self._x_columns
- @property
- def y_column(self):
- return self._y_column
- def get_Xdata(self):
- return self._data[self._x_columns]
- def get_Ydata(self):
- return self._data[self._y_column]
- def get_odds0(self):
- train_good_len = len(self._data[self._data[self._y_column] == 0])
- train_bad_len = len(self._data[self._data[self._y_column] == 1])
- odds0 = train_bad_len / train_good_len
- return odds0
- class DataPreparedEntity():
- """
- 训练集测试集特征准备完毕
- """
- def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity,
- *args, **kwargs):
- self._train_data = train_data
- self._val_data = val_data
- self._test_data = test_data
- self.args = args
- self.kwargs = kwargs
- @property
- def train_data(self):
- return self._train_data
- @property
- def val_data(self):
- return self._val_data
- @property
- def test_data(self):
- return self._test_data
- class DataSplitEntity():
- """
- 初始数据训练集测试集划分
- """
- def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame = None, test_data: pd.DataFrame = None):
- self._train_data = train_data
- self._val_data = val_data
- self._test_data = test_data
- @property
- def train_data(self):
- return self._train_data
- @property
- def val_data(self):
- return self._val_data
- @property
- def test_data(self):
- return self._test_data
- def get_distribution(self, y_column) -> pd.DataFrame:
- df = pd.DataFrame()
- train_data_len = len(self._train_data)
- train_bad_len = len(self._train_data[self._train_data[y_column] == 1])
- train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%"
- test_data_len = 0
- test_bad_len = 0
- test_bad_rate = "-"
- if self._test_data is not None:
- test_data_len = len(self._test_data)
- test_bad_len = len(self._test_data[self._test_data[y_column] == 1])
- test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%"
- total = train_data_len + test_data_len
- bad_total = train_bad_len + test_bad_len
- bad_rate = f"{f_format_float(bad_total / total * 100, 2)}%"
- df["样本"] = ["训练集", "测试集", "合计"]
- df["样本数"] = [train_data_len, test_data_len, total]
- df["样本占比"] = [f"{f_format_float(train_data_len / total * 100, 2)}%",
- f"{f_format_float(test_data_len / total * 100, 2)}%", "100%"]
- df["坏样本数"] = [train_bad_len, test_bad_len, bad_total]
- df["坏样本比例"] = [train_bad_rate, test_bad_rate, bad_rate]
- return df
- if __name__ == "__main__":
- pass
|