# -*- coding: utf-8 -*- """ @author: yq @time: 2024/11/1 @desc: """ import pandas as pd from commom import f_format_float class CandidateFeatureEntity(): """ 经过特征筛选后的特征信息 """ def __init__(self, x_column: str, breaks_list: list = None, iv_max: float = None): self._x_column = x_column self._breaks_list = breaks_list self._iv_max = iv_max @property def x_column(self): return self._x_column @property def breaks_list(self): return self._breaks_list @property def iv_max(self): return self._iv_max class DataFeatureEntity(): """ 数据特征准备完毕 """ def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str): self._data = data self._x_columns = x_columns self._y_column = y_column @property def data(self): return self._data @property def x_columns(self): return self._x_columns @property def y_column(self): return self._y_column def get_Xdata(self): return self._data[self._x_columns] def get_Ydata(self): return self._data[self._y_column] def get_odds0(self): train_good_len = len(self._data[self._data[self._y_column] == 0]) train_bad_len = len(self._data[self._data[self._y_column] == 1]) odds0 = train_bad_len / train_good_len return odds0 class DataPreparedEntity(): """ 训练集测试集特征准备完毕 """ def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity, *args, **kwargs): self._train_data = train_data self._val_data = val_data self._test_data = test_data self.args = args self.kwargs = kwargs @property def train_data(self): return self._train_data @property def val_data(self): return self._val_data @property def test_data(self): return self._test_data class DataSplitEntity(): """ 初始数据训练集测试集划分 """ def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame): self._train_data = train_data self._val_data = val_data self._test_data = test_data @property def train_data(self): return self._train_data @property def val_data(self): return self._val_data @property def test_data(self): return self._test_data def get_distribution(self, y_column) -> pd.DataFrame: df = pd.DataFrame() train_data_len = len(self._train_data) train_bad_len = len(self._train_data[self._train_data[y_column] == 1]) train_bad_rate = f"{f_format_float(train_bad_len / train_data_len * 100, 2)}%" test_data_len = 0 test_bad_len = 0 test_bad_rate = "-" if self._test_data is not None: test_data_len = len(self._test_data) test_bad_len = len(self._test_data[self._test_data[y_column] == 1]) test_bad_rate = f"{f_format_float(test_bad_len / test_data_len * 100, 2)}%" total = train_data_len + test_data_len bad_total = train_bad_len + test_bad_len bad_rate = f"{f_format_float(bad_total / total * 100, 2)}%" df["样本"] = ["训练集", "测试集", "合计"] df["样本数"] = [train_data_len, test_data_len, total] df["样本占比"] = [f"{f_format_float(train_data_len / total * 100, 2)}%", f"{f_format_float(test_data_len / total * 100, 2)}%", "100%"] df["坏样本数"] = [train_bad_len, test_bad_len, bad_total] df["坏样本比例"] = [train_bad_rate, test_bad_rate, bad_rate] return df if __name__ == "__main__": pass