# -*- coding: utf-8 -*- """ @author: yq @time: 2024/11/13 @desc: 数据探索 """ import pandas as pd from commom import f_save_train_df class DataExplore(): def __init__(self): pass def distribution(self, df: pd.DataFrame) -> pd.DataFrame: """ 数据分布,缺失率,中位数,众数,偏离度等 """ summary = [] for column in df.columns: # 创建一个新的字典 column_summary = { 'column': column, 'missing_rate': df[column].isnull().mean(), } # 判断数据类型 if df[column].dtype == 'object': column_summary['type'] = 'categorical' else: column_summary['type'] = 'numerical' if column_summary['type'] == 'numerical': column_summary['min'] = df[column].min() column_summary['25%'] = df[column].quantile(0.25) column_summary['median'] = df[column].median() column_summary['75%'] = df[column].quantile(0.75) column_summary['max'] = df[column].max() column_summary['skewness'] = df[column].skew() else: # 类别型变量 column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None freq = df[column].value_counts(normalize=True).head(10) column_summary['top_10_value'] = freq.index.tolist() column_summary['top_10_ratio'] = freq.tolist() summary.append(column_summary) return pd.DataFrame(summary) def save(self, df: pd.DataFrame): """ 数据探索结果固化 """ f_save_train_df("distribution", df) if __name__ == "__main__": pass