# -*- coding: utf-8 -*- """ @author: yq @time: 2024/11/13 @desc: 数据探索 """ import numbers import pandas as pd from pandas.core.dtypes.common import is_numeric_dtype from enums import ConstantEnum class DataExplore(): def __init__(self): pass @staticmethod def check_type(df: pd.DataFrame): check_msg = "" if ConstantEnum.SCORE.value in df.columns: check_msg = f"{check_msg}请修改列【{ConstantEnum.SCORE.value}】的名称\n" for column in df.columns: if not is_numeric_dtype(df[column]): values = list(df[column]) cnt_str = 0 cnt_number = 0 cnt_other = 0 for value in values: if isinstance(value, numbers.Number): cnt_number += 1 elif isinstance(value, str): cnt_str += 1 else: cnt_other += 1 if len(values) != cnt_str: check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 其它类型数量{cnt_other}\n" return check_msg @staticmethod def distribution(df: pd.DataFrame) -> pd.DataFrame: """ 数据分布,缺失率,中位数,众数,偏离度等 """ summary = [] for column in df.columns: # 创建一个新的字典 column_summary = { 'column': column, 'missing_rate': df[column].isnull().mean(), } # 判断数据类型 if df[column].dtype == 'object': column_summary['type'] = 'categorical' else: column_summary['type'] = 'numerical' if column_summary['type'] == 'numerical': column_summary['min'] = df[column].min() column_summary['25%'] = df[column].quantile(0.25) column_summary['median'] = df[column].median() column_summary['75%'] = df[column].quantile(0.75) column_summary['max'] = df[column].max() column_summary['skewness'] = df[column].skew() else: # 类别型变量 column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None freq = df[column].value_counts(normalize=True).head(10) column_summary['top_10_value'] = freq.index.tolist() column_summary['top_10_ratio'] = freq.tolist() summary.append(column_summary) return pd.DataFrame(summary) if __name__ == "__main__": pass