12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- # -*- coding: utf-8 -*-
- """
- @author: yq
- @time: 2024/11/13
- @desc: 数据探索
- """
- import numbers
- import pandas as pd
- from pandas.core.dtypes.common import is_numeric_dtype
- from enums import ConstantEnum
- class DataExplore():
- def __init__(self):
- pass
- @staticmethod
- def check_type(df: pd.DataFrame):
- check_msg = ""
- if ConstantEnum.SCORE.value in df.columns:
- check_msg = f"{check_msg}请修改列【{ConstantEnum.SCORE.value}】的名称\n"
- for column in df.columns:
- if not is_numeric_dtype(df[column]):
- values = list(df[column])
- cnt_str = 0
- cnt_number = 0
- cnt_other = 0
- for value in values:
- if isinstance(value, numbers.Number):
- cnt_number += 1
- elif isinstance(value, str):
- cnt_str += 1
- else:
- cnt_other += 1
- if len(values) != cnt_str:
- check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 其它类型数量{cnt_other}\n"
- return check_msg
- @staticmethod
- def distribution(df: pd.DataFrame) -> pd.DataFrame:
- """
- 数据分布,缺失率,中位数,众数,偏离度等
- """
- summary = []
- for column in df.columns:
- # 创建一个新的字典
- column_summary = {
- 'column': column,
- 'missing_rate': df[column].isnull().mean(),
- }
- # 判断数据类型
- if df[column].dtype == 'object':
- column_summary['type'] = 'categorical'
- else:
- column_summary['type'] = 'numerical'
- if column_summary['type'] == 'numerical':
- column_summary['min'] = df[column].min()
- column_summary['25%'] = df[column].quantile(0.25)
- column_summary['median'] = df[column].median()
- column_summary['75%'] = df[column].quantile(0.75)
- column_summary['max'] = df[column].max()
- column_summary['skewness'] = df[column].skew()
- else:
- # 类别型变量
- column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
- freq = df[column].value_counts(normalize=True).head(10)
- column_summary['top_10_value'] = freq.index.tolist()
- column_summary['top_10_ratio'] = freq.tolist()
- summary.append(column_summary)
- return pd.DataFrame(summary)
- if __name__ == "__main__":
- pass
|