data_explore.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/13
  5. @desc: 数据探索
  6. """
  7. import pandas as pd
  8. class DataExplore():
  9. def __init__(self):
  10. pass
  11. @staticmethod
  12. def distribution(df: pd.DataFrame) -> pd.DataFrame:
  13. """
  14. 数据分布,缺失率,中位数,众数,偏离度等
  15. """
  16. summary = []
  17. for column in df.columns:
  18. # 创建一个新的字典
  19. column_summary = {
  20. 'column': column,
  21. 'missing_rate': df[column].isnull().mean(),
  22. }
  23. # 判断数据类型
  24. if df[column].dtype == 'object':
  25. column_summary['type'] = 'categorical'
  26. else:
  27. column_summary['type'] = 'numerical'
  28. if column_summary['type'] == 'numerical':
  29. column_summary['min'] = df[column].min()
  30. column_summary['25%'] = df[column].quantile(0.25)
  31. column_summary['median'] = df[column].median()
  32. column_summary['75%'] = df[column].quantile(0.75)
  33. column_summary['max'] = df[column].max()
  34. column_summary['skewness'] = df[column].skew()
  35. else:
  36. # 类别型变量
  37. column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
  38. freq = df[column].value_counts(normalize=True).head(10)
  39. column_summary['top_10_value'] = freq.index.tolist()
  40. column_summary['top_10_ratio'] = freq.tolist()
  41. summary.append(column_summary)
  42. return pd.DataFrame(summary)
  43. if __name__ == "__main__":
  44. pass