data_explore.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/13
  5. @desc: 数据探索
  6. """
  7. import numbers
  8. import pandas as pd
  9. from pandas.core.dtypes.common import is_numeric_dtype
  10. class DataExplore():
  11. def __init__(self):
  12. pass
  13. @staticmethod
  14. def check_type(df: pd.DataFrame):
  15. check_msg = ""
  16. for column in df.columns:
  17. if not is_numeric_dtype(df[column]):
  18. values = list(df[column])
  19. cnt_str = 0
  20. cnt_number = 0
  21. cnt_other = 0
  22. for value in values:
  23. if isinstance(value, numbers.Number):
  24. cnt_number += 1
  25. elif isinstance(value, str):
  26. cnt_str += 1
  27. else:
  28. cnt_other += 1
  29. if len(values) != cnt_str:
  30. check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 其它类型数量{cnt_other}\n"
  31. return check_msg
  32. @staticmethod
  33. def distribution(df: pd.DataFrame) -> pd.DataFrame:
  34. """
  35. 数据分布,缺失率,中位数,众数,偏离度等
  36. """
  37. summary = []
  38. for column in df.columns:
  39. # 创建一个新的字典
  40. column_summary = {
  41. 'column': column,
  42. 'missing_rate': df[column].isnull().mean(),
  43. }
  44. # 判断数据类型
  45. if df[column].dtype == 'object':
  46. column_summary['type'] = 'categorical'
  47. else:
  48. column_summary['type'] = 'numerical'
  49. if column_summary['type'] == 'numerical':
  50. column_summary['min'] = df[column].min()
  51. column_summary['25%'] = df[column].quantile(0.25)
  52. column_summary['median'] = df[column].median()
  53. column_summary['75%'] = df[column].quantile(0.75)
  54. column_summary['max'] = df[column].max()
  55. column_summary['skewness'] = df[column].skew()
  56. else:
  57. # 类别型变量
  58. column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
  59. freq = df[column].value_counts(normalize=True).head(10)
  60. column_summary['top_10_value'] = freq.index.tolist()
  61. column_summary['top_10_ratio'] = freq.tolist()
  62. summary.append(column_summary)
  63. return pd.DataFrame(summary)
  64. if __name__ == "__main__":
  65. pass