data_explore.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/13
  5. @desc: 数据探索
  6. """
  7. import numbers
  8. import pandas as pd
  9. from pandas.core.dtypes.common import is_numeric_dtype
  10. from enums import ConstantEnum
  11. class DataExplore():
  12. def __init__(self):
  13. pass
  14. @staticmethod
  15. def check_type(df: pd.DataFrame):
  16. check_msg = ""
  17. if ConstantEnum.SCORE.value in df.columns:
  18. check_msg = f"{check_msg}请修改列【{ConstantEnum.SCORE.value}】的名称\n"
  19. for column in df.columns:
  20. if not is_numeric_dtype(df[column]):
  21. values = list(df[column])
  22. cnt_str = 0
  23. cnt_number = 0
  24. cnt_other = 0
  25. for value in values:
  26. if isinstance(value, numbers.Number):
  27. cnt_number += 1
  28. elif isinstance(value, str):
  29. cnt_str += 1
  30. else:
  31. cnt_other += 1
  32. if len(values) != cnt_str:
  33. check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 其它类型数量{cnt_other}\n"
  34. return check_msg
  35. @staticmethod
  36. def distribution(df: pd.DataFrame) -> pd.DataFrame:
  37. """
  38. 数据分布,缺失率,中位数,众数,偏离度等
  39. """
  40. summary = []
  41. for column in df.columns:
  42. # 创建一个新的字典
  43. column_summary = {
  44. 'column': column,
  45. 'missing_rate': df[column].isnull().mean(),
  46. }
  47. # 判断数据类型
  48. if df[column].dtype == 'object':
  49. column_summary['type'] = 'categorical'
  50. else:
  51. column_summary['type'] = 'numerical'
  52. if column_summary['type'] == 'numerical':
  53. column_summary['min'] = df[column].min()
  54. column_summary['25%'] = df[column].quantile(0.25)
  55. column_summary['median'] = df[column].median()
  56. column_summary['75%'] = df[column].quantile(0.75)
  57. column_summary['max'] = df[column].max()
  58. column_summary['skewness'] = df[column].skew()
  59. else:
  60. # 类别型变量
  61. column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
  62. freq = df[column].value_counts(normalize=True).head(10)
  63. column_summary['top_10_value'] = freq.index.tolist()
  64. column_summary['top_10_ratio'] = freq.tolist()
  65. summary.append(column_summary)
  66. return pd.DataFrame(summary)
  67. if __name__ == "__main__":
  68. pass