data_explore.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/11/13
  5. @desc: 数据探索
  6. """
  7. import numbers
  8. import pandas as pd
  9. from pandas.core.dtypes.common import is_numeric_dtype
  10. from commom import f_is_number
  11. from enums import ConstantEnum
  12. class DataExplore():
  13. def __init__(self):
  14. pass
  15. @staticmethod
  16. def check_type(df: pd.DataFrame):
  17. check_msg = ""
  18. if ConstantEnum.SCORE.value in df.columns:
  19. check_msg = f"{check_msg}请修改列【{ConstantEnum.SCORE.value}】的名称\n"
  20. for column in df.columns:
  21. if not is_numeric_dtype(df[column]):
  22. values = list(df[column])
  23. cnt_str = 0
  24. cnt_number = 0
  25. cnt_other = 0
  26. cnt_strnum = 0
  27. for value in values:
  28. if isinstance(value, numbers.Number):
  29. cnt_number += 1
  30. elif isinstance(value, str):
  31. cnt_str += 1
  32. if f_is_number(value):
  33. cnt_strnum += 1
  34. else:
  35. cnt_other += 1
  36. if len(values) != cnt_str:
  37. check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 字符型数值数量{cnt_strnum} " \
  38. f"其它类型数量{cnt_other}\n"
  39. return check_msg
  40. @staticmethod
  41. def distribution(df: pd.DataFrame) -> pd.DataFrame:
  42. """
  43. 数据分布,缺失率,中位数,众数,偏离度等
  44. """
  45. summary = []
  46. for column in df.columns:
  47. # 创建一个新的字典
  48. column_summary = {
  49. 'column': column,
  50. 'missing_rate': df[column].isnull().mean(),
  51. }
  52. # 判断数据类型
  53. if df[column].dtype == 'object':
  54. column_summary['type'] = 'categorical'
  55. else:
  56. column_summary['type'] = 'numerical'
  57. if column_summary['type'] == 'numerical':
  58. column_summary['min'] = df[column].min()
  59. column_summary['25%'] = df[column].quantile(0.25)
  60. column_summary['median'] = df[column].median()
  61. column_summary['75%'] = df[column].quantile(0.75)
  62. column_summary['max'] = df[column].max()
  63. column_summary['skewness'] = df[column].skew()
  64. else:
  65. # 类别型变量
  66. column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
  67. freq = df[column].value_counts(normalize=True).head(10)
  68. column_summary['top_10_value'] = freq.index.tolist()
  69. column_summary['top_10_ratio'] = freq.tolist()
  70. summary.append(column_summary)
  71. return pd.DataFrame(summary)
  72. if __name__ == "__main__":
  73. pass