12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- # -*- coding: utf-8 -*-
- """
- @author: yq
- @time: 2024/11/13
- @desc: 数据探索
- """
- import pandas as pd
- from commom import f_save_train_df
- class DataExplore():
- def __init__(self):
- pass
- def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
- """
- 数据分布,缺失率,中位数,众数,偏离度等
- """
- summary = []
- for column in df.columns:
- # 创建一个新的字典
- column_summary = {
- 'column': column,
- 'missing_rate': df[column].isnull().mean(),
- }
- # 判断数据类型
- if df[column].dtype == 'object':
- column_summary['type'] = 'categorical'
- else:
- column_summary['type'] = 'numerical'
- if column_summary['type'] == 'numerical':
- column_summary['min'] = df[column].min()
- column_summary['25%'] = df[column].quantile(0.25)
- column_summary['median'] = df[column].median()
- column_summary['75%'] = df[column].quantile(0.75)
- column_summary['max'] = df[column].max()
- column_summary['skewness'] = df[column].skew()
- else:
- # 类别型变量
- column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
- freq = df[column].value_counts(normalize=True).head(10)
- column_summary['top_10_value'] = freq.index.tolist()
- column_summary['top_10_ratio'] = freq.tolist()
- summary.append(column_summary)
- return pd.DataFrame(summary)
- def save(self, df: pd.DataFrame):
- """
- 数据探索结果固化
- """
- f_save_train_df("distribution", df)
- if __name__ == "__main__":
- pass
|