|
@@ -5,22 +5,52 @@
|
|
@desc: 数据探索
|
|
@desc: 数据探索
|
|
"""
|
|
"""
|
|
import pandas as pd
|
|
import pandas as pd
|
|
-
|
|
|
|
from commom import f_save_train_df
|
|
from commom import f_save_train_df
|
|
|
|
|
|
|
|
|
|
class DataExplore():
|
|
class DataExplore():
|
|
|
|
|
|
- def __init__(self, ):
|
|
|
|
|
|
+ def __init__(self):
|
|
pass
|
|
pass
|
|
|
|
|
|
def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
"""
|
|
数据分布,缺失率,中位数,众数,偏离度等
|
|
数据分布,缺失率,中位数,众数,偏离度等
|
|
"""
|
|
"""
|
|
- pass
|
|
|
|
-
|
|
|
|
- def save(self, df):
|
|
|
|
|
|
+ summary = []
|
|
|
|
+
|
|
|
|
+ for column in df.columns:
|
|
|
|
+ # 创建一个新的字典
|
|
|
|
+ column_summary = {
|
|
|
|
+ 'column': column,
|
|
|
|
+ 'missing_rate': df[column].isnull().mean(),
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 判断数据类型
|
|
|
|
+ if df[column].dtype == 'object':
|
|
|
|
+ column_summary['type'] = 'categorical'
|
|
|
|
+ else:
|
|
|
|
+ column_summary['type'] = 'numerical'
|
|
|
|
+
|
|
|
|
+ if column_summary['type'] == 'numerical':
|
|
|
|
+ column_summary['min'] = df[column].min()
|
|
|
|
+ column_summary['25%'] = df[column].quantile(0.25)
|
|
|
|
+ column_summary['median'] = df[column].median()
|
|
|
|
+ column_summary['75%'] = df[column].quantile(0.75)
|
|
|
|
+ column_summary['max'] = df[column].max()
|
|
|
|
+ column_summary['skewness'] = df[column].skew()
|
|
|
|
+ else:
|
|
|
|
+ # 类别型变量
|
|
|
|
+ column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
|
|
|
|
+ freq = df[column].value_counts(normalize=True).head(10)
|
|
|
|
+ column_summary['top_10_value'] = freq.index.tolist()
|
|
|
|
+ column_summary['top_10_ratio'] = freq.tolist()
|
|
|
|
+
|
|
|
|
+ summary.append(column_summary)
|
|
|
|
+
|
|
|
|
+ return pd.DataFrame(summary)
|
|
|
|
+
|
|
|
|
+ def save(self, df: pd.DataFrame):
|
|
"""
|
|
"""
|
|
数据探索结果固化
|
|
数据探索结果固化
|
|
"""
|
|
"""
|