Explorar o código

add: DataExplore

qiuya hai 5 meses
pai
achega
6346069e53
Modificáronse 1 ficheiros con 35 adicións e 5 borrados
  1. 35 5
      data/insight/data_explore.py

+ 35 - 5
data/insight/data_explore.py

@@ -5,22 +5,52 @@
 @desc: 数据探索
 """
 import pandas as pd
-
 from commom import f_save_train_df
 
 
 class DataExplore():
 
-    def __init__(self, ):
+    def __init__(self):
         pass
 
     def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         数据分布,缺失率,中位数,众数,偏离度等
         """
-        pass
-
-    def save(self, df):
+        summary = []
+
+        for column in df.columns:
+            # 创建一个新的字典
+            column_summary = {
+                'column': column,
+                'missing_rate': df[column].isnull().mean(),
+            }
+
+            # 判断数据类型
+            if df[column].dtype == 'object':
+                column_summary['type'] = 'categorical'
+            else:
+                column_summary['type'] = 'numerical'
+
+            if column_summary['type'] == 'numerical':
+                column_summary['min'] = df[column].min()
+                column_summary['25%'] = df[column].quantile(0.25)
+                column_summary['median'] = df[column].median()
+                column_summary['75%'] = df[column].quantile(0.75)
+                column_summary['max'] = df[column].max()
+                column_summary['skewness'] = df[column].skew()
+            else:
+                # 类别型变量
+                column_summary['mode'] = df[column].mode()[0] if not df[column].mode().empty else None
+                freq = df[column].value_counts(normalize=True).head(10)
+                column_summary['top_10_value'] = freq.index.tolist()
+                column_summary['top_10_ratio'] = freq.tolist()
+
+            summary.append(column_summary)
+
+        return pd.DataFrame(summary)
+
+    def save(self, df: pd.DataFrame):
         """
         数据探索结果固化
         """