Forráskód Böngészése

add: web 数据页面

yq 4 hónapja
szülő
commit
5d4bc990b3
5 módosított fájl, 24 hozzáadás és 18 törlés
  1. 10 5
      app.py
  2. 3 2
      data/__init__.py
  3. 2 8
      data/insight/data_explore.py
  4. 1 1
      data/loader/data_loader_excel.py
  5. 8 2
      webui/utils.py

+ 10 - 5
app.py

@@ -12,7 +12,7 @@ from webui import f_project_is_exist, f_data_upload, engine
 input_elems = set()
 elem_dict = {}
 
-with gr.Blocks("Easy-ML") as demo:
+with gr.Blocks() as demo:
     gr.HTML('<h1 ><center><font size="5">Easy-ML</font></center></h1>')
     gr.HTML('<h2 ><center><font size="2">快速建模工具</font></center></h2>')
     with gr.Tabs():
@@ -22,30 +22,35 @@ with gr.Blocks("Easy-ML") as demo:
                                           info="项目名称将会被作为缓存目录名称,如果重复会导致结果被覆盖")
             with gr.Row():
                 file_data = gr.File(label="建模数据")
+            with gr.Row():
+                data_upload = gr.Dataframe(visible=False, label="当前上传数据", max_height=300)
+            with gr.Row():
+                data_insight = gr.Dataframe(visible=False, label="数据探查", max_height=600, wrap=True)
 
         with gr.TabItem("训练"):
             with gr.Row():
                 with gr.Column():
                     model_type = gr.Dropdown(["lr"], value="lr", label="模型")
                     search_strategy = gr.Dropdown(["iv"], value="iv", label="特征搜索策略")
-                    gr.Textbox(label="Y标签")
-                    gr.Textbox(label="X特征")
+                    y_column = gr.Textbox(label="Y标签")
+                    x_columns = gr.Textbox(label="X特征")
                     gr.Slider(0.05, 1, value=0.1, label="分箱组合采样率", step=0.01),
                     train_button = gr.Button("开始训练", variant="primary")
                 with gr.Column():
                     gr.Textbox(value="输出")
 
-        input_elems.update({project_name, file_data, model_type, search_strategy})
+        input_elems.update({project_name, file_data, data_upload, model_type, search_strategy})
         elem_dict.update(dict(
             project_name=project_name,
             file_data=file_data,
+            data_upload=data_upload,
             model_type=model_type,
             search_strategy=search_strategy
         ))
         engine.add_elems(elem_dict)
 
         project_name.change(fn=f_project_is_exist, inputs=input_elems)
-        file_data.upload(fn=f_data_upload, inputs=input_elems, outputs=[])
+        file_data.upload(fn=f_data_upload, inputs=input_elems, outputs=[data_upload, data_insight])
 
     demo.launch(share=True)
 

+ 3 - 2
data/__init__.py

@@ -4,11 +4,12 @@
 @time: 2024/10/30
 @desc: 数据加载、加工相关
 """
-from .loader.data_loader_excel import DataLoaderExcel
+from .insight.data_explore import DataExplore
 from .loader.data_loader_base import DataLoaderBase
+from .loader.data_loader_excel import DataLoaderExcel
 from .loader.data_loader_mysql import DataLoaderMysql
 
-__all__ = ['DataLoaderBase', 'DataLoaderMysql', 'DataLoaderExcel']
+__all__ = ['DataLoaderBase', 'DataLoaderMysql', 'DataLoaderExcel', 'DataExplore']
 
 if __name__ == "__main__":
     pass

+ 2 - 8
data/insight/data_explore.py

@@ -5,7 +5,6 @@
 @desc: 数据探索
 """
 import pandas as pd
-from commom import f_save_train_df
 
 
 class DataExplore():
@@ -13,7 +12,8 @@ class DataExplore():
     def __init__(self):
         pass
 
-    def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
+    @staticmethod
+    def distribution(df: pd.DataFrame) -> pd.DataFrame:
         """
         数据分布,缺失率,中位数,众数,偏离度等
         """
@@ -50,12 +50,6 @@ class DataExplore():
 
         return pd.DataFrame(summary)
 
-    def save(self, df: pd.DataFrame):
-        """
-        数据探索结果固化
-        """
-        f_save_train_df("distribution", df)
-
 
 if __name__ == "__main__":
     pass

+ 1 - 1
data/loader/data_loader_excel.py

@@ -25,7 +25,7 @@ class DataLoaderExcel(DataLoaderBase):
     def get_data(self, file_path: str, sheet_name: str = 0) -> pd.DataFrame:
         df: pd.DataFrame = pd.DataFrame()
         if ".xlsx" in file_path:
-            df = pd.read_excel(file_path, sheet_name=sheet_name, index_col=False, dtype=str)
+            df = pd.read_excel(file_path, sheet_name=sheet_name, index_col=False)
         elif ".csv" in file_path:
             df = pd.read_csv(file_path)
         columns = df.columns.to_list()

+ 8 - 2
webui/utils.py

@@ -12,7 +12,7 @@ import gradio as gr
 import pandas as pd
 
 from config import BaseConfig
-from data import DataLoaderExcel
+from data import DataLoaderExcel, DataExplore
 from .manager import engine
 
 DATA_DIR = "data"
@@ -46,6 +46,7 @@ def _get_upload_data(data) -> pd.DataFrame:
     save_path = os.path.join(base_dir, DATA_DIR)
     file_path = _get_prefix_file(save_path, UPLOAD_DATA_PREFIX)
     df = data_loader.get_data(file_path)
+    return df
 
 
 def f_project_is_exist(data):
@@ -63,7 +64,8 @@ def f_get_save_path(data, file_name: str, sub_dir="", name_prefix=""):
     # 有前缀标示的先删除
     if name_prefix:
         file = _get_prefix_file(save_path, name_prefix)
-        os.remove(file)
+        if file:
+            os.remove(file)
     save_path = os.path.join(save_path, name_prefix + os.path.basename(file_name))
     return save_path
 
@@ -74,3 +76,7 @@ def f_data_upload(data):
     file_data = engine.get(data, "file_data")
     data_path = f_get_save_path(data, file_data.name, DATA_DIR, UPLOAD_DATA_PREFIX)
     shutil.copy(file_data.name, data_path)
+    df = _get_upload_data(data)
+    distribution = DataExplore.distribution(df)
+
+    return gr.update(value=df, visible=True), gr.update(value=distribution, visible=True),