utils.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: yq
  4. @time: 2024/12/5
  5. @desc:
  6. """
  7. import os
  8. import shutil
  9. from typing import List
  10. import gradio as gr
  11. import pandas as pd
  12. from sklearn.model_selection import train_test_split
  13. from config import BaseConfig
  14. from data import DataLoaderExcel, DataExplore
  15. from entitys import DataSplitEntity
  16. from feature import FilterStrategyFactory
  17. from model import ModelFactory
  18. from trainer import TrainPipeline
  19. from .manager import engine
  20. DATA_SUB_DIR = "data"
  21. UPLOAD_DATA_PREFIX = "prefix_upload_data_"
  22. data_loader = DataLoaderExcel()
  23. def _clean_base_dir(data):
  24. base_dir = _get_base_dir(data)
  25. file_name_list: List[str] = os.listdir(base_dir)
  26. for file_name in file_name_list:
  27. if file_name in [DATA_SUB_DIR]:
  28. continue
  29. file_path = os.path.join(base_dir, file_name)
  30. if os.path.isdir(file_path):
  31. shutil.rmtree(file_path)
  32. else:
  33. os.remove(file_path)
  34. def _check_save_dir(data):
  35. project_name = engine.get(data, "project_name")
  36. if project_name is None or len(project_name) == 0:
  37. raise gr.Error(message='项目名称不能为空', duration=5)
  38. return True
  39. def _get_prefix_file(save_path, prefix):
  40. file_name_list: List[str] = os.listdir(save_path)
  41. for file_name in file_name_list:
  42. if prefix in file_name:
  43. return os.path.join(save_path, file_name)
  44. def _get_base_dir(data):
  45. project_name = engine.get(data, "project_name")
  46. base_dir = os.path.join(BaseConfig.train_path, project_name)
  47. return base_dir
  48. def _get_upload_data(data) -> pd.DataFrame:
  49. base_dir = _get_base_dir(data)
  50. save_path = os.path.join(base_dir, DATA_SUB_DIR)
  51. file_path = _get_prefix_file(save_path, UPLOAD_DATA_PREFIX)
  52. df = data_loader.get_data(file_path)
  53. return df
  54. def _get_auc_ks_images(data):
  55. base_dir = _get_base_dir(data)
  56. return [os.path.join(base_dir, "train_perf.png"), os.path.join(base_dir, "test_perf.png")]
  57. def f_project_is_exist(data):
  58. project_name = engine.get(data, "project_name")
  59. if project_name is None or len(project_name) == 0:
  60. gr.Warning(message='项目名称不能为空', duration=5)
  61. elif os.path.exists(_get_base_dir(data)):
  62. gr.Warning(message='项目名称已被使用', duration=5)
  63. def _get_save_path(data, file_name: str, sub_dir="", name_prefix=""):
  64. base_dir = _get_base_dir(data)
  65. save_path = os.path.join(base_dir, sub_dir)
  66. os.makedirs(save_path, exist_ok=True)
  67. # 有前缀标示的先删除
  68. if name_prefix:
  69. file = _get_prefix_file(save_path, name_prefix)
  70. if file:
  71. os.remove(file)
  72. save_path = os.path.join(save_path, name_prefix + os.path.basename(file_name))
  73. return save_path
  74. def f_data_upload(data):
  75. if not _check_save_dir(data):
  76. return
  77. file_data = engine.get(data, "file_data")
  78. data_path = _get_save_path(data, file_data.name, DATA_SUB_DIR, UPLOAD_DATA_PREFIX)
  79. shutil.copy(file_data.name, data_path)
  80. df = _get_upload_data(data)
  81. distribution = DataExplore.distribution(df)
  82. columns = df.columns.to_list()
  83. return {
  84. engine.get_elem_by_id("data_upload"): gr.update(value=df, visible=True),
  85. engine.get_elem_by_id("data_insight"): gr.update(value=distribution, visible=True),
  86. engine.get_elem_by_id("y_column"): gr.update(choices=columns),
  87. engine.get_elem_by_id("x_columns_candidate"): gr.update(choices=columns)
  88. }
  89. def f_download_report(data):
  90. file_path = _get_save_path(data, "模型报告.docx")
  91. if os.path.exists(file_path):
  92. return {engine.get_elem_by_id("download_report"): gr.update(value=file_path)}
  93. else:
  94. raise FileNotFoundError(f"{file_path} not found.")
  95. def f_verify_param(data):
  96. y_column = engine.get(data, "y_column")
  97. if y_column is None:
  98. raise gr.Error(message=f'Y标签列不能为空', duration=5)
  99. return True
  100. def f_train(data, progress=gr.Progress(track_tqdm=True)):
  101. def _reset_component_state():
  102. return {engine.get_elem_by_id("download_report"): gr.update(visible=False),
  103. engine.get_elem_by_id("auc_df"): gr.update(visible=False),
  104. engine.get_elem_by_id("gallery_auc"): gr.update(visible=False)}
  105. progress(0, desc="Starting")
  106. feature_search_strategy = engine.get(data, "feature_search_strategy")
  107. model_type = engine.get(data, "model_type")
  108. test_split_rate = engine.get(data, "test_split_rate")
  109. data_upload = engine.get(data, "data_upload")
  110. all_param = engine.get_all(data)
  111. # 清空储存目录
  112. _clean_base_dir(data)
  113. # 校验参数
  114. if not f_verify_param(data):
  115. yield _reset_component_state()
  116. yield _reset_component_state()
  117. # 数据集划分
  118. train_data, test_data = train_test_split(data_upload, test_size=test_split_rate, shuffle=True, random_state=2025)
  119. data_split = DataSplitEntity(train_data=train_data, val_data=None, test_data=test_data)
  120. progress(0.01)
  121. # 特征处理
  122. ## 获取特征筛选策略
  123. filter_strategy_clazz = FilterStrategyFactory.get_strategy(feature_search_strategy)
  124. filter_strategy = filter_strategy_clazz(**all_param)
  125. # 选择模型
  126. model_clazz = ModelFactory.get_model(model_type)
  127. model = model_clazz(**all_param)
  128. # 训练并生成报告
  129. train_pipeline = TrainPipeline(filter_strategy, model, data_split)
  130. metric_value_dict = train_pipeline.train()
  131. progress(0.95)
  132. train_pipeline.generate_report()
  133. auc_df = metric_value_dict["模型结果"].table
  134. report_file_path = _get_save_path(data, "模型报告.docx")
  135. yield {engine.get_elem_by_id("train_progress"): gr.update(value="训练完成"),
  136. engine.get_elem_by_id("auc_df"): gr.update(value=auc_df, visible=True),
  137. engine.get_elem_by_id("gallery_auc"): gr.update(value=_get_auc_ks_images(data), visible=True),
  138. engine.get_elem_by_id("download_report"): gr.update(value=report_file_path, visible=True)}