فهرست منبع

add: 处理excel模板

yq 1 ماه پیش
والد
کامیت
3f64a41d83

+ 25 - 0
config/monitor_config_template_excel.json

@@ -0,0 +1,25 @@
+{
+  "template_path": "./cache/后评估报表模板.xlsx",
+  "metric_config_list": [
+    {
+      "metric_code": "t1",
+      "metric_func": "MetricBySqlGeneral",
+      "sql": "select * from test.t1"
+    },
+        {
+      "metric_code": "apply_num",
+      "metric_func": "BMetric",
+      "v": "1"
+    },
+        {
+      "metric_code": "auto_pass_num",
+      "metric_func": "BMetric",
+      "v": "2"
+    },
+        {
+      "metric_code": "auto_pass_per",
+      "metric_func": "BMetric",
+      "v": "3"
+    }
+  ]
+}

+ 0 - 0
config/monitor_config_template.json → config/monitor_config_template_word.json


تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 88 - 108
easy_ml_demo.ipynb


+ 3 - 3
metric_test2.py

@@ -45,10 +45,10 @@ if __name__ == "__main__":
     f_register_metric_func(BMetric)
     data_loader = DataLoaderExcel()
 
-    a = data_loader.get_data("cache/报表自动化需求-2411.xlsx")
-    a.writr("cache/a.xlsx")
+    a = data_loader.get_data("cache/t1.xlsx")
+    # a.writr("cache/a.xlsx")
 
 
-    monitor_metric = MonitorMetric("./cache/model_feature_strategy1.json")
+    monitor_metric = MonitorMetric("./cache/model_monitor_config1.json")
     monitor_metric.calculate_metric(data_loader=data_loader)
     monitor_metric.generate_report()

+ 32 - 0
metric_test3.py

@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 
+"""
+
+from data import DataLoaderMysql
+from entitys import MetricFucResultEntity,DbConfigEntity
+from metrics import MetricBase, f_register_metric_func
+from monitor import MonitorMetric
+
+
+class BMetric(MetricBase):
+
+    def __init__(self, v: str, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._v = v
+
+    def calculate(self, *args, **kwargs) -> MetricFucResultEntity:
+        if ".png" in self._v:
+            return MetricFucResultEntity(image_path=self._v)
+        else:
+            return MetricFucResultEntity(value=self._v)
+
+
+if __name__ == "__main__":
+    f_register_metric_func(BMetric)
+    data_loader = DataLoaderMysql(DbConfigEntity.from_config("./config/mysql_config.json"))
+    monitor_metric = MonitorMetric("./config/monitor_config_template_excel.json")
+    monitor_metric.calculate_metric(data_loader=data_loader)
+    monitor_metric.generate_report()

+ 2 - 2
monitor/__init__.py

@@ -4,10 +4,10 @@
 @time: 2022/10/24
 @desc: 指标监控
 """
-
+from monitor.report_generate import ReportWord
 from .monitor_metric import MonitorMetric
 
-__all__ = ['MonitorMetric']
+__all__ = ['MonitorMetric','ReportWord']
 
 if __name__ == "__main__":
     pass

+ 10 - 2
monitor/monitor_metric.py

@@ -7,8 +7,10 @@
 import threading
 from typing import Dict
 
+from commom import GeneralException
 from entitys import MonitorConfigEntity, MetricFucResultEntity
-from .report_generate import Report
+from enums import ResultCodesEnum
+from .report_generate import ReportWord, ReportExcel
 
 
 class MonitorMetric():
@@ -34,7 +36,13 @@ class MonitorMetric():
             self._update_metric_value_dict(metric_code, metric_value)
 
     def generate_report(self):
-        Report.generate_report(self._metric_value_dict, self._monitor_config.template_path)
+        if ".docx" in self._monitor_config.template_path:
+            ReportWord.generate_report(self._metric_value_dict, self._monitor_config.template_path)
+        elif ".xlsx" in self._monitor_config.template_path:
+            ReportExcel.generate_report(self._metric_value_dict, self._monitor_config.template_path)
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND,
+                                   message=f"模板【{self._monitor_config.template_path}】不能处理,请使用【.docx】或【.xlsx】模板")
 
 
 if __name__ == "__main__":

+ 81 - 16
monitor/report_generate.py

@@ -7,6 +7,7 @@
 import os
 from typing import Dict
 
+import openpyxl
 import pandas as pd
 from docx import Document
 from docx.enum.table import WD_TABLE_ALIGNMENT, WD_CELL_VERTICAL_ALIGNMENT
@@ -14,6 +15,7 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 from docx.shared import Inches, Cm, Pt
+from openpyxl.worksheet.worksheet import Worksheet
 
 from commom import GeneralException, f_get_datetime
 from config import BaseConfig
@@ -21,7 +23,7 @@ from entitys import MetricFucResultEntity
 from enums import ResultCodesEnum, PlaceholderPrefixEnum
 
 
-class Report():
+class ReportWord():
 
     @staticmethod
     def _set_cell_width(table, table_cell_width):
@@ -36,7 +38,7 @@ class Report():
         for column in table.columns:
             max_text_len = 0
             for cell in column.cells:
-                cell_text_len = Report._get_text_length(cell.text)
+                cell_text_len = ReportWord._get_text_length(cell.text)
                 max_text_len = cell_text_len if cell_text_len > max_text_len else max_text_len
             max_text_len_list.append(max_text_len)
 
@@ -83,7 +85,7 @@ class Report():
             pre_cell.text = column_name
             for run in pre_cell.paragraphs[0].runs:
                 run.bold = True
-            Report._set_cell_format(pre_cell, table_font_size)
+            ReportWord._set_cell_format(pre_cell, table_font_size)
 
     @staticmethod
     def _set_table_singleBoard(table):
@@ -132,7 +134,7 @@ class Report():
         for paragraph in doc.paragraphs:
             text = paragraph.text
             for metric_code, metric_fuc_entity in metric_value_dict.items():
-                placeholder = Report._get_placeholder(PlaceholderPrefixEnum.VALUE, metric_code)
+                placeholder = ReportWord._get_placeholder(PlaceholderPrefixEnum.VALUE, metric_code)
                 metric_value = metric_fuc_entity.value
                 if metric_value is None:
                     continue
@@ -152,7 +154,7 @@ class Report():
         # 替换表格
         for paragraph in doc.paragraphs:
             for metric_code, metric_fuc_entity in metric_value_dict.items():
-                placeholder = Report._get_placeholder(PlaceholderPrefixEnum.TABLE, metric_code)
+                placeholder = ReportWord._get_placeholder(PlaceholderPrefixEnum.TABLE, metric_code)
                 metric_table = metric_fuc_entity.table
                 table_font_size = metric_fuc_entity.table_font_size
                 table_autofit = metric_fuc_entity.table_autofit
@@ -174,25 +176,25 @@ class Report():
                     cell.text = str(column_name)
                     for run in cell.paragraphs[0].runs:
                         run.bold = True
-                    Report._set_cell_format(cell, table_font_size)
+                    ReportWord._set_cell_format(cell, table_font_size)
                     # 合并相同的列名
                     if column_idx != 0 and BaseConfig.merge_table_column:
                         pre_cell = table.cell(0, column_idx - 1)
-                        Report._merge_cell_column(pre_cell, cell, table_font_size, table_cell_width)
+                        ReportWord._merge_cell_column(pre_cell, cell, table_font_size, table_cell_width)
                 # 值
                 for row_idx, row in metric_table.iterrows():
                     for column_idx, value in enumerate(row):
                         cell = table.cell(row_idx + 1, column_idx)
                         value = str(value) if pd.notna(value) else '/'
                         cell.text = str(value)
-                        Report._set_cell_format(cell, table_font_size)
+                        ReportWord._set_cell_format(cell, table_font_size)
                         # 合并第一行数据也为列的情况
                         if row_idx == 0:
-                            Report._merge_cell_column(table.cell(0, column_idx), cell, table_font_size,
-                                                      table_cell_width)
+                            ReportWord._merge_cell_column(table.cell(0, column_idx), cell, table_font_size,
+                                                          table_cell_width)
 
-                Report._set_table_singleBoard(table)
-                Report._set_cell_width(table, table_cell_width)
+                ReportWord._set_table_singleBoard(table)
+                ReportWord._set_cell_width(table, table_cell_width)
                 # 禁止自动调整表格
                 if len(metric_table.columns) <= 20 and not table_autofit:
                     table.autofit = False
@@ -202,7 +204,7 @@ class Report():
         # 替换图片
         for paragraph in doc.paragraphs:
             for metric_code, metric_fuc_entity in metric_value_dict.items():
-                placeholder = Report._get_placeholder(PlaceholderPrefixEnum.IMAGE, metric_code)
+                placeholder = ReportWord._get_placeholder(PlaceholderPrefixEnum.IMAGE, metric_code)
                 image_path = metric_fuc_entity.image_path
                 image_size = metric_fuc_entity.image_size
                 if image_path is None:
@@ -229,14 +231,77 @@ class Report():
         else:
             raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"监控模板文件【{template_path}】不存在")
 
-        Report._fill_value_placeholder(doc, metric_value_dict)
-        Report._fill_table_placeholder(doc, metric_value_dict)
-        Report._fill_image_placeholder(doc, metric_value_dict)
+        ReportWord._fill_value_placeholder(doc, metric_value_dict)
+        ReportWord._fill_table_placeholder(doc, metric_value_dict)
+        ReportWord._fill_image_placeholder(doc, metric_value_dict)
         new_path = template_path.replace(".docx", f"{f_get_datetime()}.docx")
         if save_path is not None:
             new_path = save_path
         doc.save(f"./{new_path}")
 
 
+class ReportExcel():
+
+    @staticmethod
+    def _fill_value_placeholder(worksheet: Worksheet, metric_value_dict: Dict[str, MetricFucResultEntity]):
+        # 替换指标,检查每个单元格并替换
+        for metric_code, metric_fuc_entity in metric_value_dict.items():
+            metric_value = metric_fuc_entity.value
+            if metric_value is None:
+                continue
+            placeholder = ReportWord._get_placeholder(PlaceholderPrefixEnum.VALUE, metric_code)
+            for row in worksheet.rows:
+                for cell in row:
+                    if placeholder in str(cell.value):
+                        cell.value = str(cell.value).replace(placeholder, str(metric_value))
+
+    @staticmethod
+    def _fill_table_placeholder(worksheet: Worksheet, metric_value_dict: Dict[str, MetricFucResultEntity]):
+        # 替换表格
+        for metric_code, metric_fuc_entity in metric_value_dict.items():
+            metric_table = metric_fuc_entity.table
+            if metric_table is None:
+                continue
+            placeholder = ReportWord._get_placeholder(PlaceholderPrefixEnum.TABLE, metric_code)
+            # 定位占位符位置
+            start_row = 1
+            start_col = 1
+            end_flag = False
+            for row in worksheet.rows:
+                start_col = 1
+                for cell in row:
+                    if placeholder in str(cell.value):
+                        end_flag = True
+                        break
+                    start_col += 1
+                if end_flag:
+                    break
+                start_row += 1
+            # 无占位符则跳过
+            if not end_flag:
+                continue
+
+            for row_idx, row in metric_table.iterrows():
+                for column_idx, value in enumerate(row):
+                    worksheet.cell(row=start_row + row_idx, column=start_col + column_idx, value=str(value))
+
+    @staticmethod
+    def generate_report(metric_value_dict: Dict[str, MetricFucResultEntity], template_path: str, save_path=None):
+        if os.path.exists(template_path):
+            workbook = openpyxl.load_workbook(template_path)
+            sheet_names = workbook.sheetnames
+            worksheet = workbook[sheet_names[0]]
+
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"监控模板文件【{template_path}】不存在")
+
+        ReportExcel._fill_value_placeholder(worksheet, metric_value_dict)
+        ReportExcel._fill_table_placeholder(worksheet, metric_value_dict)
+        new_path = template_path.replace(".xlsx", f"{f_get_datetime()}.xlsx")
+        if save_path is not None:
+            new_path = save_path
+        workbook.save(f"./{new_path}")
+
+
 if __name__ == "__main__":
     pass

+ 2 - 2
pipeline/pipeline.py

@@ -12,7 +12,7 @@ from feature.feature_strategy_base import FeatureStrategyBase
 from init import init
 from model import ModelBase
 from model import ModelFactory
-from monitor.report_generate import Report
+from monitor import ReportWord
 
 init()
 
@@ -57,7 +57,7 @@ class Pipeline():
 
     def report(self, ):
         save_path = self._ml_config.f_get_save_path("模型报告.docx")
-        Report.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)
+        ReportWord.generate_report(self.metric_value_dict, self._model.get_report_template_path(), save_path=save_path)
         print(f"模型报告文件储存路径:{save_path}")
 
     def save(self):

+ 7 - 6
train_test.py

@@ -32,9 +32,9 @@ if __name__ == "__main__":
     cfg = {
         "project_name": "demo",
         # jupyter下输出内容
-        "jupyter_print": True,
+        "jupyter_print": False,
         # 是否开启粗分箱
-        "format_bin": False,
+        "format_bin": True,
         # 变量切分点搜索采样率
         "bin_sample_rate": 0.01,
         # 最多保留候选变量数
@@ -43,8 +43,8 @@ if __name__ == "__main__":
         "monto_shift_threshold": 1,
         "iv_threshold": 0.01,
         "corr_threshold": 0.4,
-        "psi_threshold": 0.2,
-        "vif_threshold": 10,
+        "psi_threshold": 0.001,
+        "vif_threshold": 1.06,
         # 压力测试
         "stress_test": True,
         "stress_sample_times": 10,
@@ -53,7 +53,7 @@ if __name__ == "__main__":
         # 手动定义切分点,字符型的变量以'%,%'合并枚举值
         "breaks_list": {
             #                 'duration_in_month': [12, 18, 48],
-            'credit_amount': [2000, 3500, 4000, 7000],
+            #                 'credit_amount': [2000, 3500, 4000, 7000],
             'purpose': ['retraining%,%car (used)', 'radio/television', 'furniture/equipment%,%business%,%repairs',
                         'domestic appliances%,%education%,%car (new)%,%others'],
             #                 'age_in_years': [27, 34, 58]
@@ -76,7 +76,8 @@ if __name__ == "__main__":
             "credit_history": "借贷历史"
         },
         "columns_exclude": [],
-        # "columns_include": ["age_in_years"],
+        # "columns_include": ["credit_amount"],
+        "rules": ["df.loc[df['credit_amount']>=9000,'SCORE'] += -50"]
     }
 
     train_pipeline = Pipeline(data=data, **cfg)

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است