Sfoglia il codice sorgente

add: 特征处理框架

yq 5 mesi fa
parent
commit
1a08021548

+ 2 - 2
commom/__init__.py

@@ -7,7 +7,7 @@
 from .logger import get_logger
 from .placeholder_func import f_fill_placeholder
 from .user_exceptions import GeneralException
-from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime
+from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime, f_save_train_df
 
 __all__ = ['f_get_clazz_in_module', 'f_clazz_to_json', 'GeneralException', 'get_logger', 'f_fill_placeholder',
-           'f_get_date', 'f_get_datetime']
+           'f_get_date', 'f_get_datetime', 'f_save_train_df']

+ 9 - 0
commom/utils.py

@@ -7,10 +7,14 @@
 
 import datetime
 import inspect
+import os
 from json import JSONEncoder
 
+import pandas as pd
 import pytz
 
+from config import BaseConfig
+
 
 def f_get_date(offset: int = 0, connect: str = "-") -> str:
     current_date = datetime.datetime.now(pytz.timezone("Asia/Shanghai")).date() + datetime.timedelta(days=offset)
@@ -33,6 +37,11 @@ def f_get_clazz_in_module(module):
     return classes
 
 
+def f_save_train_df(file_name: str, df: pd.DataFrame):
+    file_path = os.path.join(BaseConfig.train_path, file_name)
+    df.to_excel(f"{file_path}.xlsx", index=False)
+
+
 class f_clazz_to_json(JSONEncoder):
     def default(self, o):
         return o.__dict__

+ 4 - 0
config/base_config.py

@@ -12,5 +12,9 @@ class BaseConfig:
     image_path = "./cache/image"
     os.makedirs(image_path, exist_ok=True)
 
+    # 模型训练中间结果
+    train_path = "./cache/train"
+    os.makedirs(train_path, exist_ok=True)
+
     # 表格合并相同列名的列
     merge_table_column = True

+ 1 - 1
config/model_monitor_config_template.json

@@ -9,7 +9,7 @@
     {
       "metric_code": "auc",
       "metric_func": "MetricBySqlGeneral",
-      "sql": "select * from test.t1 where date>'2024-11-07'"
+      "sql": "select * from test.t1 where date>{{date-1}}"
     }
   ]
 }

+ 31 - 0
data/insight/data_explore.py

@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/13
+@desc: 数据探索
+"""
+import pandas as pd
+
+from commom import f_save_train_df
+
+
+class DataExplore():
+
+    def __init__(self, ):
+        pass
+
+    def distribution(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        数据分布,缺失率,中位数,众数,偏离度等
+        """
+        pass
+
+    def save(self, df):
+        """
+        数据探索结果固化
+        """
+        f_save_train_df("distribution", df)
+
+
+if __name__ == "__main__":
+    pass

+ 39 - 0
data/process/data_process.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/13
+@desc: 数据加工
+"""
+
+import pandas as pd
+
+from commom import f_save_train_df
+from entitys import DataProcessConfigEntity
+
+
+class DataProcess():
+
+    def __init__(self, data_process_config: DataProcessConfigEntity):
+        self._data_process_config = data_process_config
+
+    def data_fill(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        数据填充
+        """
+        pass
+
+    def data_filter(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        数据筛选,删除缺失率高的特征或样本
+        """
+        pass
+
+    def save(self, df):
+        """
+        加工结果固化
+        """
+        f_save_train_df("distribution", df)
+
+
+if __name__ == "__main__":
+    pass

+ 4 - 2
entitys/__init__.py

@@ -4,14 +4,16 @@
 @time: 2024/10/30
 @desc: 数据实体类
 """
-from .data_feaure_entity import DataFeatureEntity
+from .train_config_entity import TrainConfigEntity
+from .data_process_config_entity import DataProcessConfigEntity
+from .data_feaure_entity import DataFeatureEntity, DataSplitEntity, DataPreparedEntity
 from .db_config_entity import DbConfigEntity
 from .metric_config_entity import MetricConfigEntity
 from .metric_entity import MetricTrainEntity, MetricFucEntity
 from .monitor_metric_config_entity import MonitorMetricConfigEntity
 
 __all__ = ['DataFeatureEntity', 'DbConfigEntity', 'MetricTrainEntity', 'MonitorMetricConfigEntity', 'MetricConfigEntity',
-           'MetricFucEntity']
+           'MetricFucEntity', 'DataSplitEntity', 'DataProcessConfigEntity', 'TrainConfigEntity', 'DataPreparedEntity']
 
 if __name__ == "__main__":
     pass

+ 44 - 0
entitys/data_feaure_entity.py

@@ -31,5 +31,49 @@ class DataFeatureEntity():
     def get_Ydata(self):
         return self._data[self._y_column]
 
+
+class DataPreparedEntity():
+    def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity):
+        self._train_data = train_data
+        self._val_data = val_data
+        self._test_data = test_data
+
+    @property
+    def train_data(self):
+        return self._train_data
+
+    @property
+    def val_data(self):
+        return self._val_data
+
+    @property
+    def test_data(self):
+        return self._test_data
+
+
+class DataSplitEntity():
+    def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame, y_column: str):
+        self._train_data = train_data
+        self._val_data = val_data
+        self._test_data = test_data
+        self._y_column = y_column
+
+    @property
+    def train_data(self):
+        return self._train_data
+
+    @property
+    def val_data(self):
+        return self._val_data
+
+    @property
+    def test_data(self):
+        return self._test_data
+
+    @property
+    def y_column(self):
+        return self._y_column
+
+
 if __name__ == "__main__":
     pass

+ 50 - 0
entitys/data_process_config_entity.py

@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 数据处理配置类
+"""
+import json
+import os
+
+from commom import GeneralException
+from enums import ResultCodesEnum
+
+
+class DataProcessConfigEntity():
+    def __init__(self, y_column: str, fill_method: str, split_method: str):
+        # 定义y变量
+        self._y_column = y_column
+        # 缺失值填充方法
+        self._fill_method = fill_method
+        # 数据划分方法
+        self._split_method = split_method
+
+    @property
+    def y_column(self):
+        return self._y_column
+
+    @property
+    def fill_method(self):
+        return self._fill_method
+
+    @property
+    def split_method(self):
+        return self._split_method
+
+    @staticmethod
+    def from_config(config_path: str):
+        """
+        从配置文件生成实体类
+        """
+        if os.path.exists(config_path):
+            with open(config_path, mode="r", encoding="utf-8") as f:
+                j = json.loads(f.read())
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
+
+        return DataProcessConfigEntity(**j)
+
+
+if __name__ == "__main__":
+    pass

+ 38 - 0
entitys/train_config_entity.py

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/1
+@desc: 模型训练超参数配置类
+"""
+import json
+import os
+
+from commom import GeneralException
+from enums import ResultCodesEnum
+
+
+class TrainConfigEntity():
+    def __init__(self, lr: float):
+        # 学习率
+        self._lr = lr
+
+    @property
+    def lr(self):
+        return self._lr
+
+    @staticmethod
+    def from_config(config_path: str):
+        """
+        从配置文件生成实体类
+        """
+        if os.path.exists(config_path):
+            with open(config_path, mode="r", encoding="utf-8") as f:
+                j = json.loads(f.read())
+        else:
+            raise GeneralException(ResultCodesEnum.NOT_FOUND, message=f"指配置文件【{config_path}】不存在")
+
+        return TrainConfigEntity(**j)
+
+
+if __name__ == "__main__":
+    pass

+ 19 - 0
feature/feature_filter.py

@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/13
+@desc: 筛选特征
+"""
+from entitys import DataSplitEntity, DataPreparedEntity
+
+
+class FeatureFilter():
+    def __init__(self, ):
+        pass
+
+    def feature_filter(self, data: DataSplitEntity) -> DataPreparedEntity:
+        pass
+
+
+if __name__ == "__main__":
+    pass

+ 33 - 0
feature/feature_utils.py

@@ -0,0 +1,33 @@
+# -*- coding:utf-8 -*-
+"""
+@author: yq
+@time: 2023/12/28
+@desc:  特征工具类
+"""
+import pandas as pd
+
+from entitys import DataSplitEntity
+
+
+def f_get_bins(data: DataSplitEntity) -> pd.DataFrame:
+    pass
+
+
+def f_get_woe(data: DataSplitEntity) -> pd.DataFrame:
+    pass
+
+
+def f_get_iv(data: DataSplitEntity) -> pd.DataFrame:
+    pass
+
+
+def f_get_psi(data: DataSplitEntity) -> pd.DataFrame:
+    pass
+
+
+def f_get_corr(data: DataSplitEntity) -> pd.DataFrame:
+    pass
+
+
+def f_get_ivf(data: DataSplitEntity) -> pd.DataFrame:
+    pass

+ 4 - 1
model/model_base.py

@@ -8,11 +8,14 @@ import abc
 
 import pandas as pd
 
-from entitys import DataFeatureEntity, MetricTrainEntity
+from entitys import DataFeatureEntity, MetricTrainEntity, TrainConfigEntity
 
 
 class ModelBase(metaclass=abc.ABCMeta):
 
+    def __init__(self, train_config: TrainConfigEntity):
+        self._train_config = train_config
+
     @abc.abstractmethod
     def train(self, data: DataFeatureEntity, *args, **kwargs) -> MetricTrainEntity:
         pass

+ 3 - 2
model/model_lr.py

@@ -7,12 +7,13 @@
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 
-from entitys import DataFeatureEntity, MetricTrainEntity
+from entitys import DataFeatureEntity, MetricTrainEntity, TrainConfigEntity
 from .model_base import ModelBase
 
 
 class ModelLr(ModelBase):
-    def __init__(self, ):
+    def __init__(self, train_config: TrainConfigEntity):
+        super().__init__(train_config)
         self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
 
     def train(self, data: DataFeatureEntity, *args, **kwargs) -> MetricTrainEntity: