yq 5 місяців тому
батько
коміт
6192d24886

+ 2 - 2
commom/__init__.py

@@ -7,7 +7,7 @@
 from .logger import get_logger
 from .placeholder_func import f_fill_placeholder
 from .user_exceptions import GeneralException
-from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime, f_save_train_df
+from .utils import f_get_clazz_in_module, f_clazz_to_json, f_get_date, f_get_datetime, f_save_train_df, f_format_float
 
 __all__ = ['f_get_clazz_in_module', 'f_clazz_to_json', 'GeneralException', 'get_logger', 'f_fill_placeholder',
-           'f_get_date', 'f_get_datetime', 'f_save_train_df']
+           'f_get_date', 'f_get_datetime', 'f_save_train_df', 'f_format_float']

+ 4 - 0
commom/utils.py

@@ -16,6 +16,10 @@ import pytz
 from config import BaseConfig
 
 
+def f_format_float(num: float):
+    return f"{num: .3f}"
+
+
 def f_get_date(offset: int = 0, connect: str = "-") -> str:
     current_date = datetime.datetime.now(pytz.timezone("Asia/Shanghai")).date() + datetime.timedelta(days=offset)
     return current_date.strftime(f"%Y{connect}%m{connect}%d")

+ 0 - 7
config/data_process_config_template.json

@@ -1,11 +1,4 @@
 {
   "y_column": "creditability",
-  "x_columns_candidate": [
-    "duration_in_month",
-    "credit_amount",
-    "installment_rate_in_percentage_of_disposable_income",
-    "present_residence_since",
-    "age_in_years"
-  ],
   "bin_search_interval": 0.05
 }

+ 3 - 0
config/train_config_template.json

@@ -0,0 +1,3 @@
+{
+  "model_type": "lr"
+}

+ 9 - 0
entitys/data_feaure_entity.py

@@ -32,6 +32,9 @@ class CandidateFeatureEntity():
 
 
 class DataFeatureEntity():
+    """
+    数据特征准备完毕
+    """
     def __init__(self, data: pd.DataFrame, x_columns: list, y_column: str):
         self._data = data
         self._x_columns = x_columns
@@ -57,6 +60,9 @@ class DataFeatureEntity():
 
 
 class DataPreparedEntity():
+    """
+    训练集测试集特征准备完毕
+    """
     def __init__(self, train_data: DataFeatureEntity, val_data: DataFeatureEntity, test_data: DataFeatureEntity):
         self._train_data = train_data
         self._val_data = val_data
@@ -76,6 +82,9 @@ class DataPreparedEntity():
 
 
 class DataSplitEntity():
+    """
+    初始数据训练集测试集划分
+    """
     def __init__(self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame):
         self._train_data = train_data
         self._val_data = val_data

+ 4 - 4
entitys/data_process_config_entity.py

@@ -13,10 +13,10 @@ from enums import ResultCodesEnum
 
 
 class DataProcessConfigEntity():
-    def __init__(self, y_column: str, x_columns_candidate: List[str] = None, fill_method: str = None, split_method: str = None,
-                 feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05, iv_threshold: float = 0.03,
-                 iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4, sample_rate: float = 0.1,
-                 x_candidate_num: int = 10, special_values: Union[dict, list] = None):
+    def __init__(self, y_column: str, x_columns_candidate: List[str] = None, fill_method: str = None,
+                 split_method: str = None, feature_search_strategy: str = 'iv', bin_search_interval: float = 0.05,
+                 iv_threshold: float = 0.03, iv_threshold_wide: float = 0.05, corr_threshold: float = 0.4,
+                 sample_rate: float = 0.1, x_candidate_num: int = 10, special_values: Union[dict, list] = None):
         # 定义y变量
         self._y_column = y_column
 

+ 26 - 7
entitys/metric_entity.py

@@ -6,23 +6,42 @@
 """
 import pandas as pd
 
+from commom import f_format_float
+
 
 class MetricTrainEntity():
     """
     模型训练结果指标类
     """
 
-    def __init__(self, auc: float, ks: float):
-        self._auc = auc
-        self._ks = ks
+    def __init__(self, train_auc: float, train_ks: float, test_auc: float, test_ks: float,
+                 train_perf_image_path: str = None, test_perf_image_path: str = None):
+        self._train_auc = f_format_float(train_auc)
+        self._train_ks = f_format_float(train_ks)
+        self._train_perf_image_path = train_perf_image_path
+
+        self._test_auc = f_format_float(test_auc)
+        self._test_ks = f_format_float(test_ks)
+        self._test_perf_image_path = test_perf_image_path
+
+    def __str__(self):
+        return f"train_auc:{self._train_auc} train_ks:{self._train_ks}\ntest_auc:{self._test_auc} test_ks:{self._test_ks}"
+
+    @property
+    def train_auc(self):
+        return self._train_auc
+
+    @property
+    def train_ks(self):
+        return self._train_ks
 
     @property
-    def auc(self):
-        return self._auc
+    def test_auc(self):
+        return self._test_auc
 
     @property
-    def ks(self):
-        return self._ks
+    def test_ks(self):
+        return self._test_ks
 
 
 class MetricFucEntity():

+ 7 - 1
entitys/train_config_entity.py

@@ -12,10 +12,16 @@ from enums import ResultCodesEnum
 
 
 class TrainConfigEntity():
-    def __init__(self, lr: float):
+    def __init__(self, model_type=str, lr: float = None):
+        # 模型类型
+        self._model_type = model_type
         # 学习率
         self._lr = lr
 
+    @property
+    def model_type(self):
+        return self._model_type
+
     @property
     def lr(self):
         return self._lr

+ 2 - 1
enums/__init__.py

@@ -6,7 +6,8 @@
 """
 from .bins_strategy_enum import BinsStrategyEnum
 from .filter_strategy_enum import FilterStrategyEnum
+from .model_enum import ModelEnum
 from .placeholder_prefix_enum import PlaceholderPrefixEnum
 from .result_codes_enum import ResultCodesEnum
 
-__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'BinsStrategyEnum', 'FilterStrategyEnum']
+__all__ = ['ResultCodesEnum', 'PlaceholderPrefixEnum', 'BinsStrategyEnum', 'FilterStrategyEnum', 'ModelEnum']

+ 11 - 0
enums/model_enum.py

@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/14
+@desc: 特征筛选策略枚举值
+"""
+from enum import Enum
+
+
+class ModelEnum(Enum):
+    LR = "lr"

+ 14 - 2
model/__init__.py

@@ -4,11 +4,23 @@
 @time: 2023/12/28
 @desc: 模型相关
 """
-
+from commom import GeneralException
+from enums import ModelEnum, ResultCodesEnum
 from .model_base import ModelBase
 from .model_lr import ModelLr
 
-__all__ = ['ModelBase', 'ModelLr']
+__all__ = ['ModelBase', 'f_get_model']
+
+model_map = {
+    ModelEnum.LR.value: ModelLr
+}
+
+
+def f_get_model(model_type: str):
+    if model_type not in model_map.keys():
+        raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"模型【{model_type}】没有实现")
+    return model_map[model_type]
+
 
 if __name__ == "__main__":
     pass

+ 2 - 2
model/model_base.py

@@ -8,7 +8,7 @@ import abc
 
 import pandas as pd
 
-from entitys import DataFeatureEntity, MetricTrainEntity, TrainConfigEntity
+from entitys import MetricTrainEntity, TrainConfigEntity, DataPreparedEntity
 
 
 class ModelBase(metaclass=abc.ABCMeta):
@@ -17,7 +17,7 @@ class ModelBase(metaclass=abc.ABCMeta):
         self._train_config = train_config
 
     @abc.abstractmethod
-    def train(self, data: DataFeatureEntity, *args, **kwargs) -> MetricTrainEntity:
+    def train(self, data: DataPreparedEntity, *args, **kwargs) -> MetricTrainEntity:
         pass
 
     @abc.abstractmethod

+ 17 - 9
model/model_lr.py

@@ -4,26 +4,34 @@
 @time: 2024/11/1
 @desc: 
 """
+
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
+from toad.metrics import KS, AUC
 
-from entitys import DataFeatureEntity, MetricTrainEntity, TrainConfigEntity
+from entitys import MetricTrainEntity, TrainConfigEntity, DataPreparedEntity
 from .model_base import ModelBase
 
-from toad.metrics import KS, AUC
-
 
 class ModelLr(ModelBase):
     def __init__(self, train_config: TrainConfigEntity):
         super().__init__(train_config)
         self.lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
 
-    def train(self, data: DataFeatureEntity, *args, **kwargs) -> MetricTrainEntity:
-        self.lr.fit(data.get_Xdata(), data.get_Ydata())
-        pred_y = self.predict(data.get_Xdata())
-        ks = KS(pred_y, data.get_Ydata())
-        auc = AUC(pred_y, data.get_Ydata())
-        return MetricTrainEntity(auc, ks)
+    def train(self, data: DataPreparedEntity, *args, **kwargs) -> MetricTrainEntity:
+        train_data = data.train_data
+        test_data = data.test_data
+        self.lr.fit(train_data.get_Xdata(), train_data.get_Ydata())
+
+        train_prob = self.lr.predict(train_data.get_Xdata())
+        train_auc = AUC(train_prob, train_data.get_Ydata())
+        train_ks = KS(train_prob, train_data.get_Ydata())
+
+        test_prob = self.lr.predict(test_data.get_Xdata())
+        test_auc = AUC(test_prob, test_data.get_Ydata())
+        test_ks = KS(test_prob, test_data.get_Ydata())
+
+        return MetricTrainEntity(train_auc, train_ks, test_auc, test_ks)
 
     def predict_prob(self, x: pd.DataFrame, *args, **kwargs):
         return self.lr.predict_proba(x)[:, 1]

+ 30 - 0
train_test.py

@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""
+@author: yq
+@time: 2024/11/27
+@desc: 
+"""
+import time
+
+from entitys import DataSplitEntity, DataProcessConfigEntity, TrainConfigEntity
+from feature import FilterStrategyFactory
+from trainer import TrainPipeline
+
+if __name__ == "__main__":
+    time_now = time.time()
+    import scorecardpy as sc
+
+    dat = sc.germancredit()
+    dat["creditability"] = dat["creditability"].apply(lambda x: 1 if x == "bad" else 0)
+    data = DataSplitEntity(dat[:700], None, dat[700:])
+
+    filter_strategy_factory = FilterStrategyFactory(
+        DataProcessConfigEntity.from_config('./config/data_process_config_template.json'))
+    strategy = filter_strategy_factory.get_strategy()
+    candidate_feature = strategy.filter(data)
+    data_prepared = strategy.feature_generate(data, candidate_feature)
+
+    train_pipeline = TrainPipeline(TrainConfigEntity.from_config('./config/train_config_template.json'))
+    train_pipeline.train(data_prepared)
+
+    print(time.time() - time_now)

+ 10 - 8
trainer/train.py

@@ -4,19 +4,21 @@
 @time: 2024/11/1
 @desc: 模型训练管道
 """
-from entitys import DataFeatureEntity
-from model import ModelBase
+from entitys import DataPreparedEntity, TrainConfigEntity
+from model import f_get_model
 
 
 class TrainPipeline():
-    def __init__(self, model: ModelBase):
-        self.model = model
+    def __init__(self, train_config: TrainConfigEntity):
+        self._train_config = train_config
+        model_clazz = f_get_model(self._train_config.model_type)
+        self.model = model_clazz(self._train_config)
 
-    def train(self, train_data: DataFeatureEntity, test_data: DataFeatureEntity):
-        metric_train = self.model.train(train_data)
-        self.model.predict_prob(test_data.get_Xdata())
+    def train(self, data: DataPreparedEntity):
+        metric_train = self.model.train(data)
+        print(metric_train)
 
-    def generate_report(self):
+    def generate_report(self, data: DataPreparedEntity):
         pass