yq 1 місяць тому
батько
коміт
824a8b517b
3 змінених файлів з 44 додано та 0 видалено
  1. 7 0
      feature/feature_strategy_base.py
  2. 34 0
      feature/woe/strategy_woe.py
  3. 3 0
      pipeline/pipeline.py

+ 7 - 0
feature/feature_strategy_base.py

@@ -31,6 +31,13 @@ class FeatureStrategyBase(metaclass=abc.ABCMeta):
         """
         pass
 
+    @abc.abstractmethod
+    def variable_analyse(self, *args, **kwargs):
+        """
+        单个变量分析
+        """
+        pass
+
     @abc.abstractmethod
     def feature_generate(self, *args, **kwargs) -> pd.DataFrame:
         """

+ 34 - 0
feature/woe/strategy_woe.py

@@ -432,6 +432,40 @@ class StrategyWoe(FeatureStrategyBase):
         context.set(ContextEnum.HOMO_BIN_INFO_NUMERIC_SET, homo_bin_info_numeric_set)
         context.set_filter_info(ContextEnum.FILTER_NUMERIC, filter_numeric_overview, filter_numeric_detail)
 
+    def variable_analyse(self, data: DataSplitEntity, column: str, format_bin=None, *args, **kwargs):
+        from IPython import display
+
+        if is_numeric_dtype(data.train_data[column]):
+            train_data = data.train_data
+            test_data = data.test_data
+            format_bin_mlcfg = self.ml_config.format_bin
+            if format_bin is not None:
+                self.ml_config._format_bin = format_bin
+            homo_bin_info_numeric: HomologousBinInfo = self._handle_numeric(data, column)
+
+            bins_info = homo_bin_info_numeric.get_best_bins()
+            print(f"-----【{column}】不同分箱数下变量的推荐切分点-----")
+            imgs_path_trend_train = []
+            imgs_path_trend_test = []
+            for bin_info in bins_info:
+                print(json.dumps(bin_info.points, ensure_ascii=False, cls=NumpyEncoder))
+                breaks_list = [str(i) for i in bin_info.points]
+                sc_woebin_train = self._f_get_sc_woebin(train_data, {column: bin_info})
+                image_path = self._f_get_img_trend(sc_woebin_train, [column],
+                                                   f"train_{column}_{'_'.join(breaks_list)}")
+                imgs_path_trend_train.append(image_path[0])
+                sc_woebin_test = self._f_get_sc_woebin(test_data, {column: bin_info})
+                image_path = self._f_get_img_trend(sc_woebin_test, [column],
+                                                   f"test_{column}_{'_'.join(breaks_list)}")
+                imgs_path_trend_test.append(image_path[0])
+            f_display_images_by_side(display, imgs_path_trend_train, title=f"训练集",
+                                     image_path_list2=imgs_path_trend_test, title2="测试集")
+            self.ml_config._format_bin = format_bin_mlcfg
+
+        else:
+            print("只能针对数值型变量进行分析。")
+
+
     def feature_save(self, *args, **kwargs):
         if self.sc_woebin is None:
             GeneralException(ResultCodesEnum.NOT_FOUND, message=f"feature不存在")

+ 3 - 0
pipeline/pipeline.py

@@ -65,6 +65,9 @@ class Pipeline():
         self._feature_strategy.feature_load(path)
         self._model.model_load(path)
 
+    def variable_analyse(self, column: str, format_bin=None):
+        self._feature_strategy.variable_analyse(self._data, column, format_bin)
+
 
 if __name__ == "__main__":
     pass