Forráskód Böngészése

add: 数据类型一致性检测

yq 1 hónapja
szülő
commit
e44137394f
2 módosított fájl, 29 hozzáadás és 0 törlés
  1. 23 0
      data/insight/data_explore.py
  2. 6 0
      feature/woe/strategy_woe.py

+ 23 - 0
data/insight/data_explore.py

@@ -4,7 +4,10 @@
 @time: 2024/11/13
 @desc: 数据探索
 """
+import numbers
+
 import pandas as pd
+from pandas.core.dtypes.common import is_numeric_dtype
 
 
 class DataExplore():
@@ -12,6 +15,26 @@ class DataExplore():
     def __init__(self):
         pass
 
+    @staticmethod
+    def check_type(df: pd.DataFrame):
+        check_msg = ""
+        for column in df.columns:
+            if not is_numeric_dtype(df[column]):
+                values = list(df[column])
+                cnt_str = 0
+                cnt_number = 0
+                cnt_other = 0
+                for value in values:
+                    if isinstance(value, numbers.Number):
+                        cnt_number += 1
+                    elif isinstance(value, str):
+                        cnt_str += 1
+                    else:
+                        cnt_other += 1
+                if len(values) != cnt_str:
+                    check_msg = f"{check_msg}【{column}】数值型数量{cnt_number} 字符型数量{cnt_str} 其它类型数量{cnt_other}\n"
+        return check_msg
+
     @staticmethod
     def distribution(df: pd.DataFrame) -> pd.DataFrame:
         """

+ 6 - 0
feature/woe/strategy_woe.py

@@ -19,6 +19,7 @@ from tqdm import tqdm
 
 from commom import f_display_images_by_side, NumpyEncoder, GeneralException, f_df_to_image, f_display_title, \
     f_image_crop_white_borders
+from data import DataExplore
 from entitys import DataSplitEntity, MetricFucResultEntity
 from enums import ContextEnum, ResultCodesEnum
 from feature.feature_strategy_base import FeatureStrategyBase
@@ -285,6 +286,11 @@ class StrategyWoe(FeatureStrategyBase):
             if column in x_columns:
                 x_columns.remove(column)
 
+        check_msg = DataExplore.check_type(train_data[x_columns])
+        if check_msg != "":
+            print(f"数据类型分析:\n{check_msg}\n同一变量请保持数据类型一致")
+            raise GeneralException(ResultCodesEnum.ILLEGAL_PARAMS, message=f"数据类型错误.")
+
         bins_train = sc.woebin(train_data[x_columns + [y_column]], y=y_column, bin_num_limit=5,
                                special_values=special_values, breaks_list=breaks_list, print_info=False)