# -*- coding: utf-8 -*- """ @author: zsc @time: 2024/11/18 @desc: 数据处理 """ # 数据预处理模块 class DataPreprocessor: def __init__(self, data): self.data = data def preprocess(self): # 更多的数据预处理函数 processed_data = [ item for item in self.data if 'user' in item and 'action' in item and 'product' in item and 'channel' in item ] processed_data = self.remove_duplicates(processed_data) processed_data = self.fill_missing_values(processed_data) processed_data = self.convert_data_types(processed_data) return processed_data def remove_duplicates(self, data): # 去除重复数据 unique_data = [] seen = set() for item in data: # 创建一个元组,包含用户、行为、产品和渠道,用于识别唯一记录 identifier = (item['user'], item['action'], item['product'], item['channel']) if identifier not in seen: seen.add(identifier) unique_data.append(item) return unique_data def fill_missing_values(self, data): # 填充缺失值 for item in data: if 'user' not in item: item['user'] = 'UnknownUser' if 'action' not in item: item['action'] = 'UnknownAction' if 'product' not in item: item['product'] = 'UnknownProduct' if 'channel' not in item: item['channel'] = 'UnknownChannel' return data def convert_data_types(self, data): # 转换数据类型 for item in data: # 假设我们需要将用户ID转换为字符串类型 item['user'] = str(item['user']) # 其他数据类型转换可以根据需要添加 return data