1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- # -*- coding: utf-8 -*-
- """
- @author: zsc
- @time: 2024/11/18
- @desc: 数据处理
- """
- # 数据预处理模块
- class DataPreprocessor:
- def __init__(self, data):
- self.data = data
- def preprocess(self):
- # 更多的数据预处理函数
- processed_data = [
- item for item in self.data
- if 'user' in item and 'action' in item and 'product' in item and 'channel' in item
- ]
- processed_data = self.remove_duplicates(processed_data)
- processed_data = self.fill_missing_values(processed_data)
- processed_data = self.convert_data_types(processed_data)
- return processed_data
- def remove_duplicates(self, data):
- # 去除重复数据
- unique_data = []
- seen = set()
- for item in data:
- # 创建一个元组,包含用户、行为、产品和渠道,用于识别唯一记录
- identifier = (item['user'], item['action'], item['product'], item['channel'])
- if identifier not in seen:
- seen.add(identifier)
- unique_data.append(item)
- return unique_data
- def fill_missing_values(self, data):
- # 填充缺失值
- for item in data:
- if 'user' not in item:
- item['user'] = 'UnknownUser'
- if 'action' not in item:
- item['action'] = 'UnknownAction'
- if 'product' not in item:
- item['product'] = 'UnknownProduct'
- if 'channel' not in item:
- item['channel'] = 'UnknownChannel'
- return data
- def convert_data_types(self, data):
- # 转换数据类型
- for item in data:
- # 假设我们需要将用户ID转换为字符串类型
- item['user'] = str(item['user'])
- # 其他数据类型转换可以根据需要添加
- return data
|