DataProcessor.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: zsc
  4. @time: 2024/11/18
  5. @desc: 数据处理
  6. """
  7. # 数据预处理模块
  8. class DataPreprocessor:
  9. def __init__(self, data):
  10. self.data = data
  11. def preprocess(self):
  12. # 更多的数据预处理函数
  13. processed_data = [
  14. item for item in self.data
  15. if 'user' in item and 'action' in item and 'product' in item and 'channel' in item
  16. ]
  17. processed_data = self.remove_duplicates(processed_data)
  18. processed_data = self.fill_missing_values(processed_data)
  19. processed_data = self.convert_data_types(processed_data)
  20. return processed_data
  21. def remove_duplicates(self, data):
  22. # 去除重复数据
  23. unique_data = []
  24. seen = set()
  25. for item in data:
  26. # 创建一个元组,包含用户、行为、产品和渠道,用于识别唯一记录
  27. identifier = (item['user'], item['action'], item['product'], item['channel'])
  28. if identifier not in seen:
  29. seen.add(identifier)
  30. unique_data.append(item)
  31. return unique_data
  32. def fill_missing_values(self, data):
  33. # 填充缺失值
  34. for item in data:
  35. if 'user' not in item:
  36. item['user'] = 'UnknownUser'
  37. if 'action' not in item:
  38. item['action'] = 'UnknownAction'
  39. if 'product' not in item:
  40. item['product'] = 'UnknownProduct'
  41. if 'channel' not in item:
  42. item['channel'] = 'UnknownChannel'
  43. return data
  44. def convert_data_types(self, data):
  45. # 转换数据类型
  46. for item in data:
  47. # 假设我们需要将用户ID转换为字符串类型
  48. item['user'] = str(item['user'])
  49. # 其他数据类型转换可以根据需要添加
  50. return data