智慧水务管理系统 - 精河县供水工程综合管理平台

test_data_governance.py 8.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. """
  2. 数据治理标准化单元测试
  3. 覆盖格式转换、数据质量验证和清洗功能
  4. """
  5. import unittest
  6. from unittest.mock import Mock, patch
  7. import json
  8. from datetime import datetime
  9. from src.governance.standardizer import DataStandardizer
  10. from src.governance.validator import DataValidator
  11. from src.governance.cleaner import DataCleaner
  12. class TestDataStandardizer(unittest.TestCase):
  13. """数据标准化测试"""
  14. def setUp(self):
  15. self.standardizer = DataStandardizer()
  16. def test_temperature_format_standardization(self):
  17. """测试温度格式标准化"""
  18. raw_temperatures = [
  19. "25.5°C",
  20. "25.5 C",
  21. "77.0 F",
  22. "298.15 K"
  23. ]
  24. standardized = self.standardizer.standardize_temperature(raw_temperatures)
  25. # 所有温度应该转换为摄氏度
  26. for temp in standardized:
  27. self.assertEqual(temp["unit"], "celsius")
  28. self.assertIsInstance(temp["value"], (int, float))
  29. def test_timestamp_format_standardization(self):
  30. """测试时间戳格式标准化"""
  31. raw_timestamps = [
  32. "2026-06-16 12:00:00",
  33. "2026/06/16 12:00",
  34. "June 16, 2026 12:00 PM",
  35. "16-Jun-2026 12:00"
  36. ]
  37. standardized = self.standardizer.standardize_timestamp(raw_timestamps)
  38. # 所有时间戳应该转换为ISO格式
  39. for ts in standardized:
  40. try:
  41. datetime.fromisoformat(ts)
  42. self.assertTrue(True) # 验证ISO格式
  43. except ValueError:
  44. self.fail(f"Invalid ISO timestamp: {ts}")
  45. def test_device_id_standardization(self):
  46. """测试设备ID标准化"""
  47. raw_device_ids = [
  48. "device-001",
  49. "Device_002",
  50. "DEVICE.003",
  51. "sensor@04"
  52. ]
  53. standardized = self.standardizer.standardize_device_id(raw_device_ids)
  54. # 所有设备ID应该标准化为小写加下划线
  55. for device_id in standardized:
  56. self.assertIn("_", device_id)
  57. self.assertTrue(device_id.islower())
  58. def test_data_format_conversion(self):
  59. """测试数据格式转换"""
  60. input_data = {
  61. "device_id": "DEVICE-001",
  62. "temperature": "25.5°C",
  63. "timestamp": "2026-06-16 12:00:00",
  64. "status": "active",
  65. "battery": "85%"
  66. }
  67. converted = self.standardizer.convert_data_format(input_data)
  68. # 验证转换后的格式
  69. self.assertEqual(converted["device_id"], "device_001")
  70. self.assertEqual(converted["temperature"], 25.5)
  71. self.assertEqual(converted["unit"], "celsius")
  72. self.assertTrue(datetime.fromisoformat(converted["timestamp"]))
  73. self.assertEqual(converted["status"], "active")
  74. self.assertEqual(converted["battery_level"], 85)
  75. class TestDataValidator(unittest.TestCase):
  76. """数据验证测试"""
  77. def setUp(self):
  78. self.validator = DataValidator()
  79. def test_data_range_validation(self):
  80. """ testData范围验证"""
  81. valid_data = {
  82. "temperature": 25.5, # 合理温度
  83. "humidity": 60.2, # 合理湿度
  84. "pressure": 1013.25 # 合理气压
  85. }
  86. result = self.validator.validate_ranges(valid_data)
  87. self.assertTrue(result["valid"])
  88. def test_invalid_range_validation(self):
  89. """测试无效数据范围验证"""
  90. invalid_data = {
  91. "temperature": 200.0, # 过高温度
  92. "humidity": 150.0, # 超过100%湿度
  93. "pressure": 0.0 # 过低气压
  94. }
  95. result = self.validator.validate_ranges(invalid_data)
  96. self.assertFalse(result["valid"])
  97. self.assertGreater(len(result["errors"]), 0)
  98. def test_data_completeness_validation(self):
  99. """测试数据完整性验证"""
  100. incomplete_data = {
  101. "device_id": "device_001",
  102. # 缺少必需的timestamp字段
  103. "temperature": 25.5
  104. }
  105. required_fields = ["device_id", "timestamp", "temperature"]
  106. result = self.validator.validate_completeness(incomplete_data, required_fields)
  107. self.assertFalse(result["valid"])
  108. self.assertIn("Missing required fields", result["errors"])
  109. def test_data_type_validation(self):
  110. """测试数据类型验证"""
  111. type_mismatch_data = {
  112. "device_id": "device_001",
  113. "timestamp": "2026-06-16T12:00:00Z",
  114. "temperature": "25.5", # 字符串而不是数字
  115. "status": 1 # 数字而不是字符串
  116. }
  117. expected_types = {
  118. "device_id": str,
  119. "timestamp": str,
  120. "temperature": float,
  121. "status": str
  122. }
  123. result = self.validator.validate_types(type_mismatch_data, expected_types)
  124. self.assertFalse(result["valid"])
  125. self.assertIn("Type mismatch", result["errors"])
  126. class TestDataCleaner(unittest.TestCase):
  127. """数据清洗测试"""
  128. def setUp(self):
  129. self.cleaner = DataCleaner()
  130. def test_outlier_removal(self):
  131. """测试异常值移除"""
  132. data_with_outliers = [
  133. {"temperature": 25.5, "device_id": "dev001"},
  134. {"temperature": 200.0, "device_id": "dev002"}, # 异常高温
  135. {"temperature": 25.6, "device_id": "dev003"},
  136. {"temperature": -50.0, "device_id": "dev004"}, # 异常低温
  137. {"temperature": 25.7, "device_id": "dev005"}
  138. ]
  139. cleaned_data = self.cleaner.remove_outliers(data_with_outliers, "temperature", std_dev_threshold=3)
  140. # 异常值应该被移除
  141. self.assertEqual(len(cleaned_data), 3)
  142. temperatures = [d["temperature"] for d in cleaned_data]
  143. self.assertNotIn(200.0, temperatures)
  144. self.assertNotIn(-50.0, temperatures)
  145. def test_duplicate_removal(self):
  146. """测试重复数据移除"""
  147. data_with_duplicates = [
  148. {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100},
  149. {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复
  150. {"device_id": "dev002", "timestamp": "2026-06-16T12:01:00Z", "value": 200},
  151. {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复
  152. ]
  153. cleaned_data = self.cleaner.remove_duplicates(data_with_duplicates, ["device_id", "timestamp"])
  154. # 应该只保留唯一的记录
  155. self.assertEqual(len(cleaned_data), 2)
  156. def test_data_interpolation(self):
  157. """测试数据插值"""
  158. incomplete_data = [
  159. {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100},
  160. {"device_id": "dev001", "timestamp": "2026-06-16T12:02:00Z", "value": 120},
  161. {"device_id": "dev001", "timestamp": "2026-06-16T12:04:00Z", "value": None}, # 缺失值
  162. {"device_id": "dev001", "timestamp": "2026-06-16T12:06:00Z", "value": 140}
  163. ]
  164. interpolated_data = self.cleaner.interpolate_missing_values(incomplete_data, "value")
  165. # 缺失值应该被插值
  166. interpolated_value = next(d["value"] for d in interpolated_data if d["value"] is not None and d["timestamp"] == "2026-06-16T12:04:00Z")
  167. self.assertEqual(interpolated_value, 130) # 线性插值
  168. def test_data_normalization(self):
  169. """测试数据归一化"""
  170. raw_data = [
  171. {"device_id": "dev001", "value": 100},
  172. {"device_id": "dev002", "value": 200},
  173. {"device_id": "dev003", "value": 300}
  174. ]
  175. normalized_data = self.cleaner.normalize_data(raw_data, "value")
  176. # 检查归一化后的值在0-1范围内
  177. for item in normalized_data:
  178. normalized_value = item["normalized_value"]
  179. self.assertGreaterEqual(normalized_value, 0)
  180. self.assertLessEqual(normalized_value, 1)
  181. if __name__ == '__main__':
  182. unittest.main()