| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- """
- 数据治理标准化单元测试
- 覆盖格式转换、数据质量验证和清洗功能
- """
- import unittest
- from unittest.mock import Mock, patch
- import json
- from datetime import datetime
- from src.governance.standardizer import DataStandardizer
- from src.governance.validator import DataValidator
- from src.governance.cleaner import DataCleaner
-
-
- class TestDataStandardizer(unittest.TestCase):
- """数据标准化测试"""
-
- def setUp(self):
- self.standardizer = DataStandardizer()
-
- def test_temperature_format_standardization(self):
- """测试温度格式标准化"""
- raw_temperatures = [
- "25.5°C",
- "25.5 C",
- "77.0 F",
- "298.15 K"
- ]
-
- standardized = self.standardizer.standardize_temperature(raw_temperatures)
-
- # 所有温度应该转换为摄氏度
- for temp in standardized:
- self.assertEqual(temp["unit"], "celsius")
- self.assertIsInstance(temp["value"], (int, float))
-
- def test_timestamp_format_standardization(self):
- """测试时间戳格式标准化"""
- raw_timestamps = [
- "2026-06-16 12:00:00",
- "2026/06/16 12:00",
- "June 16, 2026 12:00 PM",
- "16-Jun-2026 12:00"
- ]
-
- standardized = self.standardizer.standardize_timestamp(raw_timestamps)
-
- # 所有时间戳应该转换为ISO格式
- for ts in standardized:
- try:
- datetime.fromisoformat(ts)
- self.assertTrue(True) # 验证ISO格式
- except ValueError:
- self.fail(f"Invalid ISO timestamp: {ts}")
-
- def test_device_id_standardization(self):
- """测试设备ID标准化"""
- raw_device_ids = [
- "device-001",
- "Device_002",
- "DEVICE.003",
- "sensor@04"
- ]
-
- standardized = self.standardizer.standardize_device_id(raw_device_ids)
-
- # 所有设备ID应该标准化为小写加下划线
- for device_id in standardized:
- self.assertIn("_", device_id)
- self.assertTrue(device_id.islower())
-
- def test_data_format_conversion(self):
- """测试数据格式转换"""
- input_data = {
- "device_id": "DEVICE-001",
- "temperature": "25.5°C",
- "timestamp": "2026-06-16 12:00:00",
- "status": "active",
- "battery": "85%"
- }
-
- converted = self.standardizer.convert_data_format(input_data)
-
- # 验证转换后的格式
- self.assertEqual(converted["device_id"], "device_001")
- self.assertEqual(converted["temperature"], 25.5)
- self.assertEqual(converted["unit"], "celsius")
- self.assertTrue(datetime.fromisoformat(converted["timestamp"]))
- self.assertEqual(converted["status"], "active")
- self.assertEqual(converted["battery_level"], 85)
-
-
- class TestDataValidator(unittest.TestCase):
- """数据验证测试"""
-
- def setUp(self):
- self.validator = DataValidator()
-
- def test_data_range_validation(self):
- """ testData范围验证"""
- valid_data = {
- "temperature": 25.5, # 合理温度
- "humidity": 60.2, # 合理湿度
- "pressure": 1013.25 # 合理气压
- }
-
- result = self.validator.validate_ranges(valid_data)
- self.assertTrue(result["valid"])
-
- def test_invalid_range_validation(self):
- """测试无效数据范围验证"""
- invalid_data = {
- "temperature": 200.0, # 过高温度
- "humidity": 150.0, # 超过100%湿度
- "pressure": 0.0 # 过低气压
- }
-
- result = self.validator.validate_ranges(invalid_data)
- self.assertFalse(result["valid"])
- self.assertGreater(len(result["errors"]), 0)
-
- def test_data_completeness_validation(self):
- """测试数据完整性验证"""
- incomplete_data = {
- "device_id": "device_001",
- # 缺少必需的timestamp字段
- "temperature": 25.5
- }
-
- required_fields = ["device_id", "timestamp", "temperature"]
- result = self.validator.validate_completeness(incomplete_data, required_fields)
- self.assertFalse(result["valid"])
- self.assertIn("Missing required fields", result["errors"])
-
- def test_data_type_validation(self):
- """测试数据类型验证"""
- type_mismatch_data = {
- "device_id": "device_001",
- "timestamp": "2026-06-16T12:00:00Z",
- "temperature": "25.5", # 字符串而不是数字
- "status": 1 # 数字而不是字符串
- }
-
- expected_types = {
- "device_id": str,
- "timestamp": str,
- "temperature": float,
- "status": str
- }
-
- result = self.validator.validate_types(type_mismatch_data, expected_types)
- self.assertFalse(result["valid"])
- self.assertIn("Type mismatch", result["errors"])
-
-
- class TestDataCleaner(unittest.TestCase):
- """数据清洗测试"""
-
- def setUp(self):
- self.cleaner = DataCleaner()
-
- def test_outlier_removal(self):
- """测试异常值移除"""
- data_with_outliers = [
- {"temperature": 25.5, "device_id": "dev001"},
- {"temperature": 200.0, "device_id": "dev002"}, # 异常高温
- {"temperature": 25.6, "device_id": "dev003"},
- {"temperature": -50.0, "device_id": "dev004"}, # 异常低温
- {"temperature": 25.7, "device_id": "dev005"}
- ]
-
- cleaned_data = self.cleaner.remove_outliers(data_with_outliers, "temperature", std_dev_threshold=3)
-
- # 异常值应该被移除
- self.assertEqual(len(cleaned_data), 3)
- temperatures = [d["temperature"] for d in cleaned_data]
- self.assertNotIn(200.0, temperatures)
- self.assertNotIn(-50.0, temperatures)
-
- def test_duplicate_removal(self):
- """测试重复数据移除"""
- data_with_duplicates = [
- {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100},
- {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复
- {"device_id": "dev002", "timestamp": "2026-06-16T12:01:00Z", "value": 200},
- {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复
- ]
-
- cleaned_data = self.cleaner.remove_duplicates(data_with_duplicates, ["device_id", "timestamp"])
-
- # 应该只保留唯一的记录
- self.assertEqual(len(cleaned_data), 2)
-
- def test_data_interpolation(self):
- """测试数据插值"""
- incomplete_data = [
- {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100},
- {"device_id": "dev001", "timestamp": "2026-06-16T12:02:00Z", "value": 120},
- {"device_id": "dev001", "timestamp": "2026-06-16T12:04:00Z", "value": None}, # 缺失值
- {"device_id": "dev001", "timestamp": "2026-06-16T12:06:00Z", "value": 140}
- ]
-
- interpolated_data = self.cleaner.interpolate_missing_values(incomplete_data, "value")
-
- # 缺失值应该被插值
- interpolated_value = next(d["value"] for d in interpolated_data if d["value"] is not None and d["timestamp"] == "2026-06-16T12:04:00Z")
- self.assertEqual(interpolated_value, 130) # 线性插值
-
- def test_data_normalization(self):
- """测试数据归一化"""
- raw_data = [
- {"device_id": "dev001", "value": 100},
- {"device_id": "dev002", "value": 200},
- {"device_id": "dev003", "value": 300}
- ]
-
- normalized_data = self.cleaner.normalize_data(raw_data, "value")
-
- # 检查归一化后的值在0-1范围内
- for item in normalized_data:
- normalized_value = item["normalized_value"]
- self.assertGreaterEqual(normalized_value, 0)
- self.assertLessEqual(normalized_value, 1)
-
-
- if __name__ == '__main__':
- unittest.main()
|