""" 数据治理标准化单元测试 覆盖格式转换、数据质量验证和清洗功能 """ import unittest from unittest.mock import Mock, patch import json from datetime import datetime from src.governance.standardizer import DataStandardizer from src.governance.validator import DataValidator from src.governance.cleaner import DataCleaner class TestDataStandardizer(unittest.TestCase): """数据标准化测试""" def setUp(self): self.standardizer = DataStandardizer() def test_temperature_format_standardization(self): """测试温度格式标准化""" raw_temperatures = [ "25.5°C", "25.5 C", "77.0 F", "298.15 K" ] standardized = self.standardizer.standardize_temperature(raw_temperatures) # 所有温度应该转换为摄氏度 for temp in standardized: self.assertEqual(temp["unit"], "celsius") self.assertIsInstance(temp["value"], (int, float)) def test_timestamp_format_standardization(self): """测试时间戳格式标准化""" raw_timestamps = [ "2026-06-16 12:00:00", "2026/06/16 12:00", "June 16, 2026 12:00 PM", "16-Jun-2026 12:00" ] standardized = self.standardizer.standardize_timestamp(raw_timestamps) # 所有时间戳应该转换为ISO格式 for ts in standardized: try: datetime.fromisoformat(ts) self.assertTrue(True) # 验证ISO格式 except ValueError: self.fail(f"Invalid ISO timestamp: {ts}") def test_device_id_standardization(self): """测试设备ID标准化""" raw_device_ids = [ "device-001", "Device_002", "DEVICE.003", "sensor@04" ] standardized = self.standardizer.standardize_device_id(raw_device_ids) # 所有设备ID应该标准化为小写加下划线 for device_id in standardized: self.assertIn("_", device_id) self.assertTrue(device_id.islower()) def test_data_format_conversion(self): """测试数据格式转换""" input_data = { "device_id": "DEVICE-001", "temperature": "25.5°C", "timestamp": "2026-06-16 12:00:00", "status": "active", "battery": "85%" } converted = self.standardizer.convert_data_format(input_data) # 验证转换后的格式 self.assertEqual(converted["device_id"], "device_001") self.assertEqual(converted["temperature"], 25.5) self.assertEqual(converted["unit"], "celsius") self.assertTrue(datetime.fromisoformat(converted["timestamp"])) self.assertEqual(converted["status"], "active") self.assertEqual(converted["battery_level"], 85) class TestDataValidator(unittest.TestCase): """数据验证测试""" def setUp(self): self.validator = DataValidator() def test_data_range_validation(self): """ testData范围验证""" valid_data = { "temperature": 25.5, # 合理温度 "humidity": 60.2, # 合理湿度 "pressure": 1013.25 # 合理气压 } result = self.validator.validate_ranges(valid_data) self.assertTrue(result["valid"]) def test_invalid_range_validation(self): """测试无效数据范围验证""" invalid_data = { "temperature": 200.0, # 过高温度 "humidity": 150.0, # 超过100%湿度 "pressure": 0.0 # 过低气压 } result = self.validator.validate_ranges(invalid_data) self.assertFalse(result["valid"]) self.assertGreater(len(result["errors"]), 0) def test_data_completeness_validation(self): """测试数据完整性验证""" incomplete_data = { "device_id": "device_001", # 缺少必需的timestamp字段 "temperature": 25.5 } required_fields = ["device_id", "timestamp", "temperature"] result = self.validator.validate_completeness(incomplete_data, required_fields) self.assertFalse(result["valid"]) self.assertIn("Missing required fields", result["errors"]) def test_data_type_validation(self): """测试数据类型验证""" type_mismatch_data = { "device_id": "device_001", "timestamp": "2026-06-16T12:00:00Z", "temperature": "25.5", # 字符串而不是数字 "status": 1 # 数字而不是字符串 } expected_types = { "device_id": str, "timestamp": str, "temperature": float, "status": str } result = self.validator.validate_types(type_mismatch_data, expected_types) self.assertFalse(result["valid"]) self.assertIn("Type mismatch", result["errors"]) class TestDataCleaner(unittest.TestCase): """数据清洗测试""" def setUp(self): self.cleaner = DataCleaner() def test_outlier_removal(self): """测试异常值移除""" data_with_outliers = [ {"temperature": 25.5, "device_id": "dev001"}, {"temperature": 200.0, "device_id": "dev002"}, # 异常高温 {"temperature": 25.6, "device_id": "dev003"}, {"temperature": -50.0, "device_id": "dev004"}, # 异常低温 {"temperature": 25.7, "device_id": "dev005"} ] cleaned_data = self.cleaner.remove_outliers(data_with_outliers, "temperature", std_dev_threshold=3) # 异常值应该被移除 self.assertEqual(len(cleaned_data), 3) temperatures = [d["temperature"] for d in cleaned_data] self.assertNotIn(200.0, temperatures) self.assertNotIn(-50.0, temperatures) def test_duplicate_removal(self): """测试重复数据移除""" data_with_duplicates = [ {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复 {"device_id": "dev002", "timestamp": "2026-06-16T12:01:00Z", "value": 200}, {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, # 重复 ] cleaned_data = self.cleaner.remove_duplicates(data_with_duplicates, ["device_id", "timestamp"]) # 应该只保留唯一的记录 self.assertEqual(len(cleaned_data), 2) def test_data_interpolation(self): """测试数据插值""" incomplete_data = [ {"device_id": "dev001", "timestamp": "2026-06-16T12:00:00Z", "value": 100}, {"device_id": "dev001", "timestamp": "2026-06-16T12:02:00Z", "value": 120}, {"device_id": "dev001", "timestamp": "2026-06-16T12:04:00Z", "value": None}, # 缺失值 {"device_id": "dev001", "timestamp": "2026-06-16T12:06:00Z", "value": 140} ] interpolated_data = self.cleaner.interpolate_missing_values(incomplete_data, "value") # 缺失值应该被插值 interpolated_value = next(d["value"] for d in interpolated_data if d["value"] is not None and d["timestamp"] == "2026-06-16T12:04:00Z") self.assertEqual(interpolated_value, 130) # 线性插值 def test_data_normalization(self): """测试数据归一化""" raw_data = [ {"device_id": "dev001", "value": 100}, {"device_id": "dev002", "value": 200}, {"device_id": "dev003", "value": 300} ] normalized_data = self.cleaner.normalize_data(raw_data, "value") # 检查归一化后的值在0-1范围内 for item in normalized_data: normalized_value = item["normalized_value"] self.assertGreaterEqual(normalized_value, 0) self.assertLessEqual(normalized_value, 1) if __name__ == '__main__': unittest.main()