# 全局负载均衡器示例
import hashlib
import time
from datetime import datetime

class GlobalLoadBalancer:
    def __init__(self, datacenters):
        self.datacenters = datacenters
        self.health_checker = HealthChecker(datacenters)
        self.dns_manager = DNSManager()
        self.session_manager = SessionManager()
        
    def route_request(self, request):
        # 根据用户位置和数据中心健康状态选择最佳数据中心
        client_location = self.get_client_location(request)
        healthy_dcs = self.health_checker.get_healthy_datacenters()
        
        if not healthy_dcs:
            raise Exception("No healthy datacenters available")
            
        # 选择最近的健康数据中心
        best_dc = self.select_best_datacenter(client_location, healthy_dcs)
        
        # 记录会话信息
        session_id = self.session_manager.create_session(
            request.client_ip, best_dc.id)
            
        return {
            "datacenter": best_dc,
            "session_id": session_id,
            "endpoint": best_dc.endpoint
        }
        
    def get_client_location(self, request):
        # 通过IP地理位置库获取客户端位置
        # 这里简化处理
        return "us-east"
        
    def select_best_datacenter(self, client_location, healthy_dcs):
        # 根据延迟、负载和地理位置选择最佳数据中心
        best_dc = None
        min_score = float('inf')
        
        for dc in healthy_dcs:
            score = self.calculate_datacenter_score(client_location, dc)
            if score < min_score:
                min_score = score
                best_dc = dc
                
        return best_dc
        
    def calculate_datacenter_score(self, client_location, dc):
        # 综合评分：地理位置距离 + 当前负载 + 历史延迟
        distance_score = self.calculate_distance_score(client_location, dc.location)
        load_score = dc.current_load / dc.max_capacity
        latency_score = dc.average_latency / 1000  # 假设最大延迟1000ms
        
        return distance_score * 0.5 + load_score * 0.3 + latency_score * 0.2
        
    def calculate_distance_score(self, client_location, dc_location):
        # 简化的距离计算
        distance_map = {
            ("us-east", "us-east"): 0,
            ("us-east", "us-west"): 4000,
            ("us-east", "eu-west"): 6000,
            ("us-west", "us-west"): 0,
            ("us-west", "eu-west"): 9000,
            ("eu-west", "eu-west"): 0
        }
        
        distance = distance_map.get((client_location, dc_location), 10000)
        return distance / 10000  # 归一化到0-1范围

class Datacenter:
    def __init__(self, dc_id, location, endpoint):
        self.id = dc_id
        self.location = location
        self.endpoint = endpoint
        self.current_load = 0
        self.max_capacity = 1000
        self.average_latency = 50
        self.status = "healthy"
        
    def update_metrics(self, load, latency):
        self.current_load = load
        self.average_latency = latency

class HealthChecker:
    def __init__(self, datacenters):
        self.datacenters = datacenters
        
    def get_healthy_datacenters(self):
        healthy_dcs = []
        for dc in self.datacenters:
            if self.is_datacenter_healthy(dc):
                healthy_dcs.append(dc)
        return healthy_dcs
        
    def is_datacenter_healthy(self, dc):
        # 检查数据中心健康状态
        try:
            # 发送健康检查请求
            response = self.send_health_check(dc.endpoint)
            if response.status_code == 200:
                # 更新数据中心指标
                dc.update_metrics(
                    response.json().get('load', 0),
                    response.json().get('latency', 50)
                )
                return True
            else:
                dc.status = "unhealthy"
                return False
        except Exception as e:
            print(f"Health check failed for {dc.id}: {e}")
            dc.status = "unhealthy"
            return False
            
    def send_health_check(self, endpoint):
        import requests
        return requests.get(f"{endpoint}/health", timeout=5)

2. 分布式会话管理

# 分布式会话管理示例
import json
import time
import hashlib
from datetime import datetime

class DistributedSessionManager:
    def __init__(self, redis_cluster):
        self.redis_cluster = redis_cluster
        self.session_ttl = 3600  # 1小时
        
    def create_session(self, client_ip, datacenter_id):
        session_id = self.generate_session_id(client_ip, datacenter_id)
        
        session_data = {
            "session_id": session_id,
            "client_ip": client_ip,
            "datacenter_id": datacenter_id,
            "created_time": datetime.now().isoformat(),
            "last_accessed": datetime.now().isoformat(),
            "data": {}
        }
        
        # 在所有数据中心存储会话数据
        self.store_session_in_all_dcs(session_id, session_data)
        
        return session_id
        
    def generate_session_id(self, client_ip, datacenter_id):
        # 生成全局唯一的会话ID
        timestamp = str(int(time.time() * 1000))
        unique_string = f"{client_ip}:{datacenter_id}:{timestamp}"
        return hashlib.sha256(unique_string.encode()).hexdigest()
        
    def store_session_in_all_dcs(self, session_id, session_data):
        # 在所有数据中心存储会话数据
        for dc_client in self.redis_cluster.get_all_clients():
            try:
                dc_client.setex(
                    f"session:{session_id}", 
                    self.session_ttl, 
                    json.dumps(session_data)
                )
            except Exception as e:
                print(f"Failed to store session in DC: {e}")
                
    def get_session(self, session_id):
        # 从最近的数据中心获取会话数据
        for dc_client in self.redis_cluster.get_clients_by_proximity():
            try:
                session_data = dc_client.get(f"session:{session_id}")
                if session_data:
                    session_obj = json.loads(session_data)
                    # 更新最后访问时间
                    session_obj["last_accessed"] = datetime.now().isoformat()
                    self.update_session(session_id, session_obj)
                    return session_obj
            except Exception as e:
                print(f"Failed to get session from DC: {e}")
                
        return None
        
    def update_session(self, session_id, session_data):
        # 更新会话数据
        self.store_session_in_all_dcs(session_id, session_data)
        
    def delete_session(self, session_id):
        # 删除会话数据
        for dc_client in self.redis_cluster.get_all_clients():
            try:
                dc_client.delete(f"session:{session_id}")
            except Exception as e:
                print(f"Failed to delete session from DC: {e}")

class RedisClusterManager:
    def __init__(self, datacenter_configs):
        self.datacenters = {}
        self.client_proximity = {}  # 客户端到数据中心的 proximity
        
        for dc_config in datacenter_configs:
            self.datacenters[dc_config['id']] = redis.Redis(
                host=dc_config['host'],
                port=dc_config['port'],
                db=dc_config['db']
            )
            
    def get_all_clients(self):
        return list(self.datacenters.values())
        
    def get_clients_by_proximity(self):
        # 根据客户端位置返回按距离排序的Redis客户端
        # 这里简化处理，实际应用中需要根据客户端IP确定最近的数据中心
        return list(self.datacenters.values())

3. 全局唯一标识符

# 全局唯一标识符生成示例
import time
import threading
from datetime import datetime

class GlobalIDGenerator:
    def __init__(self, datacenter_id, worker_id):
        self.datacenter_id = datacenter_id & 0x1F  # 5位数据中心ID
        self.worker_id = worker_id & 0x1F          # 5位工作节点ID
        self.sequence = 0                          # 12位序列号
        self.last_timestamp = -1
        self.lock = threading.Lock()
        
        # Twitter Snowflake算法
        # 1位符号位 + 41位时间戳 + 5位数据中心ID + 5位工作节点ID + 12位序列号 = 64位
        
    def generate_id(self):
        with self.lock:
            timestamp = self._get_timestamp()
            
            if timestamp < self.last_timestamp:
                raise Exception("Clock moved backwards")
                
            if timestamp == self.last_timestamp:
                # 同一毫秒内，序列号自增
                self.sequence = (self.sequence + 1) & 0xFFF
                if self.sequence == 0:
                    # 序列号用完，等待下一毫秒
                    timestamp = self._wait_next_millis(self.last_timestamp)
            else:
                # 新的毫秒，序列号重置
                self.sequence = 0
                
            self.last_timestamp = timestamp
            
            # 组合ID
            id_value = (
                (timestamp << 22) |
                (self.datacenter_id << 17) |
                (self.worker_id << 12) |
                self.sequence
            )
            
            return id_value
            
    def _get_timestamp(self):
        return int(time.time() * 1000)
        
    def _wait_next_millis(self, last_timestamp):
        timestamp = self._get_timestamp()
        while timestamp <= last_timestamp:
            timestamp = self._get_timestamp()
        return timestamp

# 使用示例
id_generator = GlobalIDGenerator(datacenter_id=1, worker_id=1)
order_id = id_generator.generate_id()
print(f"Generated ID: {order_id}")

Active-Passive架构模式

Active-Passive架构模式是指一个数据中心处于活跃状态处理所有请求，其他数据中心处于备用状态，只有在主数据中心失效时才接管服务。

架构特点

1. 主备模式

一个主数据中心处理所有流量
备用数据中心保持同步但不处理请求
故障时进行主备切换

2. 数据同步

主数据中心向备用数据中心同步数据
通常采用异步或半同步复制
数据丢失风险相对较高

3. 故障切换

需要手动或自动检测主数据中心故障
切换过程可能需要停机时间
切换后备用数据中心成为新的主数据中心

实现方案

1. 数据同步机制

# Active-Passive数据同步示例
import time
import threading
from datetime import datetime

class ActivePassiveReplicator:
    def __init__(self, active_db, passive_db, sync_interval=1):
        self.active_db = active_db
        self.passive_db = passive_db
        self.sync_interval = sync_interval
        self.is_running = False
        self.last_sync_time = None
        self.replication_lag = 0
        
    def start_replication(self):
        self.is_running = True
        replication_thread = threading.Thread(target=self._replication_loop)
        replication_thread.daemon = True
        replication_thread.start()
        
    def stop_replication(self):
        self.is_running = False
        
    def _replication_loop(self):
        while self.is_running:
            try:
                start_time = time.time()
                
                # 执行数据同步
                self._sync_data()
                
                end_time = time.time()
                self.last_sync_time = datetime.now()
                self.replication_lag = end_time - start_time
                
                # 等待下次同步
                time.sleep(self.sync_interval)
                
            except Exception as e:
                print(f"Replication error: {e}")
                time.sleep(10)  # 错误时等待更长时间
                
    def _sync_data(self):
        # 获取主数据库的变更日志
        changes = self.active_db.get_recent_changes(
            since=self.last_sync_time or datetime.min)
            
        if changes:
            # 应用变更到备用数据库
            for change in changes:
                self.passive_db.apply_change(change)
                
            print(f"Replicated {len(changes)} changes")
            
    def get_replication_status(self):
        return {
            "is_running": self.is_running,
            "last_sync_time": self.last_sync_time,
            "replication_lag": self.replication_lag,
            "active_db_status": self.active_db.get_status(),
            "passive_db_status": self.passive_db.get_status()
        }
        
    def promote_passive_to_active(self):
        # 将备用数据库提升为主数据库
        self.stop_replication()
        self.passive_db.promote_to_active()
        print("Passive database promoted to active")

class DatabaseAdapter:
    def __init__(self, connection_string):
        self.connection_string = connection_string
        self.connection = self._connect()
        
    def _connect(self):
        # 数据库连接逻辑
        pass
        
    def get_recent_changes(self, since):
        # 获取自指定时间以来的数据库变更
        query = """
        SELECT * FROM change_log 
        WHERE change_time > %s 
        ORDER BY change_time ASC
        """
        # 执行查询并返回变更记录
        pass
        
    def apply_change(self, change):
        # 应用变更到数据库
        pass
        
    def get_status(self):
        # 获取数据库状态
        return {
            "connected": True,
            "role": "active" if self.is_active() else "passive"
        }
        
    def promote_to_active(self):
        # 将数据库提升为主数据库
        pass
        
    def is_active(self):
        # 检查数据库是否为主数据库
        pass

2. 故障检测与切换

# Active-Passive故障检测与切换示例
import time
import threading
from datetime import datetime

class FailoverManager:
    def __init__(self, active_dc, passive_dc, health_check_interval=30):
        self.active_dc = active_dc
        self.passive_dc = passive_dc
        self.health_check_interval = health_check_interval
        self.is_monitoring = False
        self.failover_threshold = 3  # 连续3次健康检查失败触发切换
        self.failure_count = 0
        
    def start_monitoring(self):
        self.is_monitoring = True
        monitor_thread = threading.Thread(target=self._monitoring_loop)
        monitor_thread.daemon = True
        monitor_thread.start()
        
    def stop_monitoring(self):
        self.is_monitoring = False
        
    def _monitoring_loop(self):
        while self.is_monitoring:
            try:
                if not self._is_active_dc_healthy():
                    self.failure_count += 1
                    print(f"Active DC health check failed ({self.failure_count}/{self.failover_threshold})")
                    
                    if self.failure_count >= self.failover_threshold:
                        print("Failover threshold reached, initiating failover...")
                        self._initiate_failover()
                else:
                    # 健康检查通过，重置失败计数
                    if self.failure_count > 0:
                        print("Active DC health restored")
                        self.failure_count = 0
                        
            except Exception as e:
                print(f"Monitoring error: {e}")
                
            time.sleep(self.health_check_interval)
            
    def _is_active_dc_healthy(self):
        try:
            # 多重健康检查
            health_checks = [
                self._check_network_connectivity(),
                self._check_database_status(),
                self._check_application_status(),
                self._check_response_time()
            ]
            
            # 所有检查都通过才算健康
            return all(health_checks)
            
        except Exception as e:
            print(f"Health check error: {e}")
            return False
            
    def _check_network_connectivity(self):
        # 检查网络连通性
        import socket
        try:
            socket.create_connection((self.active_dc.host, self.active_dc.port), timeout=5)
            return True
        except:
            return False
            
    def _check_database_status(self):
        # 检查数据库状态
        try:
            return self.active_dc.database.is_healthy()
        except:
            return False
            
    def _check_application_status(self):
        # 检查应用状态
        try:
            response = self.active_dc.http_client.get("/health")
            return response.status_code == 200
        except:
            return False
            
    def _check_response_time(self):
        # 检查响应时间
        try:
            start_time = time.time()
            self.active_dc.http_client.get("/health")
            response_time = time.time() - start_time
            return response_time < 2.0  # 2秒以内算正常
        except:
            return False
            
    def _initiate_failover(self):
        print("Initiating failover process...")
        
        # 1. 停止主数据中心的服务
        self._stop_active_services()
        
        # 2. 确保数据同步完成
        self._ensure_data_sync()
        
        # 3. 将备用数据中心提升为主数据中心
        self._promote_passive_dc()
        
        # 4. 更新DNS记录或负载均衡配置
        self._update_routing()
        
        # 5. 启动新主数据中心的服务
        self._start_new_active_services()
        
        # 6. 发送告警通知
        self._send_failover_notification()
        
        print("Failover completed successfully")
        
    def _stop_active_services(self):
        # 停止主数据中心的服务
        try:
            self.active_dc.stop_services()
            print("Active DC services stopped")
        except Exception as e:
            print(f"Error stopping active DC services: {e}")
            
    def _ensure_data_sync(self):
        # 确保数据同步完成
        try:
            self.active_dc.replicator.wait_for_sync_completion(timeout=300)  # 5分钟超时
            print("Data synchronization completed")
        except Exception as e:
            print(f"Data sync timeout or error: {e}")
            
    def _promote_passive_dc(self):
        # 将备用数据中心提升为主数据中心
        try:
            self.passive_dc.promote_to_active()
            print("Passive DC promoted to active")
        except Exception as e:
            print(f"Error promoting passive DC: {e}")
            raise
            
    def _update_routing(self):
        # 更新DNS记录或负载均衡配置
        try:
            dns_manager = DNSManager()
            dns_manager.update_record(
                record_name="api.company.com",
                new_ip=self.passive_dc.public_ip
            )
            print("DNS record updated")
        except Exception as e:
            print(f"Error updating DNS: {e}")
            
    def _start_new_active_services(self):
        # 启动新主数据中心的服务
        try:
            self.passive_dc.start_services()
            print("New active DC services started")
        except Exception as e:
            print(f"Error starting new active DC services: {e}")
            
    def _send_failover_notification(self):
        # 发送故障切换通知
        notification = {
            "event": "datacenter_failover",
            "timestamp": datetime.now().isoformat(),
            "old_active_dc": self.active_dc.id,
            "new_active_dc": self.passive_dc.id,
            "reason": "health_check_failure"
        }
        
        try:
            NotificationService.send(notification)
            print("Failover notification sent")
        except Exception as e:
            print(f"Error sending notification: {e}")

混合架构模式

在实际应用中，很多企业采用混合架构模式，结合Active-Active和Active-Passive的优点，根据业务特点和成本考虑进行灵活设计。

区域性Active-Active + 全球Active-Passive

1. 区域内多活

在同一地理区域内部署多个数据中心，实现区域内多活：

低延迟的网络连接
实时数据同步
自动负载均衡

2. 跨区域主备

在不同地理区域间采用主备模式：

异步数据复制
定期灾难恢复演练
手动或自动故障切换

# 混合架构示例
class HybridArchitecture:
    def __init__(self):
        # 区域内Active-Active数据中心
        self.regional_dcs = [
            Datacenter("us-east-1", "us-east", "primary"),
            Datacenter("us-east-2", "us-east", "primary")
        ]
        
        # 跨区域Active-Passive数据中心
        self.global_dcs = [
            RegionalGroup("us-east", self.regional_dcs, "active"),
            RegionalGroup("eu-west", [
                Datacenter("eu-west-1", "eu-west", "standby")
            ], "passive")
        ]
        
        self.global_lb = GlobalLoadBalancer(self.global_dcs)
        self.regional_lb = RegionalLoadBalancer(self.regional_dcs)
        
    def handle_request(self, request):
        # 首先选择区域
        region = self.global_lb.select_region(request.client_location)
        
        # 在区域内进行负载均衡
        datacenter = self.regional_lb.route_request(region, request)
        
        return {
            "region": region,
            "datacenter": datacenter,
            "endpoint": datacenter.endpoint
        }
        
    def handle_regional_failure(self, failed_region):
        # 处理区域级故障
        if failed_region == "us-east":
            # 切换到备用区域
            self.global_lb.failover_to_region("eu-west")
            print("Failed over to EU region")
            
    def perform_disaster_recovery_drill(self):
        # 定期进行灾难恢复演练
        print("Starting disaster recovery drill...")
        
        # 1. 验证备用区域的健康状态
        eu_healthy = self.global_lb.is_region_healthy("eu-west")
        if not eu_healthy:
            raise Exception("EU region is not healthy for DR drill")
            
        # 2. 模拟主区域故障
        print("Simulating US-East region failure...")
        self.global_lb.simulate_region_failure("us-east")
        
        # 3. 验证故障切换
        print("Verifying failover to EU region...")
        active_region = self.global_lb.get_active_region()
        if active_region != "eu-west":
            raise Exception("Failover verification failed")
            
        # 4. 验证服务可用性
        print("Verifying service availability...")
        self._verify_service_availability()
        
        # 5. 恢复主区域
        print("Restoring US-East region...")
        self.global_lb.restore_region("us-east")
        
        # 6. 切换回主区域
        print("Failing back to US-East region...")
        self.global_lb.failover_to_region("us-east")
        
        print("Disaster recovery drill completed successfully")
        
    def _verify_service_availability(self):
        # 验证服务可用性
        test_requests = [
            {"endpoint": "/api/users", "method": "GET"},
            {"endpoint": "/api/orders", "method": "POST", "data": {"test": "data"}},
            {"endpoint": "/api/products", "method": "GET"}
        ]
        
        for request in test_requests:
            try:
                # 发送测试请求
                response = self._send_test_request(request)
                if response.status_code != 200:
                    raise Exception(f"Service test failed: {request['endpoint']}")
            except Exception as e:
                raise Exception(f"Service verification error: {e}")
                
    def _send_test_request(self, request):
        import requests
        if request['method'] == 'GET':
            return requests.get(f"https://api.company.com{request['endpoint']}")
        elif request['method'] == 'POST':
            return requests.post(f"https://api.company.com{request['endpoint']}", 
                               json=request.get('data', {}))

网络架构设计

跨数据中心容灾架构的网络设计是关键，需要考虑延迟、带宽、可靠性等多个因素。

专线连接

# 专线连接管理示例
class DedicatedConnectionManager:
    def __init__(self, connections):
        self.connections = connections
        self.connection_health = {}
        
    def monitor_connections(self):
        for conn in self.connections:
            health_status = self._check_connection_health(conn)
            self.connection_health[conn.id] = health_status
            
            if not health_status['healthy']:
                self._handle_connection_failure(conn)
                
    def _check_connection_health(self, connection):
        checks = {
            'latency': self._check_latency(connection),
            'bandwidth': self._check_bandwidth(connection),
            'packet_loss': self._check_packet_loss(connection),
            'jitter': self._check_jitter(connection)
        }
        
        # 综合健康评分
        health_score = self._calculate_health_score(checks)
        is_healthy = health_score > 0.7  # 70%以上算健康
        
        return {
            'healthy': is_healthy,
            'score': health_score,
            'checks': checks
        }
        
    def _calculate_health_score(self, checks):
        # 加权计算健康评分
        weights = {
            'latency': 0.3,
            'bandwidth': 0.3,
            'packet_loss': 0.2,
            'jitter': 0.2
        }
        
        score = 0
        for check_name, weight in weights.items():
            check_value = checks.get(check_name, 0)
            # 将检查值标准化到0-1范围
            normalized_value = self._normalize_check_value(check_name, check_value)
            score += normalized_value * weight
            
        return score
        
    def _normalize_check_value(self, check_name, value):
        # 根据检查类型将值标准化到0-1范围
        if check_name == 'latency':
            # 延迟越小越好，假设最大可接受延迟为100ms
            return max(0, 1 - (value / 100))
        elif check_name == 'bandwidth':
            # 带宽越高越好，假设最小可接受带宽为100Mbps
            return min(1, value / 1000)  # 假设最大带宽1000Mbps
        elif check_name == 'packet_loss':
            # 丢包率越低越好，假设最大可接受丢包率5%
            return max(0, 1 - (value / 5))
        elif check_name == 'jitter':
            # 抖动越小越好，假设最大可接受抖动50ms
            return max(0, 1 - (value / 50))
        return 0

DNS负载均衡

# DNS负载均衡示例
class DNSLoadBalancer:
    def __init__(self, domain_name, datacenters):
        self.domain_name = domain_name
        self.datacenters = datacenters
        self.dns_provider = self._initialize_dns_provider()
        
    def _initialize_dns_provider(self):
        # 初始化DNS提供商API客户端
        # 这里以CloudFlare为例
        import CloudFlare
        return CloudFlare.CloudFlare(
            email='admin@company.com',
            token='your_api_token'
        )
        
    def update_dns_records(self, healthy_dcs):
        # 更新DNS记录，只包含健康的数据中心
        try:
            # 获取当前DNS记录
            zones = self.dns_provider.zones.get(params={'name': self.domain_name})
            zone_id = zones[0]['id']
            
            dns_records = self.dns_provider.zones.dns_records.get(
                zone_id, params={'name': f"api.{self.domain_name}"})
                
            if dns_records:
                record_id = dns_records[0]['id']
                
                # 构建新的IP地址列表
                new_ips = [dc.public_ip for dc in healthy_dcs]
                
                # 更新DNS记录
                self.dns_provider.zones.dns_records.put(
                    zone_id, record_id,
                    data={
                        'type': 'A',
                        'name': f"api.{self.domain_name}",
                        'content': ','.join(new_ips),
                        'proxied': True
                    }
                )
                
                print(f"DNS records updated with IPs: {new_ips}")
                
        except Exception as e:
            print(f"Failed to update DNS records: {e}")
            
    def get_current_records(self):
        # 获取当前DNS记录
        try:
            zones = self.dns_provider.zones.get(params={'name': self.domain_name})
            zone_id = zones[0]['id']
            
            dns_records = self.dns_provider.zones.dns_records.get(
                zone_id, params={'name': f"api.{self.domain_name}"})
                
            return dns_records[0] if dns_records else None
        except Exception as e:
            print(f"Failed to get DNS records: {e}")
            return None

数据一致性保障

跨数据中心架构中的数据一致性是核心挑战，需要采用合适的策略来保障。

最终一致性模型

# 最终一致性实现示例
import time
import uuid
from datetime import datetime

class EventualConsistencyManager:
    def __init__(self, datacenters):
        self.datacenters = datacenters
        self.conflict_resolver = ConflictResolver()
        
    def write_data(self, data, origin_dc):
        # 1. 在源数据中心写入数据
        write_result = origin_dc.database.write(data)
        
        # 2. 异步复制到其他数据中心
        replication_tasks = []
        for dc in self.datacenters:
            if dc != origin_dc:
                task = self._replicate_async(dc, data, write_result.version)
                replication_tasks.append(task)
                
        # 3. 返回写入结果
        return {
            "success": True,
            "version": write_result.version,
            "replication_tasks": replication_tasks
        }
        
    def _replicate_async(self, target_dc, data, version):
        # 异步复制数据
        def replication_task():
            try:
                # 检查目标数据中心是否已有更新版本
                existing_data = target_dc.database.read(data.id)
                if existing_data and existing_data.version >= version:
                    # 已有更新版本，无需复制
                    return {"status": "skipped", "reason": "newer_version_exists"}
                    
                # 复制数据
                target_dc.database.write(data)
                return {"status": "success"}
                
            except Exception as e:
                return {"status": "failed", "error": str(e)}
                
        # 使用线程池执行异步复制
        import threading
        thread = threading.Thread(target=replication_task)
        thread.start()
        return thread
        
    def read_data(self, data_id, preferred_dc=None):
        # 1. 首先从首选数据中心读取
        if preferred_dc:
            data = preferred_dc.database.read(data_id)
            if data:
                return data
                
        # 2. 如果首选数据中心没有数据，从其他数据中心读取
        for dc in self.datacenters:
            if dc != preferred_dc:
                data = dc.database.read(data_id)
                if data:
                    return data
                    
        return None
        
    def resolve_conflicts(self, data_id):
        # 收集所有数据中心的数据版本
        versions = {}
        for dc in self.datacenters:
            data = dc.database.read(data_id)
            if data:
                versions[dc.id] = data
                
        if len(versions) <= 1:
            # 没有冲突
            return list(versions.values())[0] if versions else None
            
        # 解决冲突
        resolved_data = self.conflict_resolver.resolve(list(versions.values()))
        
        # 将解决后的数据同步到所有数据中心
        for dc in self.datacenters:
            dc.database.write(resolved_data)
            
        return resolved_data

class ConflictResolver:
    def resolve(self, conflicting_versions):
        # 基于时间戳的冲突解决
        latest_version = max(conflicting_versions, key=lambda x: x.timestamp)
        return latest_version
        
    def resolve_with_vector_clocks(self, conflicting_versions):
        # 使用向量时钟解决冲突
        # 这是一个简化的实现
        vector_clocks = [version.vector_clock for version in conflicting_versions]
        
        # 找到最大的向量时钟
        max_clock = self._find_max_vector_clock(vector_clocks)
        
        # 返回对应的版本
        for version in conflicting_versions:
            if version.vector_clock == max_clock:
                return version
                
        return conflicting_versions[0]
        
    def _find_max_vector_clock(self, vector_clocks):
        # 找到最大的向量时钟
        if not vector_clocks:
            return {}
            
        max_clock = {}
        for clock in vector_clocks:
            for dc_id, timestamp in clock.items():
                if dc_id not in max_clock or timestamp > max_clock[dc_id]:
                    max_clock[dc_id] = timestamp
                    
        return max_clock

监控与运维

跨数据中心架构需要完善的监控和运维体系来保障其稳定运行。

全局监控系统

# 全局监控系统示例
import time
import json
from datetime import datetime

class GlobalMonitoringSystem:
    def __init__(self, datacenters):
        self.datacenters = datacenters
        self.alert_manager = AlertManager()
        self.metrics_collector = MetricsCollector()
        
    def start_monitoring(self):
        while True:
            try:
                # 收集各数据中心指标
                metrics = self._collect_all_metrics()
                
                # 分析指标并检测异常
                anomalies = self._detect_anomalies(metrics)
                
                # 发送告警
                if anomalies:
                    self.alert_manager.send_alerts(anomalies)
                    
                # 存储指标数据
                self.metrics_collector.store_metrics(metrics)
                
                time.sleep(60)  # 每分钟收集一次
                
            except Exception as e:
                print(f"Monitoring error: {e}")
                time.sleep(60)
                
    def _collect_all_metrics(self):
        metrics = {}
        for dc in self.datacenters:
            try:
                dc_metrics = self._collect_dc_metrics(dc)
                metrics[dc.id] = dc_metrics
            except Exception as e:
                print(f"Failed to collect metrics from {dc.id}: {e}")
                metrics[dc.id] = {"error": str(e)}
                
        return metrics
        
    def _collect_dc_metrics(self, dc):
        return {
            "timestamp": datetime.now().isoformat(),
            "system": self._collect_system_metrics(dc),
            "application": self._collect_application_metrics(dc),
            "network": self._collect_network_metrics(dc),
            "database": self._collect_database_metrics(dc)
        }
        
    def _collect_system_metrics(self, dc):
        # 收集系统指标（CPU、内存、磁盘等）
        return {
            "cpu_usage": dc.system_monitor.get_cpu_usage(),
            "memory_usage": dc.system_monitor.get_memory_usage(),
            "disk_usage": dc.system_monitor.get_disk_usage(),
            "uptime": dc.system_monitor.get_uptime()
        }
        
    def _collect_application_metrics(self, dc):
        # 收集应用指标（请求数、错误率、响应时间等）
        return {
            "request_count": dc.app_monitor.get_request_count(),
            "error_rate": dc.app_monitor.get_error_rate(),
            "avg_response_time": dc.app_monitor.get_avg_response_time(),
            "active_sessions": dc.app_monitor.get_active_sessions()
        }
        
    def _collect_network_metrics(self, dc):
        # 收集网络指标（带宽、延迟、丢包率等）
        return {
            "bandwidth_usage": dc.network_monitor.get_bandwidth_usage(),
            "latency": dc.network_monitor.get_latency(),
            "packet_loss": dc.network_monitor.get_packet_loss(),
            "connections": dc.network_monitor.get_active_connections()
        }
        
    def _collect_database_metrics(self, dc):
        # 收集数据库指标（连接数、查询性能、复制延迟等）
        return {
            "connections": dc.db_monitor.get_connection_count(),
            "query_performance": dc.db_monitor.get_query_performance(),
            "replication_lag": dc.db_monitor.get_replication_lag(),
            "disk_io": dc.db_monitor.get_disk_io()
        }
        
    def _detect_anomalies(self, metrics):
        anomalies = []
        
        for dc_id, dc_metrics in metrics.items():
            if "error" in dc_metrics:
                anomalies.append({
                    "type": "datacenter_unreachable",
                    "datacenter": dc_id,
                    "error": dc_metrics["error"],
                    "timestamp": datetime.now().isoformat()
                })
                continue
                
            # 检查各项指标是否异常
            system_anomalies = self._detect_system_anomalies(dc_metrics["system"], dc_id)
            app_anomalies = self._detect_app_anomalies(dc_metrics["application"], dc_id)
            network_anomalies = self._detect_network_anomalies(dc_metrics["network"], dc_id)
            db_anomalies = self._detect_db_anomalies(dc_metrics["database"], dc_id)
            
            anomalies.extend(system_anomalies + app_anomalies + network_anomalies + db_anomalies)
            
        return anomalies
        
    def _detect_system_anomalies(self, system_metrics, dc_id):
        anomalies = []
        
        if system_metrics["cpu_usage"] > 90:
            anomalies.append({
                "type": "high_cpu_usage",
                "datacenter": dc_id,
                "value": system_metrics["cpu_usage"],
                "threshold": 90,
                "timestamp": datetime.now().isoformat()
            })
            
        if system_metrics["memory_usage"] > 95:
            anomalies.append({
                "type": "high_memory_usage",
                "datacenter": dc_id,
                "value": system_metrics["memory_usage"],
                "threshold": 95,
                "timestamp": datetime.now().isoformat()
            })
            
        return anomalies
        
    def _detect_app_anomalies(self, app_metrics, dc_id):
        anomalies = []
        
        if app_metrics["error_rate"] > 0.05:  # 5%错误率阈值
            anomalies.append({
                "type": "high_error_rate",
                "datacenter": dc_id,
                "value": app_metrics["error_rate"],
                "threshold": 0.05,
                "timestamp": datetime.now().isoformat()
            })
            
        if app_metrics["avg_response_time"] > 2000:  # 2秒响应时间阈值
            anomalies.append({
                "type": "high_response_time",
                "datacenter": dc_id,
                "value": app_metrics["avg_response_time"],
                "threshold": 2000,
                "timestamp": datetime.now().isoformat()
            })
            
        return anomalies
        
    def _detect_network_anomalies(self, network_metrics, dc_id):
        anomalies = []
        
        if network_metrics["latency"] > 100:  # 100ms延迟阈值
            anomalies.append({
                "type": "high_network_latency",
                "datacenter": dc_id,
                "value": network_metrics["latency"],
                "threshold": 100,
                "timestamp": datetime.now().isoformat()
            })
            
        if network_metrics["packet_loss"] > 1:  # 1%丢包率阈值
            anomalies.append({
                "type": "high_packet_loss",
                "datacenter": dc_id,
                "value": network_metrics["packet_loss"],
                "threshold": 1,
                "timestamp": datetime.now().isoformat()
            })
            
        return anomalies
        
    def _detect_db_anomalies(self, db_metrics, dc_id):
        anomalies = []
        
        if db_metrics["replication_lag"] > 30:  # 30秒复制延迟阈值
            anomalies.append({
                "type": "high_replication_lag",
                "datacenter": dc_id,
                "value": db_metrics["replication_lag"],
                "threshold": 30,
                "timestamp": datetime.now().isoformat()
            })
            
        return anomalies

class AlertManager:
    def __init__(self):
        self.notification_channels = [
            EmailNotifier(),
            SlackNotifier(),
            SMSNotifier()
        ]
        
    def send_alerts(self, anomalies):
        for anomaly in anomalies:
            self._send_alert(anomaly)
            
    def _send_alert(self, anomaly):
        alert_message = self._format_alert_message(anomaly)
        
        for channel in self.notification_channels:
            try:
                channel.send(alert_message)
            except Exception as e:
                print(f"Failed to send alert via {channel.__class__.__name__}: {e}")
                
    def _format_alert_message(self, anomaly):
        return f"""
        ALERT: {anomaly['type']}
        Datacenter: {anomaly['datacenter']}
        Value: {anomaly['value']}
        Threshold: {anomaly['threshold']}
        Time: {anomaly['timestamp']}
        """

class EmailNotifier:
    def send(self, message):
        # 发送邮件告警
        print(f"Sending email alert: {message}")
        
class SlackNotifier:
    def send(self, message):
        # 发送Slack告警
        print(f"Sending Slack alert: {message}")
        
class SMSNotifier:
    def send(self, message):
        # 发送短信告警
        print(f"Sending SMS alert: {message}")

最佳实践

1. 定期灾难恢复演练

# 灾难恢复演练管理示例
class DisasterRecoveryDrillManager:
    def __init__(self, architecture):
        self.architecture = architecture
        self.drill_history = []
        
    def schedule_drill(self, schedule_config):
        # 安排定期演练
        import schedule
        schedule.every(schedule_config['interval']).days.do(self.perform_drill)
        
    def perform_drill(self, drill_type="failover"):
        drill_record = {
            "drill_id": self._generate_drill_id(),
            "type": drill_type,
            "start_time": datetime.now(),
            "steps": [],
            "status": "running"
        }
        
        try:
            if drill_type == "failover":
                self._perform_failover_drill(drill_record)
            elif drill_type == "network_partition":
                self._perform_network_partition_drill(drill_record)
            elif drill_type == "datacenter_outage":
                self._perform_datacenter_outage_drill(drill_record)
                
            drill_record["status"] = "completed"
            drill_record["end_time"] = datetime.now()
            
        except Exception as e:
            drill_record["status"] = "failed"
            drill_record["error"] = str(e)
            drill_record["end_time"] = datetime.now()
            
        self.drill_history.append(drill_record)
        self._send_drill_report(drill_record)
        
        return drill_record
        
    def _perform_failover_drill(self, drill_record):
        # 执行故障切换演练
        drill_record["steps"].append({
            "step": 1,
            "action": "verify_standby_dc_health",
            "status": "started",
            "timestamp": datetime.now()
        })
        
        # 1. 验证备用数据中心健康状态
        standby_healthy = self.architecture.global_lb.is_region_healthy("eu-west")
        drill_record["steps"][-1]["status"] = "completed" if standby_healthy else "failed"
        drill_record["steps"][-1]["result"] = standby_healthy
        drill_record["steps"][-1]["end_time"] = datetime.now()
        
        if not standby_healthy:
            raise Exception("Standby datacenter is not healthy")
            
        drill_record["steps"].append({
            "step": 2,
            "action": "simulate_primary_dc_failure",
            "status": "started",
            "timestamp": datetime.now()
        })
        
        # 2. 模拟主数据中心故障
        self.architecture.global_lb.simulate_region_failure("us-east")
        drill_record["steps"][-1]["status"] = "completed"
        drill_record["steps"][-1]["end_time"] = datetime.now()
        
        drill_record["steps"].append({
            "step": 3,
            "action": "verify_failover",
            "status": "started",
            "timestamp": datetime.now()
        })
        
        # 3. 验证故障切换
        active_region = self.architecture.global_lb.get_active_region()
        failover_success = active_region == "eu-west"
        drill_record["steps"][-1]["status"] = "completed" if failover_success else "failed"
        drill_record["steps"][-1]["result"] = failover_success
        drill_record["steps"][-1]["end_time"] = datetime.now()
        
        if not failover_success:
            raise Exception("Failover verification failed")
            
        drill_record["steps"].append({
            "step": 4,
            "action": "verify_service_availability",
            "status": "started",
            "timestamp": datetime.now()
        })
        
        # 4. 验证服务可用性
        service_available = self._verify_service_availability()
        drill_record["steps"][-1]["status"] = "completed" if service_available else "failed"
        drill_record["steps"][-1]["result"] = service_available
        drill_record["steps"][-1]["end_time"] = datetime.now()
        
        if not service_available:
            raise Exception("Service availability verification failed")
            
        drill_record["steps"].append({
            "step": 5,
            "action": "restore_primary_dc",
            "status": "started",
            "timestamp": datetime.now()
        })
        
        # 5. 恢复主数据中心
        self.architecture.global_lb.restore_region("us-east")
        drill_record["steps"][-1]["status"] = "completed"
        drill_record["steps"][-1]["end_time"] = datetime.now()
        
    def _verify_service_availability(self):
        # 验证服务可用性
        test_endpoints = [
            "/health",
            "/api/users",
            "/api/orders"
        ]
        
        for endpoint in test_endpoints:
            try:
                import requests
                response = requests.get(f"https://api.company.com{endpoint}", timeout=10)
                if response.status_code != 200:
                    return False
            except Exception:
                return False
                
        return True
        
    def _generate_drill_id(self):
        return f"drill_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
    def _send_drill_report(self, drill_record):
        # 发送演练报告
        report = self._generate_drill_report(drill_record)
        print(f"Disaster recovery drill report: {report}")
        
    def _generate_drill_report(self, drill_record):
        total_time = (drill_record["end_time"] - drill_record["start_time"]).total_seconds()
        success_steps = len([step for step in drill_record["steps"] if step["status"] == "completed"])
        total_steps = len(drill_record["steps"])
        
        return {
            "drill_id": drill_record["drill_id"],
            "type": drill_record["type"],
            "duration": total_time,
            "success_rate": success_steps / total_steps if total_steps > 0 else 0,
            "steps": drill_record["steps"]
        }

2. 成本优化策略

# 跨数据中心成本优化示例
class CostOptimizationManager:
    def __init__(self, datacenters):
        self.datacenters = datacenters
        self.cost_analyzer = CostAnalyzer()
        
    def optimize_costs(self):
        optimizations = []
        
        # 1. 网络成本优化
        network_optimizations = self._optimize_network_costs()
        optimizations.extend(network_optimizations)
        
        # 2. 计算资源优化
        compute_optimizations = self._optimize_compute_resources()
        optimizations.extend(compute_optimizations)
        
        # 3. 存储成本优化
        storage_optimizations = self._optimize_storage_costs()
        optimizations.extend(storage_optimizations)
        
        return optimizations
        
    def _optimize_network_costs(self):
        optimizations = []
        
        # 分析跨数据中心流量模式
        traffic_analysis = self._analyze_cross_dc_traffic()
        
        # 优化建议
        if traffic_analysis["high_latency_routes"]:
            optimizations.append({
                "type": "network_optimization",
                "action": "upgrade_network_connections",
                "routes": traffic_analysis["high_latency_routes"],
                "estimated_savings": "$50,000/month"
            })
            
        if traffic_analysis["redundant_traffic"]:
            optimizations.append({
                "type": "network_optimization",
                "action": "eliminate_redundant_traffic",
                "pattern": traffic_analysis["redundant_traffic"],
                "estimated_savings": "$20,000/month"
            })
            
        return optimizations
        
    def _optimize_compute_resources(self):
        optimizations = []
        
        # 分析各数据中心的资源利用率
        resource_utilization = self._analyze_resource_utilization()
        
        # 优化建议
        for dc_id, utilization in resource_utilization.items():
            if utilization["cpu_avg"] < 30:
                optimizations.append({
                    "type": "compute_optimization",
                    "action": "downsize_instances",
                    "datacenter": dc_id,
                    "current_utilization": utilization["cpu_avg"],
                    "estimated_savings": "$15,000/month"
                })
                
        return optimizations
        
    def _optimize_storage_costs(self):
        optimizations = []
        
        # 分析存储使用模式
        storage_analysis = self._analyze_storage_usage()
        
        # 优化建议
        if storage_analysis["cold_data_ratio"] > 0.6:
            optimizations.append({
                "type": "storage_optimization",
                "action": "migrate_cold_data_to_cheap_storage",
                "data_ratio": storage_analysis["cold_data_ratio"],
                "estimated_savings": "$30,000/month"
            })
            
        return optimizations
        
    def _analyze_cross_dc_traffic(self):
        # 分析跨数据中心流量
        return {
            "high_latency_routes": ["us-east <-> eu-west"],
            "redundant_traffic": ["duplicate_replication_streams"]
        }
        
    def _analyze_resource_utilization(self):
        # 分析资源利用率
        utilization = {}
        for dc in self.datacenters:
            utilization[dc.id] = {
                "cpu_avg": dc.system_monitor.get_avg_cpu_usage(),
                "memory_avg": dc.system_monitor.get_avg_memory_usage()
            }
        return utilization
        
    def _analyze_storage_usage(self):
        # 分析存储使用情况
        return {
            "cold_data_ratio": 0.7,  # 70%的冷数据
            "hot_data_ratio": 0.3     # 30%的热数据
        }

class CostAnalyzer:
    def analyze_monthly_costs(self, datacenters):
        total_cost = 0
        cost_breakdown = {}
        
        for dc in datacenters:
            dc_cost = self._calculate_dc_cost(dc)
            total_cost += dc_cost
            cost_breakdown[dc.id] = dc_cost
            
        return {
            "total_monthly_cost": total_cost,
            "cost_breakdown": cost_breakdown,
            "cost_per_request": total_cost / self._get_monthly_request_volume()
        }
        
    def _calculate_dc_cost(self, dc):
        # 计算单个数据中心的成本
        compute_cost = dc.resource_manager.get_compute_cost()
        storage_cost = dc.resource_manager.get_storage_cost()
        network_cost = dc.network_manager.get_network_cost()
        
        return compute_cost + storage_cost + network_cost
        
    def _get_monthly_request_volume(self):
        # 获取月度请求量
        return 10000000  # 1000万请求/月

总结

跨数据中心容灾架构是构建高可用、高可靠分布式系统的关键技术。通过合理选择Active-Active或Active-Passive架构模式，结合完善的网络设计、数据一致性保障和监控运维体系，可以显著提升系统的容灾能力和业务连续性。

关键要点包括：

根据业务需求和成本考虑选择合适的架构模式
设计可靠的网络连接和负载均衡机制
实现高效的数据同步和冲突解决策略
建立完善的监控和告警体系
定期进行灾难恢复演练
持续优化成本和性能

下一章我们将探讨混合云与多云场景下的容灾设计，了解如何利用云服务商的能力构建灵活高效的容灾架构。