平台分层架构: 接入层、元数据层、数据层、管理层

老马啸西风2025/9/7大约 15 分钟

分层架构是分布式文件存储平台设计的核心模式之一，通过将系统功能划分为不同的逻辑层次，可以实现关注点分离、降低系统复杂性、提高可维护性和可扩展性。本章将深入探讨分布式文件存储平台的四层架构设计：接入层、元数据层、数据层和管理层，详细分析每层的职责、设计要点、关键技术以及实现策略。

4.1.1 接入层设计

接入层是用户与分布式文件存储平台交互的第一层，负责处理各种协议请求、用户认证和负载均衡等功能。它是平台的"门面"，直接影响用户体验和系统安全性。

4.1.1.1 接入层核心职责

协议适配与转换：
- 支持多种存储协议（POSIX、NFS、S3、HDFS等）
- 实现协议间的转换和适配
- 提供统一的内部接口
请求路由与负载均衡：
- 分发用户请求到后端服务
- 实现请求的负载均衡
- 支持动态扩缩容
安全认证与授权：
- 实现用户身份认证
- 控制访问权限
- 加密数据传输
流量控制与限流：
- 控制请求流量
- 防止系统过载
- 实现服务质量保证

4.1.1.2 协议适配设计

# 协议适配器抽象类
from abc import ABC, abstractmethod

class ProtocolAdapter(ABC):
    @abstractmethod
    def parse_request(self, raw_request):
        """解析原始请求"""
        pass
    
    @abstractmethod
    def build_response(self, response_data):
        """构建响应"""
        pass
    
    @abstractmethod
    def get_protocol_name(self):
        """获取协议名称"""
        pass

# S3协议适配器示例
class S3ProtocolAdapter(ProtocolAdapter):
    def parse_request(self, raw_request):
        # 解析S3协议请求
        # 提取操作类型、路径、参数等信息
        parsed_request = {
            "operation": raw_request.get("method"),
            "path": raw_request.get("path"),
            "headers": raw_request.get("headers"),
            "body": raw_request.get("body")
        }
        return parsed_request
    
    def build_response(self, response_data):
        # 构建S3协议响应
        response = {
            "status_code": response_data.get("status", 200),
            "headers": {
                "Content-Type": "application/xml",
                "x-amz-request-id": response_data.get("request_id", "")
            },
            "body": response_data.get("data", "")
        }
        return response
    
    def get_protocol_name(self):
        return "S3"

# NFS协议适配器示例
class NFSProtocolAdapter(ProtocolAdapter):
    def parse_request(self, raw_request):
        # 解析NFS协议请求
        parsed_request = {
            "operation": raw_request.get("procedure"),
            "file_handle": raw_request.get("file_handle"),
            "arguments": raw_request.get("arguments")
        }
        return parsed_request
    
    def build_response(self, response_data):
        # 构建NFS协议响应
        response = {
            "status": response_data.get("status", 0),
            "data": response_data.get("data", ""),
            "attributes": response_data.get("attributes", {})
        }
        return response
    
    def get_protocol_name(self):
        return "NFS"

# 协议适配器工厂
class ProtocolAdapterFactory:
    _adapters = {
        "S3": S3ProtocolAdapter,
        "NFS": NFSProtocolAdapter
    }
    
    @classmethod
    def create_adapter(cls, protocol_name):
        if protocol_name in cls._adapters:
            return cls._adapters[protocol_name]()
        else:
            raise ValueError(f"Unsupported protocol: {protocol_name}")
    
    @classmethod
    def register_adapter(cls, protocol_name, adapter_class):
        cls._adapters[protocol_name] = adapter_class

# 使用示例
adapter = ProtocolAdapterFactory.create_adapter("S3")
request = {"method": "GET", "path": "/bucket/object", "headers": {}, "body": ""}
parsed = adapter.parse_request(request)
print(f"Parsed S3 request: {parsed}")

4.1.1.3 负载均衡策略

轮询算法（Round Robin）：
- 依次将请求分发给后端服务
- 实现简单，负载分布均匀
- 不考虑后端服务的实际负载
加权轮询算法（Weighted Round Robin）：
- 根据后端服务的处理能力分配权重
- 能力强的服务处理更多请求
- 更合理地分配负载
最少连接算法（Least Connections）：
- 将请求分发给当前连接数最少的服务
- 动态调整负载分配
- 适应服务处理能力的变化
一致性哈希算法（Consistent Hashing）：
- 根据请求特征进行哈希计算
- 相同特征的请求路由到同一服务
- 减少缓存失效

# 负载均衡器示例
import random
import time

class LoadBalancer:
    def __init__(self, strategy="round_robin"):
        self.strategy = strategy
        self.servers = []
        self.current_index = 0
        self.connections = {}
    
    def add_server(self, server_id, weight=1):
        """添加服务器"""
        self.servers.append({
            "id": server_id,
            "weight": weight,
            "last_heartbeat": time.time()
        })
        self.connections[server_id] = 0
    
    def remove_server(self, server_id):
        """移除服务器"""
        self.servers = [s for s in self.servers if s["id"] != server_id]
        if server_id in self.connections:
            del self.connections[server_id]
    
    def update_heartbeat(self, server_id):
        """更新服务器心跳"""
        for server in self.servers:
            if server["id"] == server_id:
                server["last_heartbeat"] = time.time()
                break
    
    def is_server_healthy(self, server_id):
        """检查服务器健康状态"""
        for server in self.servers:
            if server["id"] == server_id:
                # 假设30秒内没有心跳认为不健康
                return time.time() - server["last_heartbeat"] < 30
        return False
    
    def get_next_server(self):
        """获取下一个服务器"""
        healthy_servers = [s for s in self.servers if self.is_server_healthy(s["id"])]
        if not healthy_servers:
            return None
        
        if self.strategy == "round_robin":
            return self._round_robin(healthy_servers)
        elif self.strategy == "weighted_round_robin":
            return self._weighted_round_robin(healthy_servers)
        elif self.strategy == "least_connections":
            return self._least_connections(healthy_servers)
        else:
            return random.choice(healthy_servers)
    
    def _round_robin(self, servers):
        """轮询算法"""
        if not servers:
            return None
        server = servers[self.current_index % len(servers)]
        self.current_index += 1
        return server["id"]
    
    def _weighted_round_robin(self, servers):
        """加权轮询算法"""
        if not servers:
            return None
        
        # 简化的加权轮询实现
        total_weight = sum(s["weight"] for s in servers)
        rand_weight = random.randint(1, total_weight)
        
        current_weight = 0
        for server in servers:
            current_weight += server["weight"]
            if rand_weight <= current_weight:
                return server["id"]
        
        return servers[-1]["id"]
    
    def _least_connections(self, servers):
        """最少连接算法"""
        if not servers:
            return None
        
        min_connections = float('inf')
        selected_server = None
        
        for server in servers:
            connections = self.connections.get(server["id"], 0)
            if connections < min_connections:
                min_connections = connections
                selected_server = server["id"]
        
        return selected_server
    
    def increment_connection(self, server_id):
        """增加连接数"""
        if server_id in self.connections:
            self.connections[server_id] += 1
    
    def decrement_connection(self, server_id):
        """减少连接数"""
        if server_id in self.connections and self.connections[server_id] > 0:
            self.connections[server_id] -= 1

# 使用示例
lb = LoadBalancer(strategy="least_connections")
lb.add_server("server1", weight=2)
lb.add_server("server2", weight=1)
lb.add_server("server3", weight=3)

# 模拟服务器心跳
lb.update_heartbeat("server1")
lb.update_heartbeat("server2")
lb.update_heartbeat("server3")

# 获取服务器
for i in range(10):
    server = lb.get_next_server()
    print(f"Request {i+1} routed to: {server}")
    if server:
        lb.increment_connection(server)

4.1.1.4 安全认证机制

身份认证：
- 用户名/密码认证
- Token认证（JWT、OAuth等）
- 证书认证（TLS客户端证书）
- 多因子认证（MFA）
访问控制：
- 基于角色的访问控制（RBAC）
- 基于属性的访问控制（ABAC）
- 细粒度权限管理
- 访问审计日志
数据加密：
- 传输层加密（TLS/SSL）
- 应用层加密（字段级加密）
- 密钥管理（KMS集成）

4.1.2 元数据层设计

元数据层是分布式文件存储平台的核心组件之一，负责管理文件系统的元数据信息，包括文件名、目录结构、权限信息、文件位置等。元数据的高效管理和一致性保证直接影响系统的性能和可靠性。

4.1.2.1 元数据层核心职责

元数据存储与管理：
- 存储文件系统命名空间信息
- 管理文件和目录的属性信息
- 维护文件与数据块的映射关系
元数据缓存：
- 实现多级元数据缓存
- 提高元数据访问性能
- 保证缓存一致性
元数据一致性：
- 保证分布式环境下的元数据一致性
- 实现元数据的高可用
- 支持元数据的快速恢复
命名空间管理：
- 管理文件系统目录结构
- 实现文件和目录的增删改查
- 支持原子操作和事务

4.1.2.2 元数据存储设计

# 元数据存储接口
from abc import ABC, abstractmethod
import json
from typing import Dict, Any, Optional

class MetadataStorage(ABC):
    @abstractmethod
    def create_inode(self, inode_id: str, metadata: Dict[str, Any]) -> bool:
        """创建inode"""
        pass
    
    @abstractmethod
    def get_inode(self, inode_id: str) -> Optional[Dict[str, Any]]:
        """获取inode"""
        pass
    
    @abstractmethod
    def update_inode(self, inode_id: str, metadata: Dict[str, Any]) -> bool:
        """更新inode"""
        pass
    
    @abstractmethod
    def delete_inode(self, inode_id: str) -> bool:
        """删除inode"""
        pass
    
    @abstractmethod
    def create_directory_entry(self, parent_id: str, name: str, inode_id: str) -> bool:
        """创建目录项"""
        pass
    
    @abstractmethod
    def get_directory_entry(self, parent_id: str, name: str) -> Optional[str]:
        """获取目录项"""
        pass
    
    @abstractmethod
    def delete_directory_entry(self, parent_id: str, name: str) -> bool:
        """删除目录项"""
        pass

# 基于键值存储的元数据实现
class KVBasedMetadataStorage(MetadataStorage):
    def __init__(self, kv_store):
        self.kv_store = kv_store
        self.namespace_prefix = "namespace:"
        self.inode_prefix = "inode:"
    
    def _get_inode_key(self, inode_id: str) -> str:
        return f"{self.inode_prefix}{inode_id}"
    
    def _get_namespace_key(self, parent_id: str, name: str) -> str:
        return f"{self.namespace_prefix}{parent_id}:{name}"
    
    def create_inode(self, inode_id: str, metadata: Dict[str, Any]) -> bool:
        key = self._get_inode_key(inode_id)
        try:
            self.kv_store.set(key, json.dumps(metadata))
            return True
        except Exception as e:
            print(f"Failed to create inode {inode_id}: {e}")
            return False
    
    def get_inode(self, inode_id: str) -> Optional[Dict[str, Any]]:
        key = self._get_inode_key(inode_id)
        try:
            data = self.kv_store.get(key)
            if data:
                return json.loads(data)
            return None
        except Exception as e:
            print(f"Failed to get inode {inode_id}: {e}")
            return None
    
    def update_inode(self, inode_id: str, metadata: Dict[str, Any]) -> bool:
        key = self._get_inode_key(inode_id)
        try:
            existing_data = self.kv_store.get(key)
            if existing_data:
                existing_metadata = json.loads(existing_data)
                existing_metadata.update(metadata)
                self.kv_store.set(key, json.dumps(existing_metadata))
                return True
            return False
        except Exception as e:
            print(f"Failed to update inode {inode_id}: {e}")
            return False
    
    def delete_inode(self, inode_id: str) -> bool:
        key = self._get_inode_key(inode_id)
        try:
            self.kv_store.delete(key)
            return True
        except Exception as e:
            print(f"Failed to delete inode {inode_id}: {e}")
            return False
    
    def create_directory_entry(self, parent_id: str, name: str, inode_id: str) -> bool:
        key = self._get_namespace_key(parent_id, name)
        try:
            self.kv_store.set(key, inode_id)
            return True
        except Exception as e:
            print(f"Failed to create directory entry {parent_id}/{name}: {e}")
            return False
    
    def get_directory_entry(self, parent_id: str, name: str) -> Optional[str]:
        key = self._get_namespace_key(parent_id, name)
        try:
            return self.kv_store.get(key)
        except Exception as e:
            print(f"Failed to get directory entry {parent_id}/{name}: {e}")
            return None
    
    def delete_directory_entry(self, parent_id: str, name: str) -> bool:
        key = self._get_namespace_key(parent_id, name)
        try:
            self.kv_store.delete(key)
            return True
        except Exception as e:
            print(f"Failed to delete directory entry {parent_id}/{name}: {e}")
            return False

# 元数据缓存层
class MetadataCache:
    def __init__(self, max_size=10000):
        self.cache = {}
        self.access_order = []  # LRU实现
        self.max_size = max_size
    
    def get(self, key: str) -> Optional[Any]:
        if key in self.cache:
            # 更新访问顺序（LRU）
            self.access_order.remove(key)
            self.access_order.append(key)
            return self.cache[key]
        return None
    
    def put(self, key: str, value: Any):
        if key in self.cache:
            self.access_order.remove(key)
        else:
            # 检查缓存大小，必要时淘汰
            if len(self.cache) >= self.max_size:
                oldest_key = self.access_order.pop(0)
                del self.cache[oldest_key]
        
        self.cache[key] = value
        self.access_order.append(key)
    
    def invalidate(self, key: str):
        if key in self.cache:
            del self.cache[key]
            self.access_order.remove(key)
    
    def clear(self):
        self.cache.clear()
        self.access_order.clear()

# 元数据服务
class MetadataService:
    def __init__(self, storage: MetadataStorage, cache: MetadataCache):
        self.storage = storage
        self.cache = cache
    
    def get_file_metadata(self, file_path: str) -> Optional[Dict[str, Any]]:
        # 检查缓存
        cached_metadata = self.cache.get(file_path)
        if cached_metadata:
            return cached_metadata
        
        # 从存储中获取
        # 这里简化了路径解析逻辑
        inode_id = self._resolve_path_to_inode(file_path)
        if inode_id:
            metadata = self.storage.get_inode(inode_id)
            if metadata:
                # 放入缓存
                self.cache.put(file_path, metadata)
                return metadata
        
        return None
    
    def update_file_metadata(self, file_path: str, metadata: Dict[str, Any]) -> bool:
        inode_id = self._resolve_path_to_inode(file_path)
        if inode_id:
            result = self.storage.update_inode(inode_id, metadata)
            if result:
                # 更新缓存
                self.cache.put(file_path, metadata)
            return result
        return False
    
    def _resolve_path_to_inode(self, file_path: str) -> Optional[str]:
        # 简化的路径解析实现
        # 实际实现需要处理目录遍历、符号链接等复杂情况
        parts = file_path.strip("/").split("/")
        current_inode = "root"  # 根目录inode
        
        for part in parts:
            if not part:
                continue
            inode_id = self.storage.get_directory_entry(current_inode, part)
            if not inode_id:
                return None
            current_inode = inode_id
        
        return current_inode

4.1.2.3 元数据一致性保障

分布式一致性协议：
- Paxos算法
- Raft算法
- ZooKeeper协调服务
主从复制机制：
- 主节点处理写操作
- 从节点同步数据
- 故障自动切换
版本控制：
- MVCC（多版本并发控制）
- 向量时钟
- 版本向量

4.1.2.4 元数据缓存策略

多级缓存架构：
- 客户端缓存
- 接入层缓存
- 元数据服务缓存
缓存淘汰策略：
- LRU（最近最少使用）
- LFU（最不经常使用）
- TTL（生存时间）
缓存一致性：
- 写穿透策略
- 缓存失效通知
- 定期缓存刷新

4.1.3 数据层设计

数据层负责实际存储文件数据，是分布式文件存储平台的基础设施层。数据层的设计直接影响系统的存储效率、访问性能和可靠性。

4.1.3.1 数据层核心职责

数据存储：
- 实现数据的分布式存储
- 支持多种存储介质
- 实现数据冗余和保护
数据访问：
- 提供高效的数据读写接口
- 实现数据的并行访问
- 优化数据访问路径
数据管理：
- 实现数据的生命周期管理
- 支持数据迁移和均衡
- 提供数据清理和回收
数据保护：
- 实现数据冗余存储
- 支持数据备份和恢复
- 提供数据完整性校验

4.1.3.2 数据存储架构

# 数据块管理器
import hashlib
import os
from typing import List, Optional
from abc import ABC, abstractmethod

class DataBlock(ABC):
    def __init__(self, block_id: str, data: bytes):
        self.block_id = block_id
        self.data = data
        self.checksum = self._calculate_checksum(data)
        self.size = len(data)
    
    def _calculate_checksum(self, data: bytes) -> str:
        return hashlib.md5(data).hexdigest()
    
    def verify_checksum(self) -> bool:
        return self.checksum == self._calculate_checksum(self.data)

class DataStorage(ABC):
    @abstractmethod
    def write_block(self, block: DataBlock) -> bool:
        """写入数据块"""
        pass
    
    @abstractmethod
    def read_block(self, block_id: str) -> Optional[DataBlock]:
        """读取数据块"""
        pass
    
    @abstractmethod
    def delete_block(self, block_id: str) -> bool:
        """删除数据块"""
        pass
    
    @abstractmethod
    def get_block_location(self, block_id: str) -> List[str]:
        """获取数据块位置"""
        pass

# 本地文件系统存储实现
class LocalFileStorage(DataStorage):
    def __init__(self, storage_path: str):
        self.storage_path = storage_path
        os.makedirs(storage_path, exist_ok=True)
    
    def write_block(self, block: DataBlock) -> bool:
        try:
            file_path = os.path.join(self.storage_path, block.block_id)
            with open(file_path, 'wb') as f:
                f.write(block.data)
            return True
        except Exception as e:
            print(f"Failed to write block {block.block_id}: {e}")
            return False
    
    def read_block(self, block_id: str) -> Optional[DataBlock]:
        try:
            file_path = os.path.join(self.storage_path, block_id)
            if not os.path.exists(file_path):
                return None
            
            with open(file_path, 'rb') as f:
                data = f.read()
            
            return DataBlock(block_id, data)
        except Exception as e:
            print(f"Failed to read block {block_id}: {e}")
            return None
    
    def delete_block(self, block_id: str) -> bool:
        try:
            file_path = os.path.join(self.storage_path, block_id)
            if os.path.exists(file_path):
                os.remove(file_path)
            return True
        except Exception as e:
            print(f"Failed to delete block {block_id}: {e}")
            return False
    
    def get_block_location(self, block_id: str) -> List[str]:
        file_path = os.path.join(self.storage_path, block_id)
        return [file_path] if os.path.exists(file_path) else []

# 数据块分布管理器
class BlockPlacementManager:
    def __init__(self, replication_factor: int = 3):
        self.replication_factor = replication_factor
        self.block_locations = {}  # block_id -> [node_ids]
    
    def place_block(self, block_id: str, available_nodes: List[str]) -> List[str]:
        """为数据块选择存储节点"""
        if len(available_nodes) < self.replication_factor:
            raise ValueError("Not enough available nodes for replication")
        
        # 简单的随机选择策略
        import random
        selected_nodes = random.sample(available_nodes, self.replication_factor)
        self.block_locations[block_id] = selected_nodes
        return selected_nodes
    
    def get_block_locations(self, block_id: str) -> List[str]:
        """获取数据块的存储位置"""
        return self.block_locations.get(block_id, [])
    
    def remove_node_from_placement(self, node_id: str):
        """从数据块分布中移除节点"""
        for block_id, nodes in self.block_locations.items():
            if node_id in nodes:
                nodes.remove(node_id)
                # 可能需要触发数据重新分布
                if len(nodes) < self.replication_factor - 1:
                    print(f"Warning: Block {block_id} has insufficient replicas")

# 数据访问管理器
class DataAccessManager:
    def __init__(self, storage_nodes: dict, placement_manager: BlockPlacementManager):
        self.storage_nodes = storage_nodes  # node_id -> DataStorage
        self.placement_manager = placement_manager
    
    def write_data(self, block_id: str, data: bytes) -> bool:
        """写入数据"""
        block = DataBlock(block_id, data)
        
        # 获取存储节点
        node_ids = list(self.storage_nodes.keys())
        storage_nodes = self.placement_manager.place_block(block_id, node_ids)
        
        # 并行写入所有副本
        success_count = 0
        for node_id in storage_nodes:
            storage = self.storage_nodes.get(node_id)
            if storage and storage.write_block(block):
                success_count += 1
        
        # 至少需要写入一半以上的副本
        return success_count >= (self.placement_manager.replication_factor // 2 + 1)
    
    def read_data(self, block_id: str) -> Optional[bytes]:
        """读取数据"""
        # 获取存储位置
        node_ids = self.placement_manager.get_block_locations(block_id)
        
        # 尝试从各个节点读取，直到成功
        for node_id in node_ids:
            storage = self.storage_nodes.get(node_id)
            if storage:
                block = storage.read_block(block_id)
                if block and block.verify_checksum():
                    return block.data
        
        return None
    
    def delete_data(self, block_id: str) -> bool:
        """删除数据"""
        node_ids = self.placement_manager.get_block_locations(block_id)
        
        success_count = 0
        for node_id in node_ids:
            storage = self.storage_nodes.get(node_id)
            if storage and storage.delete_block(block_id):
                success_count += 1
        
        # 从分布管理器中移除记录
        if block_id in self.placement_manager.block_locations:
            del self.placement_manager.block_locations[block_id]
        
        return success_count > 0

4.1.3.3 数据冗余策略

副本机制：
- 多副本存储
- 副本放置策略
- 副本同步机制
纠删码机制：
- Reed-Solomon编码
- LDPC编码
- 本地纠删码
混合冗余：
- 热数据多副本
- 冷数据纠删码
- 动态冗余调整

4.1.3.4 数据访问优化

并行读写：
- 数据分片并行访问
- 多线程/多进程处理
- 异步I/O操作
预取机制：
- 基于访问模式的预取
- 批量数据预取
- 缓存预热
数据本地性：
- 计算靠近存储
- 数据分布优化
- 网络拓扑感知

4.1.4 管理层设计

管理层负责整个分布式文件存储平台的管理和监控，确保系统的稳定运行和高效运维。

4.1.4.1 管理层核心职责

集群管理：
- 管理集群节点状态
- 实现节点的动态加入和离开
- 协调集群配置更新
监控告警：
- 收集系统运行指标
- 实现异常检测和告警
- 提供监控数据可视化
运维工具：
- 提供系统运维工具
- 实现自动化运维
- 支持故障诊断和恢复
配置管理：
- 管理系统配置信息
- 支持配置的动态更新
- 实现配置版本管理

4.1.4.2 集群管理设计

# 集群节点状态
from enum import Enum
from typing import Dict, List
import time

class NodeStatus(Enum):
    ONLINE = "online"
    OFFLINE = "offline"
    MAINTENANCE = "maintenance"
    FAULTY = "faulty"

class ClusterNode:
    def __init__(self, node_id: str, host: str, port: int):
        self.node_id = node_id
        self.host = host
        self.port = port
        self.status = NodeStatus.OFFLINE
        self.last_heartbeat = 0
        self.capacity = 0  # 总容量(GB)
        self.used = 0      # 已用容量(GB)
        self.roles = []    # 节点角色列表
    
    def update_heartbeat(self):
        self.last_heartbeat = time.time()
        if self.status != NodeStatus.MAINTENANCE:
            self.status = NodeStatus.ONLINE
    
    def is_healthy(self, timeout: int = 30) -> bool:
        return (time.time() - self.last_heartbeat) < timeout
    
    def get_utilization(self) -> float:
        return self.used / self.capacity if self.capacity > 0 else 0

# 集群管理器
class ClusterManager:
    def __init__(self):
        self.nodes: Dict[str, ClusterNode] = {}
        self.heartbeat_timeout = 30  # 心跳超时时间(秒)
    
    def add_node(self, node: ClusterNode):
        """添加节点"""
        self.nodes[node.node_id] = node
        print(f"Node {node.node_id} added to cluster")
    
    def remove_node(self, node_id: str):
        """移除节点"""
        if node_id in self.nodes:
            del self.nodes[node_id]
            print(f"Node {node_id} removed from cluster")
    
    def update_node_heartbeat(self, node_id: str):
        """更新节点心跳"""
        node = self.nodes.get(node_id)
        if node:
            node.update_heartbeat()
    
    def get_node(self, node_id: str) -> Optional[ClusterNode]:
        """获取节点信息"""
        return self.nodes.get(node_id)
    
    def get_all_nodes(self) -> List[ClusterNode]:
        """获取所有节点"""
        return list(self.nodes.values())
    
    def get_healthy_nodes(self) -> List[ClusterNode]:
        """获取健康节点"""
        return [node for node in self.nodes.values() if node.is_healthy(self.heartbeat_timeout)]
    
    def get_node_by_status(self, status: NodeStatus) -> List[ClusterNode]:
        """根据状态获取节点"""
        return [node for node in self.nodes.values() if node.status == status]
    
    def check_node_health(self):
        """检查节点健康状态"""
        current_time = time.time()
        for node in self.nodes.values():
            if node.status != NodeStatus.MAINTENANCE:
                if (current_time - node.last_heartbeat) > self.heartbeat_timeout:
                    if node.status != NodeStatus.FAULTY:
                        node.status = NodeStatus.FAULTY
                        print(f"Node {node.node_id} marked as faulty")
                else:
                    if node.status == NodeStatus.FAULTY:
                        node.status = NodeStatus.ONLINE
                        print(f"Node {node.node_id} recovered")

# 使用示例
cluster_manager = ClusterManager()

# 添加节点
node1 = ClusterNode("node1", "192.168.1.10", 8080)
node1.capacity = 1000  # 1TB
node1.used = 300       # 300GB
node1.roles = ["metadata", "data"]

node2 = ClusterNode("node2", "192.168.1.11", 8080)
node2.capacity = 2000  # 2TB
node2.used = 800       # 800GB
node2.roles = ["data"]

cluster_manager.add_node(node1)
cluster_manager.add_node(node2)

# 更新心跳
cluster_manager.update_node_heartbeat("node1")
cluster_manager.update_node_heartbeat("node2")

# 检查健康状态
cluster_manager.check_node_health()

# 获取健康节点
healthy_nodes = cluster_manager.get_healthy_nodes()
print(f"Healthy nodes: {[node.node_id for node in healthy_nodes]}")

4.1.4.3 监控告警系统

指标收集：
- 系统性能指标
- 资源使用率
- 业务指标
异常检测：
- 阈值告警
- 趋势分析
- 机器学习检测
告警管理：
- 告警分级
- 告警收敛
- 通知渠道

4.1.4.4 运维工具集

部署工具：
- 自动化部署脚本
- 容器化部署
- 配置管理工具
诊断工具：
- 性能分析工具
- 故障诊断工具
- 日志分析工具
维护工具：
- 数据备份工具
- 系统升级工具
- 容量管理工具

总结

分布式文件存储平台的分层架构设计通过将系统功能划分为接入层、元数据层、数据层和管理层，实现了关注点分离和模块化设计。每一层都有明确的职责和设计要点：

接入层作为用户与系统的交互界面，负责协议适配、负载均衡和安全认证，直接影响用户体验和系统安全性。
元数据层管理文件系统的命名空间和元数据信息，是系统的"大脑"，其设计直接关系到系统的性能和一致性。
数据层负责实际的数据存储和访问，是系统的基础设施，需要考虑存储效率、访问性能和数据保护。
管理层提供集群管理、监控告警和运维工具，确保系统的稳定运行和高效运维。

通过合理设计每一层的架构和实现，可以构建一个高性能、高可用、易维护的分布式文件存储平台。在实际应用中，需要根据具体的业务需求和技术约束，灵活调整各层的设计方案，并通过持续的优化和演进，确保平台能够满足当前和未来的业务需求。