性能瓶颈识别与优化：基于分布式追踪的系统性能调优

老马啸西风2025/8/31大约 13 分钟

在复杂的微服务架构中，性能问题往往隐藏在深层的服务调用链中，传统的监控手段难以快速定位根本原因。分布式追踪技术通过记录请求在系统中的完整流转路径，为我们提供了深入分析性能瓶颈的强大工具。本文将详细介绍如何利用追踪数据识别性能瓶颈，并实施有效的优化措施。

性能瓶颈识别方法

1. 基于时间分布的分析

响应时间分解

通过分析追踪数据中各个Span的执行时间，可以将总响应时间分解到具体的服务和操作：

# 响应时间分解分析
class ResponseTimeAnalyzer:
    def __init__(self, traces):
        self.traces = traces
    
    def analyze_response_time(self):
        """分析响应时间分布"""
        results = {
            'total_time_distribution': [],
            'service_time_breakdown': defaultdict(list),
            'operation_time_breakdown': defaultdict(list)
        }
        
        for trace in self.traces:
            root_span = self.get_root_span(trace)
            if not root_span:
                continue
            
            total_time = root_span['duration']
            results['total_time_distribution'].append(total_time)
            
            # 分解到各个服务
            service_times = self.decompose_by_service(trace)
            for service, time in service_times.items():
                results['service_time_breakdown'][service].append(time)
            
            # 分解到各个操作
            operation_times = self.decompose_by_operation(trace)
            for operation, time in operation_times.items():
                results['operation_time_breakdown'][operation].append(time)
        
        return self.calculate_statistics(results)
    
    def get_root_span(self, trace):
        """获取根Span"""
        spans = trace.get('spans', [])
        for span in spans:
            if 'parentSpanId' not in span:
                return span
        return None
    
    def decompose_by_service(self, trace):
        """按服务分解时间"""
        service_times = defaultdict(int)
        spans = trace.get('spans', [])
        
        for span in spans:
            service = span.get('tags', {}).get('service', 'unknown')
            service_times[service] += span.get('duration', 0)
        
        return service_times
    
    def decompose_by_operation(self, trace):
        """按操作分解时间"""
        operation_times = defaultdict(int)
        spans = trace.get('spans', [])
        
        for span in spans:
            operation = span.get('operationName', 'unknown')
            operation_times[operation] += span.get('duration', 0)
        
        return operation_times
    
    def calculate_statistics(self, results):
        """计算统计信息"""
        stats = {}
        
        # 总时间统计
        total_times = results['total_time_distribution']
        if total_times:
            stats['total_time'] = {
                'avg': sum(total_times) / len(total_times),
                'p50': self.percentile(total_times, 50),
                'p95': self.percentile(total_times, 95),
                'p99': self.percentile(total_times, 99),
                'max': max(total_times)
            }
        
        # 服务时间统计
        service_stats = {}
        for service, times in results['service_time_breakdown'].items():
            service_stats[service] = {
                'total_time': sum(times),
                'avg_time': sum(times) / len(times),
                'count': len(times),
                'percentage': (sum(times) / sum(total_times)) * 100 if total_times else 0
            }
        stats['service_breakdown'] = service_stats
        
        return stats
    
    def percentile(self, data, percentile):
        """计算百分位数"""
        if not data:
            return 0
        sorted_data = sorted(data)
        index = int(len(sorted_data) * percentile / 100)
        return sorted_data[min(index, len(sorted_data) - 1)]

2. 基于调用频率的分析

高频调用的服务或操作往往是性能优化的重点：

# 调用频率分析
class CallFrequencyAnalyzer:
    def __init__(self, traces):
        self.traces = traces
    
    def analyze_frequency(self):
        """分析调用频率"""
        service_calls = defaultdict(int)
        operation_calls = defaultdict(int)
        
        for trace in self.traces:
            spans = trace.get('spans', [])
            for span in spans:
                service = span.get('tags', {}).get('service', 'unknown')
                operation = span.get('operationName', 'unknown')
                
                service_calls[service] += 1
                operation_calls[operation] += 1
        
        # 识别高频调用
        high_frequency_services = {
            service: count for service, count in service_calls.items() 
            if count > len(self.traces) * 0.1  # 调用次数超过总请求数的10%
        }
        
        high_frequency_operations = {
            operation: count for operation, count in operation_calls.items()
            if count > len(self.traces) * 0.05  # 操作次数超过总请求数的5%
        }
        
        return {
            'high_frequency_services': high_frequency_services,
            'high_frequency_operations': high_frequency_operations,
            'service_call_stats': dict(service_calls),
            'operation_call_stats': dict(operation_calls)
        }

3. 基于异常检测的分析

通过统计分析识别性能异常：

# 异常检测分析
class AnomalyDetector:
    def __init__(self, traces):
        self.traces = traces
    
    def detect_anomalies(self):
        """检测性能异常"""
        # 按服务分组数据
        service_data = defaultdict(list)
        
        for trace in self.traces:
            spans = trace.get('spans', [])
            for span in spans:
                service = span.get('tags', {}).get('service', 'unknown')
                duration = span.get('duration', 0)
                service_data[service].append(duration)
        
        anomalies = []
        
        # 对每个服务进行异常检测
        for service, durations in service_data.items():
            if len(durations) < 10:  # 数据量太小跳过
                continue
            
            # 计算基线统计
            mean = sum(durations) / len(durations)
            std = self.std_deviation(durations)
            
            # 识别3σ之外的异常点
            for i, duration in enumerate(durations):
                if abs(duration - mean) > 3 * std:
                    anomalies.append({
                        'service': service,
                        'duration': duration,
                        'mean': mean,
                        'std': std,
                        'deviation': abs(duration - mean) / std,
                        'severity': 'high' if abs(duration - mean) > 5 * std else 'medium'
                    })
        
        return sorted(anomalies, key=lambda x: x['deviation'], reverse=True)
    
    def std_deviation(self, data):
        """计算标准差"""
        if len(data) < 2:
            return 0
        mean = sum(data) / len(data)
        variance = sum((x - mean) ** 2 for x in data) / (len(data) - 1)
        return variance ** 0.5

常见性能瓶颈类型

1. 数据库性能瓶颈

数据库查询往往是微服务中最常见的性能瓶颈：

-- 慢查询识别
SELECT 
    operation_name,
    AVG(duration) as avg_duration,
    COUNT(*) as call_count,
    MAX(duration) as max_duration
FROM spans 
WHERE operation_name LIKE '%SELECT%' 
    OR operation_name LIKE '%INSERT%'
    OR operation_name LIKE '%UPDATE%'
GROUP BY operation_name
HAVING AVG(duration) > 1000000  -- 超过1秒的查询
ORDER BY avg_duration DESC
LIMIT 10;

# 数据库查询优化建议
class DatabaseOptimizer:
    def __init__(self, traces):
        self.traces = traces
    
    def analyze_database_queries(self):
        """分析数据库查询性能"""
        db_queries = []
        
        for trace in self.traces:
            spans = trace.get('spans', [])
            for span in spans:
                operation = span.get('operationName', '')
                if any(keyword in operation.upper() for keyword in ['SELECT', 'INSERT', 'UPDATE', 'DELETE']):
                    db_queries.append({
                        'trace_id': trace.get('traceId'),
                        'span_id': span.get('spanId'),
                        'operation': operation,
                        'duration': span.get('duration', 0),
                        'tags': span.get('tags', {}),
                        'logs': span.get('logs', [])
                    })
        
        # 识别慢查询
        slow_queries = [q for q in db_queries if q['duration'] > 1000000]  # 超过1秒
        
        return {
            'total_queries': len(db_queries),
            'slow_queries': slow_queries,
            'slow_query_percentage': len(slow_queries) / len(db_queries) * 100 if db_queries else 0,
            'recommendations': self.generate_db_recommendations(slow_queries)
        }
    
    def generate_db_recommendations(self, slow_queries):
        """生成数据库优化建议"""
        recommendations = []
        
        # 按操作类型分组
        query_types = defaultdict(list)
        for query in slow_queries:
            operation = query['operation'].upper()
            if 'SELECT' in operation:
                query_types['SELECT'].append(query)
            elif 'INSERT' in operation:
                query_types['INSERT'].append(query)
            elif 'UPDATE' in operation:
                query_types['UPDATE'].append(query)
            elif 'DELETE' in operation:
                query_types['DELETE'].append(query)
        
        # SELECT查询优化建议
        if query_types['SELECT']:
            recommendations.append({
                'type': 'select_optimization',
                'description': f'发现 {len(query_types["SELECT"])} 个慢SELECT查询',
                'suggestions': [
                    '检查查询是否使用了合适的索引',
                    '优化JOIN操作，避免笛卡尔积',
                    '考虑分页查询减少数据传输',
                    '分析查询执行计划'
                ]
            })
        
        # INSERT/UPDATE查询优化建议
        if query_types['INSERT'] or query_types['UPDATE']:
            recommendations.append({
                'type': 'write_optimization',
                'description': f'发现 {len(query_types["INSERT"]) + len(query_types["UPDATE"])} 个慢写入操作',
                'suggestions': [
                    '批量处理写入操作',
                    '优化事务范围，减少锁竞争',
                    '考虑异步写入',
                    '检查是否有不必要的触发器'
                ]
            })
        
        return recommendations

2. 外部API调用瓶颈

外部服务调用通常具有较高的延迟和不确定性：

# 外部API调用分析
class ExternalAPIStrategy:
    def __init__(self, traces):
        self.traces = traces
    
    def analyze_external_calls(self):
        """分析外部API调用"""
        external_calls = []
        
        for trace in self.traces:
            spans = trace.get('spans', [])
            for span in spans:
                tags = span.get('tags', {})
                # 识别外部调用（通过URL或服务名判断）
                if 'http.url' in tags or 'peer.service' in tags:
                    url = tags.get('http.url', '')
                    service = tags.get('peer.service', '')
                    
                    # 判断是否为外部调用
                    if self.is_external_call(url, service):
                        external_calls.append({
                            'trace_id': trace.get('traceId'),
                            'span_id': span.get('spanId'),
                            'url': url,
                            'service': service,
                            'duration': span.get('duration', 0),
                            'status_code': tags.get('http.status_code', ''),
                            'timeout': tags.get('timeout', '')
                        })
        
        # 统计分析
        total_calls = len(external_calls)
        slow_calls = [call for call in external_calls if call['duration'] > 2000000]  # 超过2秒
        failed_calls = [call for call in external_calls if str(call['status_code']).startswith('5')]
        
        return {
            'total_external_calls': total_calls,
            'slow_external_calls': slow_calls,
            'failed_external_calls': failed_calls,
            'avg_duration': sum(call['duration'] for call in external_calls) / total_calls if total_calls > 0 else 0,
            'recommendations': self.generate_api_recommendations(external_calls)
        }
    
    def is_external_call(self, url, service):
        """判断是否为外部调用"""
        # 简单的外部调用判断逻辑
        internal_domains = ['internal.company.com', 'localhost', '127.0.0.1']
        
        if url:
            return not any(domain in url for domain in internal_domains)
        if service:
            return 'internal' not in service.lower()
        
        return False
    
    def generate_api_recommendations(self, external_calls):
        """生成API优化建议"""
        recommendations = []
        
        # 超时设置建议
        calls_without_timeout = [call for call in external_calls if not call.get('timeout')]
        if calls_without_timeout:
            recommendations.append({
                'type': 'timeout_configuration',
                'description': f'发现 {len(calls_without_timeout)} 个外部调用未设置超时',
                'suggestions': [
                    '为所有外部调用设置合理的超时时间',
                    '实施超时重试机制',
                    '考虑使用断路器模式'
                ]
            })
        
        # 缓存建议
        # 这里可以基于调用频率和参数变化情况给出缓存建议
        frequent_calls = self.identify_frequent_calls(external_calls)
        if frequent_calls:
            recommendations.append({
                'type': 'caching_strategy',
                'description': f'发现 {len(frequent_calls)} 个频繁的外部调用',
                'suggestions': [
                    '考虑对频繁调用的结果进行缓存',
                    '实施缓存失效策略',
                    '使用本地缓存减少网络延迟'
                ]
            })
        
        return recommendations
    
    def identify_frequent_calls(self, external_calls):
        """识别频繁调用"""
        call_frequency = defaultdict(int)
        for call in external_calls:
            key = f"{call.get('url', '')}|{call.get('service', '')}"
            call_frequency[key] += 1
        
        # 识别调用次数超过阈值的调用
        return [call for call, count in call_frequency.items() if count > 100]

3. 序列化/反序列化瓶颈

在微服务间通信中，数据序列化/反序列化也可能成为性能瓶颈：

# 序列化性能分析
class SerializationAnalyzer:
    def __init__(self, traces):
        self.traces = traces
    
    def analyze_serialization(self):
        """分析序列化性能"""
        serialization_spans = []
        
        for trace in self.traces:
            spans = trace.get('spans', [])
            for span in spans:
                operation = span.get('operationName', '')
                if any(keyword in operation.lower() for keyword in ['serialize', 'deserialize', 'marshal', 'unmarshal']):
                    serialization_spans.append({
                        'trace_id': trace.get('traceId'),
                        'span_id': span.get('spanId'),
                        'operation': operation,
                        'duration': span.get('duration', 0),
                        'tags': span.get('tags', {})
                    })
        
        if not serialization_spans:
            return {'message': '未发现明显的序列化操作'}
        
        # 统计分析
        total_duration = sum(span['duration'] for span in serialization_spans)
        avg_duration = total_duration / len(serialization_spans)
        
        # 识别慢序列化操作
        slow_serializations = [span for span in serialization_spans if span['duration'] > avg_duration * 2]
        
        return {
            'total_serialization_time': total_duration,
            'avg_serialization_time': avg_duration,
            'slow_serializations': slow_serializations,
            'recommendations': self.generate_serialization_recommendations(serialization_spans)
        }
    
    def generate_serialization_recommendations(self, serialization_spans):
        """生成序列化优化建议"""
        recommendations = []
        
        # 数据大小分析
        large_payloads = self.analyze_payload_sizes(serialization_spans)
        if large_payloads:
            recommendations.append({
                'type': 'payload_optimization',
                'description': f'发现 {len(large_payloads)} 个大数据载荷的序列化操作',
                'suggestions': [
                    '优化数据结构，减少不必要的字段传输',
                    '考虑使用更高效的序列化协议（如Protocol Buffers）',
                    '实施数据压缩',
                    '分页处理大数据集'
                ]
            })
        
        # 协议选择建议
        protocol_usage = self.analyze_protocol_usage(serialization_spans)
        if 'json' in protocol_usage and protocol_usage['json'] > len(serialization_spans) * 0.7:
            recommendations.append({
                'type': 'protocol_optimization',
                'description': '大量使用JSON序列化，可能存在性能优化空间',
                'suggestions': [
                    '考虑使用二进制序列化协议（如Protocol Buffers、Avro）',
                    '评估不同序列化协议的性能差异',
                    '在合适场景下使用更轻量的格式'
                ]
            })
        
        return recommendations
    
    def analyze_payload_sizes(self, serialization_spans):
        """分析载荷大小"""
        large_payloads = []
        for span in serialization_spans:
            payload_size = span['tags'].get('payload_size', 0)
            if payload_size > 1024 * 1024:  # 超过1MB
                large_payloads.append(span)
        return large_payloads
    
    def analyze_protocol_usage(self, serialization_spans):
        """分析协议使用情况"""
        protocol_usage = defaultdict(int)
        for span in serialization_spans:
            operation = span['operation'].lower()
            if 'json' in operation:
                protocol_usage['json'] += 1
            elif 'proto' in operation or 'protobuf' in operation:
                protocol_usage['protobuf'] += 1
            elif 'avro' in operation:
                protocol_usage['avro'] += 1
            else:
                protocol_usage['other'] += 1
        return protocol_usage

性能优化策略

1. 缓存策略优化

// 缓存优化示例
@Service
public class OptimizedUserService {
    private final Cache<String, User> userCache;
    private final UserRepository userRepository;
    private final Tracer tracer;
    
    public OptimizedUserService(UserRepository userRepository, Tracer tracer) {
        this.userRepository = userRepository;
        this.tracer = tracer;
        // 配置合理的缓存策略
        this.userCache = Caffeine.newBuilder()
                .maximumSize(10_000)
                .expireAfterWrite(30, TimeUnit.MINUTES)
                .recordStats()
                .build();
    }
    
    public User getUser(String userId) {
        Span span = tracer.buildSpan("getUserWithCache").start();
        
        try (Scope scope = tracer.scopeManager().activate(span)) {
            // 首先检查缓存
            Span cacheSpan = tracer.buildSpan("cacheLookup").start();
            try (Scope cacheScope = tracer.scopeManager().activate(cacheSpan)) {
                User cachedUser = userCache.getIfPresent(userId);
                if (cachedUser != null) {
                    span.setTag("cache.hit", true);
                    span.setTag("source", "cache");
                    cacheSpan.setTag("result", "hit");
                    return cachedUser;
                }
                cacheSpan.setTag("result", "miss");
            } finally {
                cacheSpan.finish();
            }
            
            // 缓存未命中，查询数据库
            Span dbSpan = tracer.buildSpan("databaseQuery").start();
            try (Scope dbScope = tracer.scopeManager().activate(dbSpan)) {
                User user = userRepository.findById(userId);
                if (user != null) {
                    // 将结果放入缓存
                    userCache.put(userId, user);
                    span.setTag("cache.hit", false);
                    span.setTag("source", "database");
                }
                return user;
            } finally {
                dbSpan.finish();
            }
        } finally {
            span.finish();
        }
    }
}

2. 异步处理优化

// 异步处理优化示例
@Service
public class AsyncOrderService {
    private final OrderRepository orderRepository;
    private final PaymentService paymentService;
    private final NotificationService notificationService;
    private final ExecutorService executorService;
    private final Tracer tracer;
    
    public AsyncOrderService(OrderRepository orderRepository, 
                           PaymentService paymentService,
                           NotificationService notificationService,
                           Tracer tracer) {
        this.orderRepository = orderRepository;
        this.paymentService = paymentService;
        this.notificationService = notificationService;
        this.tracer = tracer;
        this.executorService = Executors.newFixedThreadPool(10);
    }
    
    public CompletableFuture<Order> processOrderAsync(Order order) {
        Span span = tracer.buildSpan("processOrderAsync").start();
        
        return CompletableFuture.supplyAsync(() -> {
            try (Scope scope = tracer.scopeManager().activate(span)) {
                // 保存订单
                Span saveSpan = tracer.buildSpan("saveOrder").start();
                try (Scope saveScope = tracer.scopeManager().activate(saveSpan)) {
                    Order savedOrder = orderRepository.save(order);
                    saveSpan.setTag("order.id", savedOrder.getId());
                    return savedOrder;
                } finally {
                    saveSpan.finish();
                }
            } finally {
                span.finish();
            }
        }, executorService)
        .thenCompose(savedOrder -> {
            // 异步处理支付
            return processPaymentAsync(savedOrder);
        })
        .thenCompose(orderWithPayment -> {
            // 异步发送通知
            return sendNotificationAsync(orderWithPayment);
        })
        .whenComplete((result, throwable) -> {
            if (throwable != null) {
                // 记录异常
                Span errorSpan = tracer.buildSpan("processOrderError").start();
                try (Scope errorScope = tracer.scopeManager().activate(errorSpan)) {
                    errorSpan.setTag("error", true);
                    errorSpan.log(ImmutableMap.of("event", "error", "error.object", throwable));
                } finally {
                    errorSpan.finish();
                }
            }
        });
    }
    
    private CompletableFuture<Order> processPaymentAsync(Order order) {
        return CompletableFuture.supplyAsync(() -> {
            Span span = tracer.buildSpan("processPaymentAsync").start();
            try (Scope scope = tracer.scopeManager().activate(span)) {
                PaymentResult result = paymentService.processPayment(order.getPaymentInfo());
                order.setPaymentResult(result);
                span.setTag("payment.status", result.getStatus().toString());
                return order;
            } finally {
                span.finish();
            }
        }, executorService);
    }
    
    private CompletableFuture<Order> sendNotificationAsync(Order order) {
        return CompletableFuture.supplyAsync(() -> {
            Span span = tracer.buildSpan("sendNotificationAsync").start();
            try (Scope scope = tracer.scopeManager().activate(span)) {
                notificationService.sendOrderConfirmation(order);
                span.setTag("notification.sent", true);
                return order;
            } finally {
                span.finish();
            }
        }, executorService);
    }
}

3. 数据库查询优化

// 数据库查询优化示例
@Repository
public class OptimizedUserRepository {
    private final JdbcTemplate jdbcTemplate;
    private final Tracer tracer;
    
    public OptimizedUserRepository(JdbcTemplate jdbcTemplate, Tracer tracer) {
        this.jdbcTemplate = jdbcTemplate;
        this.tracer = tracer;
    }
    
    public List<User> findUsersWithOrders(String userId) {
        Span span = tracer.buildSpan("findUsersWithOrders").start();
        
        try (Scope scope = tracer.scopeManager().activate(span)) {
            // 使用JOIN优化查询，避免N+1问题
            String sql = """
                SELECT u.*, o.id as order_id, o.total_amount, o.status
                FROM users u
                LEFT JOIN orders o ON u.id = o.user_id
                WHERE u.id = ?
                ORDER BY o.created_at DESC
                LIMIT 10
                """;
            
            Span querySpan = tracer.buildSpan("databaseQuery").start();
            try (Scope queryScope = tracer.scopeManager().activate(querySpan)) {
                List<UserWithOrders> results = jdbcTemplate.query(sql, 
                    (rs, rowNum) -> mapUserWithOrders(rs), userId);
                
                querySpan.setTag("rows.fetched", results.size());
                querySpan.setTag("query.type", "JOIN");
                
                // 转换为用户列表
                return results.stream()
                    .collect(Collectors.groupingBy(UserWithOrders::getUser))
                    .entrySet().stream()
                    .map(entry -> {
                        User user = entry.getKey();
                        List<Order> orders = entry.getValue().stream()
                            .map(UserWithOrders::getOrder)
                            .filter(Objects::nonNull)
                            .collect(Collectors.toList());
                        user.setOrders(orders);
                        return user;
                    })
                    .collect(Collectors.toList());
            } finally {
                querySpan.finish();
            }
        } finally {
            span.finish();
        }
    }
    
    private UserWithOrders mapUserWithOrders(ResultSet rs) throws SQLException {
        User user = new User();
        user.setId(rs.getString("id"));
        user.setName(rs.getString("name"));
        user.setEmail(rs.getString("email"));
        
        Order order = null;
        String orderId = rs.getString("order_id");
        if (orderId != null) {
            order = new Order();
            order.setId(orderId);
            order.setTotalAmount(rs.getBigDecimal("total_amount"));
            order.setStatus(OrderStatus.valueOf(rs.getString("status")));
        }
        
        return new UserWithOrders(user, order);
    }
}

性能监控与告警

关键性能指标

# 服务响应时间监控
histogram_quantile(0.95, sum(rate(span_duration_seconds_bucket[5m])) by (service, le))

# 数据库查询性能
histogram_quantile(0.95, sum(rate(span_duration_seconds_bucket{operation=~".*SELECT.*|.*INSERT.*"}[5m])) by (operation, le))

# 外部API调用延迟
histogram_quantile(0.95, sum(rate(span_duration_seconds_bucket{peer_service!=""}[5m])) by (peer_service, le))

# 缓存命中率
rate(cache_hit_total[5m]) / (rate(cache_hit_total[5m]) + rate(cache_miss_total[5m]))

告警规则配置

# Prometheus告警规则
groups:
- name: performance-alerts
  rules:
  - alert: HighServiceLatency
    expr: histogram_quantile(0.95, sum(rate(span_duration_seconds_bucket[5m])) by (service, le)) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Service {{ $labels.service }} has high latency"
      description: "95th percentile latency for {{ $labels.service }} is above 1 second"

  - alert: DatabaseQuerySlow
    expr: histogram_quantile(0.95, sum(rate(span_duration_seconds_bucket{operation=~".*SELECT.*"}[5m])) by (operation, le)) > 0.5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Database query is slow"
      description: "Database query {{ $labels.operation }} is taking more than 500ms"

  - alert: ExternalAPITimeout
    expr: rate(span_exception_total{error_type="timeout"}[5m]) > 0.1
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "External API timeouts detected"
      description: "High rate of external API timeouts, check connectivity and response times"

  - alert: LowCacheHitRate
    expr: rate(cache_hit_total[5m]) / (rate(cache_hit_total[5m]) + rate(cache_miss_total[5m])) < 0.8
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Low cache hit rate"
      description: "Cache hit rate is below 80%, consider cache optimization"

效果评估与持续优化

优化效果评估

# 优化效果评估
class OptimizationEvaluator:
    def __init__(self, before_traces, after_traces):
        self.before_traces = before_traces
        self.after_traces = after_traces
    
    def evaluate_optimization(self):
        """评估优化效果"""
        before_stats = self.calculate_performance_stats(self.before_traces)
        after_stats = self.calculate_performance_stats(self.after_traces)
        
        improvement = {}
        
        # 响应时间改善
        if before_stats['avg_duration'] > 0:
            time_improvement = (before_stats['avg_duration'] - after_stats['avg_duration']) / before_stats['avg_duration'] * 100
            improvement['response_time'] = {
                'before': before_stats['avg_duration'],
                'after': after_stats['avg_duration'],
                'improvement_percentage': time_improvement
            }
        
        # 错误率改善
        if before_stats['error_rate'] > 0:
            error_improvement = (before_stats['error_rate'] - after_stats['error_rate']) / before_stats['error_rate'] * 100
            improvement['error_rate'] = {
                'before': before_stats['error_rate'],
                'after': after_stats['error_rate'],
                'improvement_percentage': error_improvement
            }
        
        # 吞吐量改善
        throughput_improvement = (after_stats['throughput'] - before_stats['throughput']) / before_stats['throughput'] * 100
        improvement['throughput'] = {
            'before': before_stats['throughput'],
            'after': after_stats['throughput'],
            'improvement_percentage': throughput_improvement
        }
        
        return improvement
    
    def calculate_performance_stats(self, traces):
        """计算性能统计信息"""
        durations = []
        error_count = 0
        total_count = len(traces)
        
        for trace in traces:
            root_span = self.get_root_span(trace)
            if root_span:
                durations.append(root_span.get('duration', 0))
                if self.is_error_trace(trace):
                    error_count += 1
        
        avg_duration = sum(durations) / len(durations) if durations else 0
        error_rate = error_count / total_count if total_count > 0 else 0
        
        # 计算时间窗口内的吞吐量（假设追踪数据覆盖1小时）
        throughput = total_count / 3600  # 每秒请求数
        
        return {
            'avg_duration': avg_duration,
            'error_rate': error_rate,
            'throughput': throughput,
            'total_requests': total_count
        }
    
    def get_root_span(self, trace):
        """获取根Span"""
        spans = trace.get('spans', [])
        for span in spans:
            if 'parentSpanId' not in span:
                return span
        return None
    
    def is_error_trace(self, trace):
        """判断是否为错误追踪"""
        spans = trace.get('spans', [])
        for span in spans:
            if span.get('tags', {}).get('error', False):
                return True
        return False