# filebeat.yml - Edge Layer
filebeat.inputs:
- type: log
  enabled: true
  paths:
    - /var/log/application/*.log
  fields:
    service: user-service
    environment: production
  fields_under_root: true

processors:
- add_docker_metadata: ~
- add_kubernetes_metadata: ~

output.logstash:
  hosts: ["logstash-collector:5044"]

# logstash.conf - Collector Layer
input {
  beats {
    port => 5044
  }
}

filter {
  # 解析结构化日志
  json {
    source => "message"
    target => "log_data"
  }
  
  # 提取追踪上下文
  mutate {
    add_field => {
      "trace_id" => "%{[log_data][traceId]}"
      "span_id" => "%{[log_data][spanId]}"
      "parent_id" => "%{[log_data][parentId]}"
    }
  }
  
  # 标准化时间戳
  date {
    match => [ "[log_data][timestamp]", "ISO8601" ]
    target => "@timestamp"
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "application-logs-%{+YYYY.MM.dd}"
  }
}

使用ELK Stack进行日志关联

Elasticsearch索引设计

为了支持高效的日志关联查询，需要合理设计Elasticsearch索引：

{
  "mappings": {
    "properties": {
      "@timestamp": {
        "type": "date"
      },
      "level": {
        "type": "keyword"
      },
      "service": {
        "type": "keyword"
      },
      "trace_id": {
        "type": "keyword"
      },
      "span_id": {
        "type": "keyword"
      },
      "parent_id": {
        "type": "keyword"
      },
      "message": {
        "type": "text"
      },
      "log_data": {
        "type": "object",
        "enabled": false
      }
    }
  },
  "settings": {
    "number_of_shards": 3,
    "number_of_replicas": 1,
    "refresh_interval": "30s"
  }
}

Kibana可视化配置

使用Kibana创建日志关联的可视化视图：

创建追踪视图

{
  "title": "Request Trace View",
  "visualization": {
    "type": "timeline",
    "data": {
      "query": {
        "bool": {
          "must": [
            {
              "term": {
                "trace_id": "{{traceId}}"
              }
            }
          ]
        }
      },
      "sort": [
        {
          "@timestamp": {
            "order": "asc"
          }
        }
      ]
    },
    "columns": [
      "service",
      "level",
      "message",
      "@timestamp"
    ]
  }
}

创建服务依赖图

{
  "title": "Service Dependency Graph",
  "visualization": {
    "type": "graph",
    "data": {
      "aggs": {
        "services": {
          "terms": {
            "field": "service"
          },
          "aggs": {
            "dependencies": {
              "terms": {
                "field": "parent_service"
              }
            }
          }
        }
      }
    }
  }
}

查询优化技巧

Trace ID查询优化

{
  "query": {
    "term": {
      "trace_id": "abc123def456ghi789"
    }
  },
  "sort": [
    {
      "@timestamp": {
        "order": "asc"
      }
    }
  ],
  "size": 1000
}

时间范围查询

{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "trace_id": "abc123def456ghi789"
          }
        }
      ],
      "filter": [
        {
          "range": {
            "@timestamp": {
              "gte": "2025-08-31T09:00:00Z",
              "lte": "2025-08-31T10:00:00Z"
            }
          }
        }
      ]
    }
  }
}

使用Jaeger进行分布式追踪

Jaeger架构组件

Jaeger是专为分布式追踪设计的系统，包含以下核心组件：

Jaeger Agent

轻量级守护进程，运行在每个主机上：

接收客户端的追踪数据
批量发送数据到Jaeger Collector

Jaeger Collector

接收和处理追踪数据：

验证和存储追踪数据
实现数据采样和过滤

Jaeger Query

提供查询接口和UI：

查询和分析追踪数据
提供可视化界面

Jaeger UI

Web界面用于查看和分析追踪数据：

展示服务依赖关系
显示请求调用链路
提供性能分析功能

集成示例

Java应用集成

import io.jaegertracing.Configuration;
import io.jaegertracing.internal.JaegerTracer;
import io.opentracing.Tracer;
import io.opentracing.util.GlobalTracer;

@Configuration
public class JaegerConfig {
    
    @Bean
    public Tracer jaegerTracer() {
        return new Configuration("user-service")
            .withSampler(new Configuration.SamplerConfiguration()
                .withType("const")
                .withParam(1))
            .withReporter(new Configuration.ReporterConfiguration()
                .withLogSpans(true)
                .withSender(new Configuration.SenderConfiguration()
                    .withEndpoint("http://jaeger-collector:14268/api/traces")))
            .getTracer();
    }
}

日志与追踪关联

@RestController
public class UserController {
    
    private static final Logger logger = LoggerFactory.getLogger(UserController.class);
    
    @GetMapping("/users/{id}")
    public User getUser(@PathVariable String id) {
        Span span = GlobalTracer.get().activeSpan();
        
        if (span != null) {
            // 在日志中包含追踪信息
            logger.info("Fetching user information",
                keyValue("userId", id),
                keyValue("traceId", span.context().toTraceId()),
                keyValue("spanId", span.context().toSpanId()));
        }
        
        // 业务逻辑
        return userService.findById(id);
    }
}

使用OpenTelemetry Collector

Collector配置

OpenTelemetry Collector提供了强大的数据处理能力：

# otel-collector-config.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch:
    timeout: 1s
    send_batch_size: 1000
  attributes:
    actions:
      - key: service.name
        action: upsert
        value: "unknown-service"
  resource:
    attributes:
      - key: deployment.environment
        value: "production"
        action: insert

exporters:
  elasticsearch:
    endpoints: ["http://elasticsearch:9200"]
    logs_index: "otel-logs"
  jaeger:
    endpoint: jaeger-collector:14250
    tls:
      insecure: true

service:
  pipelines:
    logs:
      receivers: [otlp]
      processors: [batch, attributes, resource]
      exporters: [elasticsearch]
    traces:
      receivers: [otlp]
      processors: [batch, attributes, resource]
      exporters: [jaeger]

数据关联处理

在Collector中实现日志与追踪数据的关联：

processors:
  logstransform:
    logs:
      - statement: set(attributes["trace_id"], resource.attributes["telemetry.sdk.language"])
        condition: isMatch(attributes["message"], ".*error.*")
      - statement: set(attributes["correlation_id"], attributes["request.id"])
        condition: has(attributes["request.id"])

跨服务日志关联的最佳实践

1. 统一日志格式

确保所有服务使用统一的日志格式：

{
  "timestamp": "2025-08-31T10:00:00.123Z",
  "level": "INFO",
  "service": "user-service",
  "traceId": "abc123def456ghi789",
  "spanId": "jkl012mno345pqr678",
  "message": "User authentication successful",
  "userId": "user123",
  "ipAddress": "192.168.1.100"
}

2. 合理的索引策略

为常用查询字段建立索引：

{
  "settings": {
    "index": {
      "number_of_shards": 3,
      "number_of_replicas": 1
    }
  },
  "mappings": {
    "properties": {
      "traceId": {
        "type": "keyword",
        "index": true
      },
      "service": {
        "type": "keyword",
        "index": true
      },
      "level": {
        "type": "keyword",
        "index": true
      },
      "@timestamp": {
        "type": "date",
        "index": true
      }
    }
  }
}

3. 数据生命周期管理

实现合理的数据保留策略：

{
  "policy": {
    "phases": {
      "hot": {
        "actions": {
          "rollover": {
            "max_age": "7d",
            "max_size": "50gb"
          }
        }
      },
      "warm": {
        "min_age": "7d",
        "actions": {
          "forcemerge": {
            "max_num_segments": 1
          }
        }
      },
      "cold": {
        "min_age": "30d",
        "actions": {
          "freeze": {}
        }
      },
      "delete": {
        "min_age": "90d",
        "actions": {
          "delete": {}
        }
      }
    }
  }
}

4. 监控和告警

建立完善的监控和告警机制：

{
  "trigger": {
    "schedule": {
      "interval": "1m"
    }
  },
  "input": {
    "search": {
      "request": {
        "search_type": "query_then_fetch",
        "indices": ["application-logs-*"],
        "body": {
          "size": 0,
          "query": {
            "bool": {
              "must": [
                {
                  "term": {
                    "level": "ERROR"
                  }
                }
              ],
              "filter": [
                {
                  "range": {
                    "@timestamp": {
                      "gte": "now-5m"
                    }
                  }
                }
              ]
            }
          },
          "aggs": {
            "by_trace": {
              "terms": {
                "field": "traceId",
                "size": 10
              }
            }
          }
        }
      }
    }
  },
  "condition": {
    "compare": {
      "ctx.payload.hits.total": {
        "gt": 10
      }
    }
  },
  "actions": {
    "send_email": {
      "email": {
        "to": "admin@example.com",
        "subject": "High Error Rate Detected",
        "body": "Error count in the last 5 minutes: {{ctx.payload.hits.total}}",
        "attachments": {
          "error_traces.csv": {
            "data": {
              "format": "csv",
              "aggregation": "by_trace"
            }
          }
        }
      }
    }
  }
}

故障排查场景

请求链路分析

通过Trace ID分析完整的请求链路：

# 查询特定Trace的所有日志
curl -X GET "http://elasticsearch:9200/application-logs-*/_search" \
-H "Content-Type: application/json" \
-d '{
  "query": {
    "term": {
      "traceId": "abc123def456ghi789"
    }
  },
  "sort": [
    {
      "@timestamp": {
        "order": "asc"
      }
    }
  ]
}'

性能瓶颈识别

分析各服务的响应时间分布：

{
  "aggs": {
    "by_service": {
      "terms": {
        "field": "service"
      },
      "aggs": {
        "avg_duration": {
          "avg": {
            "script": {
              "source": "doc['end_time'].value - doc['start_time'].value"
            }
          }
        }
      }
    }
  }
}

错误传播追踪

追踪错误在服务间的传播路径：

{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "traceId": "abc123def456ghi789"
          }
        },
        {
          "terms": {
            "level": ["ERROR", "FATAL"]
          }
        }
      ]
    }
  }
}

总结

跨服务日志关联是实现分布式系统可观察性的关键技术。通过合理设计日志收集架构、使用专业的日志聚合工具，并遵循最佳实践，我们可以构建完整的请求追踪视图，为系统的监控、调试和优化提供强有力的支持。

在下一章中，我们将探讨日志的安全性与合规性问题，包括日志中的敏感信息管理、日志加密与访问控制以及合规性要求等内容。