# loki配置示例
auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
  chunk_idle_period: 5m
  chunk_retain_period: 30s

schema_config:
  configs:
    - from: 2020-05-15
      store: boltdb
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 168h

storage_config:
  boltdb:
    directory: /tmp/loki/index

  filesystem:
    directory: /tmp/loki/chunks

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

Prometheus（指标收集）

# prometheus配置示例
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alert.rules"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'api-server'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_app]
        action: keep
        regex: api-server

Tempo（分布式追踪）

# tempo配置示例
server:
  http_listen_port: 3200

distributor:
  receivers:
    jaeger:
      protocols:
        thrift_http:
        grpc:
        thrift_binary:
        thrift_compact:
    zipkin:

ingester:
  trace_idle_period: 10s
  max_block_bytes: 1_000_000
  max_block_duration: 5m

compactor:
  compaction:
    block_retention: 1h

storage:
  trace:
    backend: local
    local:
      path: /tmp/tempo/blocks

标签一致性设计

统一标签命名规范

为了实现数据关联，需要在三个系统中使用一致的标签命名：

# 推荐的标签命名规范
labels:
  service: user-service        # 服务名称
  namespace: production        # 命名空间
  pod: user-service-7d5b8c9c4-xl2v9  # Pod名称
  instance: 10.0.1.23:8080     # 实例地址
  version: v1.2.3             # 版本号
  cluster: prod-cluster        # 集群名称

Trace ID 贯穿三系统

确保Trace ID在日志、指标和追踪中保持一致：

// Go示例：在日志中包含Trace ID
logger.Info("User login successful",
    zap.String("trace_id", traceID),
    zap.String("service", "user-service"),
    zap.String("user_id", userID))

# Prometheus指标中包含Trace ID标签
http_requests_total{service="user-service", trace_id="a1b2c3d4-e5f6-7890"}

Grafana统一界面配置

数据源配置

{
  "name": "Loki",
  "type": "loki",
  "url": "http://loki:3100",
  "access": "proxy",
  "jsonData": {
    "maxLines": 1000
  }
}

{
  "name": "Prometheus",
  "type": "prometheus",
  "url": "http://prometheus:9090",
  "access": "proxy"
}

{
  "name": "Tempo",
  "type": "tempo",
  "url": "http://tempo:3200",
  "access": "proxy"
}

仪表板集成示例

{
  "title": "Unified Observability Dashboard",
  "panels": [
    {
      "title": "Service Metrics",
      "type": "graph",
      "datasource": "Prometheus",
      "targets": [
        {
          "expr": "rate(http_requests_total[5m])",
          "legendFormat": "{{service}} - {{method}}"
        }
      ]
    },
    {
      "title": "Service Logs",
      "type": "logs",
      "datasource": "Loki",
      "targets": [
        {
          "expr": "{service=\"$service\"} |~ \"$search\"",
          "refId": "A"
        }
      ]
    },
    {
      "title": "Trace Overview",
      "type": "traces",
      "datasource": "Tempo",
      "targets": [
        {
          "queryType": "traceql",
          "query": "{ service=\"$service\" }"
        }
      ]
    }
  ]
}

跨系统关联查询

从指标到日志

# 1. 在Prometheus中发现异常指标
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m]) > 1

# 2. 在Grafana中使用变量传递服务名称到Loki查询
{service="$service", status=~"5.."} |~ "timeout"

从追踪到日志

# 1. 在Tempo中查找慢查询
{ service="user-service", span_name="/api/users" } | duration > 1s

# 2. 获取Trace ID后，在Loki中查询相关日志
{trace_id="a1b2c3d4-e5f6-7890"} |~ "error"

从日志到指标

# 1. 在Loki中查找错误日志
{service="user-service"} |~ "ERROR" |~ "database connection failed"

# 2. 提取相关信息后，在Prometheus中查看相关指标
up{job="database"} == 0

统一告警策略

多维度告警规则

# Prometheus告警规则
groups:
- name: unified-alerts
  rules:
  - alert: HighErrorRate
    expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High error rate for {{ $labels.service }}"
      description: "Error rate is above 5% for service {{ $labels.service }}"

  - alert: ServiceDown
    expr: up == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "Service {{ $labels.job }} is down"
      description: "Service {{ $labels.job }} has been down for more than 2 minutes"

告警关联与抑制

# Alertmanager配置
route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  - match:
      severity: critical
    receiver: 'critical'

inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['service', 'namespace']

数据关联实现

应用层集成

// Java示例：在应用中统一Trace ID
@Component
public class ObservabilityContext {
    private static final ThreadLocal<String> traceId = new ThreadLocal<>();
    
    public static void setTraceId(String id) {
        traceId.set(id);
    }
    
    public static String getTraceId() {
        return traceId.get();
    }
    
    public static void clear() {
        traceId.remove();
    }
}

// 日志配置中包含Trace ID
<configuration>
    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
        <encoder class="net.logstash.logback.encoder.LoggingEventCompositeJsonEncoder">
            <providers>
                <timestamp/>
                <logLevel/>
                <message/>
                <mdc/>
                <pattern>
                    <pattern>
                        {
                            "trace_id": "%mdc{traceId}",
                            "span_id": "%mdc{spanId}"
                        }
                    </pattern>
                </pattern>
            </providers>
        </encoder>
    </appender>
</configuration>

基础设施层集成

# Fluentd配置：从日志中提取Trace ID
<source>
  @type tail
  path /var/log/application/*.log
  pos_file /var/log/fluentd.pos
  tag application.log
  <parse>
    @type json
    time_key timestamp
  </parse>
</source>

<filter application.log>
  @type record_transformer
  <record>
    trace_id ${record["trace_id"]}
    service ${record["service"]}
  </record>
</filter>

<match application.log>
  @type loki
  url "http://loki:3100/loki/api/v1/push"
  extra_labels {"job": "application-logs"}
</match>

性能优化策略

查询优化

Loki查询优化

# 优化前：全量扫描
{job="application-logs"} |~ "error"

# 优化后：使用标签过滤减少扫描量
{job="application-logs", service="user-service"} |= "error"

Prometheus查询优化

# 优化前：高基数查询
http_requests_total{method!="", handler!="", status!="", user_agent!=""}

# 优化后：合理使用标签
http_requests_total{job="api-server", method="POST"}

存储优化

# Loki存储策略
compactor:
  working_directory: /tmp/loki/compactor
  shared_store: filesystem
  compaction_interval: 10m
  retention_enabled: true
  retention_delete_delay: 2h
  retention_delete_worker_count: 150

limits_config:
  retention_period: 168h  # 7天保留期
  max_query_length: 12000h  # 最大查询时间

安全与权限管理

数据访问控制

# Grafana权限配置
apiVersion: 1
providers:
  - name: 'default'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /var/lib/grafana/dashboards

多租户支持

# Loki多租户配置
auth_enabled: true

server:
  http_listen_port: 3100
  grpc_listen_port: 9095

common:
  path_prefix: /tmp/loki
  storage:
    filesystem:
      chunks_directory: /tmp/loki/chunks
      rules_directory: /tmp/loki/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory

query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100

runtime_config:
  file: /etc/loki/runtime-config.yaml

最佳实践总结

1. 设计原则

一致性：保持标签命名和数据结构的一致性
可关联性：确保不同系统间的数据可以关联
可扩展性：设计支持未来扩展的架构

2. 实施步骤

标准化标签：定义统一的标签命名规范
集成基础设施：配置日志、指标、追踪系统的集成
构建仪表板：创建统一的可视化界面
配置告警：设置跨系统的告警策略
优化性能：持续优化查询和存储性能

3. 维护建议

定期审查：定期审查标签使用情况和查询性能
文档化：维护完整的集成文档和操作手册
培训团队：确保团队成员理解统一监控的价值和使用方法

总结

构建统一的监控视图是实现全面系统可观察性的关键步骤。通过整合日志、指标和追踪数据，我们可以获得更深入的系统洞察，更高效地进行问题排查。Loki、Prometheus和Tempo的组合为这一目标提供了强大的技术支撑，但成功的关键在于良好的架构设计、一致的标签规范和持续的优化维护。

在下一节中，我们将深入探讨OpenTelemetry标准在跨语言监控中的应用，学习如何构建统一的可观察性基础设施。