global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alert.rules"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']

  - job_name: 'my-app'
    static_configs:
      - targets: ['my-app:8080']
    metrics_path: '/metrics'
    scrape_interval: 10s

应用指标暴露：

// Java应用中使用Micrometer暴露指标
@RestController
public class MetricsController {
    
    private final Counter requestCounter;
    private final Timer requestTimer;
    
    public MetricsController(MeterRegistry meterRegistry) {
        this.requestCounter = Counter.builder("http.requests")
            .tag("method", "GET")
            .register(meterRegistry);
            
        this.requestTimer = Timer.builder("http.request.duration")
            .register(meterRegistry);
    }
    
    @GetMapping("/api/data")
    public ResponseEntity<String> getData() {
        return requestTimer.recordCallable(() -> {
            requestCounter.increment();
            // 业务逻辑
            return ResponseEntity.ok("data");
        });
    }
}

告警规则配置

alert.rules：

groups:
- name: example
  rules:
  - alert: HighCPUUsage
    expr: rate(node_cpu_seconds_total{mode!="idle"}[5m]) > 0.8
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "实例 {{ $labels.instance }} CPU使用率过高"
      description: "{{ $labels.instance }} CPU使用率超过80%达2分钟"

  - alert: ServiceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "服务 {{ $labels.job }} 不可用"
      description: "服务 {{ $labels.job }} 已经宕机1分钟"

使用Grafana可视化监控数据

Grafana是领先的开源可视化平台，与Prometheus等数据源完美集成，提供丰富的图表和仪表板功能。

Grafana核心特性

多样化面板：

图表、表格、热力图等多种可视化方式
灵活的布局和样式配置
支持变量和模板

告警通知：

集成多种通知渠道
灵活的告警规则配置
告警抑制和分组

插件生态：

丰富的数据源插件
社区贡献的面板插件
自定义插件开发支持

仪表板配置示例

应用性能监控仪表板：

{
  "dashboard": {
    "title": "应用性能监控",
    "panels": [
      {
        "title": "HTTP请求速率",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{path}}"
          }
        ]
      },
      {
        "title": "错误率",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m])",
            "legendFormat": "错误率"
          }
        ]
      },
      {
        "title": "响应时间",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "50th percentile"
          }
        ]
      }
    ]
  }
}

变量和模板使用

{
  "templating": {
    "list": [
      {
        "name": "job",
        "type": "query",
        "datasource": "Prometheus",
        "label": "服务",
        "query": "label_values(up, job)",
        "refresh": 1
      },
      {
        "name": "instance",
        "type": "query",
        "datasource": "Prometheus",
        "label": "实例",
        "query": "label_values(up{job=\"$job\"}, instance)",
        "refresh": 1
      }
    ]
  }
}

集成ELK堆栈进行日志管理

ELK（Elasticsearch、Logstash、Kibana）堆栈是业界广泛使用的日志管理和分析解决方案。

ELK组件介绍

Elasticsearch：

分布式搜索引擎
实时全文搜索能力
水平扩展支持

Logstash：

数据收集和处理管道
多种输入、过滤和输出插件
实时数据处理

Kibana：

数据可视化平台
丰富的图表和仪表板
强大的搜索和分析功能

Logstash配置示例

logstash.conf：

input {
  beats {
    port => 5044
  }
}

filter {
  grok {
    match => { "message" => "%{TIMESTAMP_ISO8601:timestamp} \[%{LOGLEVEL:loglevel}\] %{GREEDYDATA:logger} - %{GREEDYDATA:message}" }
  }
  
  date {
    match => [ "timestamp", "ISO8601" ]
  }
  
  mutate {
    remove_field => [ "timestamp" ]
  }
  
  if [loglevel] == "ERROR" {
    mutate {
      add_tag => [ "error" ]
    }
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "app-logs-%{+YYYY.MM.dd}"
  }
  
  stdout {
    codec => rubydebug
  }
}

Filebeat配置

filebeat.yml：

filebeat.inputs:
- type: log
  enabled: true
  paths:
    - /var/log/my-app/*.log
  fields:
    service: my-app
  fields_under_root: true

processors:
- add_host_metadata: ~
- add_docker_metadata: ~

output.logstash:
  hosts: ["logstash:5044"]

Kibana仪表板配置

日志分析仪表板：

{
  "dashboard": {
    "title": "应用日志分析",
    "panels": [
      {
        "title": "日志级别分布",
        "type": "pie",
        "query": {
          "query": "*",
          "language": "lucene"
        },
        "buckets": {
          "aggType": "terms",
          "field": "loglevel.keyword",
          "size": 10
        }
      },
      {
        "title": "错误日志趋势",
        "type": "line",
        "query": {
          "query": "tags:error",
          "language": "lucene"
        },
        "buckets": {
          "aggType": "date_histogram",
          "field": "@timestamp",
          "interval": "1h"
        }
      }
    ]
  }
}

使用Loki和Fluentd进行日志收集与聚合

Loki是Grafana Labs推出的轻量级日志聚合系统，与Prometheus和Grafana深度集成。Fluentd是统一的日志收集器，支持多种数据源和输出。

Loki架构特点

低成本：

不对日志内容建立全文索引
基于标签的索引机制
高效的压缩存储

易集成：

与Prometheus查询语法一致
与Grafana无缝集成
支持多种客户端

Loki配置示例

loki.yml：

auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
  chunk_idle_period: 5m
  chunk_retain_period: 30s

schema_config:
  configs:
    - from: 2020-05-15
      store: boltdb
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 168h

storage_config:
  boltdb:
    directory: /tmp/loki/index

  filesystem:
    directory: /tmp/loki/chunks

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

Promtail配置

promtail.yml：

server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
- job_name: system
  static_configs:
  - targets:
      - localhost
    labels:
      job: varlogs
      __path__: /var/log/*log
  - targets:
      - localhost
    labels:
      job: applogs
      __path__: /var/log/my-app/*.log
      service: my-app

Fluentd配置

fluent.conf：

<source>
  @type tail
  path /var/log/my-app/*.log
  pos_file /var/log/td-agent/my-app.log.pos
  tag my-app.access
  <parse>
    @type json
    time_key timestamp
    time_format %Y-%m-%dT%H:%M:%S.%NZ
  </parse>
</source>

<filter my-app.access>
  @type record_transformer
  <record>
    service my-app
    environment ${ENVIRONMENT}
  </record>
</filter>

<match my-app.access>
  @type elasticsearch
  host elasticsearch
  port 9200
  logstash_format true
  logstash_prefix my-app
</match>

监控系统的告警与自动化响应

有效的告警机制和自动化响应能力是保障系统稳定性的关键。

告警策略设计

告警级别：

Critical：严重问题，需要立即处理
Warning：潜在问题，需要关注
Info：信息性告警，用于记录

告警维度：

业务维度：影响用户体验的指标
系统维度：CPU、内存、磁盘等系统指标
应用维度：应用性能、错误率等指标

Alertmanager配置

alertmanager.yml：

global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alert@example.com'
  smtp_auth_username: 'alert'
  smtp_auth_password: 'password'

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'team-mails'

receivers:
- name: 'team-mails'
  email_configs:
  - to: 'team@example.com'
    send_resolved: true

- name: 'webhook'
  webhook_configs:
  - url: 'http://webhook.example.com/alert'
    send_resolved: true

inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'dev', 'instance']

自动化响应机制

自动修复脚本：

#!/bin/bash
# 自动重启失败的服务
SERVICE_NAME=$1
MAX_RESTARTS=3
RESTART_COUNT=0

while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do
  if systemctl is-active --quiet $SERVICE_NAME; then
    echo "服务 $SERVICE_NAME 正常运行"
    exit 0
  else
    echo "服务 $SERVICE_NAME 未运行，尝试重启..."
    systemctl restart $SERVICE_NAME
    sleep 10
    RESTART_COUNT=$((RESTART_COUNT + 1))
  fi
done

echo "服务 $SERVICE_NAME 重启失败，发送告警"
# 发送告警通知
curl -X POST -d "服务 $SERVICE_NAME 重启失败" http://alert.example.com/notify

Kubernetes自动修复：

apiVersion: apps/v1
kind: Deployment
metadata:
  name: my-app
spec:
  replicas: 3
  selector:
    matchLabels:
      app: my-app
  template:
    metadata:
      labels:
        app: my-app
    spec:
      containers:
      - name: my-app
        image: my-app:latest
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 5

最佳实践

为了构建高效的监控和日志管理体系，建议遵循以下最佳实践：

1. 指标设计

遵循四大黄金信号：延迟、流量、错误、饱和度
区分业务指标和系统指标
建立统一的命名规范

2. 日志规范

结构化日志输出
合理的日志级别划分
包含足够的上下文信息

3. 告警管理

避免告警疲劳，设置合理的告警阈值
建立告警抑制和分组机制
定期审查和优化告警规则

4. 可视化设计

设计直观易懂的仪表板
关注关键业务指标
提供多维度的数据分析能力

总结

监控和日志管理是现代DevOps实践的核心组成部分。通过Prometheus、Grafana、ELK、Loki等工具，团队可以构建全面的系统可观测性体系。合理的指标设计、日志规范和告警策略能够有效提升系统的稳定性和可维护性。随着云原生技术的发展，监控和日志管理也在不断演进，需要持续学习和实践以适应新的需求和挑战。

在下一章中，我们将探讨事件响应与故障排查的实践，了解如何建立高效的故障处理机制。