prometheus告警规则模板:MySQL,nginx,node
阅读原文时间:2023年07月10日阅读:1

rules_up.yml

groups:
- name: up
  rules:
  - alert: mysql
    expr: up{instance="db1",job="mysql"} != 0
    for: 10s
    labels:
      instance: '{{$labels.instance}}'
      priority: "3"
      type: 'mysql'
    annotations:
      description: '{{ $labels.instance }} nginx-vts-web  is down please handle'
      summary: 'jobname: {{$labels.instance}} nginx-vts-web is down please handle'

  - alert: node
    expr: up{job="node"} != 0
    for: 10s
    labels:
      instance: '{{$labels.instance}}'
      priority: "3"
      type: 'node'
    annotations:
      description: '{{ $labels.instance }} nginx-vts-web  is down please handle'
      summary: 'jobname: {{$labels.instance}} nginx-vts-web is down please handle'

  - alert: nginx
    expr: up{instance="web",job="nginx"} != 0
    for: 10s
    labels:
      instance: '{{$labels.instance}}'
      priority: "3"
      type: 'nginx'
    annotations:
      description: '{{ $labels.instance }} nginx-vts-web  is down please handle'
      summary: 'jobname: {{$labels.instance}} nginx-vts-web is down please handle'

rules_mysql.yml

groups:
- name: mysql.rules
  rules:
  - record: mysql_slave_lag_seconds
    expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay
  - record: mysql_heartbeat_lag_seconds
    expr: mysql_heartbeat_now_timestamp_seconds - mysql_heartbeat_stored_timestamp_seconds
  - record: job:mysql_transactions:rate5m
    expr: sum(rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m]))
      WITHOUT (command)
  - alert: MySQLGaleraNotReady
    expr: mysql_global_status_wsrep_ready != 1
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
      summary: Galera cluster node not ready
  - alert: MySQLGaleraOutOfSync
    expr: (mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync
      == 0)
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}}
        != 4).'
      summary: Galera cluster node out of sync
  - alert: MySQLGaleraDonorFallingBehind
    expr: (mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue
      > 100)
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is a donor (hotbackup)
        and is falling behind (queue size {{$value}}).'
      summary: xtradb cluster donor node falling behind
  - alert: MySQLReplicationNotRunning
    expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running
      == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      description: Slave replication (IO or SQL) has been down for more than 2 minutes.
      summary: Slave replication is not running
  - alert: MySQLReplicationLag
    expr: (mysql_slave_lag_seconds > 30) and ON(instance) (predict_linear(mysql_slave_lag_seconds[5m],
      60 * 2) > 0)
    for: 1m
    labels:
      severity: critical
    annotations:
      description: The mysql slave replication has fallen behind and is not recovering
      summary: MySQL slave replication is lagging
  - alert: MySQLReplicationLag
    expr: (mysql_heartbeat_lag_seconds > 30) and ON(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m],
      60 * 2) > 0)
    for: 1m
    labels:
      severity: critical
    annotations:
      description: The mysql slave replication has fallen behind and is not recovering
      summary: MySQL slave replication is lagging
  - alert: MySQLInnoDBLogWaits
    expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
    labels:
      severity: warning
    annotations:
      description: The innodb logs are waiting for disk at a rate of {{$value}} /
        second
      summary: MySQL innodb log writes stalling

rules_nginx.yml

groups:
- name: aws_ec2_nginx-vts-web
  rules:
  - alert: nginx-vts-web-status
    expr: up{job="nginx-vts-web"} == 0
    for: 1m
    labels:
      instance: '{{$labels.instance}}'
      priority: "3"
      type: 'nginx'
    annotations:
      description: '{{ $labels.instance }} nginx-vts-web  is down please handle'
      summary: 'jobname: {{$labels.instance}} nginx-vts-web is down please handle'
  - alert: nginx application 5xx gt 10 per min
    expr: sum(nginx_vts_filter_requests_total{direction="5xx",job="nginx-vts-web"} - nginx_vts_filter_requests_total{direction="5xx",job="nginx-vts-web"} offset 1m) by (application,direction) > 10
    for: 1m
    labels:
      type: 'nginx'
      priority: "4"
    annotations:
      description: ' {{$labels.application}} {{$labels.direction}} >10/m'
      summary: '  {{$labels.application}} {{$labels.direction}}  >10/m'
  - alert: nginx application 5xx gt 1 per min
    expr: sum(nginx_vts_filter_requests_total{direction="5xx",job="nginx-vts-web"} - nginx_vts_filter_requests_total{direction="5xx",job="nginx-vts-web"} offset 1m) by (application,direction,filter_name) > 1
    for: 1m
    labels:
      type: 'nginx-detail'
      priority: "4"
    annotations:
      description: ' {{$labels.application}} {{$labels.direction}}{{$labels.filter_name}} >10/m'
      summary: '  {{$labels.application}} {{$labels.direction}}  >10/m'
 # - alert: nginx application 2xx gt 10 per min
 #   expr: sum(nginx_vts_filter_requests_total{direction="2xx",job="nginx-vts-web"} - nginx_vts_filter_requests_total{direction="2xx",job="nginx-vts-web"} offset 1m) by (application,direction) > 10
 #   for: 1m
 #   labels:
 #     test: 'yes'
 #     priority: "4"
 #   annotations:
 #     description: ' {{$labels.application}} {{$labels.direction}} >10/m'
 #     summary: '  {{$labels.application}} {{$labels.direction}}  >10/m'

  - alert: nginx interface time  gt 300ms
    expr: nginx_vts_filter_request_seconds{job="nginx-vts-web",filter_name!="/v1/users/kyc/upload"}*1000 > 300
    for: 1m
    labels:
      type: 'nginx'
      priority: "4"
    annotations:
      description: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}}  >300ms'
      summary: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}} >300ms'

  - alert: nginx interface time  gt 2s
    expr: nginx_vts_filter_request_seconds{job="nginx-vts-web",filter_name="/v1/users/kyc/upload"}*1000 > 2000
    for: 1m
    labels:
      type: 'nginx'
      priority: "4"
    annotations:
      description: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}}  >300ms'
      summary: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}} >300ms'

rules_node.yml

groups:
- name: example-node-exporter-rules
  rules:
  # The count of CPUs per node, useful for getting CPU time as a percent of total.
  - alert: instance:node_cpus:count
    expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode) > 2
    for: 1s
    annotations:
      description: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}}  >300ms'
      summary: ' {{$labels.application}} {{$labels.filter}} {{$labels.filter_name}} >300ms'
- name: node_up
  rules:
  # The count of CPUs per node, useful for getting CPU time as a percent of total.
  - alert: instance:node_cpus:count
    expr: up{instance="node",job="node"} != 1
    for: 1s
    annotations:
      description: ' {{$labels.application}} {{$lables.instance}}  is up'
      summary: ' {{$labels.application}} {{$lables.instance}}  is up'