| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Found data file with inconsistent last_sync_offset
expr: sum by(path) (changes(abgw_file_sync_offset_mismatch_errs_total{job="abgw"}[3h])) != 0
for: 1m
labels:
component: abgw
severity: error
annotations:
summary: The "{{ $labels.path }}" file's size is less than last_sync_offset stored in info file.
|
ok
|
|
57.712s ago
|
226.1us |
| alert: Backup storage throttling is activated
expr: job:abgw_append_throttle_delay_ms:rate5m != 0
for: 1m
labels:
component: abgw
severity: warning
annotations:
summary: Backup storage started to throttle write operations due to the lack of free space. Visit https://kb.acronis.com/content/62823 to learn how to troubleshoot this issue.
|
ok
|
|
57.712s ago
|
97.73us |
| alert: Backup storage SSL certificate will expire in less than 21 days
expr: label_join(instance_path_reg_type:abgw_next_certificate_expiration:min - time() < 86400 * 21 and instance_path_reg_type:abgw_next_certificate_expiration:min - time() >= 86400 * 14 and instance_path_reg_type:abgw_next_certificate_expiration:min{type!="crl"}, "object_id", "-", "reg_name", "type", "path")
labels:
component: abgw
severity: info
annotations:
summary: 'The {{$labels.type}} certificate will expire in less than 21 days. Path: {{$labels.path}}. Registration name: {{$labels.reg_name}}.'
|
ok
|
|
57.712s ago
|
298.8us |
| alert: Backup storage SSL certificate will expire in less than 14 days
expr: label_join(instance_path_reg_type:abgw_next_certificate_expiration:min - time() < 86400 * 14 and instance_path_reg_type:abgw_next_certificate_expiration:min - time() >= 86400 * 7 and instance_path_reg_type:abgw_next_certificate_expiration:min{type!="crl"}, "object_id", "-", "reg_name", "type", "path")
labels:
component: abgw
severity: warning
annotations:
summary: 'The {{$labels.type}} certificate will expire in less than 14 days. Path: {{$labels.path}}. Registration name: {{$labels.reg_name}}.'
|
ok
|
|
57.711s ago
|
265.5us |
| alert: Backup storage SSL certificate will expire in less than 7 days
expr: label_join(instance_path_reg_type:abgw_next_certificate_expiration:min - time() < 86400 * 7 and instance_path_reg_type:abgw_next_certificate_expiration:min - time() >= 0 and instance_path_reg_type:abgw_next_certificate_expiration:min{type!="crl"}, "object_id", "-", "reg_name", "type", "path")
labels:
component: abgw
severity: critical
annotations:
summary: 'The {{$labels.type}} certificate will expire in less than 7 days. Path: {{$labels.path}}. Registration name: {{$labels.reg_name}}.'
|
ok
|
|
57.711s ago
|
248.5us |
| alert: Backup storage SSL certificate has expired
expr: label_join(instance_path_reg_type:abgw_next_certificate_expiration:min - time() < 0 and instance_path_reg_type:abgw_next_certificate_expiration:min{type!="crl"}, "object_id", "-", "reg_name", "type", "path")
labels:
component: abgw
severity: critical
annotations:
summary: 'The {{$labels.type}} certificate has expired. Path: {{$labels.path}}. Registration name: {{$labels.reg_name}}.'
|
ok
|
|
57.711s ago
|
152.6us |
| alert: Backup storage CRL is not up to date
expr: label_join(time() - instance_path_reg_type:abgw_next_certificate_expiration:min > 86400 * 2 and instance_path_reg_type:abgw_next_certificate_expiration:min{path!~".*root\\.crl$",type="crl"}, "object_id", "-", "reg_name", "type", "path")
labels:
component: abgw
severity: warning
annotations:
summary: 'The CRL has not been updated for more than 2 days. Path: {{$labels.path}}. Registration name: {{$labels.reg_name}}.'
|
ok
|
|
57.711s ago
|
173.2us |
| alert: Different number of collaborating backup storage services
expr: sum(abgw_neighbors{type="count"}) != 0 and count(count(abgw_neighbors{type="count"} != scalar(count(abgw_neighbors{type="count"}))) or (((count(abgw_neighbors{type="index"}) - 1) * count(abgw_neighbors{type="index"}) / 2) != sum(abgw_neighbors{type="index"}))) > 0
for: 30m
labels:
component: abgw
severity: error
annotations:
summary: Some backup storage services report a different number of collaborating services. Please contact the technical support.
|
ok
|
|
57.711s ago
|
263.9us |
| alert: Attempt to use migrated accounts
expr: instance:abgw_inst_outdated:count > 0
for: 1m
labels:
component: abgw
severity: warning
annotations:
summary: One or more attempts to use migrated accounts detected for the last 24 hours. Please contact the technical support.
|
ok
|
|
57.711s ago
|
82.29us |
| alert: Storage I/O error
expr: instance:abgw_io_errors:count > 0
for: 1m
labels:
component: abgw
severity: error
annotations:
summary: One or more errors detected during storage I/O operations for the last 24 hours. Please contact the technical support.
|
ok
|
|
57.711s ago
|
66.97us |
| alert: Backup storage has high replica open error rate
expr: err:abgw_file_replica_open_errs:rate5m{err!="OK"} / on() group_left() sum(err:abgw_file_replica_open_errs:rate5m) > 0.05
for: 1m
labels:
component: abgw
severity: error
annotations:
summary: Backup storage has the error rate when opening replica files "{{$labels.err}}" higher than 5%.
|
ok
|
|
57.711s ago
|
133us |
| alert: Backup storage has high replica write error rate
expr: err:abgw_push_replica_errs:rate5m{err!="OK"} / on() group_left() sum(err:abgw_push_replica_errs:rate5m) > 0.05
for: 1m
labels:
component: abgw
severity: error
annotations:
summary: Backup storage has the error rate when writing replica files "{{$labels.err}}" higher than 5%.
|
ok
|
|
57.711s ago
|
117.3us |
| alert: Backup storage has high replica removal error rate
expr: err:abgw_rm_file_push_errs:rate5m{err!="OK"} / on() group_left() sum(err:abgw_rm_file_push_errs:rate5m) > 0.05
for: 1m
labels:
component: abgw
severity: error
annotations:
summary: Backup storage has the error rate when removing secondary replica files "{{$labels.err}}" higher than 5%.
|
ok
|
|
57.711s ago
|
107.9us |
| alert: Backup storage service is down
expr: label_replace(node_systemd_unit_state{name=~"abgw-kvstore-proxy.service|abgw-setting.service|vstorage-abgw.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_abgw == 1
for: 5m
labels:
component: abgw
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: critical
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
57.711s ago
|
654.5us |
|
3.643s ago |
37.47ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Software updates exist
expr: sum by(job, version, available_version) (softwareupdates_node_available) > 0 or sum by(job, version, available_version) (softwareupdates_management_panel_available) > 0
labels:
component: cluster
object_id: '{{ $labels.available_version }}'
severity: info
annotations:
summary: 'Software updates exist for the cluster. Available version: {{$labels.available_version}}.'
|
ok
|
|
3.643s ago
|
302.5us |
| alert: Update check failed
expr: softwareupdates_node_info{state="check_failed"} * on(node) group_left(instance) up{job="node"} and on() backend_node_internet_connected == 1
for: 10m
labels:
component: node
object_id: '{{ $labels.node }}'
severity: warning
annotations:
summary: Update check failed on the node {{$labels.instance}}. Please check access to the update repository.
|
ok
|
|
3.642s ago
|
465.7us |
| alert: Multiple update checks failed
expr: (sum_over_time(softwareupdates_node_info{state="check_failed"}[3d]) and softwareupdates_node_info{state="check_failed"}) * on(node) group_left(instance) up{job="node"} / (60 * 24 * 3) >= 0.9
labels:
component: node
severity: critical
annotations:
summary: Update checks failed multiple times on the node {{$labels.instance}}. Please check access to the update repository.
|
ok
|
|
3.642s ago
|
1.76ms |
| alert: Update download failed
expr: softwareupdates_node_info{state="download_failed"} * on(node) group_left(instance) up{job="node"}
labels:
component: node
severity: critical
annotations:
summary: Update download failed on the node {{$labels.instance}}.
|
ok
|
|
3.64s ago
|
198.9us |
| alert: Node update failed
expr: softwareupdates_node_info{state="update_failed"} * on(node) group_left(instance) up{job="node"}
labels:
component: node
severity: critical
annotations:
summary: Software update failed on the node {{$labels.instance}}.
|
ok
|
|
3.64s ago
|
154.8us |
| alert: Update failed
expr: sum by(job, instance, version, available_version) (softwareupdates_node_info{state="update_ctrl_plane_failed"})
labels:
component: node
severity: critical
annotations:
summary: Update failed for the management panel and compute API.
|
ok
|
|
3.64s ago
|
64.23us |
| alert: Cluster update failed
expr: count(softwareupdates_cluster_info{state="failed"}) + count(softwareupdates_node_info{state="idle"}) == count(up{job="node"}) + 1
labels:
component: cluster
severity: critical
annotations:
summary: Update failed for the cluster.
|
ok
|
|
3.64s ago
|
180.4us |
| alert: Entering maintenance for update failed
expr: softwareupdates_node_info{state="maintenance_failed"} * on(node) group_left(instance) up{job="node"}
labels:
component: node
severity: critical
annotations:
summary: Entering maintenance failed while updating the node {{$labels.instance}}.
|
ok
|
|
3.64s ago
|
112.2us |
| alert: Network connectivity failed
expr: sum by(network_name) (increase(network_connectivity_received_packets_total[10m])) == 0 and sum by(network_name) (increase(network_connectivity_sent_packets_total[10m])) > 0 and on(network_name) label_replace(cluster_network_info_total, "network_name", "$1", "network", "(.*)")
labels:
component: node
object_id: '{{$labels.network_name}}'
severity: critical
annotations:
summary: No network traffic has been detected via network "{{$labels.network_name}}" from all nodes.
|
ok
|
|
3.64s ago
|
3.674ms |
| alert: Node network connectivity problem
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="ord"}[10m])) == 0 and sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="ord"}[10m])) > 0 and on(dest_host) label_replace(sum_over_time(softwareupdates_node_state{state="rebooting"}[10m]) * on(node) group_left(hostname) (backend_node_online + 1), "dest_host", "$1", "hostname", "(.*)") == 0 and on(dest_host) label_replace(backend_node_online, "dest_host", "$1", "hostname", "(.*)") == 1 and on(network_name) label_replace(cluster_network_info_total, "network_name", "$1", "network", "(.*)")
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: critical
annotations:
summary: Node "{{$labels.src_host}}" has no network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}".
|
ok
|
|
3.636s ago
|
2.716ms |
| alert: Node network packet loss
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="ord"}[10m])) > 15 and (sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="ord"}[10m])) - sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="ord"}[10m]))) > 5
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: warning
annotations:
summary: Node "{{$labels.src_host}}" has a problem with network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}" due to the loss of some packets.
|
ok
|
|
3.634s ago
|
2.574ms |
| alert: Node network persistent packet loss
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="ord"}[2h])) > 450 and (sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="ord"}[2h])) - sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="ord"}[2h]))) > 50
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: warning
annotations:
summary: Node "{{$labels.src_host}}" has a problem with network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}" due to the persistent loss of some packets over the last two hours.
|
ok
|
|
3.631s ago
|
6.549ms |
| alert: Node network unstable connectivity
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="mtu"}[10m])) == 0 and sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="ord"}[10m])) > 0 and sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="mtu"}[10m])) > 0
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: critical
annotations:
summary: Node "{{$labels.src_host}}" has a problem with network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}" due to the loss of all MTU-sized packets.
|
ok
|
|
3.625s ago
|
2.45ms |
| alert: Node network MTU packet loss
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="mtu"}[10m])) > 15 and (sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="mtu"}[10m])) - sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="mtu"}[10m]))) > 5
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: warning
annotations:
summary: Node "{{$labels.src_host}}" has a problem with network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}" due to the loss of some MTU-sized packets.
|
ok
|
|
3.623s ago
|
2.508ms |
| alert: Node network persistent MTU packet loss
expr: sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="mtu"}[2h])) > 450 and (sum by(src_host, dest_host, network_name) (increase(network_connectivity_sent_packets_total{probe_type="mtu"}[2h])) - sum by(src_host, dest_host, network_name) (increase(network_connectivity_received_packets_total{probe_type="mtu"}[2h]))) > 50
labels:
component: node
object_id: '{{$labels.network_name}}-{{$labels.src_host}}-{{$labels.dest_host}}'
severity: warning
annotations:
summary: Node "{{$labels.src_host}}" has a problem with network connectivity to node "{{$labels.dest_host}}" via network "{{$labels.network_name}}" due to the persistent loss of some MTU-sized packets over the last two hours.
|
ok
|
|
3.62s ago
|
6.745ms |
| alert: Management node HA has four nodes
expr: count(backend_node_ha == 1) == 4
for: 10m
labels:
component: cluster
severity: warning
annotations:
summary: The management node HA configuration has four nodes. It is recommended to have three or five nodes included.
|
ok
|
|
3.613s ago
|
159.5us |
| alert: Incompatible hardware detected
expr: backend_node_cpu_info{iommu="False",model=~"AMD EPYC.*"} * on(node_id) group_right(model) label_join(backend_node_nic_info{model=~"MT27800 Family \\[ConnectX\\-5\\]",type=~"Infiniband controller|Ethernet controller"}, "nic_model", "", "model")
for: 10m
labels:
component: node
severity: warning
annotations:
summary: '{{- if query "backend_vendor_info{vendor='acronis'}" -}} Incompatible hardware detected on node {{$labels.node_id}}: {{$labels.model}} & {{$labels.nic_model}}. Using Mellanox and AMD may lead to data loss. Please double check that SR-IOV is properly enabled. Visit https://kb.acronis.com/content/64948 to learn how to troubleshoot this issue. {{- else if query "backend_vendor_info{vendor='virtuozzo'}" -}} Incompatible hardware detected on node {{$labels.node_id}}: {{$labels.model}} & {{$labels.nic_model}}. Using Mellanox and AMD may lead to data loss. Please double check that SR-IOV is properly enabled. Visit https://support.virtuozzo.com/hc/en-us/articles/19764365143953 to learn how to troubleshoot this issue. {{- end -}}'
|
ok
|
|
3.613s ago
|
235us |
| alert: Node had a fenced state for 1 hour
expr: sum_over_time(hci_compute_node_crashed_fenced[2h]) > 60
for: 5m
labels:
component: compute
severity: critical
annotations:
summary: For the last 2 hours node {{$labels.hostname}} with ID {{$labels.node}} had a fenced state at least for 1 hour
|
ok
|
|
3.613s ago
|
107.2us |
| alert: Node failed to return to operation
expr: hci_compute_node_crashed_fenced == 1 and on(node) backend_node_online
for: 30m
labels:
component: node
severity: warning
annotations:
summary: Node {{$labels.hostname}} has failed to automatically return to operation within 30 minutes after a crash. Check the node's hardware, and then try returning it to operation manually.
|
ok
|
|
3.613s ago
|
144.2us |
| alert: Node crash detected
expr: hci_compute_node_crashed_fenced == 1
for: 30s
labels:
component: node
severity: critical
annotations:
summary: Node {{$labels.hostname}} crashed, which started the VM evacuation.
|
ok
|
|
3.613s ago
|
78.88us |
| alert: License is not loaded
expr: cluster_license_info{status="unknown"} == 1
for: 30m
labels:
component: cluster
severity: warning
annotations:
summary: License is not loaded
|
ok
|
|
3.613s ago
|
89.58us |
| alert: License expired
expr: cluster_license_info{status=~"(expired|invalid|error|inactive)"} == 1
for: 30m
labels:
component: cluster
severity: critical
annotations:
summary: The license of cluster "{{$labels.cluster_name}}" has expired. Сontact your reseller to update your license immediately!
|
ok
|
|
3.613s ago
|
121.4us |
| alert: License is not updated
expr: cluster_license_info{expire_in_days=~"[7-9]|(1[0-9])|20",is_spla="False"} == 1
for: 30m
labels:
component: cluster
severity: warning
annotations:
summary: The license cannot be updated automatically and will expire in less than 21 days. Check the cluster connectivity to the license server or contact the technical support.
|
ok
|
|
3.613s ago
|
97.84us |
| alert: License will expire soon
expr: cluster_license_info{expire_in_days=~"[1-6]"} == 1
for: 30m
labels:
component: cluster
severity: critical
annotations:
summary: The license has not been updated automatically and will expire in less than 7 days. Check the cluster connectivity to the license server and contact the technical support immediately.
|
ok
|
|
3.613s ago
|
75.54us |
| alert: Kafka SSL CA certificate will expire in less than 30 days
expr: (kafka_ssl_ca_cert_expire_in_days > 0) <= 30
labels:
component: compute
severity: warning
value: '{{ $value }}'
annotations:
summary: Kafka SSL CA certificate will expire in {{ $value }} days. Please renew the certificate.
|
ok
|
|
3.613s ago
|
79.93us |
| alert: Kafka SSL CA certificate has expired
expr: kafka_ssl_ca_cert_expire_in_days <= 0
labels:
component: compute
severity: critical
annotations:
summary: Kafka SSL CA certificate has expired. Please renew the certificate.
|
ok
|
|
3.613s ago
|
88.51us |
| alert: Kafka SSL client certificate will expire in less than 30 days
expr: (kafka_ssl_client_cert_expire_in_days > 0) <= 30
labels:
component: compute
severity: warning
value: '{{ $value }}'
annotations:
summary: Kafka SSL client certificate will expire in {{ $value }} days. Please renew the certificate.
|
ok
|
|
3.613s ago
|
60.98us |
| alert: Kafka SSL client certificate has expired
expr: kafka_ssl_client_cert_expire_in_days <= 0
labels:
component: compute
severity: critical
annotations:
summary: Kafka SSL client certificate has expired. Please renew the certificate.
|
ok
|
|
3.613s ago
|
56.49us |
| alert: Compute cluster has failed
expr: compute_status == 2
labels:
component: compute
severity: critical
annotations:
summary: Compute cluster has failed. Unable to manage virtual machines.
|
ok
|
|
3.613s ago
|
74.53us |
| alert: Changes to the management database are not replicated
expr: db_replication_status == 2 and on(node) softwareupdates_node_state{state!~"updat.*"} == 1
for: 10m
labels:
component: node
severity: critical
annotations:
summary: Changes to the management database are not replicated to the node "{{ $labels.host }}" because it is offline. Check the node's state and connectivity.
|
ok
|
|
3.613s ago
|
526.4us |
| alert: Changes to the management database are not replicated
expr: db_replication_status == 1
labels:
component: node
severity: critical
annotations:
summary: Changes to the management database are not replicated to the node "{{ $labels.host }}". Please contact the technical support.
|
ok
|
|
3.612s ago
|
43.51us |
| alert: Management panel SSL certificate will expire in less than 30 days
expr: (backend_ui_ssl_cert_expire_in_days > 7) <= 30
labels:
component: cluster
severity: warning
value: '{{ $value }}'
annotations:
summary: The SSL certificate for the admin and self-service panels will expire in {{ $value }} days. Renew the certificate, as described in the product documentation, or contact the technical support.
|
ok
|
|
3.612s ago
|
74.28us |
| alert: Management panel SSL certificate will expire in less than 7 days
expr: (backend_ui_ssl_cert_expire_in_days > 0) <= 7
labels:
component: cluster
severity: critical
value: '{{ $value }}'
annotations:
summary: The SSL certificate for the admin and self-service panels will expire in {{ $value }} days. Renew the certificate, as described in the product documentation, or contact the technical support.
|
ok
|
|
3.612s ago
|
58.49us |
| alert: Management panel SSL certificate has expired
expr: backend_ui_ssl_cert_expire_in_days < 1
labels:
component: cluster
severity: critical
annotations:
summary: The SSL certificate for the admin and self-service panels has expired. Renew the certificate, as described in the product documentation, or contact the technical support.
|
ok
|
|
3.612s ago
|
80.11us |
| alert: Kernel is outdated
expr: backend_node_kernel_outdated == 1 and on(node) softwareupdates_node_state{state="uptodate"} == 1
labels:
component: node
severity: warning
annotations:
summary: Node "{{ $labels.instance }}" is not running the latest kernel.
|
ok
|
|
3.612s ago
|
188.2us |
| alert: Unable to push space usage statistics
expr: cluster_spla_last_action_days{action_type="report"} > 1
labels:
component: cluster
severity: warning
annotations:
summary: Unable to push space usage statistics for the cluster. Check the internet connection on the management node.
|
ok
|
|
3.612s ago
|
56.3us |
| alert: Unable to apply SPLA license
expr: cluster_spla_last_action_days{action_type="update_key"} > 1
labels:
component: cluster
severity: error
annotations:
summary: Unable to apply SPLA license for the cluster. Сontact your reseller to solve this issue.
|
ok
|
|
3.612s ago
|
45.34us |
| alert: Unable to get space usage
expr: cluster_spla_last_action_days{action_type="get_usage"} > 1
labels:
component: cluster
severity: error
annotations:
summary: Unable to get space usage for the cluster.
|
ok
|
|
3.612s ago
|
92.02us |
| alert: Disk SMART warning
expr: backend_node_disk_status{role!="unassigned",smart_status="failed"}
labels:
component: cluster
object_id: '{{ $labels.device }}-{{ $labels.serial_number }}-{{ $labels.instance }}'
severity: error
annotations:
summary: Disk "{{ $labels.device }}" ({{ $labels.serial_number }}) on node "{{ $labels.instance }}" has failed a S.M.A.R.T. check.
|
ok
|
|
3.612s ago
|
59.75us |
| alert: Disk error
expr: backend_node_disk_status{disk_status=~"unavail|failed",role!="unassigned"}
labels:
component: cluster
object_id: '{{ $labels.device }}-{{ $labels.serial_number }}-{{ $labels.instance }}'
severity: error
annotations:
summary: Disk "{{ $labels.device }}" ({{ $labels.serial_number }}) has failed on node "{{ $labels.instance }}".
|
ok
|
|
3.612s ago
|
87.56us |
| alert: Management node backup does not exist
expr: backend_database_backup_age{last_backup_date="None"} == 0
labels:
component: cluster
severity: error
annotations:
summary: The last management node backup has failed or does not exist!
|
ok
|
|
3.612s ago
|
92.91us |
| alert: Management node backup is old
expr: (backend_database_backup_age > 0) < 3
for: 1h
labels:
component: cluster
severity: warning
value: '{{ $value }}'
annotations:
summary: Management node backup is older than {{ $value }} day
|
ok
|
|
3.612s ago
|
62.21us |
| alert: Management node backup is too old
expr: backend_database_backup_age > 3
labels:
component: cluster
severity: error
value: '{{ $value }}'
annotations:
summary: Management node backup is older than {{ $value }} day
|
ok
|
|
3.612s ago
|
54.92us |
| alert: Node has no internet access
expr: backend_node_internet_connected == 0
for: 10m
labels:
component: node
severity: warning
annotations:
summary: Node "{{ $labels.instance }}" cannot reach the repository. Ensure the node has a working internet connection.
|
ok
|
|
3.612s ago
|
77.93us |
| alert: High availability for the admin panel must be configured
expr: count by(cluster_id) (backend_node_master) >= 3 and on(cluster_id) backend_ha_up == 0
for: 15m
labels:
component: cluster
severity: error
annotations:
summary: |
Configure high availability for the admin panel in SETTINGS > System settings > Management node high availability. Otherwise the admin panel will be a single point of failure.
|
ok
|
|
3.612s ago
|
135.8us |
| alert: Identity provider validation error
expr: backend_idp_error{error_type="validation_error"} == 1
labels:
component: cluster
severity: error
annotations:
summary: Invalid identity provider configuration "{{ $labels.idp_name }}" in domain "{{ $labels.domain_name }}".
|
ok
|
|
3.612s ago
|
103.2us |
| alert: Identity provider connection error
expr: backend_idp_error{error_type="connection_error"} == 1
for: 10m
labels:
component: cluster
severity: error
annotations:
summary: Unable to connect to identity provider "{{ $labels.idp_name }}" in domain "{{ $labels.domain_name }}".
|
ok
|
|
3.612s ago
|
105us |
| alert: Backend service is down
expr: label_replace(sum by(name) (node_systemd_unit_state{name=~"vstorage-ui-backend.service",state="active"}), "name", "$1", "name", "(.*)\\.service") == 0
for: 5m
labels:
component: cluster
object_id: '{{ $labels.name }}'
severity: critical
annotations:
summary: Service {{$labels.name}} is down.
|
ok
|
|
3.612s ago
|
457.2us |
| alert: Primary management node service is down
expr: label_replace(node_systemd_unit_state{name=~"vstorage-ui-backend-s3-proxy.service|vstorage-ui-backend-consul.service|vstorage-ui-backend-celery_periodic.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_master == 1
for: 5m
labels:
component: cluster
object_id: '{{ $labels.name }}'
severity: critical
annotations:
summary: Service {{$labels.name}} is down on host {{$labels.instance}}.
|
ok
|
|
3.611s ago
|
519.4us |
| alert: High availability service is down
expr: label_replace(node_systemd_unit_state{name=~"vstorage-ui-backend-raftor.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_management == 1 and on() backend_ha_up == 1
for: 5m
labels:
component: cluster
object_id: '{{ $labels.name }}'
severity: critical
annotations:
summary: Service {{$labels.name}} is down on host {{$labels.instance}}.
|
ok
|
|
3.611s ago
|
454.1us |
| alert: Management node service is down
expr: label_replace(node_systemd_unit_state{name=~"alertmanager.service|prometheus.service|postgresql.service|pgbouncer.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_management == 1
for: 5m
labels:
component: cluster
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: critical
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
3.611s ago
|
566.2us |
| alert: Node service is down
expr: label_replace(node_systemd_unit_state{name=~"mtail.service|chronyd.service|multipathd.service|sshd.service|disp-helper.service|abrtd.service|abrt-oops.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1
for: 5m
labels:
component: node
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: warning
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
3.61s ago
|
681us |
| alert: Critical node service is down
expr: label_replace(node_systemd_unit_state{name=~"nginx.service|vcmmd.service|vstorage-ui-agent.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1
for: 5m
labels:
component: node
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: critical
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
3.61s ago
|
389us |
| alert: Compute node service is down
expr: label_replace(node_systemd_unit_state{name=~"openvswitch.service|ovs-vswitchd.service|ovsdb-server.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_compute == 1
for: 5m
labels:
component: compute
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: critical
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
3.609s ago
|
464.5us |
|
59.955s ago |
37.59ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Node has high CPU usage
expr: round(100 - (avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100)) > 90
for: 15m
labels:
component: node
severity: critical
annotations:
summary: Node {{ $labels.instance}} has CPU usage higher than 90%. The current value is {{ $value }}.
value: '{{ $value }}'
|
ok
|
|
59.955s ago
|
7.355ms |
| alert: Node has high memory usage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 95 and on(instance) label_replace(openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"}, "instance", "$1", "hostname", "(.*).vstoragedomain") == 1 and (rate(node_vmstat_pswpin[5m]) + rate(node_vmstat_pswpout[5m])) > 100
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{$labels.instance}} has memory usage higher than 95%. The current value is {{ $value }}.
value: '{{ $value }}'
|
ok
|
|
59.947s ago
|
613us |
| alert: Node has high disk I/O usage
expr: round(rate(node_disk_io_time_seconds_total{device=~".+",job="node"}[2m]) * 100) > 85
for: 15m
labels:
component: node
severity: critical
annotations:
summary: Disk /dev/{{$labels.device}} on node {{$labels.instance}} has I/O usage higher than 85%. The current value is {{ $value }}.
value: '{{ $value }}'
|
ok
|
|
59.947s ago
|
1.008ms |
| alert: Node has high swap usage
expr: (node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes * 100 > 70 and (node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes * 100 < 95 and (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90 and (rate(node_vmstat_pswpin[5m]) + rate(node_vmstat_pswpout[5m])) > 100 and delta(node_memory_SwapFree_bytes[5m]) < 0
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{$labels.instance}} has swap usage higher than 70%. The current value is {{ $value }}.
value: '{{ $value }}'
|
ok
|
|
59.946s ago
|
859us |
| alert: Node has critically high swap usage
expr: (node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes * 100 >= 95 and (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90 and (rate(node_vmstat_pswpin[5m]) + rate(node_vmstat_pswpout[5m])) > 100 and delta(node_memory_SwapFree_bytes[5m]) < 0
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{$labels.instance}} has critically high swap usage, exceeding 95%. The current value is {{ $value }}.
value: '{{ $value }}'
|
ok
|
|
59.945s ago
|
1.107ms |
| alert: Node time not synced
expr: floor((abs(backend_node_time_seconds{role="node"} - scalar(backend_node_time_seconds{role="backend"})) > 5) and (abs(backend_node_time_seconds{role="node"} - scalar(backend_node_time_seconds{role="backend"})) < 30))
for: 6m
labels:
component: node
severity: warning
annotations:
summary: Time on node {{$labels.instance}} differs from the time on backend node by more than {{ $value }} seconds.
value: '{{ $value }}'
|
ok
|
|
59.944s ago
|
316.4us |
| alert: Node time critically unsynced
expr: floor(abs(backend_node_time_seconds{role="node"} - scalar(backend_node_time_seconds{role="backend"})) > 30)
for: 6m
labels:
component: node
severity: critical
annotations:
summary: Time on node {{$labels.instance}} is critically unsynced, differing from the time on backend node by more than {{ $value }} seconds.
value: '{{ $value }}'
|
ok
|
|
59.944s ago
|
191.3us |
| alert: Not enough cluster nodes
expr: sum by(cluster_id) (backend_node_online) < 3
for: 5m
labels:
component: cluster
severity: warning
value: '{{ $value }}'
annotations:
summary: Cluster has only {{ $value }} node(s) instead of the recommended minimum of 3. Add more nodes to the cluster.
|
ok
|
|
59.944s ago
|
91.64us |
| alert: Software RAID is not fully synced
expr: round(((node_md_blocks_synced / node_md_blocks) * 100) < 100) and on() (node_md_state{state="active"} == 1)
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Software RAID {{ $labels.device }} on node {{ $labels.instance }} is {{ $value }} synced.
value: '{{ $value }}'
|
ok
|
|
59.944s ago
|
152.4us |
| alert: Licensed storage capacity is low
expr: ((job:mdsd_fs_allocated_size_bytes:sum > job:mdsd_cluster_licensed_space_bytes:sum * 0.8) < job:mdsd_cluster_licensed_space_bytes:sum * 0.9) / 1024 ^ 3
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: '{{- if query "backend_vendor_info{vendor='acronis'}" -}} Licensed storage capacity is low as the cluster has reached 80% of licensed storage capacity. Please switch to the SPLA licensing model. {{- else if query "backend_vendor_info{vendor='virtuozzo'}" -}} Cluster has reached 80% of licensed storage capacity. {{- end -}}'
|
ok
|
|
59.944s ago
|
169us |
| alert: Licensed storage capacity is critically low
expr: (job:mdsd_fs_allocated_size_bytes:sum >= job:mdsd_cluster_licensed_space_bytes:sum * 0.9) / 1024 ^ 3
for: 5m
labels:
component: cluster
severity: critical
annotations:
summary: '{{- if query "backend_vendor_info{vendor='acronis'}" -}} Licensed storage capacity is critically low as the cluster has reached 90% of licensed storage capacity. Please switch to the SPLA licensing model. {{- else if query "backend_vendor_info{vendor='virtuozzo'}" -}} Cluster has reached 90% of licensed storage capacity. {{- end -}}'
|
ok
|
|
59.944s ago
|
86.95us |
| alert: Network interface is flapping
expr: round(increase(node_network_carrier_changes_total{job="node"}[15m])) > 5
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Network interface {{$labels.device}} on node {{$labels.instance}} is flapping.
|
ok
|
|
59.944s ago
|
1.794ms |
| alert: MTU mismatch
expr: count without(mtu) (count by(mtu, network) (count_values without(job) ("mtu", node_network_mtu_bytes{job="node"}) * on(node, device) group_left(network) count without(job) (cluster_network_info_created{network!="<unspecified>"}))) > 1
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Network {{$labels.network}} has assigned interfaces with different MTU.
|
ok
|
|
59.942s ago
|
2.576ms |
| alert: Node has high receive packet loss rate
expr: instance_device:node_network_receive_drop:rate5m{device!="br-int"} > 1000
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{ $labels.instance }} has ({{ humanize $value }}) receive packet loss rate. Please check node network settings.
value: '{{ $value }}'
|
ok
|
|
59.939s ago
|
1.354ms |
| alert: Node has high transmit packet loss rate
expr: instance_device:node_network_transmit_drop:rate5m{device!="br-int"} > 1000
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{ $labels.instance }} has ({{ humanize $value }}) transmit packet loss rate. Please check node network settings.
value: '{{ $value }}'
|
ok
|
|
59.938s ago
|
1.28ms |
| alert: Node has high receive packet error rate
expr: instance_device:node_network_receive_errs:rate5m{device!="br-int"} > 1000
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{ $labels.instance }} has ({{ humanize $value }}) receive packet error rate. Please check node network settings.
value: '{{ $value }}'
|
ok
|
|
59.937s ago
|
1.422ms |
| alert: Node has high transmit packet error rate
expr: instance_device:node_network_transmit_errs:rate5m{device!="br-int"} > 1000
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Node {{ $labels.instance }} has ({{ humanize $value }}) transmit packet error rate. Please check node network settings.
value: '{{ $value }}'
|
ok
|
|
59.936s ago
|
1.363ms |
| alert: Network bond is not redundant
expr: node_bonding_slaves - node_bonding_active > 0
for: 5m
labels:
component: node
severity: critical
value: '{{ $value }}'
annotations:
summary: Network bond {{ $labels.master }} on node {{ $labels.instance }} is missing {{ $value }} subordinate interface(s).
|
ok
|
|
59.934s ago
|
252us |
| alert: Node is offline
expr: label_replace(backend_node_online, "hostname", "$1", "hostname", "([^.]*).*") == 0 unless on(node) softwareupdates_node_state{state=~"updating|rebooting"} == 1
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Node {{ $labels.hostname }} with ID {{ $labels.node }} is offline.
|
ok
|
|
59.934s ago
|
436.3us |
| alert: Node got offline too many times
expr: changes(backend_node_online[1h]) > (3 * 2)
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Node "{{ $labels.hostname }}" got offline too many times for the last hour.
|
ok
|
|
59.934s ago
|
132.1us |
| alert: Systemd service is flapping
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Systemd service {{ $labels.name }} on node {{ $labels.instance }} has changed its state more than 5 times in 5 minutes or 15 times in one hour.
|
ok
|
|
59.934s ago
|
4.151ms |
| alert: Cluster is out of physical space
expr: round((job:mdsd_cluster_raw_space_free:sum / job:mdsd_cluster_raw_space_total:sum) * 100, 0.01) < 10
for: 5m
labels:
component: cluster
severity: critical
annotations:
summary: Cluster has just {{ $value }}% of physical storage space left. You may want to free some space or add more storage capacity.
value: '{{ $value }}'
|
ok
|
|
59.93s ago
|
117.1us |
| alert: Cluster is out of licensed space
expr: round((cluster_logical_free_space_size_bytes / cluster_logical_total_space_size_bytes) * 100, 0.01) < 0.1
for: 5m
labels:
component: cluster
severity: critical
annotations:
summary: Сluster "{{ $labels.cluster_name }}" has run out of storage space allowed by license. No more data can be written. Please contact your reseller to update your license immediately!
|
ok
|
|
59.929s ago
|
128.2us |
| alert: Disk is running out of space
expr: round(node_filesystem_free_bytes{job="node",mountpoint="/"} / node_filesystem_size_bytes{job="node",mountpoint="/"} * 100, 0.01) < 10 or node_filesystem_free_bytes{job="node",mountpoint="/"} < 5 * 1024 ^ 3
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Root partition on node "{{ $labels.instance }}" is running out of space
|
ok
|
|
59.929s ago
|
553.6us |
| alert: Compute node disk is out of space
expr: (node_filesystem_free_bytes{job="node",mountpoint="/"} and on(node) (backend_node_compute_worker == 1)) < 1024 ^ 3 or (node_filesystem_free_bytes{job="node",mountpoint="/"} and on(node) (backend_node_compute_controller == 1)) < 10 * 1024 ^ 3
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Root partition on compute node "{{ $labels.instance }}" is running out of space.
|
ok
|
|
59.929s ago
|
412us |
| alert: Disk is out of space
expr: (round(node_filesystem_free_bytes{job="node",mountpoint="/"} / node_filesystem_size_bytes{job="node",mountpoint="/"} * 100, 0.01) and on(node) ((backend_node_compute == 1 and backend_node_compute_controller == 0) or backend_node_compute == 0)) < 5
for: 5m
labels:
component: node
severity: critical
annotations:
summary: Root partition on node "{{ $labels.instance }}" is running out of space
|
ok
|
|
59.929s ago
|
386.7us |
| alert: Low network interface speed
expr: node_network_speed_bytes > 0 and node_network_speed_bytes / 125000 < 1000 and on(device, node) cluster_network_info_total{network!="<unspecified>"}
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Network interface "{{ $labels.device}}" on node "{{ $labels.instance }}" has speed lower than the minimally required 1 Gbps.
|
ok
|
|
59.928s ago
|
2.615ms |
| alert: Network interface half duplex
expr: node_network_info{duplex="half",operstate="up"} and on(device, node) cluster_network_info_total{network!="<unspecified>"}
for: 5m
labels:
component: node
severity: warning
annotations:
summary: Network interface "{{$labels.device}}" on node "{{$labels.instance}}" is not in full duplex mode.
|
ok
|
|
59.926s ago
|
304.5us |
| alert: Four metadata services in cluster
expr: count(cluster_mdsd_info) == 4 and count(cluster_mdsd_info) <= count(backend_node_master)
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Cluster has four metadata services. This configuration slows down the cluster performance and does not improve its availability. For a cluster of four nodes, it is enough to configure three MDSes. Delete an extra MDS from one of the cluster nodes.
|
ok
|
|
59.926s ago
|
204.2us |
| alert: Over five metadata services in cluster
expr: count(cluster_mdsd_info) > 5 and count(cluster_mdsd_info) <= count(backend_node_master)
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Cluster has more than five metadata services. This configuration slows down the cluster performance and does not improve its availability. For a large cluster, it is enough to configure five MDSes. Delete extra MDSes from the cluster nodes.
|
ok
|
|
59.925s ago
|
233us |
| alert: More than one metadata service per node
expr: count by(node, hostname) (cluster_mdsd_info * on(node) group_left(hostname) backend_node_master) > 1
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Node "{{ $labels.hostname }}" has more than one metadata service located on it. It is recommended to have only one metadata service per node. Delete the extra metadata services from this node and create them on other nodes instead.
|
ok
|
|
59.925s ago
|
172.7us |
| alert: Not enough metadata disks
expr: count(cluster_mdsd_disk_info) == 2
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Cluster requires more disks with the metadata role. Losing one more MDS will halt cluster operation.
|
ok
|
|
59.925s ago
|
80.69us |
| alert: Only one metadata disk in cluster
expr: count(cluster_mdsd_disk_info) == 1
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Cluster has only one MDS. There is only one disk with the metadata role at the moment. Losing this disk will completely destroy all cluster data irrespective of the redundancy schema.
|
ok
|
|
59.925s ago
|
60.16us |
| alert: Not enough storage disks
expr: cluster_min_req_redundancy_number{failure_domain="host"} > scalar(count(count by(node) (cluster_csd_info))) or cluster_min_req_redundancy_number{failure_domain="disk"} > scalar(count(cluster_csd_info))
for: 5m
labels:
component: cluster
object_id: '{{ $labels.service }}'
severity: warning
annotations:
summary: Cluster requires more disks with the storage role to be able to provide the required level of redundancy for '{{ $labels.service }}' service.
|
ok
|
|
59.925s ago
|
667.5us |
| alert: Zero storage disks
expr: absent(cluster_csd_info) and on() count(up{job="mds"}) > 0 and on() up{job="backend"} == 1
for: 5m
labels:
component: cluster
severity: warning
annotations:
summary: Cluster has zero disks with the storage role and cannot provide the required level of redundancy.
|
ok
|
|
59.925s ago
|
364.6us |
| alert: Shaman service is down
expr: up{job="shaman"} == 0
for: 5m
labels:
component: node
object_id: '{{ $labels.instance }}'
severity: critical
annotations:
summary: Shaman service is down on host {{$labels.instance}}.
|
ok
|
|
59.924s ago
|
82.78us |
| alert: Infrastructure interface has high receive packet drop rate
expr: ((rate(node_network_receive_drop_total{device!~"lo|tap.*",job="node"}[5m])) / (rate(node_network_receive_packets_total{device!~"lo|tap.*",job="node"}[5m]) != 0)) * 100 > 5
for: 10m
labels:
component: node
object_id: '{{ $labels.device }}'
severity: critical
annotations:
summary: |
Network interface {{ $labels.device }} on node {{ $labels.instance }} has receive packet drop rate higher than 5%. Please check physical network devices connectivity.
|
ok
|
|
59.924s ago
|
2.138ms |
| alert: Infrastructure interface has high transmit packet drop rate
expr: ((rate(node_network_transmit_drop_total{device!~"lo|tap.*",job="node"}[5m])) / (rate(node_network_transmit_packets_total{device!~"lo|tap.*",job="node"}[5m]) != 0)) * 100 > 5
for: 10m
labels:
component: node
object_id: '{{ $labels.device }}'
severity: critical
annotations:
summary: |
Network interface {{ $labels.device }} on node {{ $labels.instance }} has transmit packet drop rate higher than 5%. Please check physical network devices connectivity.
|
ok
|
|
59.922s ago
|
2.208ms |
| alert: Connection tracking table is full
expr: increase(kernel_conntrack_table_full_total[10m]) > 0
labels:
component: node
severity: critical
annotations:
summary: The kernel connection tracking table on node {{ $labels.instance}} has reached its maximum capacity. This may lead to network issues.
|
ok
|
|
59.92s ago
|
95.33us |
|
6.142s ago |
1.087ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Cluster is running out of vCPU resources
expr: 80 < sum(openstack_placement_resource_usage{resourcetype="VCPU"}) / sum(openstack_placement_resource_allocation_ratio{resourcetype="VCPU"} * (openstack_placement_resource_total{resourcetype="VCPU"} - openstack_placement_resource_reserved{resourcetype="VCPU"})) * 100 < 95
for: 10m
labels:
component: compute
object_id: openstack_exporter
severity: info
annotations:
summary: Cluster has reached 80% of the vCPU allocation limit.
|
ok
|
|
18.035s ago
|
807.1us |
| alert: Cluster is out of vCPU resources
expr: sum(openstack_placement_resource_usage{resourcetype="VCPU"}) / sum(openstack_placement_resource_allocation_ratio{resourcetype="VCPU"} * (openstack_placement_resource_total{resourcetype="VCPU"} - openstack_placement_resource_reserved{resourcetype="VCPU"})) * 100 > 95
for: 10m
labels:
component: compute
object_id: openstack_exporter
severity: warning
annotations:
summary: Cluster has reached 95% of the vCPU allocation limit.
|
ok
|
|
18.035s ago
|
522.4us |
| alert: Cluster is running out of memory
expr: 80 < sum(openstack_placement_resource_usage{resourcetype="MEMORY_MB"}) / sum(openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * (openstack_placement_resource_total{resourcetype="MEMORY_MB"} - openstack_placement_resource_reserved{resourcetype="MEMORY_MB"})) * 100 < 95
for: 10m
labels:
component: compute
object_id: openstack_exporter
severity: info
annotations:
summary: Cluster has reached 80% of the memory allocation limit.
|
ok
|
|
18.034s ago
|
615.9us |
| alert: Cluster is out of memory
expr: sum(openstack_placement_resource_usage{resourcetype="MEMORY_MB"}) / sum(openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * (openstack_placement_resource_total{resourcetype="MEMORY_MB"} - openstack_placement_resource_reserved{resourcetype="MEMORY_MB"})) * 100 > 95
for: 10m
labels:
component: compute
object_id: openstack_exporter
severity: warning
annotations:
summary: Cluster has reached 95% of the memory allocation limit.
|
ok
|
|
18.034s ago
|
473.9us |
| alert: Virtual machine error
expr: label_replace(openstack_nova_server_status{status="ERROR"}, "object_id", "$1", "id", "(.*)")
labels:
component: compute
severity: critical
annotations:
summary: Virtual machine {{$labels.name}} with ID {{$labels.id}} is in the 'Error' state.
|
ok
|
|
18.033s ago
|
190.7us |
| alert: Virtual machine state mismatch
expr: label_join((count_over_time(nova:libvirt:server:diff[2h]) > 60) and (nova:libvirt:server:diff), "object_id", "", "id")
for: 10m
labels:
component: compute
severity: critical
annotations:
summary: State of virtual machine {{$labels.name}} with ID {{$labels.id}} differs in the Nova databases and libvirt configuration.
|
ok
|
|
18.033s ago
|
281.4us |
| alert: Virtual machine is not responding
expr: sum by(project_name, name, domain_uuid) (instance_domain:libvirt_domain_block_stats_read_bytes:rate5m) == 0 and sum by(project_name, name, domain_uuid) (instance_domain:libvirt_domain_block_stats_write_bytes:rate5m) == 0 and sum by(project_name, name, domain_uuid) (instance_domain:libvirt_domain_interface_stats_receive_bytes:rate5m) == 0 and sum by(project_name, name, domain_uuid) (instance_domain:libvirt_domain_interface_stats_transmit_bytes:rate5m) == 0 and sum by(project_name, name, domain_uuid) (instance_domain:libvirt_domain_info_cpu_time_seconds:rate5m) > 0.1
for: 10m
labels:
component: compute
object_id: '{{$labels.domain_uuid}}'
severity: critical
annotations:
summary: |
Virtual machine {{$labels.name}} in project {{$labels.project_name}} has stopped responding. Consider VM restart.
|
ok
|
|
18.033s ago
|
3.795ms |
| alert: Virtual machine has crashed
expr: (libvirt_domain_info_state == 5 and libvirt_domain_info_state_reason == 3) or (libvirt_domain_info_state == 6 and libvirt_domain_info_state_reason == 1) or (libvirt_domain_info_state == 1 and libvirt_domain_info_state_reason == 9) or (libvirt_domain_info_state == 3 and libvirt_domain_info_state_reason == 10)
for: 10m
labels:
component: compute
object_id: '{{$labels.domain_uuid}}'
severity: critical
annotations:
summary: |
Virtual machine with ID {{$labels.domain_uuid}} in project {{$labels.project_name}} has crashed. Restart the VM.
|
ok
|
|
18.029s ago
|
6.978ms |
| alert: Volume attachment details mismatch
expr: label_replace((count_over_time(cinder:libvirt:volume:diff[2h]) > 60) and (cinder:libvirt:volume:diff), "object_id", "$1", "volume_id", "(.*)")
for: 10m
labels:
component: compute
severity: critical
annotations:
summary: Attachment details for volume with ID {{$labels.volume_id}} differ in the Nova and libvirt databases. Additionally, this may indicate the existence of an uncommitted temporary snapshot.
|
ok
|
|
18.022s ago
|
257.6us |
| alert: Volume is stuck in transitional state
expr: openstack_cinder_volume_gb{status=~"attaching|detaching|deleting|extending|reserved"}
for: 15m
labels:
component: compute
object_id: '{{ $labels.id }}'
severity: warning
annotations:
summary: Volume {{ $labels.id }} is stuck with the {{ $labels.status }} status for more than 5 minutes.
|
ok
|
|
18.022s ago
|
146us |
| alert: Volume has incorrect status
expr: openstack_cinder_volume_gb{status=~"error|error_deleting|error_managing|error_restoring|error_backing-up|error_extending"}
for: 10m
labels:
component: compute
object_id: '{{ $labels.id }}'
severity: critical
annotations:
summary: Volume {{ $labels.id }} has the {{ $labels.status }} status.
|
ok
|
|
18.022s ago
|
162.6us |
| alert: Virtual network port check failed
expr: neutron_port_status_failed{check!="dhcp",device_owner!="network:dhcp"} == 1 unless on(device_id) label_join(openstack_nova_server_status{status="SHELVED_OFFLOADED"}, "device_id", "", "uuid")
for: 10m
labels:
component: compute
object_id: '{{$labels.port_id}}'
severity: critical
annotations:
summary: Neutron port with ID {{$labels.port_id}} failed {{$labels.check}} check. The port type is {{$labels.device_owner}} with owner ID {{$labels.device_id}}
|
ok
|
|
18.022s ago
|
14.14ms |
| alert: Virtual network port check failed
expr: neutron_port_status_failed{check!="dhcp",device_owner="network:dhcp"} == 1
for: 10m
labels:
component: compute
object_id: '{{$labels.port_id}}'
severity: warning
annotations:
summary: Neutron port with ID {{$labels.port_id}} failed {{$labels.check}} check. The port type is {{$labels.device_owner}} with owner ID {{$labels.device_id}}
|
ok
|
|
18.008s ago
|
2.51ms |
| alert: Virtual network port check failed
expr: neutron_port_status_failed{check="dhcp"} == 1 unless on(device_id) label_join(openstack_nova_server_status{status="SHELVED_OFFLOADED"}, "device_id", "", "uuid")
for: 10m
labels:
component: compute
object_id: '{{$labels.port_id}}'
severity: info
annotations:
summary: Neutron port with ID {{$labels.port_id}} failed {{$labels.check}} check. The port type is {{$labels.device_owner}} with owner ID {{$labels.device_id}}
|
ok
|
|
18.005s ago
|
169us |
| alert: Backup plan failed
expr: openstack_freezer_backup_plan_status == 1
for: 20m
labels:
component: compute
object_id: '{{$labels.id}}'
severity: warning
annotations:
summary: Backup plan {{$labels.name}} for compute volumes has three consecutive failures.
|
ok
|
|
18.005s ago
|
349.7us |
| alert: Virtual router HA has more than one active L3 agent
expr: count by(ha_state, router_id) (openstack_neutron_l3_agent_of_router{ha_state="active"}) > 1
for: 10m
labels:
component: compute
object_id: '{{$labels.router_id}}'
severity: critical
annotations:
summary: |
Virtual router HA with ID {{$labels.router_id}} has more than one active L3 agent. Please contact the technical support.
|
ok
|
|
18.005s ago
|
449.8us |
| alert: Virtual router HA has no active L3 agent
expr: count by(router_id) (openstack_neutron_l3_agent_of_router) - on(router_id) count by(router_id) (openstack_neutron_l3_agent_of_router{ha_state!~"active"}) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.router_id}}'
severity: critical
annotations:
summary: |
Virtual router HA with ID {{$labels.router_id}} has no active L3 agent. Please contact the technical support.
|
ok
|
|
18.005s ago
|
2.538ms |
| alert: Virtual router SNAT-related port has invalid host binding
expr: openstack_neutron_port{device_owner="network:router_centralized_snat"} and on(device_id, binding_host_id) (label_replace(label_replace(openstack_neutron_l3_agent_of_router{ha_state="standby"}, "device_id", "$1", "router_id", "(.+)"), "binding_host_id", "$1", "agent_host", "(.+)"))
for: 10m
labels:
component: compute
object_id: '{{$labels.uuid}}'
severity: critical
annotations:
summary: |
Virtual router SNAT-related port with ID {{$labels.uuid}} is bound to the Standby HA router node. Please contact the technical support.
|
ok
|
|
18.002s ago
|
1.893ms |
| alert: Virtual router gateway port has invalid host binding
expr: openstack_neutron_port{device_owner="network:router_gateway"} and on(device_id, binding_host_id) (label_replace(label_replace(openstack_neutron_l3_agent_of_router{ha_state="standby"}, "device_id", "$1", "router_id", "(.+)"), "binding_host_id", "$1", "agent_host", "(.+)"))
for: 10m
labels:
component: compute
object_id: '{{$labels.uuid}}'
severity: critical
annotations:
summary: |
Virtual router gateway port with ID {{$labels.uuid}} is bound to the Standby HA router node. Please contact the technical support.
|
ok
|
|
18s ago
|
1.248ms |
| alert: Neutron bridge mapping not found
expr: label_replace(openstack_neutron_network_bridge_mapping * on(hostname) group_left(node) (backend_node_compute), "object_id", "$1", "provider_physical_network", "(.*)") == 0
for: 20m
labels:
component: compute
severity: critical
annotations:
summary: |
Physical network "{{$labels.provider_physical_network}}" is not found in the bridge mapping on node "{{$labels.hostname}}". Virtual network "{{$labels.network_name}}" on this node is most likely not functioning. Please contact the technical support.
|
ok
|
|
17.999s ago
|
426.4us |
| alert: Virtual DHCP server is unavailable from node
expr: neutron_network_dhcp_reply_count == 0 and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count == 0) < 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Built-in DHCP server for virtual network "{{$labels.network_id}}" is not available from node "{{$labels.host}}". Please check the neutron-dhcp-agent service or contact the technical support.
|
ok
|
|
17.999s ago
|
1.759ms |
| alert: Virtual DHCP server is unavailable
expr: group by(network_id) (neutron_network_dhcp_reply_count == 0) and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count == 0) >= 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Built-in DHCP server for virtual network "{{$labels.network_id}}" is not available from cluster nodes. Please check the neutron-dhcp-agent service or contact the technical support.
|
ok
|
|
17.997s ago
|
1.67ms |
| alert: Virtual DHCP server HA degraded on node
expr: neutron_network_dhcp_reply_count == 1 and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count == 1) < 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Only one built-in DHCP server for virtual network "{{$labels.network_id}}" is reachable from node "{{$labels.host}}". DHCP high availability entered the degraded state. Please check the neutron-dhcp-agent service or contact the technical support.
|
ok
|
|
17.996s ago
|
1.267ms |
| alert: Virtual DHCP server HA degraded
expr: group by(network_id) (neutron_network_dhcp_reply_count == 1) and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count == 1) >= 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Only one built-in DHCP server for virtual network "{{$labels.network_id}}" is reachable from cluster nodes. DHCP high availability entered the degraded state. Please check the neutron-dhcp-agent service or contact the technical support.
|
ok
|
|
17.994s ago
|
1.181ms |
| alert: Unrecognized DHCP servers detected from node
expr: neutron_network_dhcp_reply_count >= 3 and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count >= 3) < 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Built-in DHCP service for virtual network "{{$labels.network_id}}" may be malfunctioning on node "{{$labels.host}}". Please ensure that virtual machines are receiving correct DHCP addresses or contact the technical support.
|
ok
|
|
17.993s ago
|
1.509ms |
| alert: Unrecognized DHCP servers detected
expr: group by(network_id) (neutron_network_dhcp_reply_count >= 3) and on(network_id) (count by(network_id) (neutron_network_dhcp_reply_count >= 3) >= 2) and on() (backend_ha_up == 1)
for: 10m
labels:
component: compute
object_id: '{{$labels.network_id}}'
severity: warning
annotations:
summary: |
Built-in DHCP service for virtual network "{{$labels.network_id}}" may be malfunctioning. Please ensure that virtual machines are receiving correct DHCP addresses or contact the technical support.
|
ok
|
|
17.992s ago
|
1.383ms |
| alert: Licensed core limit exceeded
expr: (sum(openstack_nova_phys_cores_available) - on() licensed_core_number) > 0 and on() licensed_core_number > 0
for: 10m
labels:
component: compute
severity: critical
annotations:
core_number: '{{ printf `licensed_core_number`|query|first|value }}'
core_number_used: '{{ printf `sum(openstack_nova_phys_cores_available)`|query|first|value }}'
summary: |
Number of physical cores used in the cluster is "{{ $labels.core_number_used }}", which exceeds the licensed core limit of "{{ $labels.core_number }}".
|
ok
|
|
17.991s ago
|
191.5us |
| alert: Load balancer is stuck in pending state
expr: openstack_loadbalancer_loadbalancer_status{is_stale="true"}
labels:
component: compute
object_id: '{{$labels.id}}'
severity: error
annotations:
summary: |
Load balancer with ID "{{$labels.id}}" is stuck with the "{{$labels.provisioning_status}}" status. Ensure that the load balancer configuration is consistent and perform a failover.
|
ok
|
|
17.991s ago
|
51.81us |
| alert: Load balancer error
expr: openstack_loadbalancer_loadbalancer_status{provisioning_status="ERROR"}
labels:
component: compute
object_id: '{{$labels.id}}'
severity: warning
annotations:
summary: |
Load balancer with ID "{{$labels.id}}" has the 'ERROR' provisioning status. Please check the Octavia service logs or contact the technical support.
|
ok
|
|
17.991s ago
|
55.48us |
| alert: Kubernetes cluster update failed
expr: openstack_container_infra_cluster_status == 4
for: 5m
labels:
component: compute
object_id: '{{ $labels.uuid }}'
severity: warning
annotations:
summary: Kubernetes cluster with ID "{{ $labels.uuid }}" has the "{{ $labels.status }}" status.
|
ok
|
|
17.991s ago
|
101.5us |
|
4.404s ago |
2.554ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Keystone API service is down
expr: (openstack_service_up{service=~"keystone.*"} == 0) and on(cluster_id) (backend_ha_reconfigure == 0) and on(cluster_id) (backend_compute_reconfigure == 0) and on(cluster_id) (backend_compute_deploy == 0)
for: 10m
labels:
component: compute
object_id: '{{$labels.service}}'
severity: critical
annotations:
summary: '{{$labels.service}} API service is down.'
|
ok
|
|
10.229s ago
|
433.1us |
| alert: OpenStack service API upstream is down
expr: (sum by(cluster_id, service) (openstack_service_up{service!~"keystone.*|gnocchi*"}) > 0 < scalar(sum by(cluster_id) (backend_node_compute_controller == 1))) and on(cluster_id) (backend_ha_reconfigure == 0) and on(cluster_id) (backend_compute_reconfigure == 0) and on(cluster_id) (backend_compute_deploy == 0)
for: 10m
labels:
component: compute
object_id: '{{$labels.service}}'
severity: warning
annotations:
summary: One or more OpenStack {{$labels.service}} API upstreams are down.
|
ok
|
|
10.228s ago
|
967.9us |
| alert: All OpenStack service API upstreams are down
expr: (sum by(cluster_id, service) (openstack_service_up{service!~"keystone.*"}) <= 0) and on(cluster_id) (backend_ha_reconfigure == 0) and on(cluster_id) (backend_compute_reconfigure == 0) and on(cluster_id) (backend_compute_deploy == 0) and on() (compute_cluster_state{state="reconfiguring"} == 0)
for: 10m
labels:
component: compute
object_id: '{{$labels.service}}'
severity: critical
annotations:
summary: All OpenStack {{$labels.service}} API upstreams are down.
|
ok
|
|
10.228s ago
|
678.4us |
| alert: OpenStack Cinder Scheduler is down
expr: sum without(uuid) (label_replace(openstack_cinder_agent_state{adminState="enabled",service="cinder-scheduler"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Block Storage (Cinder) Scheduler agent is down on host {{$labels.instance}}.
|
ok
|
|
10.227s ago
|
925.6us |
| alert: OpenStack Cinder Volume agent is down
expr: sum without(uuid) (label_replace(label_replace(openstack_cinder_agent_state{adminState="enabled",service="cinder-volume"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*"), "storage_name", "$1", "hostname", ".*@(.*)") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}-{{$labels.storage_name}}'
severity: critical
annotations:
summary: OpenStack Block Storage (Cinder) Volume agent is down on host {{$labels.instance}} for storage {{$labels.storage_name}}.
|
ok
|
|
10.226s ago
|
1.018ms |
| alert: OpenStack Neutron L3 agent is down
expr: (label_replace(openstack_neutron_agent_state{service="neutron-l3-agent"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Networking (Neutron) L3 agent is down on host {{$labels.instance}}.
|
ok
|
|
10.225s ago
|
906.4us |
| alert: OpenStack Neutron OpenvSwitch agent is down
expr: (label_replace(openstack_neutron_agent_state{service="neutron-openvswitch-agent"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Networking (Neutron) OpenvSwitch agent is down on host {{$labels.instance}}.
|
ok
|
|
10.224s ago
|
758.5us |
| alert: OpenStack Neutron Metadata agent is down
expr: (label_replace(openstack_neutron_agent_state{service="neutron-metadata-agent"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Networking (Neutron) Metadata agent is down on host {{$labels.instance}}.
|
ok
|
|
10.224s ago
|
718.8us |
| alert: OpenStack Neutron DHCP agent is down
expr: (label_replace(openstack_neutron_agent_state{service="neutron-dhcp-agent"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Networking (Neutron) DHCP agent is down on host {{$labels.instance}}.
|
ok
|
|
10.223s ago
|
557us |
| alert: OpenStack Nova Compute is down
expr: (label_replace(openstack_nova_agent_state{adminState="enabled",service="nova-compute"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Compute (Nova) agent is down on host {{$labels.instance}}.
|
ok
|
|
10.223s ago
|
893.3us |
| alert: OpenStack Nova Conductor is down
expr: (label_replace(openstack_nova_agent_state{adminState="enabled",service="nova-conductor"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Compute (Nova) Conductor agent is down on host {{$labels.instance}}.
|
ok
|
|
10.222s ago
|
576.5us |
| alert: OpenStack Nova Scheduler is down
expr: (label_replace(openstack_nova_agent_state{adminState="enabled",service="nova-scheduler"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_management == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Compute (Nova) Scheduler agent is down on host {{$labels.instance}}.
|
ok
|
|
10.222s ago
|
674.7us |
| alert: High request error rate for OpenStack API requests detected
expr: label_replace(sum by(instance, log_file) (rate(openstack_request_count{status=~"5.."}[1h])) / sum by(instance, log_file) (rate(openstack_request_count[1h])), "object_id", "$1", "log_file", "(.*).log") * 100 > 5
for: 10m
labels:
component: compute
severity: warning
annotations:
summary: Request error rate more than 5% detected for {{$labels.object_id}} for the last 1 hour. Check {{$labels.object_id}} resource usage.
|
ok
|
|
10.221s ago
|
3.873ms |
| alert: OpenStack Octavia Provisioning Worker is down
expr: (label_replace(openstack_loadbalancer_service_state{service="octavia_worker"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute_controller == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Loadbalancing (Octavia) provisioning worker is down on host {{$labels.instance}}.
|
ok
|
|
10.217s ago
|
666us |
| alert: OpenStack Octavia Housekeeping service is down
expr: (label_replace(openstack_loadbalancer_service_state{service="octavia_housekeeping"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute_controller == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Loadbalancing (Octavia) housekeeping service is down on host {{$labels.instance}}.
|
ok
|
|
10.217s ago
|
663.6us |
| alert: OpenStack Octavia HealthManager service is down
expr: (label_replace(openstack_loadbalancer_service_state{service="octavia_health_manager"}, "nodename", "$1", "hostname", "(.*vstoragedomain).*") * on(nodename) group_left(node) node_uname_info{job="node"} * on(node) group_left() (backend_node_compute_controller == 1) * on(node) group_left(instance) up{job="node"} + on(node) group_left() max by(node) (softwareupdates_node_state{state=~"updating|rebooting"}) + scalar(backend_ha_reconfigure == bool 1)) == 0
for: 10m
labels:
component: compute
object_id: '{{$labels.instance}}'
severity: critical
annotations:
summary: OpenStack Loadbalancing (Octavia) health manager service is down on host {{$labels.instance}}.
|
ok
|
|
10.216s ago
|
551.2us |
|
33.285s ago |
15.04ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: Node has stuck I/O requests
expr: fused_stuck_reqs_30s > 0 or fused_stuck_reqs_10s > 0
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: Some I/O requests are stuck on {{$labels.instance}}.
|
ok
|
|
33.285s ago
|
496.2us |
| alert: Cluster has blocked or slow replication
expr: increase(mdsd_cluster_replication_stuck_chunks[5m]) > 0 or increase(mdsd_cluster_replication_touts_total[5m]) > 0
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: Chunk replication is blocked or too slow.
|
ok
|
|
33.284s ago
|
241.5us |
| alert: Node has failed map requests
expr: fused_maps_failed > 0 or rate(fused_map_failures_total[5m]) > 0
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: Some map requests on {{$labels.instance}} have failed.
|
ok
|
|
33.284s ago
|
197us |
| alert: Cluster has too many chunks
expr: (job:mdsd_fs_chunk_maps:sum > 1e+07) < 1.5e+07
for: 1m
labels:
component: core storage
severity: warning
annotations:
summary: There are too many chunks in the cluster, which slows down the metadata service.
|
ok
|
|
33.284s ago
|
73.77us |
| alert: Cluster has critically high number of chunks
expr: job:mdsd_fs_chunk_maps:sum >= 1.5e+07
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: There are too many chunks in the cluster, which slows down the metadata service.
|
ok
|
|
33.284s ago
|
55.11us |
| alert: Cluster has too many files
expr: (job:mdsd_fs_files:sum > 4e+06) < 1e+07
for: 1m
labels:
component: core storage
severity: warning
annotations:
summary: There are too many files in the cluster, which slows down the metadata service.
|
ok
|
|
33.284s ago
|
215.4us |
| alert: Cluster has critically high number of files
expr: job:mdsd_fs_files:sum >= 1e+07
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: There are too many files in the cluster, which slows down the metadata service.
|
ok
|
|
33.284s ago
|
109.1us |
| alert: Metadata service has high CPU usage
expr: (sum by(instance) (rate(process_cpu_seconds_total{job="mds"}[5m])) * 100) > 80
for: 1m
labels:
component: core storage
severity: warning
annotations:
summary: Metadata service on {{$labels.instance}} has CPU usage higher than 80%. The service may be overloaded.
|
ok
|
|
33.284s ago
|
266.4us |
| alert: Metadata service has high commit latency
expr: 5 > histogram_quantile(0.95, instance_le:rjournal_commit_duration_seconds_bucket:rate5m{job="mds"}) > 1
for: 1m
labels:
component: core storage
severity: warning
annotations:
summary: Metadata service on {{$labels.instance}} has the 95th percentile latency higher than 1 second.
|
ok
|
|
33.284s ago
|
874.4us |
| alert: Metadata service has critically high commit latency
expr: histogram_quantile(0.95, instance_le:rjournal_commit_duration_seconds_bucket:rate5m{job="mds"}) >= 5
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: Metadata service on {{$labels.instance}} has the 95th percentile latency higher than 5 seconds.
|
ok
|
|
33.283s ago
|
744.9us |
| alert: Cluster has failed mount points
expr: job:up_not_being_updated:count{job="fused"} - job:up_not_being_updated_with_restart:count{job="fused"} > 0
for: 1m
labels:
component: core storage
severity: critical
annotations:
summary: Some mount points stopped working and need to be recovered.
|
ok
|
|
33.282s ago
|
120.6us |
| alert: Storage disk is unresponsive
expr: sum by(csid) (mdsd_cs_status{status="ill"}) * on(csid) group_right() label_replace(cluster_csd_disk_info, "object_id", "$1", "csid", "(.*)") > 0
for: 1m
labels:
component: core storage
object_id: '{{ $labels.csid }}'
severity: warning
annotations:
summary: Disk '{{$labels.device}}' (CS#{{$labels.csid}}) on node {{$labels.instance}} is unresponsive. Check or replace this disk.
|
ok
|
|
33.282s ago
|
538.1us |
| alert: Cluster has offline chunk services
expr: sum by(csid) (mdsd_cs_status{status="offline"}) * on(csid) group_right() label_replace(cluster_csd_disk_info, "object_id", "$1", "csid", "(.*)") > 0
for: 5m
labels:
component: core storage
object_id: '{{ $labels.csid }}'
severity: warning
annotations:
summary: 'Chunk service #{{ $labels.csid }} is in 'offline' state on {{ $labels.instance }} node. Check and restart it.'
|
ok
|
|
33.282s ago
|
721.8us |
| alert: Cluster has failed chunk services
expr: sum by(csid) (mdsd_cs_status{status=~"failed|failed rel"}) * on(csid) group_right() label_replace(cluster_csd_disk_info, "object_id", "$1", "csid", "(.*)") > 0
for: 5m
labels:
component: core storage
object_id: '{{ $labels.csid }}'
severity: warning
annotations:
summary: 'Chunk service #{{ $labels.csid }} is in the 'failed' state on node {{ $labels.instance }}. Replace the disk or contact the technical support.'
|
ok
|
|
33.281s ago
|
570us |
| alert: Cluster has unavailable metadata services
expr: up{job="mds"} unless on(mdsid) (job:up_with_restart{job="mds"} == 1 or job:up_with_restart{job="mds"} == bool 0 and on(node) (instance:being_updated))
for: 5m
labels:
component: core storage
object_id: '{{ $labels.mdsid }}'
severity: warning
annotations:
summary: 'Metadata service #{{ $labels.mdsid }} is offline or has failed on '{{ $labels.instance }}' node. Check and restart it.'
|
ok
|
|
33.281s ago
|
397.1us |
| alert: Cluster is running out of physical space on tier
expr: label_replace(sum by(tier) (mdsd_cluster_free_space_bytes) / sum by(tier) (mdsd_cluster_space_bytes), "object_id", "tier-$1", "tier", "(.*)") < 0.2
for: 5m
labels:
component: core storage
severity: warning
annotations:
summary: There is little free physical space left on storage tier {{ $labels.tier }}
|
ok
|
|
33.28s ago
|
230.7us |
| alert: Cluster is out of physical space on tier
expr: label_replace(sum by(tier) (mdsd_cluster_free_space_bytes) / sum by(tier) (mdsd_cluster_space_bytes), "object_id", "tier-$1", "tier", "(.*)") < 0.1
for: 5m
labels:
component: core storage
severity: critical
annotations:
summary: There is not enough free physical space on storage tier {{ $labels.tier }}
|
ok
|
|
33.28s ago
|
247.8us |
| alert: Master metadata service changes too often
expr: topk(1, mdsd_is_master) and (delta(mdsd_master_uptime[1h]) < 300000) and on(node) softwareupdates_node_state{state!~"updat.*"} == 1
for: 10m
labels:
component: core storage
severity: warning
annotations:
summary: Master metadata service has changed more than once in 5 minutes.
|
ok
|
|
33.28s ago
|
604.1us |
| alert: Reached "node crash per hour" threshold
expr: shaman_node_crash_threshold == 1
for: 5m
labels:
component: node
severity: critical
annotations:
summary: '{{- if query "backend_vendor_info{vendor='acronis'}" -}} Node {{$labels.hostname}} with shaman node id {{$labels.client_node}} has reached the "node crash per hour" threshold. Visit https://kb.acronis.com/content/68797 to learn how to troubleshoot this issue. {{- else if query "backend_vendor_info{vendor='virtuozzo'}" -}} Node {{$labels.hostname}} with shaman node id {{$labels.client_node}} has reached the "node crash per hour" threshold. {{- end -}}'
|
ok
|
|
33.279s ago
|
42.89us |
| alert: Number of CSes per device does not match configuration
expr: label_replace(backend_node_online == 1, "host", "$1", "hostname", "([^.]*).*") and on(node) (count by(device, instance, node, tier) (cluster_csd_info) - on(tier) group_left() (cluster_cs_per_tier_info)) != 0
for: 10m
labels:
component: storage
severity: warning
annotations:
summary: Number of CSes per device on node {{$labels.host}} with id {{$labels.node}} does not match configuration. Check your disk configuration.
|
ok
|
|
33.279s ago
|
872.9us |
| alert: CS has excessive journal size
expr: cluster_csd_journal_size{journal_type="inner_cache"} * on(csid) group_left(instance, device) cluster_csd_disk_info > 512
for: 10m
labels:
component: storage
object_id: '{{ $labels.csid }}'
severity: warning
value: '{{ $value }}'
annotations:
summary: The journal on CS#{{ $labels.csid }} on host {{ $labels.instance }}, disk {{ $labels.device }}, is {{ $value }} MiB. The recommended size is 256 MiB.
|
ok
|
|
33.279s ago
|
761.6us |
| alert: CS has inconsistent encryption settings
expr: count by(tier) (count by(tier, encryption) (cluster_csd_journal_size * on(csid) group_left(instance) cluster_csd_disk_info * on(csid) group_left(tier) cluster_csd_info)) > 1
for: 10m
labels:
component: storage
severity: warning
annotations:
summary: Encryption is disabled for some CSs in tier {{ $labels.tier }} but enabled for others on the same tier.
|
ok
|
|
33.278s ago
|
1.402ms |
| alert: CS missing journal configuration
expr: cluster_csd_disk_info unless on(csid) cluster_csd_journal_size and on() count(up{job="mds"}) > 0
for: 10m
labels:
component: storage
object_id: '{{ $labels.csid }}'
severity: warning
annotations:
summary: The journal is not configured for CS#{{ $labels.csid }} on node {{ $labels.instance }}.
|
ok
|
|
33.277s ago
|
736.2us |
| alert: Possible lack of allocatable space
expr: (cluster_space_ok_without_node == 0) * on(node) group_right() backend_node_online
labels:
component: storage
object_id: '{{ $labels.node }}'
severity: warning
annotations:
summary: Losing node {{ $labels.hostname }} will lead to the lack of allocatable space or failure domains in the storage cluster. Add more storage disks or nodes to the cluster, depending on your failure domain configuration.
|
ok
|
|
33.276s ago
|
1.447ms |
| alert: CS journal device shared across multiple tiers
expr: count by(instance, device) (count by(instance, device, tier) (cluster_csd_journal_size{journal_type="external_cache"} * on(csid) group_left(tier) (cluster_csd_info) * on(node) group_left(instance) (up{job="node"}))) >= 2
for: 10m
labels:
component: storage
object_id: '{{ $labels.instance }}-{{ $labels.device }}'
severity: warning
annotations:
summary: CSes from multiple tiers are using the same journal '{{ $labels.device }}' on '{{ $labels.instance }}' node.
|
ok
|
|
33.275s ago
|
924.7us |
| alert: Disk cache settings are not optimal
expr: count by(node, instance, tier) (cluster_csd_journal_size{journal_type="inner_cache"} * on(csid) group_left(tier) cluster_csd_info * on(node) group_left(instance) up{job="node"}) + count by(node, instance, tier) (cluster_csd_journal_size{journal_type="external_cache"} * on(csid) group_left(tier) cluster_csd_info * on(node) group_left(instance) up{job="node"}) >= 2
for: 10m
labels:
component: storage
object_id: '{{ $labels.node }}-{{ $labels.tier }}'
severity: warning
annotations:
summary: CSes are set up with different cache settings in the same tier '{{ $labels.tier }}' on '{{ $labels.instance }}' node
|
ok
|
|
33.274s ago
|
1.564ms |
| alert: Core storage service is down
expr: label_replace(node_systemd_unit_state{name="vstorage-disks-monitor.service",state="active"}, "name", "$1", "name", "(.*)\\.service") != 1 and on(node) backend_node_master == 1 and on() backend_virtual_cluster == 0
for: 5m
labels:
component: storage
object_id: '{{ $labels.name }} - {{ $labels.instance }}'
severity: warning
annotations:
summary: Service {{ $labels.name }} is down on host {{ $labels.instance }}.
|
ok
|
|
33.272s ago
|
539us |
|
58.703s ago |
957.7us |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: S3 Gateway service has high GET request latency
expr: (histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_s3gw_get_req_latency_ms_bucket:rate5m)) > 1000) < 5000
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has the median GET request latency higher than 1 second.
|
ok
|
|
22.109s ago
|
1.918ms |
| alert: S3 Gateway service has critically high GET request latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_s3gw_get_req_latency_ms_bucket:rate5m)) >= 5000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has the median GET request latency higher than 5 seconds.
|
ok
|
|
22.107s ago
|
1.558ms |
| alert: Object service has critically high request latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc_req:ostor_os_req_latency_ms_bucket:rate5m)) >= 5000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Object service ({{$labels.svc_id}}) on {{$labels.instance}} has the median request latency higher than 5 seconds.
|
ok
|
|
22.106s ago
|
7.114ms |
| alert: Object service has high request latency
expr: (histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc_req:ostor_os_req_latency_ms_bucket:rate5m)) > 1000) < 5000
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Object service ({{$labels.svc_id}}) on {{$labels.instance}} has the median request latency higher than 1 second.
|
ok
|
|
22.099s ago
|
6.618ms |
| alert: Name service has critically high request latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc_req:ostor_ns_req_latency_ms_bucket:rate5m)) >= 5000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Name service ({{$labels.svc_id}}) on {{$labels.instance}} has the median request latency higher than 5 seconds.
|
ok
|
|
22.092s ago
|
5.428ms |
| alert: Name service has high request latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc_req:ostor_ns_req_latency_ms_bucket:rate5m)) > 1000
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Name service ({{$labels.svc_id}}) on {{$labels.instance}} has the median request latency higher than 1 second.
|
ok
|
|
22.087s ago
|
4.954ms |
| alert: Name service has high commit latency
expr: (histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_commit_latency_us_bucket:rate5m{job="ns"})) > 1e+06) < 1e+07
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Name service ({{$labels.svc_id}}) on {{$labels.instance}} has the median commit latency higher than 1 second. Check the storage performance.
|
ok
|
|
22.082s ago
|
1.369ms |
| alert: Name service has critically high commit latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_commit_latency_us_bucket:rate5m{job="ns"})) >= 1e+07
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Name service ({{$labels.svc_id}}) on {{$labels.instance}} has the median commit latency higher than 10 seconds. Check the storage performance.
|
ok
|
|
22.08s ago
|
1.113ms |
| alert: Object service has high commit latency
expr: (histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_commit_latency_us_bucket:rate5m{job="os"})) > 1e+06) < 1e+07
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Object service ({{$labels.svc_id}}) on {{$labels.instance}} has the median commit latency higher than 1 second. Check the storage performance.
|
ok
|
|
22.079s ago
|
1.996ms |
| alert: Object service has critically high commit latency
expr: histogram_quantile(0.5, sum by(instance, svc_id, le) (instance_vol_svc:ostor_commit_latency_us_bucket:rate5m{job="os"})) >= 1e+07
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Object service ({{$labels.svc_id}}) on {{$labels.instance}} has the median commit latency higher than 10 seconds. Check the storage performance.
|
ok
|
|
22.077s ago
|
1.725ms |
| alert: S3 Gateway service has high cancel request rate
expr: 30 > (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m)) > 1 and ((sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req_cancelled:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m))) * 100 > 5 and (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m)) > (30 / 300)
for: 3m
labels:
component: S3
severity: warning
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has the cancel request rate higher than 5%. It may be caused by connectivity issues, requests timeouts, or a small limit for pending requests.
|
ok
|
|
22.076s ago
|
828.9us |
| alert: S3 Gateway service has critically high cancel request rate
expr: (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m)) > 1 and ((sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req_cancelled:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m))) * 100 >= 30 and (sum by(svc_id, instance) (instance_vol_svc:ostor_s3gw_req:rate5m)) > (30 / 300)
for: 3m
labels:
component: S3
severity: critical
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has the cancel request rate higher than 30%. It may be caused by connectivity issues, requests timeouts, or a small limit for pending requests.
|
ok
|
|
22.075s ago
|
721.9us |
| alert: Object storage agent is frozen for a long time
expr: increase(pcs_process_inactive_seconds_total{job="ostor"}[5m]) > 0
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Object storage agent on {{$labels.instance}} has the event loop inactive for more than 1 minute.
|
ok
|
|
22.074s ago
|
137us |
| alert: S3 service is frozen for a long time
expr: increase(pcs_process_inactive_seconds_total{job=~"s3gw|os|ns"}[5m]) > 0
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 service ({{$labels.job}}, {{$labels.svc_id}}) on {{$labels.instance}} has the event loop inactive for more than 1 minute.
|
ok
|
|
22.074s ago
|
632.4us |
| alert: S3 Gateway service has high CPU usage
expr: ((sum by(instance, svc_id) (rate(process_cpu_seconds_total{job="s3gw"}[5m])) * 100) > 75) < 90
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has CPU usage higher than 75%. The service may be overloaded.
|
ok
|
|
22.074s ago
|
316.5us |
| alert: S3 Gateway service has critically high CPU usage
expr: (sum by(instance, svc_id) (rate(process_cpu_seconds_total{job="s3gw"}[5m])) * 100) >= 90
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has CPU usage higher than 90%. The service may be overloaded.
|
ok
|
|
22.074s ago
|
224.2us |
| alert: S3 Gateway service has too many failed requests
expr: ((sum by(instance, svc_id) (instance_vol_svc:ostor_req_server_err:rate5m)) / (sum by(instance, svc_id) (instance_vol_svc:ostor_s3gw_req:rate5m))) * 100 > 5
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 Gateway service ({{$labels.svc_id}}) on {{$labels.instance}} has a lot of failed requests with a server error (5XX status code).
|
ok
|
|
22.073s ago
|
299.9us |
| alert: S3 service failed to start
expr: increase(ostor_svc_start_failed_count_total{service=~"os|ns|s3gw",storage_type="S3"}[5m]) > 10
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: Object storage agent failed to start {{$labels.job}}({{$labels.svc_id}}) on {{$labels.instance}}
|
ok
|
|
22.073s ago
|
1.011ms |
| alert: NFS service failed to start
expr: increase(ostor_svc_start_failed_count_total{service=~"os|ns|s3gw",storage_type="NFS"}[5m]) > 10
for: 1m
labels:
component: NFS
severity: critical
annotations:
summary: Object storage agent failed to start {{$labels.job}}({{$labels.svc_id}}) on {{$labels.instance}}
|
ok
|
|
22.072s ago
|
86.94us |
| alert: FSMDS service failed to start
expr: increase(ostor_svc_start_failed_count_total{service="fs"}[5m]) > 10
for: 1m
labels:
component: NFS
severity: critical
annotations:
summary: Object storage agent failed to start file service on {{$labels.instance}}.
|
ok
|
|
22.072s ago
|
60.05us |
| alert: Object storage agent is offline
expr: up{job="ostor"} == 0
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Object storage agent is offline on {{$labels.instance}}.
|
ok
|
|
22.072s ago
|
124.9us |
| alert: Object storage agent is not connected to configuration service
expr: increase(ostor_svc_registry_cfg_failed_total[5m]) > 3 and on(node) (instance:not_being_updated)
for: 5m
labels:
component: S3
severity: critical
annotations:
summary: Object storage agent failed to connect to the configuration service on {{$labels.instance}}.
|
ok
|
|
22.072s ago
|
252.2us |
| alert: S3 cluster has unavailable object services
expr: count by(instance) (up{job="os"}) > sum by(instance) (up{job="os"} == 1 or (up{job="os"} == bool 0 and on(instance) (instance:being_updated)))
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Some Object services are not running on {{$labels.instance}}. Check the service status in the command-line interface.
|
ok
|
|
22.072s ago
|
800.8us |
| alert: S3 cluster has unavailable name services
expr: count by(instance) (up{job="ns"}) > sum by(instance) (up{job="ns"} == 1 or (up{job="ns"} == bool 0 and on(instance) (instance:being_updated)))
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Some Name services are not running on {{$labels.instance}}. Check the service status in the command-line interface.
|
ok
|
|
22.071s ago
|
620.8us |
| alert: S3 cluster has unavailable S3 Gateway services
expr: count by(instance) (up{job="s3gw"}) > sum by(instance) (up{job="s3gw"})
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Some S3 Gateway services are not running on {{$labels.instance}}. Check the service status in the command-line interface.
|
ok
|
|
22.071s ago
|
339.2us |
| alert: S3 cluster has unavailable Geo-replication services
expr: count by(instance) (up{job="gr"}) > sum by(instance) (up{job="gr"} == 1 or (up{job="gr"} == bool 0 and on(instance) (instance:being_updated)))
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: Some Geo-replication services are not running on {{$labels.instance}}. Check the service status in the command-line interface.
|
ok
|
|
22.07s ago
|
555.2us |
| alert: NFS service has unavailable FS services
expr: count by(instance) (up{job="fs"}) > sum by(instance) (up{job="fs"})
for: 1m
labels:
component: NFS
severity: warning
annotations:
summary: Some File services are not running on {{$labels.instance}}. Check the service status in the command-line interface.
|
ok
|
|
22.07s ago
|
120.3us |
| alert: S3 cluster has too many open file descriptors
expr: (sum by(instance) (process_open_fds{job=~"gr|acc|s3gw|ns|os|ostor"})) > 9000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: There are more than 9000 open file descriptors on {{$labels.instance}}. Please contact the technical support.
|
ok
|
|
22.07s ago
|
592.9us |
| alert: S3 node is in the automatic maintenance mode
expr: auto_maintenance_status > 0
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: '{{- if query "backend_vendor_info{vendor='acronis'}" -}} S3 services have been evacuated from {{$labels.instance}} because of too many failed S3 requests. Check the service logs. Visit https://kb.acronis.com/content/72408 to learn how to troubleshoot this issue. {{- else if query "backend_vendor_info{vendor='virtuozzo'}" -}} S3 services have been evacuated from {{$labels.instance}} because of too many failed S3 requests. Check the service logs. {{- end -}}'
|
ok
|
|
22.069s ago
|
62.98us |
| alert: S3 NDS service has high notification processing error rate
expr: 15 > ((sum by(svc_id, instance) (instance_vol_svc:ostor_nds_error_total:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_nds_total:rate5m))) * 100 >= 5
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has the notification processing error rate higher than 5%. It may be caused by connectivity issues, requests timeouts, or an S3 topics misconfiguration.
|
ok
|
|
22.069s ago
|
200.2us |
| alert: S3 NDS service has critically high notification processing error rate
expr: ((sum by(svc_id, instance) (instance_vol_svc:ostor_nds_error_total:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_nds_total:rate5m))) * 100 >= 15
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has the notification processing error rate higher than 15%. It may be caused by connectivity issues, requests timeouts, or an S3 topics misconfiguration.
|
ok
|
|
22.069s ago
|
229us |
| alert: S3 NDS service has high notification deletion error rate
expr: ((sum by(svc_id, instance) (instance_vol_svc:ostor_nds_delete_error_total:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_nds_total:rate5m))) * 100 > 5
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has the notification deletion error rate higher than 5%. It may be caused by a storage misconfiguration, storage performance degradation, or other storage issues.
|
ok
|
|
22.069s ago
|
185.1us |
| alert: S3 NDS service has high notification repetition rate
expr: ((sum by(svc_id, instance) (instance_vol_svc:ostor_nds_repeat_total:rate5m)) / (sum by(svc_id, instance) (instance_vol_svc:ostor_nds_total:rate5m))) * 100 > 5
for: 1m
labels:
component: S3
severity: warning
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has the notification repetition rate higher than 5%. It may be caused by a storage misconfiguration or other storage issues.
|
ok
|
|
22.069s ago
|
163.7us |
| alert: S3 NDS service has too many staged unprocessed notifications
expr: nds_staged_messages_count > 1000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has a lot of unprocessed notifications staged on the storage. It may be caused by connectivity or storage issues.
|
ok
|
|
22.069s ago
|
76.63us |
| alert: S3 NDS service has too many messages in simultaneous processing
expr: nds_endpoint_process_count > 1000
for: 1m
labels:
component: S3
severity: critical
annotations:
summary: S3 NDS service ({{$labels.svc_id}}) on {{$labels.instance}} has a lot of notifications in simultaneous processing on the endpoint. It may be caused by connectivity issues or an S3 topics misconfiguration.
|
ok
|
|
22.069s ago
|
88.92us |
| alert: S3 service is experiencing many network problems
expr: instance_vol_svc:rpc_errors_total:rate5m{job=~"s3gw|os|ns",vol_id=~"01.*"} > (10 / (5 * 60))
for: 2m
labels:
component: S3
object_id: '{{$labels.svc_id}}-{{$labels.instance}}'
severity: critical
annotations:
summary: S3 service ({{$labels.job}}, {{$labels.svc_id}}) on {{$labels.instance}} has many RPC errors. Check your network configuration.
|
ok
|
|
22.069s ago
|
287.5us |
| alert: S3 service is experiencing some network problems
expr: instance_vol_svc:rpc_errors_total:rate5m{job=~"s3gw|os|ns",vol_id=~"01.*"} > (5 / (5 * 60)) and instance_vol_svc:rpc_errors_total:rate5m{job=~"s3gw|os|ns",vol_id=~"01.*"} <= (10 / (5 * 60))
for: 2m
labels:
component: S3
object_id: '{{$labels.svc_id}}-{{$labels.instance}}'
severity: warning
annotations:
summary: S3 service ({{$labels.job}}, {{$labels.svc_id}}) on {{$labels.instance}} has some RPC errors. Check your network configuration.
|
ok
|
|
22.069s ago
|
411us |
| alert: NFS service is experiencing many network problems
expr: instance_vol_svc:rpc_errors_total:rate5m{job=~"fs|os",vol_id=~"02.*"} > (10 / (5 * 60))
for: 2m
labels:
component: NFS
object_id: '{{$labels.svc_id}}-{{$labels.instance}}'
severity: critical
annotations:
summary: NFS service ({{$labels.job}}, {{$labels.svc_id}}) on {{$labels.instance}} has many RPC errors. Check your network configuration.
|
ok
|
|
22.068s ago
|
110us |
| alert: NFS service is experiencing some network problems
expr: instance_vol_svc:rpc_errors_total:rate5m{job=~"fs|os",vol_id=~"02.*"} > (5 / (5 * 60)) and instance_vol_svc:rpc_errors_total:rate5m{job=~"fs|os",vol_id=~"02.*"} <= (10 / (5 * 60))
for: 2m
labels:
component: NFS
object_id: '{{$labels.svc_id}}-{{$labels.instance}}'
severity: warning
annotations:
summary: NFS service ({{$labels.job}}, {{$labels.svc_id}}) on {{$labels.instance}} has some RPC errors. Check your network configuration.
|
ok
|
|
22.068s ago
|
272.6us |
| alert: S3 redundancy warning
expr: storage_redundancy_threshold{failure_domain="disk",type="s3"} > 0 and storage_redundancy_threshold{failure_domain="disk",type="s3"} <= scalar(count(backend_node_master))
for: 10m
labels:
component: S3
severity: warning
annotations:
summary: |
S3 is set to failure domain "disk" even though there are enough available nodes. It is recommended to set the failure domain to "host" so that S3 can survive host failures in addition to disk failures.
|
ok
|
|
22.068s ago
|
310.3us |
| alert: S3 cluster misconfiguration
expr: count(up{job="ostor"}) > 1 and count(ostor_svc_registry_cfg_failed_total) < 2
labels:
component: S3
severity: error
annotations:
summary: |
{{ if query "backend_vendor_info{vendor='acronis'}" }} The S3 cluster configuration is not highly available. If one S3 node fails, the entire S3 cluster may become non-operational. To ensure high availability, update the S3 cluster configuration, as described in the Knowledge Base at https://kb.acronis.com/node/64033 {{ else if query "backend_vendor_info{vendor='virtuozzo'}" }} The S3 cluster configuration is not highly available. If one S3 node fails, the entire S3 cluster may become non-operational. To ensure high availability, update the S3 cluster configuration, as described in the Knowledge Base at https://support.virtuozzo.com/hc/en-us/articles/27536517316753-Virtuozzo-Hybrid-Infrastructure-Alert-S3-cluster-misconfiguration {{ end }}
|
ok
|
|
22.068s ago
|
132.4us |
| alert: Object storage account control service is offline
expr: up{job="acc"} == 0
for: 5m
labels:
component: S3
severity: critical
annotations:
summary: Object storage account control service is down on host {{$labels.instance}}.
|
ok
|
|
22.068s ago
|
90.03us |
|
38.4s ago |
7.284ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance_proxied:abgw_read_bytes:rate5m
expr: rate(abgw_read_bytes_total[5m])
|
ok
|
|
38.4s ago
|
177us |
| record: job:abgw_read_bytes:rate5m
expr: sum by(job) (instance_proxied:abgw_read_bytes:rate5m)
|
ok
|
|
38.4s ago
|
108.9us |
| record: instance_proxied:abgw_write_bytes:rate5m
expr: rate(abgw_write_bytes_total[5m])
|
ok
|
|
38.4s ago
|
114.1us |
| record: job:abgw_write_bytes:rate5m
expr: sum by(job) (instance_proxied:abgw_write_bytes:rate5m)
|
ok
|
|
38.4s ago
|
70.44us |
| record: instance:abgw_write_rollback_bytes:rate5m
expr: rate(abgw_write_rollback_bytes_total[5m])
|
ok
|
|
38.4s ago
|
47.31us |
| record: job:abgw_write_rollback_bytes:rate5m
expr: sum by(job) (instance:abgw_write_rollback_bytes:rate5m)
|
ok
|
|
38.4s ago
|
61.25us |
| record: instance:abgw_append_throttle_delay_ms:rate5m
expr: rate(abgw_append_throttle_delay_ms_total[5m])
|
ok
|
|
38.4s ago
|
79.98us |
| record: job:abgw_append_throttle_delay_ms:rate5m
expr: sum by(job) (instance:abgw_append_throttle_delay_ms:rate5m)
|
ok
|
|
38.4s ago
|
80.72us |
| record: instance:abgw_read_reqs:rate5m
expr: rate(abgw_read_reqs_total[5m])
|
ok
|
|
38.4s ago
|
110.1us |
| record: job:abgw_read_reqs:rate5m
expr: sum by(job) (instance:abgw_read_reqs:rate5m)
|
ok
|
|
38.4s ago
|
125.1us |
| record: instance:abgw_write_reqs:rate5m
expr: rate(abgw_write_reqs_total[5m])
|
ok
|
|
38.399s ago
|
127.5us |
| record: job:abgw_write_reqs:rate5m
expr: sum by(job) (instance:abgw_write_reqs:rate5m)
|
ok
|
|
38.399s ago
|
145.6us |
| record: instance:abgw_stat_reqs:rate5m
expr: rate(abgw_stat_reqs_total[5m])
|
ok
|
|
38.399s ago
|
128.7us |
| record: job:abgw_stat_reqs:rate5m
expr: sum by(job) (instance:abgw_stat_reqs:rate5m)
|
ok
|
|
38.399s ago
|
81.07us |
| record: job:abgw_read_bufs:sum
expr: sum by(job) (abgw_read_bufs)
|
ok
|
|
38.399s ago
|
66.58us |
| record: job:abgw_read_bufs_bytes:sum
expr: sum by(job) (abgw_read_bufs_bytes)
|
ok
|
|
38.399s ago
|
42.65us |
| record: job:abgw_write_bufs:sum
expr: sum by(job) (abgw_write_bufs)
|
ok
|
|
38.399s ago
|
35.96us |
| record: job:abgw_write_bufs_bytes:sum
expr: sum by(job) (abgw_write_bufs_bytes)
|
ok
|
|
38.399s ago
|
31.62us |
| record: instance_type:abgw_push_progress_bytes:rate5m
expr: rate(abgw_push_progress_bytes_total[5m])
|
ok
|
|
38.399s ago
|
43.75us |
| record: type:abgw_push_progress_bytes:rate5m
expr: sum by(job, type) (instance_type:abgw_push_progress_bytes:rate5m)
|
ok
|
|
38.399s ago
|
68.74us |
| record: type:abgw_push_backlog_bytes:sum
expr: sum by(job, type) (abgw_push_backlog_bytes)
|
ok
|
|
38.399s ago
|
72.94us |
| record: brand:abgw_push_replica_total_size_by_brand:sum
expr: sum by(job, brand) (abgw_push_replica_total_size_by_brand)
|
ok
|
|
38.399s ago
|
43.53us |
| record: brand:abgw_push_progress_by_brand:sum
expr: sum by(job, brand) (abgw_push_progress_by_brand)
|
ok
|
|
38.399s ago
|
35.33us |
| record: instance_err:abgw_file_replica_open_errs:rate5m
expr: rate(abgw_file_replica_open_errs_total[5m])
|
ok
|
|
38.399s ago
|
45.31us |
| record: err:abgw_file_replica_open_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_file_replica_open_errs:rate5m)
|
ok
|
|
38.399s ago
|
72.51us |
| record: instance_err:abgw_push_replica_errs:rate5m
expr: rate(abgw_push_replica_errs_total[5m])
|
ok
|
|
38.399s ago
|
92.23us |
| record: err:abgw_push_replica_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_push_replica_errs:rate5m)
|
ok
|
|
38.399s ago
|
43.11us |
| record: instance_err:abgw_rm_file_push_errs:rate5m
expr: rate(abgw_rm_file_push_errs_total[5m])
|
ok
|
|
38.399s ago
|
72.28us |
| record: err:abgw_rm_file_push_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_rm_file_push_errs:rate5m)
|
ok
|
|
38.399s ago
|
40.21us |
| record: instance:abgw_pull_progress_bytes:rate5m
expr: rate(abgw_pull_progress_bytes_total[5m])
|
ok
|
|
38.399s ago
|
85.57us |
| record: job:abgw_pull_progress_bytes:rate5m
expr: sum by(job) (instance:abgw_pull_progress_bytes:rate5m)
|
ok
|
|
38.399s ago
|
111.9us |
| record: job:abgw_nr_files_to_pull:sum
expr: sum by(job) (abgw_nr_files_to_pull)
|
ok
|
|
38.399s ago
|
99.85us |
| record: job:abgw_pull_backlog_bytes:sum
expr: sum by(job) (abgw_pull_backlog_bytes)
|
ok
|
|
38.399s ago
|
117.4us |
| record: job:abgw_nr_accounts_pull_pending:sum
expr: sum by(job) (abgw_nr_accounts_pull_pending)
|
ok
|
|
38.398s ago
|
100us |
| record: job:abgw_nr_accounts_pull_started:sum
expr: sum by(job) (abgw_nr_accounts_pull_started)
|
ok
|
|
38.398s ago
|
53.02us |
| record: job:abgw_nr_accounts_pull_errors:sum
expr: sum by(job) (abgw_nr_accounts_pull_errors)
|
ok
|
|
38.398s ago
|
58.69us |
| record: instance:abgw_conns_total:rate5m
expr: rate(abgw_conns_total[5m])
|
ok
|
|
38.398s ago
|
86.54us |
| record: job:abgw_conns_total:rate5m
expr: sum by(job) (instance:abgw_conns_total:rate5m)
|
ok
|
|
38.398s ago
|
67.25us |
| record: job:abgw_conns:sum
expr: sum by(job) (abgw_conns)
|
ok
|
|
38.398s ago
|
78.61us |
| record: instance_type:abgw_io_limiting_failures:rate5m
expr: rate(abgw_io_limiting_failures_total[5m])
|
ok
|
|
38.398s ago
|
83.99us |
| record: type:abgw_io_limiting_failures:rate5m
expr: sum by(job, type) (instance_type:abgw_io_limiting_failures:rate5m)
|
ok
|
|
38.398s ago
|
39.04us |
| record: instance_iop:abgw_iop_wd_timeouts:rate5m
expr: rate(abgw_iop_wd_timeouts[5m])
|
ok
|
|
38.398s ago
|
59.68us |
| record: iop:abgw_iop_wd_timeouts:rate5m
expr: sum by(job, iop) (instance_iop:abgw_iop_wd_timeouts:rate5m)
|
ok
|
|
38.398s ago
|
55.63us |
| record: instance_req_err:abgw_req_errs:rate5m
expr: rate(abgw_req_errs_total[5m])
|
ok
|
|
38.398s ago
|
38.04us |
| record: req_err:abgw_req_errs:rate5m
expr: sum by(job, req, err) (instance_req_err:abgw_req_errs:rate5m)
|
ok
|
|
38.398s ago
|
52.37us |
| record: instance_req_le:abgw_req_latency_ms_bucket:rate5m
expr: rate(abgw_req_latency_ms_bucket[5m])
|
ok
|
|
38.398s ago
|
73.77us |
| record: req_le:abgw_req_latency_ms_bucket:rate5m
expr: sum by(job, req, le) (instance_req_le:abgw_req_latency_ms_bucket:rate5m)
|
ok
|
|
38.398s ago
|
64.64us |
| record: instance_req:abgw_req_latency_ms_count:rate5m
expr: rate(abgw_req_latency_ms_count[5m])
|
ok
|
|
38.398s ago
|
64.55us |
| record: req:abgw_req_latency_ms_count:rate5m
expr: sum by(job, req) (instance_req:abgw_req_latency_ms_count:rate5m)
|
ok
|
|
38.398s ago
|
90.86us |
| record: instance_req:abgw_req_latency_ms_sum:rate5m
expr: rate(abgw_req_latency_ms_sum[5m])
|
ok
|
|
38.398s ago
|
72.67us |
| record: req:abgw_req_latency_ms_sum:rate5m
expr: sum by(job, req) (instance_req:abgw_req_latency_ms_sum:rate5m)
|
ok
|
|
38.398s ago
|
68.25us |
| record: instance_req_err:abgw_v2_ireq_errs:rate5m
expr: rate(abgw_v2_ireq_errs_total[5m])
|
ok
|
|
38.398s ago
|
71.02us |
| record: req_err:abgw_v2_ireq_errs:rate5m
expr: sum by(job, req, err) (instance_req_err:abgw_v2_ireq_errs:rate5m)
|
ok
|
|
38.398s ago
|
46.38us |
| record: instance_req_lat_le:abgw_v2_ireq_latency_ms_bucket:rate5m
expr: rate(abgw_v2_ireq_latency_ms_bucket[5m])
|
ok
|
|
38.398s ago
|
56.33us |
| record: req_lat_le:abgw_v2_ireq_latency_ms_bucket:rate5m
expr: sum by(job, req, lat, le) (instance_req_lat_le:abgw_v2_ireq_latency_ms_bucket:rate5m)
|
ok
|
|
38.398s ago
|
55.43us |
| record: instance_req_lat:abgw_v2_ireq_latency_ms_count:rate5m
expr: rate(abgw_v2_ireq_latency_ms_count[5m])
|
ok
|
|
38.398s ago
|
54.11us |
| record: req_lat:abgw_v2_ireq_latency_ms_count:rate5m
expr: sum by(job, req, lat) (instance_req_lat:abgw_v2_ireq_latency_ms_count:rate5m)
|
ok
|
|
38.398s ago
|
43.64us |
| record: instance_req_lat:abgw_v2_ireq_latency_ms_sum:rate5m
expr: rate(abgw_v2_ireq_latency_ms_sum[5m])
|
ok
|
|
38.398s ago
|
41.02us |
| record: req_lat:abgw_v2_ireq_latency_ms_sum:rate5m
expr: sum by(job, req, lat) (instance_req_lat:abgw_v2_ireq_latency_ms_sum:rate5m)
|
ok
|
|
38.398s ago
|
36.11us |
| record: instance_fop_proxied_err:abgw_fop_errs:rate5m
expr: rate(abgw_fop_latency_ms_count[5m])
|
ok
|
|
38.398s ago
|
37.98us |
| record: fop_proxied_err:abgw_fop_errs:rate5m
expr: sum by(job, fop, proxied, err) (instance_fop_proxied_err:abgw_fop_errs:rate5m)
|
ok
|
|
38.398s ago
|
85.42us |
| record: instance_fop_proxied_err_le:abgw_fop_latency_ms_bucket:rate5m
expr: rate(abgw_fop_latency_ms_bucket[5m])
|
ok
|
|
38.398s ago
|
133.7us |
| record: fop_proxied_err_le:abgw_fop_latency_ms_bucket:rate5m
expr: sum by(job, fop, proxied, err, le) (instance_fop_proxied_err_le:abgw_fop_latency_ms_bucket:rate5m)
|
ok
|
|
38.397s ago
|
88.58us |
| record: instance_fop_proxied_err:abgw_fop_latency_ms_count:rate5m
expr: instance_fop_proxied_err:abgw_fop_errs:rate5m
|
ok
|
|
38.397s ago
|
54.58us |
| record: fop_proxied_err:abgw_fop_latency_ms_count:rate5m
expr: fop_proxied_err:abgw_fop_errs:rate5m
|
ok
|
|
38.397s ago
|
37.39us |
| record: instance_fop_proxied_err:abgw_fop_latency_ms_sum:rate5m
expr: rate(abgw_fop_latency_ms_sum[5m])
|
ok
|
|
38.397s ago
|
56.25us |
| record: fop_proxied_err:abgw_fop_latency_ms_sum:rate5m
expr: sum by(job, fop, proxied, err) (instance_fop_proxied_err:abgw_fop_latency_ms_sum:rate5m)
|
ok
|
|
38.397s ago
|
62.79us |
| record: instance_iop_proxied_err:abgw_iop_errs:rate5m
expr: rate(abgw_iop_latency_ms_count[5m])
|
ok
|
|
38.397s ago
|
57.51us |
| record: iop_proxied_err:abgw_iop_errs:rate5m
expr: sum by(job, iop, proxied, err) (instance_iop_proxied_err:abgw_iop_errs:rate5m)
|
ok
|
|
38.397s ago
|
49.75us |
| record: instance_iop_proxied_err_le:abgw_iop_latency_ms_bucket:rate5m
expr: rate(abgw_iop_latency_ms_bucket[5m])
|
ok
|
|
38.397s ago
|
50.36us |
| record: iop_proxied_err_le:abgw_iop_latency_ms_bucket:rate5m
expr: sum by(job, iop, proxied, err, le) (instance_iop_proxied_err_le:abgw_iop_latency_ms_bucket:rate5m)
|
ok
|
|
38.397s ago
|
63.13us |
| record: instance_iop_proxied_err:abgw_iop_latency_ms_count:rate5m
expr: instance_iop_proxied_err:abgw_iop_errs:rate5m
|
ok
|
|
38.397s ago
|
48.54us |
| record: iop_proxied_err:abgw_iop_latency_ms_count:rate5m
expr: iop_proxied_err:abgw_iop_errs:rate5m
|
ok
|
|
38.397s ago
|
45.57us |
| record: instance_iop_proxied_err:abgw_iop_latency_ms_sum:rate5m
expr: rate(abgw_iop_latency_ms_sum[5m])
|
ok
|
|
38.397s ago
|
64.92us |
| record: iop_proxied_err:abgw_iop_latency_ms_sum:rate5m
expr: sum by(job, iop, proxied, err) (instance_iop_proxied_err:abgw_iop_latency_ms_sum:rate5m)
|
ok
|
|
38.397s ago
|
56.15us |
| record: instance_err:abgw_account_lookup_errs:rate5m
expr: rate(abgw_account_lookup_errs_total[5m])
|
ok
|
|
38.397s ago
|
86.68us |
| record: err:abgw_account_lookup_errs:rate5m
expr: sum by(job) (instance_err:abgw_account_lookup_errs:rate5m)
|
ok
|
|
38.397s ago
|
68.91us |
| record: instance_err:abgw_account_pull_errs:rate5m
expr: rate(abgw_account_pull_errs_total[5m])
|
ok
|
|
38.397s ago
|
60.64us |
| record: err:abgw_account_pull_errs:rate5m
expr: sum by(job) (instance_err:abgw_account_pull_errs:rate5m)
|
ok
|
|
38.397s ago
|
58.16us |
| record: job:abgw_accounts:sum
expr: sum by(job) (abgw_accounts)
|
ok
|
|
38.397s ago
|
59.15us |
| record: job:abgw_fds:sum
expr: sum by(job) (abgw_fds)
|
ok
|
|
38.397s ago
|
66.91us |
| record: job:abgw_files:sum
expr: sum by(job) (abgw_files)
|
ok
|
|
38.397s ago
|
55.34us |
| record: job:abgw_detached_files:sum
expr: sum by(job) (abgw_detached_files)
|
ok
|
|
38.397s ago
|
45.94us |
| record: instance_err:abgw_file_lookup_errs:rate5m
expr: rate(abgw_file_lookup_errs_total[5m])
|
ok
|
|
38.397s ago
|
58.09us |
| record: err:abgw_file_lookup_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_file_lookup_errs:rate5m)
|
ok
|
|
38.397s ago
|
51.13us |
| record: instance_err:abgw_file_migration_source_read_errs:rate5m
expr: rate(abgw_file_migration_source_read_errs_total[5m])
|
ok
|
|
38.397s ago
|
64.93us |
| record: err:abgw_file_migration_source_read_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_file_migration_source_read_errs:rate5m)
|
ok
|
|
38.397s ago
|
73.08us |
| record: instance_err:abgw_file_migration_source_open_errs:rate5m
expr: rate(abgw_file_migration_source_open_errs_total[5m])
|
ok
|
|
38.397s ago
|
56.83us |
| record: err:abgw_file_migration_source_open_errs:rate5m
expr: sum by(job, err) (instance_err:abgw_file_migration_source_open_errs:rate5m)
|
ok
|
|
38.397s ago
|
51.31us |
| record: instance:abgw_cert_update_fail:rate5m
expr: rate(abgw_cert_update_fail_total[5m])
|
ok
|
|
38.397s ago
|
53.26us |
| record: job:abgw_cert_update_fail:rate5m
expr: sum by(job) (instance:abgw_cert_update_fail:rate5m)
|
ok
|
|
38.397s ago
|
47.59us |
| record: instance_err:abgw_verify_certs_errors:rate5m
expr: rate(abgw_verify_certs_errors_total[5m])
|
ok
|
|
38.397s ago
|
53.88us |
| record: err:abgw_verify_certs_errors:rate5m
expr: sum by(job, err) (instance_err:abgw_verify_certs_errors:rate5m)
|
ok
|
|
38.397s ago
|
58.53us |
| record: instance:abgw_crl_download_fail:rate5m
expr: rate(abgw_crl_download_fail_total[5m])
|
ok
|
|
38.397s ago
|
38.87us |
| record: job:abgw_crl_download_fail:rate5m
expr: sum by(job) (instance:abgw_crl_download_fail:rate5m)
|
ok
|
|
38.397s ago
|
34.29us |
| record: instance_path_reg_type:abgw_next_certificate_expiration:min
expr: min by(job, path, reg_name, type) (abgw_next_certificate_expiration)
|
ok
|
|
38.397s ago
|
58.53us |
| record: instance:abgw_containers_validate_segments_fail:rate5m
expr: rate(abgw_containers_validate_segments_fail_total[5m])
|
ok
|
|
38.397s ago
|
46.43us |
| record: job:abgw_containers_validate_segments_fail:rate5m
expr: sum by(job) (instance:abgw_containers_validate_segments_fail:rate5m)
|
ok
|
|
38.397s ago
|
56.14us |
| record: instance:abgw_containers_validate_trees_fail:rate5m
expr: rate(abgw_containers_validate_trees_fail_total[5m])
|
ok
|
|
38.397s ago
|
39.05us |
| record: job:abgw_containers_validate_trees_fail:rate5m
expr: sum by(job) (instance:abgw_containers_validate_trees_fail:rate5m)
|
ok
|
|
38.397s ago
|
53.1us |
| record: job:abgw_ostor_space_usage_total:sum
expr: sum(abgw_ostor_used_space_bytes)
|
ok
|
|
38.397s ago
|
52.57us |
| record: instance:abgw_inst_outdated:count
expr: count(changes(abgw_req_errs_total{err="INST_OUTDATED"}[1d]) > 0 or changes(abgw_v2_ireq_errs_total{err="INST_OUTDATED"}[1d]) > 0)
|
ok
|
|
38.397s ago
|
251us |
| record: instance:abgw_io_errors:count
expr: count(changes(abgw_req_errs_total{err="IO"}[1d]) > 0 or changes(abgw_v2_ireq_errs_total{err="IO"}[1d]) > 0)
|
ok
|
|
38.396s ago
|
184.3us |
|
2.407s ago |
2.745ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: job:libvirt_up:count
expr: count by(job) (libvirt_up)
|
ok
|
|
20.779s ago
|
343.4us |
| record: job:libvirt_up:sum
expr: sum by(job) (libvirt_up)
|
ok
|
|
20.779s ago
|
224.3us |
| record: instance:libvirt_domain_info_virtual_cpus:sum
expr: sum without(domain, domain_uuid) (libvirt_domain_info_virtual_cpus)
|
ok
|
|
20.779s ago
|
1.941ms |
| record: job:libvirt_domain_info_virtual_cpus:sum
expr: sum by(job) (instance:libvirt_domain_info_virtual_cpus:sum)
|
ok
|
|
20.777s ago
|
573.3us |
| record: instance_domain:libvirt_domain_info_cpu_time_seconds:rate5m
expr: rate(libvirt_domain_info_cpu_time_seconds_total[5m])
|
ok
|
|
20.776s ago
|
1.796ms |
| record: instance:libvirt_domain_info_cpu_time_seconds:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_info_cpu_time_seconds:rate5m)
|
ok
|
|
20.775s ago
|
2.246ms |
| record: job:libvirt_domain_info_cpu_time_seconds:rate5m
expr: sum by(job) (instance:libvirt_domain_info_cpu_time_seconds:rate5m)
|
ok
|
|
20.772s ago
|
658us |
| record: instance_domain_target_device:libvirt_domain_block_stats_read_requests:rate5m
expr: rate(libvirt_domain_block_stats_read_requests_total[5m])
|
ok
|
|
20.772s ago
|
3.085ms |
| record: instance_domain:libvirt_domain_block_stats_read_requests:rate5m
expr: sum without(source_file, target_device) (instance_domain_target_device:libvirt_domain_block_stats_read_requests:rate5m)
|
ok
|
|
20.769s ago
|
1.809ms |
| record: instance:libvirt_domain_block_stats_read_requests:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_block_stats_read_requests:rate5m)
|
ok
|
|
20.767s ago
|
1.316ms |
| record: job:libvirt_domain_block_stats_read_requests:rate5m
expr: sum by(job) (instance:libvirt_domain_block_stats_read_requests:rate5m)
|
ok
|
|
20.766s ago
|
483.5us |
| record: instance_domain_target_device:libvirt_domain_block_stats_write_requests:rate5m
expr: rate(libvirt_domain_block_stats_write_requests_total[5m])
|
ok
|
|
20.765s ago
|
2.741ms |
| record: instance_domain:libvirt_domain_block_stats_write_requests:rate5m
expr: sum without(source_file, target_device) (instance_domain_target_device:libvirt_domain_block_stats_write_requests:rate5m)
|
ok
|
|
20.762s ago
|
1.744ms |
| record: instance:libvirt_domain_block_stats_write_requests:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_block_stats_write_requests:rate5m)
|
ok
|
|
20.761s ago
|
1.565ms |
| record: job:libvirt_domain_block_stats_write_requests:rate5m
expr: sum by(job) (instance:libvirt_domain_block_stats_write_requests:rate5m)
|
ok
|
|
20.759s ago
|
992.5us |
| record: instance_domain_target_device:libvirt_domain_block_stats_flush_requests:rate5m
expr: rate(libvirt_domain_block_stats_flush_requests_total[5m])
|
ok
|
|
20.758s ago
|
2.868ms |
| record: instance_domain:libvirt_domain_block_stats_flush_requests:rate5m
expr: sum without(source_file, target_device) (instance_domain_target_device:libvirt_domain_block_stats_flush_requests:rate5m)
|
ok
|
|
20.755s ago
|
1.985ms |
| record: instance:libvirt_domain_block_stats_flush_requests:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_block_stats_flush_requests:rate5m)
|
ok
|
|
20.753s ago
|
1.331ms |
| record: job:libvirt_domain_block_stats_flush_requests:rate5m
expr: sum by(job) (instance:libvirt_domain_block_stats_flush_requests:rate5m)
|
ok
|
|
20.752s ago
|
593.3us |
| record: instance_domain_target_device:libvirt_domain_block_stats_read_bytes:rate5m
expr: rate(libvirt_domain_block_stats_read_bytes_total[5m])
|
ok
|
|
20.752s ago
|
2.689ms |
| record: instance_domain:libvirt_domain_block_stats_read_bytes:rate5m
expr: sum without(source_file, target_device) (instance_domain_target_device:libvirt_domain_block_stats_read_bytes:rate5m)
|
ok
|
|
20.749s ago
|
1.67ms |
| record: instance:libvirt_domain_block_stats_read_bytes:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_block_stats_read_bytes:rate5m)
|
ok
|
|
20.747s ago
|
1.208ms |
| record: job:libvirt_domain_block_stats_read_bytes:rate5m
expr: sum by(job) (instance:libvirt_domain_block_stats_read_bytes:rate5m)
|
ok
|
|
20.746s ago
|
475.9us |
| record: instance_domain_target_device:libvirt_domain_block_stats_write_bytes:rate5m
expr: rate(libvirt_domain_block_stats_write_bytes_total[5m])
|
ok
|
|
20.746s ago
|
2.605ms |
| record: instance_domain:libvirt_domain_block_stats_write_bytes:rate5m
expr: sum without(source_file, target_device) (instance_domain_target_device:libvirt_domain_block_stats_write_bytes:rate5m)
|
ok
|
|
20.743s ago
|
1.641ms |
| record: instance:libvirt_domain_block_stats_write_bytes:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_block_stats_write_bytes:rate5m)
|
ok
|
|
20.742s ago
|
1.279ms |
| record: job:libvirt_domain_block_stats_write_bytes:rate5m
expr: sum by(job) (instance:libvirt_domain_block_stats_write_bytes:rate5m)
|
ok
|
|
20.74s ago
|
518.1us |
| record: instance_domain_target_device:libvirt_domain_block_stats_read_seconds:rate5m
expr: rate(libvirt_domain_block_stats_read_seconds_total[5m])
|
ok
|
|
20.74s ago
|
2.639ms |
| record: instance_domain_target_device:libvirt_domain_block_stats_write_seconds:rate5m
expr: rate(libvirt_domain_block_stats_write_seconds_total[5m])
|
ok
|
|
20.737s ago
|
2.871ms |
| record: instance_domain_target_device:libvirt_domain_block_stats_flush_seconds:rate5m
expr: rate(libvirt_domain_block_stats_flush_seconds_total[5m])
|
ok
|
|
20.734s ago
|
2.58ms |
| record: instance_domain_target_device:libvirt_domain_interface_stats_receive_bytes:rate5m
expr: rate(libvirt_domain_interface_stats_receive_bytes_total[5m])
|
ok
|
|
20.732s ago
|
1.86ms |
| record: instance_domain:libvirt_domain_interface_stats_receive_bytes:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_receive_bytes:rate5m)
|
ok
|
|
20.73s ago
|
1.758ms |
| record: instance:libvirt_domain_interface_stats_receive_bytes:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_receive_bytes:rate5m)
|
ok
|
|
20.728s ago
|
1.653ms |
| record: job:libvirt_domain_interface_stats_receive_bytes:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_receive_bytes:rate5m)
|
ok
|
|
20.727s ago
|
620.7us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_transmit_bytes:rate5m
expr: rate(libvirt_domain_interface_stats_transmit_bytes_total[5m])
|
ok
|
|
20.726s ago
|
1.845ms |
| record: instance_domain:libvirt_domain_interface_stats_transmit_bytes:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_transmit_bytes:rate5m)
|
ok
|
|
20.724s ago
|
2.197ms |
| record: instance:libvirt_domain_interface_stats_transmit_bytes:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_transmit_bytes:rate5m)
|
ok
|
|
20.722s ago
|
1.586ms |
| record: job:libvirt_domain_interface_stats_transmit_bytes:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_transmit_bytes:rate5m)
|
ok
|
|
20.721s ago
|
586us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_receive_packets:rate5m
expr: rate(libvirt_domain_interface_stats_receive_packets_total[5m])
|
ok
|
|
20.72s ago
|
1.873ms |
| record: instance_domain:libvirt_domain_interface_stats_receive_packets:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_receive_packets:rate5m)
|
ok
|
|
20.718s ago
|
1.646ms |
| record: instance:libvirt_domain_interface_stats_receive_packets:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_receive_packets:rate5m)
|
ok
|
|
20.716s ago
|
1.453ms |
| record: job:libvirt_domain_interface_stats_receive_packets:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_receive_packets:rate5m)
|
ok
|
|
20.715s ago
|
561.6us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_transmit_packets:rate5m
expr: rate(libvirt_domain_interface_stats_transmit_packets_total[5m])
|
ok
|
|
20.715s ago
|
1.749ms |
| record: instance_domain:libvirt_domain_interface_stats_transmit_packets:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_transmit_packets:rate5m)
|
ok
|
|
20.713s ago
|
1.574ms |
| record: instance:libvirt_domain_interface_stats_transmit_packets:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_transmit_packets:rate5m)
|
ok
|
|
20.711s ago
|
1.461ms |
| record: job:libvirt_domain_interface_stats_transmit_packets:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_transmit_packets:rate5m)
|
ok
|
|
20.71s ago
|
560.4us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_receive_drops:rate5m
expr: rate(libvirt_domain_interface_stats_receive_drops_total[5m])
|
ok
|
|
20.709s ago
|
1.688ms |
| record: instance_domain:libvirt_domain_interface_stats_receive_drops:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_receive_drops:rate5m)
|
ok
|
|
20.708s ago
|
1.54ms |
| record: instance:libvirt_domain_interface_stats_receive_drops:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_receive_drops:rate5m)
|
ok
|
|
20.706s ago
|
1.908ms |
| record: job:libvirt_domain_interface_stats_receive_drops:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_receive_drops:rate5m)
|
ok
|
|
20.704s ago
|
516.3us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_transmit_drops:rate5m
expr: rate(libvirt_domain_interface_stats_transmit_drops_total[5m])
|
ok
|
|
20.704s ago
|
1.627ms |
| record: instance_domain:libvirt_domain_interface_stats_transmit_drops:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_transmit_drops:rate5m)
|
ok
|
|
20.702s ago
|
1.585ms |
| record: instance:libvirt_domain_interface_stats_transmit_drops:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_transmit_drops:rate5m)
|
ok
|
|
20.701s ago
|
1.465ms |
| record: job:libvirt_domain_interface_stats_transmit_drops:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_transmit_drops:rate5m)
|
ok
|
|
20.699s ago
|
487.5us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_receive_errors:rate5m
expr: rate(libvirt_domain_interface_stats_receive_errors_total[5m])
|
ok
|
|
20.699s ago
|
1.601ms |
| record: instance_domain:libvirt_domain_interface_stats_receive_errors:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_receive_errors:rate5m)
|
ok
|
|
20.697s ago
|
1.528ms |
| record: instance:libvirt_domain_interface_stats_receive_errors:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_receive_errors:rate5m)
|
ok
|
|
20.696s ago
|
1.405ms |
| record: job:libvirt_domain_interface_stats_receive_errors:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_receive_errors:rate5m)
|
ok
|
|
20.694s ago
|
526.4us |
| record: instance_domain_target_device:libvirt_domain_interface_stats_transmit_errors:rate5m
expr: rate(libvirt_domain_interface_stats_transmit_errors_total[5m])
|
ok
|
|
20.694s ago
|
1.702ms |
| record: instance_domain:libvirt_domain_interface_stats_transmit_errors:rate5m
expr: sum without(target_device) (instance_domain_target_device:libvirt_domain_interface_stats_transmit_errors:rate5m)
|
ok
|
|
20.692s ago
|
1.747ms |
| record: instance:libvirt_domain_interface_stats_transmit_errors:rate5m
expr: sum without(domain, domain_uuid) (instance_domain:libvirt_domain_interface_stats_transmit_errors:rate5m)
|
ok
|
|
20.69s ago
|
1.51ms |
| record: job:libvirt_domain_interface_stats_transmit_errors:rate5m
expr: sum by(job) (instance:libvirt_domain_interface_stats_transmit_errors:rate5m)
|
ok
|
|
20.689s ago
|
536.4us |
|
46.428s ago |
357.8ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance_csid:csd_read_bytes:rate5m
expr: rate(csd_read_bytes_total[5m])
|
ok
|
|
46.428s ago
|
853.4us |
| record: instance:csd_read_bytes:rate5m
expr: sum without(csid) (instance_csid:csd_read_bytes:rate5m)
|
ok
|
|
46.428s ago
|
359.6us |
| record: job:csd_read_bytes:rate5m
expr: sum by(job) (instance:csd_read_bytes:rate5m)
|
ok
|
|
46.427s ago
|
95.65us |
| record: instance_csid:csd_write_bytes:rate5m
expr: rate(csd_write_bytes_total[5m])
|
ok
|
|
46.427s ago
|
638.8us |
| record: instance:csd_write_bytes:rate5m
expr: sum without(csid) (instance_csid:csd_write_bytes:rate5m)
|
ok
|
|
46.427s ago
|
340.7us |
| record: job:csd_write_bytes:rate5m
expr: sum by(job) (instance:csd_write_bytes:rate5m)
|
ok
|
|
46.426s ago
|
91.68us |
| record: instance_csid:csd_read_reqs:rate5m
expr: rate(csd_read_reqs_total[5m])
|
ok
|
|
46.426s ago
|
599.1us |
| record: instance:csd_read_reqs:rate5m
expr: sum without(csid) (instance_csid:csd_read_reqs:rate5m)
|
ok
|
|
46.426s ago
|
541.6us |
| record: job:csd_read_reqs:rate5m
expr: sum by(job) (instance:csd_read_reqs:rate5m)
|
ok
|
|
46.425s ago
|
160.7us |
| record: instance_csid:csd_write_reqs:rate5m
expr: rate(csd_write_reqs_total[5m])
|
ok
|
|
46.425s ago
|
578.6us |
| record: instance:csd_write_reqs:rate5m
expr: sum without(csid) (instance_csid:csd_write_reqs:rate5m)
|
ok
|
|
46.424s ago
|
439.5us |
| record: job:csd_write_reqs:rate5m
expr: sum by(job) (instance:csd_write_reqs:rate5m)
|
ok
|
|
46.424s ago
|
105.8us |
| record: instance_csid:csd_map_reqs:rate5m
expr: rate(csd_map_reqs_total[5m])
|
ok
|
|
46.424s ago
|
713.6us |
| record: instance:csd_map_reqs:rate5m
expr: sum without(csid) (instance_csid:csd_map_reqs:rate5m)
|
ok
|
|
46.423s ago
|
358.8us |
| record: job:csd_map_reqs:rate5m
expr: sum by(job) (instance:csd_map_reqs:rate5m)
|
ok
|
|
46.423s ago
|
120.9us |
| record: instance_csid:csd_rmw_reqs:rate5m
expr: rate(csd_rmw_reqs_total[5m])
|
ok
|
|
46.423s ago
|
611.5us |
| record: instance:csd_rmw_reqs:rate5m
expr: sum without(csid) (instance_csid:csd_rmw_reqs:rate5m)
|
ok
|
|
46.422s ago
|
313.2us |
| record: job:csd_rmw_reqs:rate5m
expr: sum by(job) (instance:csd_rmw_reqs:rate5m)
|
ok
|
|
46.422s ago
|
86.37us |
| record: instance_csid:csd_io_time_seconds:rate5m
expr: rate(csd_io_time_seconds_total[5m])
|
ok
|
|
46.422s ago
|
574.2us |
| record: instance:csd_io_time_seconds:avg_rate5m
expr: avg without(csid) (instance_csid:csd_io_time_seconds:rate5m)
|
ok
|
|
46.421s ago
|
413.1us |
| record: job:csd_io_time_seconds:avg_rate5m
expr: avg by(job) (instance:csd_io_time_seconds:avg_rate5m)
|
ok
|
|
46.421s ago
|
116.5us |
| record: csd_io_op_time_seconds:with_tier
expr: csd_io_op_time_seconds * on(csid) group_left(tier) cluster_csd_info
|
ok
|
|
46.421s ago
|
1.058ms |
| record: instance_csid:csd_queue_time_seconds:rate5m
expr: rate(csd_queue_time_seconds_total[5m])
|
ok
|
|
46.42s ago
|
608.6us |
| record: instance:csd_queue_time_seconds:rate5m
expr: sum without(csid) (instance_csid:csd_queue_time_seconds:rate5m)
|
ok
|
|
46.419s ago
|
384.1us |
| record: job:csd_queue_time_seconds:rate5m
expr: sum by(job) (instance:csd_queue_time_seconds:rate5m)
|
ok
|
|
46.419s ago
|
116.6us |
| record: instance_csid:csd_journal_usage_ratio:rate5m
expr: rate(csd_journal_usage_ratio_total[5m])
|
ok
|
|
46.419s ago
|
598.4us |
| record: instance:csd_journal_usage_ratio:avg_rate5m
expr: avg without(csid) (instance_csid:csd_journal_usage_ratio:rate5m)
|
ok
|
|
46.418s ago
|
325.4us |
| record: job:csd_journal_usage_ration:avg_rate5m
expr: avg by(job) (instance:csd_journal_usage_ratio:avg_rate5m)
|
ok
|
|
46.418s ago
|
190.7us |
| record: instance_csid:csd_repl_read_bytes:rate5m
expr: rate(csd_repl_read_bytes_total[5m])
|
ok
|
|
46.418s ago
|
789.8us |
| record: instance:csd_repl_read_bytes:rate5m
expr: sum without(csid) (instance_csid:csd_repl_read_bytes:rate5m)
|
ok
|
|
46.417s ago
|
404.7us |
| record: job:csd_repl_read_bytes:rate5m
expr: sum by(job) (instance:csd_repl_read_bytes:rate5m)
|
ok
|
|
46.417s ago
|
116.6us |
| record: instance_csid:csd_repl_write_bytes:rate5m
expr: rate(csd_repl_write_bytes_total[5m])
|
ok
|
|
46.417s ago
|
593.7us |
| record: instance:csd_repl_write_bytes:rate5m
expr: sum without(csid) (instance_csid:csd_repl_write_bytes:rate5m)
|
ok
|
|
46.416s ago
|
367.1us |
| record: job:csd_repl_write_bytes:rate5m
expr: sum by(job) (instance:csd_repl_write_bytes:rate5m)
|
ok
|
|
46.416s ago
|
99.9us |
| record: instance_csid:csd_sync:rate5m
expr: rate(csd_sync_total[5m])
|
ok
|
|
46.416s ago
|
611.6us |
| record: instance:csd_sync:rate5m
expr: sum without(csid) (instance_csid:csd_sync:rate5m)
|
ok
|
|
46.415s ago
|
369.5us |
| record: job:csd_sync:rate5m
expr: sum by(job) (instance:csd_sync:rate5m)
|
ok
|
|
46.415s ago
|
131.5us |
| record: instance_csid:csd_fsync:rate5m
expr: rate(csd_fsync_total[5m])
|
ok
|
|
46.415s ago
|
633.4us |
| record: instance:csd_fsync:rate5m
expr: sum without(csid) (instance_csid:csd_fsync:rate5m)
|
ok
|
|
46.414s ago
|
375.7us |
| record: job:csd_fsync:rate5m
expr: sum by(job) (instance:csd_fsync:rate5m)
|
ok
|
|
46.414s ago
|
178.2us |
| record: instance_csid:csd_sync_time_seconds:rate5m
expr: rate(csd_sync_time_seconds_total[5m])
|
ok
|
|
46.413s ago
|
721.6us |
| record: instance:csd_sync_time_seconds:avg_rate5m
expr: avg without(csid) (instance_csid:csd_sync_time_seconds:rate5m)
|
ok
|
|
46.413s ago
|
358.9us |
| record: job:csd_sync_time_seconds:avg_rate5m
expr: avg by(job) (instance:csd_sync_time_seconds:avg_rate5m)
|
ok
|
|
46.412s ago
|
133.7us |
| record: instance_csid_req_err:csd_reqs:rate5m
expr: rate(csd_reqs_total[5m])
|
ok
|
|
46.412s ago
|
6.899ms |
| record: instance_req_err:csd_reqs:rate5m
expr: sum without(csid) (instance_csid_req_err:csd_reqs:rate5m)
|
ok
|
|
46.405s ago
|
3.349ms |
| record: req_err:csd_reqs:rate5m
expr: sum by(job, req, err) (instance_req_err:csd_reqs:rate5m)
|
ok
|
|
46.402s ago
|
553.2us |
| record: instance_csid_req_le:csd_req_duration_seconds_bucket:rate5m
expr: rate(csd_req_duration_seconds_bucket[5m])
|
ok
|
|
46.402s ago
|
189.1ms |
| record: instance_req_le:csd_req_duration_seconds_bucket:rate5m
expr: sum without(csid) (instance_csid_req_le:csd_req_duration_seconds_bucket:rate5m)
|
ok
|
|
46.213s ago
|
113.7ms |
| record: req_le:csd_req_duration_seconds_bucket:rate5m
expr: sum by(job, req, le) (instance_req_le:csd_req_duration_seconds_bucket:rate5m)
|
ok
|
|
46.099s ago
|
9.612ms |
| record: instance_csid_req:csd_req_duration_seconds_sum:rate5m
expr: rate(csd_req_duration_seconds_sum[5m])
|
ok
|
|
46.089s ago
|
5.847ms |
| record: instance_csid_req:csd_req_duration_seconds_count:rate5m
expr: rate(csd_req_duration_seconds_count[5m])
|
ok
|
|
46.084s ago
|
5.591ms |
| record: cluster_csd_disk_info
expr: sum without(fstype, mountpoint) (cluster_csd_info * on(node, mountpoint) group_right(csid) label_replace(node_filesystem_free_bytes{job="node"}, "device", "$1", "device", "/dev/([[:alpha:]]+(([[:digit:]]+n[[:digit:]]+)|(-[[:digit:]]+))?)p?[[:digit:]]*")) > bool 1
|
ok
|
|
46.078s ago
|
4.591ms |
| record: cluster_min_req_redundancy_number
expr: count by(service, failure_domain) (storage_policy_allocatable_space{scheme="17+3"}) * 20 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="17+2"}) * 19 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="17+1"}) * 18 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="15+3"}) * 18 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="15+2"}) * 17 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="15+1"}) * 16 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="13+3"}) * 16 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="13+2"}) * 15 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="13+1"}) * 14 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="11+3"}) * 14 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="11+2"}) * 13 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="11+1"}) * 12 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="9+3"}) * 12 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="9+2"}) * 11 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="9+1"}) * 10 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="7+3"}) * 10 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="7+2"}) * 9 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="7+1"}) * 8 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="5+3"}) * 8 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="5+2"}) * 7 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="5+1"}) * 6 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="3+2"}) * 5 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="3+1"}) * 4 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="2+1"}) * 3 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="1+2"}) * 3 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="1+1"}) * 2 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="1+0"}) or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="5"}) * 5 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="4"}) * 4 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="3"}) * 3 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="2"}) * 2 or count by(service, failure_domain) (storage_policy_allocatable_space{scheme="1"})
|
ok
|
|
46.074s ago
|
1.253ms |
|
52.06s ago |
227.4ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:fused_read_bytes:rate5m
expr: rate(fused_read_bytes_total[5m])
|
ok
|
|
52.06s ago
|
311.5us |
| record: job:fused_read_bytes:rate5m
expr: sum by(job) (instance:fused_read_bytes:rate5m)
|
ok
|
|
52.06s ago
|
127.6us |
| record: instance:fused_write_bytes:rate5m
expr: rate(fused_write_bytes_total[5m])
|
ok
|
|
52.06s ago
|
272.3us |
| record: job:fused_write_bytes:rate5m
expr: sum by(job) (instance:fused_write_bytes:rate5m)
|
ok
|
|
52.06s ago
|
145.1us |
| record: instance:fused_reads:rate5m
expr: rate(fused_reads_total[5m])
|
ok
|
|
52.06s ago
|
174.3us |
| record: job:fused_reads:rate5m
expr: sum by(job) (instance:fused_reads:rate5m)
|
ok
|
|
52.06s ago
|
101.6us |
| record: instance:fused_writes:rate5m
expr: rate(fused_writes_total[5m])
|
ok
|
|
52.06s ago
|
138.7us |
| record: job:fused_writes:rate5m
expr: sum by(job) (instance:fused_writes:rate5m)
|
ok
|
|
52.059s ago
|
94.67us |
| record: instance:fused_flushes:rate5m
expr: rate(fused_flushes_total[5m])
|
ok
|
|
52.059s ago
|
124.4us |
| record: job:fused_flushes:rate5m
expr: sum by(job) (instance:fused_flushes:rate5m)
|
ok
|
|
52.059s ago
|
89.02us |
| record: instance:fused_unaligned_reads:rate5m
expr: rate(fused_unaligned_reads_total[5m])
|
ok
|
|
52.059s ago
|
130.1us |
| record: job:fused_unaligned_reads:rate5m
expr: sum by(job) (instance:fused_unaligned_reads:rate5m)
|
ok
|
|
52.059s ago
|
83.31us |
| record: instance:fused_unaligned_writes:rate5m
expr: rate(fused_unaligned_writes_total[5m])
|
ok
|
|
52.059s ago
|
122.5us |
| record: job:fused_unaligned_writes:rate5m
expr: sum by(job) (instance:fused_unaligned_writes:rate5m)
|
ok
|
|
52.059s ago
|
76.61us |
| record: instance:fused_cache_read_bytes:rate5m
expr: rate(fused_cache_read_bytes_total[5m])
|
ok
|
|
52.059s ago
|
256us |
| record: job:fused_cache_read_bytes:rate5m
expr: sum by(job) (instance:fused_cache_read_bytes:rate5m)
|
ok
|
|
52.059s ago
|
115.1us |
| record: instance:fused_cache_write_bytes:rate5m
expr: rate(fused_cache_write_bytes_total[5m])
|
ok
|
|
52.059s ago
|
144.6us |
| record: job:fused_cache_write_bytes:rate5m
expr: sum by(job) (instance:fused_cache_write_bytes:rate5m)
|
ok
|
|
52.059s ago
|
158.9us |
| record: instance:fused_cache_reads:rate5m
expr: rate(fused_cache_reads_total[5m])
|
ok
|
|
52.058s ago
|
159.5us |
| record: job:fused_cache_reads:rate5m
expr: sum by(job) (instance:fused_cache_reads:rate5m)
|
ok
|
|
52.058s ago
|
93.73us |
| record: instance:fused_cache_writes:rate5m
expr: rate(fused_cache_writes_total[5m])
|
ok
|
|
52.058s ago
|
193.1us |
| record: job:fused_cache_writes:rate5m
expr: sum by(job) (instance:fused_cache_writes:rate5m)
|
ok
|
|
52.058s ago
|
117.3us |
| record: job:fused_cache_read_pending_blocks:sum
expr: sum by(job) (fused_cache_read_pending_blocks)
|
ok
|
|
52.058s ago
|
121.4us |
| record: job:fused_cache_write_pending_blocks:sum
expr: sum by(job) (fused_cache_write_pending_blocks)
|
ok
|
|
52.058s ago
|
164.8us |
| record: instance:fused_ls_gc_processed_bytes:rate5m
expr: rate(fused_ls_gc_processed_bytes_total[5m])
|
ok
|
|
52.058s ago
|
157us |
| record: job:fused_ls_gc_processed_bytes:rate5m
expr: sum by(job) (instance:fused_ls_gc_processed_bytes:rate5m)
|
ok
|
|
52.058s ago
|
110.9us |
| record: instance:fused_ls_gc_reencoded_bytes:rate5m
expr: rate(fused_ls_gc_reencoded_bytes_total[5m])
|
ok
|
|
52.058s ago
|
125.3us |
| record: job:fused_ls_gc_reencoded_bytes:rate5m
expr: sum by(job) (instance:fused_ls_gc_reencoded_bytes:rate5m)
|
ok
|
|
52.057s ago
|
123.9us |
| record: job:fused_ls_gc_pending_bytes:sum
expr: sum by(job) (fused_ls_gc_pending_bytes)
|
ok
|
|
52.057s ago
|
152.2us |
| record: job:fused_ls_gc_pending_chunks:sum
expr: sum by(job) (fused_ls_gc_pending_chunks)
|
ok
|
|
52.057s ago
|
96.17us |
| record: job:fused_ls_gc_processing_chunks:sum
expr: sum by(job) (fused_ls_gc_processing_chunks)
|
ok
|
|
52.057s ago
|
117.2us |
| record: job:fused_ls_gc_reencoding_chunks:sum
expr: sum by(job) (fused_ls_gc_reencoding_chunks)
|
ok
|
|
52.057s ago
|
100.5us |
| record: job:fused_ls_open_containers:sum
expr: sum by(job) (fused_ls_open_containers)
|
ok
|
|
52.057s ago
|
119.6us |
| record: instance_op:fused_ops:rate5m
expr: rate(fused_op_duration_seconds_count[5m])
|
ok
|
|
52.057s ago
|
2.931ms |
| record: op:fused_ops:rate5m
expr: sum by(job, op) (instance_op:fused_ops:rate5m)
|
ok
|
|
52.054s ago
|
1.424ms |
| record: instance_op_le:fused_op_duration_seconds_bucket:rate5m
expr: rate(fused_op_duration_seconds_bucket[5m])
|
ok
|
|
52.053s ago
|
118.4ms |
| record: op_le:fused_op_duration_seconds_bucket:rate5m
expr: sum by(job, op, le) (instance_op_le:fused_op_duration_seconds_bucket:rate5m)
|
ok
|
|
51.934s ago
|
64.34ms |
| record: instance_op:fused_op_duration_seconds_count:rate5m
expr: rate(fused_op_duration_seconds_count[5m])
|
ok
|
|
51.87s ago
|
2.391ms |
| record: op:fused_op_duration_seconds_count:rate5m
expr: sum by(job, op) (instance_op:fused_op_duration_seconds_count:rate5m)
|
ok
|
|
51.868s ago
|
1.356ms |
| record: instance_op:fused_op_duration_seconds_sum:rate5m
expr: rate(fused_op_duration_seconds_sum[5m])
|
ok
|
|
51.866s ago
|
2.253ms |
| record: op:fused_op_duration_seconds_sum:rate5m
expr: sum by(job, op) (instance_op:fused_op_duration_seconds_sum:rate5m)
|
ok
|
|
51.864s ago
|
1.329ms |
| record: instance_req:fused_reqs:rate5m
expr: rate(fused_req_duration_seconds_count[5m])
|
ok
|
|
51.863s ago
|
616.9us |
| record: req:fused_reqs:rate5m
expr: sum by(job, req) (instance_req:fused_reqs:rate5m)
|
ok
|
|
51.862s ago
|
265.7us |
| record: instance_req_le:fused_req_duration_seconds_bucket:rate5m
expr: rate(fused_req_duration_seconds_bucket[5m])
|
ok
|
|
51.862s ago
|
16.15ms |
| record: req_le:fused_req_duration_seconds_bucket:rate5m
expr: sum by(job, req, le) (instance_req_le:fused_req_duration_seconds_bucket:rate5m)
|
ok
|
|
51.846s ago
|
9.793ms |
| record: instance_req:fused_req_duration_seconds_count:rate5m
expr: rate(fused_req_duration_seconds_count[5m])
|
ok
|
|
51.836s ago
|
484.6us |
| record: req:fused_req_duration_seconds_count:rate5m
expr: sum by(job, req) (instance_req:fused_req_duration_seconds_count:rate5m)
|
ok
|
|
51.836s ago
|
251.3us |
| record: instance_req:fused_req_duration_seconds_sum:rate5m
expr: rate(fused_req_duration_seconds_sum[5m])
|
ok
|
|
51.836s ago
|
459.2us |
| record: req:fused_req_duration_seconds_sum:rate5m
expr: sum by(job, req) (instance_req:fused_req_duration_seconds_sum:rate5m)
|
ok
|
|
51.835s ago
|
253.4us |
|
38.291s ago |
1.757ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:mdsd_is_master:top1
expr: topk(1, mdsd_is_master)
|
ok
|
|
59.276s ago
|
272.8us |
| record: job:mdsd_cluster_raw_space_total:sum
expr: sum by(job) (mdsd_cluster_raw_space_total and instance:mdsd_is_master:top1)
|
ok
|
|
59.276s ago
|
186.2us |
| record: job:mdsd_cluster_raw_space_free:sum
expr: sum by(job) (mdsd_cluster_raw_space_free and instance:mdsd_is_master:top1)
|
ok
|
|
59.276s ago
|
179.4us |
| record: job:mdsd_cluster_licensed_space_bytes:sum
expr: sum by(job) (mdsd_cluster_licensed_space_bytes and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
200.9us |
| record: job:mdsd_cluster_to_replicate_chunks:sum
expr: sum by(job) (mdsd_cluster_to_replicate_chunks and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
165.6us |
| record: job:mdsd_cluster_replicated_chunks:rate5m
expr: sum by(job) (rate(mdsd_cluster_replicated_chunks_total[5m]) and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
158.7us |
| record: job:mdsd_cluster_rebalance_pending_chunks:sum
expr: sum by(job) (mdsd_cluster_rebalance_pending_chunks and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
199.8us |
| record: job:mdsd_cluster_rebalance_committing_chunks:sum
expr: sum by(job) (mdsd_cluster_rebalance_committing_chunks and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
131.3us |
| record: job:mdsd_fs_files:sum
expr: sum by(job) (mdsd_fs_files and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
105.2us |
| record: job:mdsd_fs_inodes:sum
expr: sum by(job) (mdsd_fs_inodes and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
103.7us |
| record: job:mdsd_fs_leases:sum
expr: sum by(job) (mdsd_fs_leases and instance:mdsd_is_master:top1)
|
ok
|
|
59.275s ago
|
160.1us |
| record: job:mdsd_fs_file_maps:sum
expr: sum by(job) (mdsd_fs_file_maps and instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
140.6us |
| record: job:mdsd_fs_chunk_maps:sum
expr: sum by(job) (mdsd_fs_chunk_maps and instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
109.8us |
| record: job:mdsd_fs_chunk_replicas:sum
expr: sum by(job) (mdsd_fs_chunk_replicas and instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
146.2us |
| record: job:mdsd_fs_lease_clients:sum
expr: sum by(job) (mdsd_fs_lease_clients and instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
137.3us |
| record: tier:mdsd_fs_space_bytes:sum
expr: sum by(job, tier) (mdsd_fs_space_bytes and ignoring(tier) instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
183.3us |
| record: job:mdsd_fs_space_bytes:sum
expr: sum by(job) (tier:mdsd_fs_space_bytes:sum)
|
ok
|
|
59.274s ago
|
131.9us |
| record: tier:mdsd_fs_free_space_bytes:sum
expr: sum by(job, tier) (mdsd_fs_free_space_bytes and ignoring(tier) instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
184us |
| record: job:mdsd_fs_free_space_bytes:sum
expr: sum by(job) (tier:mdsd_fs_free_space_bytes:sum)
|
ok
|
|
59.274s ago
|
112.1us |
| record: job:mdsd_fs_logical_size_bytes:sum
expr: sum by(job) (mdsd_fs_logical_size_bytes and instance:mdsd_is_master:top1)
|
ok
|
|
59.274s ago
|
157.2us |
| record: job:mdsd_fs_physical_size_bytes:sum
expr: sum by(job) (mdsd_fs_physical_size_bytes and instance:mdsd_is_master:top1)
|
ok
|
|
59.273s ago
|
116.3us |
| record: job:mdsd_fs_allocated_size_bytes:sum
expr: sum by(job) (mdsd_fs_allocated_size_bytes and instance:mdsd_is_master:top1)
|
ok
|
|
59.273s ago
|
137.7us |
| record: master:mdsd_cs_status
expr: mdsd_cs_status and ignoring(csid, status) instance:mdsd_is_master:top1
|
ok
|
|
59.273s ago
|
756.6us |
| record: csid_status:mdsd_cs_chunks:sum
expr: sum by(job, csid, status) (mdsd_cs_chunks and ignoring(csid, status) instance:mdsd_is_master:top1)
|
ok
|
|
59.273s ago
|
8.205ms |
| record: csid:mdsd_cs_chunks:sum
expr: sum by(job, csid) (csid_status:mdsd_cs_chunks:sum)
|
ok
|
|
59.264s ago
|
3.355ms |
| record: status:mdsd_cs_chunks:sum
expr: sum by(job, status) (csid_status:mdsd_cs_chunks:sum)
|
ok
|
|
59.261s ago
|
2.987ms |
| record: csid:mdsd_cs_unique_chunks:sum
expr: mdsd_cs_unique_chunks and ignoring(csid) instance:mdsd_is_master:top1
|
ok
|
|
59.258s ago
|
966.6us |
| record: csid_type:mdsd_cs_chunk_allocation_cost:sum
expr: sum by(job, csid, type) (mdsd_cs_chunk_allocation_cost and ignoring(csid, type) instance:mdsd_is_master:top1)
|
ok
|
|
59.257s ago
|
1.854ms |
| record: csid:mdsd_cs_chunk_allocation_cost:sum
expr: sum by(job, csid) (csid_type:mdsd_cs_chunk_allocation_cost:sum)
|
ok
|
|
59.255s ago
|
967.5us |
| record: csid:mdsd_cs_space_bytes:sum
expr: sum by(job, csid) (mdsd_cs_space_bytes and ignoring(csid) instance:mdsd_is_master:top1)
|
ok
|
|
59.254s ago
|
796.9us |
| record: job:mdsd_cs_space_bytes:sum
expr: sum by(job) (csid:mdsd_cs_space_bytes:sum)
|
ok
|
|
59.254s ago
|
343.3us |
| record: csid:mdsd_cs_free_space_bytes:sum
expr: sum by(job, csid) (mdsd_cs_free_space_bytes and ignoring(csid) instance:mdsd_is_master:top1)
|
ok
|
|
59.253s ago
|
974.6us |
| record: job:mdsd_cs_free_space_bytes:sum
expr: sum by(job) (csid:mdsd_cs_free_space_bytes:sum)
|
ok
|
|
59.252s ago
|
460.7us |
| record: csid:mdsd_cs_avail_space_bytes:sum
expr: sum by(job, csid) (mdsd_cs_avail_space_bytes and ignoring(csid) instance:mdsd_is_master:top1)
|
ok
|
|
59.252s ago
|
965.2us |
| record: job:mdsd_cs_avail_space_bytes:sum
expr: sum by(job) (csid:mdsd_cs_avail_space_bytes:sum)
|
ok
|
|
59.251s ago
|
474us |
| record: instance_req_err:mdsd_requests:rate5m
expr: rate(mdsd_requests_total[5m])
|
ok
|
|
59.251s ago
|
349.7us |
| record: req_err:mdsd_requests:rate5m
expr: sum by(job, req, err) (instance_req_err:mdsd_requests:rate5m)
|
ok
|
|
59.25s ago
|
362.2us |
| record: instance_req:mdsd_requests_sent:rate5m
expr: rate(mdsd_requests_sent_total[5m])
|
ok
|
|
59.25s ago
|
111us |
| record: req:mdsd_requests_sent:rate5m
expr: sum by(job, req) (instance_req:mdsd_requests_sent:rate5m)
|
ok
|
|
59.25s ago
|
92.91us |
| record: instance_req_le:mdsd_request_duration_seconds_bucket:rate5m
expr: rate(mdsd_request_duration_seconds_bucket[5m])
|
ok
|
|
59.25s ago
|
5.452ms |
| record: req_le:mdsd_request_duration_seconds_bucket:rate5m
expr: sum by(job, req, le) (instance_req_le:mdsd_request_duration_seconds_bucket:rate5m)
|
ok
|
|
59.244s ago
|
4.732ms |
| record: instance_req:mdsd_request_duration_seconds_sum:rate5m
expr: rate(mdsd_request_duration_seconds_sum[5m])
|
ok
|
|
59.24s ago
|
298.2us |
| record: instance_req:mdsd_request_duration_seconds_count:rate5m
expr: rate(mdsd_request_duration_seconds_count[5m])
|
ok
|
|
59.239s ago
|
264.3us |
| record: cluster_mdsd_disk_info
expr: sum without(fstype, mountpoint) (cluster_mdsd_info * on(node, mountpoint) group_right(mdsid) label_replace(node_filesystem_free_bytes{job="node"}, "device", "$1", "device", "/dev/([[:alpha:]]+)[[:digit:]]+")) > bool 1
|
ok
|
|
59.239s ago
|
4.269ms |
|
22.582s ago |
863us |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:node_up
expr: (1 - changes(node_boot_time_seconds{job="node"}[5m]) > bool 0) and up{job="node"}
|
ok
|
|
2.866s ago
|
449.5us |
| record: job:node_up:sum
expr: sum by(job) (instance:node_up{job="node"})
|
ok
|
|
2.865s ago
|
169.7us |
| record: instance_cpu_mode:node_cpu_seconds:rate5m
expr: rate(node_cpu_seconds_total{job="node"}[5m])
|
ok
|
|
2.865s ago
|
85.04ms |
| record: instance_mode:node_cpu_seconds:avg_rate5m
expr: avg without(cpu) (instance_cpu_mode:node_cpu_seconds:rate5m{job="node"})
|
ok
|
|
2.78s ago
|
44.46ms |
| record: instance:node_cpu_seconds:rate5m
expr: sum by(job, instance, node) (1 - instance_mode:node_cpu_seconds:avg_rate5m{job="node",mode="idle"})
|
ok
|
|
2.736s ago
|
182.3us |
| record: instance:node_cpu_seconds:count
expr: count by(job, instance, node) (node_cpu_seconds_total{job="node",mode="idle"})
|
ok
|
|
2.736s ago
|
5.325ms |
| record: instance:node_context_switches:rate5m
expr: rate(node_context_switches_total{job="node"}[5m])
|
ok
|
|
2.73s ago
|
137.2us |
| record: job:node_context_switches:rate5m
expr: sum by(job) (instance:node_context_switches:rate5m{job="node"})
|
ok
|
|
2.73s ago
|
89.48us |
| record: instance:node_forks:rate5m
expr: rate(node_forks_total{job="node"}[5m])
|
ok
|
|
2.73s ago
|
296.7us |
| record: job:node_forks:rate5m
expr: sum by(job) (instance:node_forks:rate5m{job="node"})
|
ok
|
|
2.73s ago
|
235.2us |
| record: instance:node_intr:rate5m
expr: rate(node_intr_total{job="node"}[5m])
|
ok
|
|
2.73s ago
|
266.5us |
| record: job:node_intr:rate5m
expr: sum by(job) (instance:node_intr:rate5m{job="node"})
|
ok
|
|
2.73s ago
|
139.6us |
| record: job:node_procs_running:sum
expr: sum by(job) (node_procs_running{job="node"})
|
ok
|
|
2.729s ago
|
135.9us |
| record: job:node_procs_blocked:sum
expr: sum by(job) (node_procs_blocked{job="node"})
|
ok
|
|
2.729s ago
|
101.6us |
| record: job:node_load1:sum
expr: sum by(job) (node_load1{job="node"})
|
ok
|
|
2.729s ago
|
134.3us |
| record: job:node_load5:sum
expr: sum by(job) (node_load5{job="node"})
|
ok
|
|
2.729s ago
|
158.8us |
| record: job:node_load15:sum
expr: sum by(job) (node_load15{job="node"})
|
ok
|
|
2.729s ago
|
143.6us |
| record: job:node_filefd_allocated:sum
expr: sum by(job) (node_filefd_allocated{job="node"})
|
ok
|
|
2.729s ago
|
130.5us |
| record: job:node_filefd_maximum:sum
expr: sum by(job) (node_filefd_maximum{job="node"})
|
ok
|
|
2.729s ago
|
96.33us |
| record: instance:node_vmstat_pswpout_bytes:rate5m
expr: rate(node_vmstat_pswpout{job="node"}[5m]) * 1024
|
ok
|
|
2.729s ago
|
180.4us |
| record: job:node_vmstat_pswpout_bytes:rate5m
expr: sum by(job) (instance:node_vmstat_pswpout_bytes:rate5m{job="node"})
|
ok
|
|
2.729s ago
|
89.65us |
| record: instance:node_vmstat_pswpin_bytes:rate5m
expr: rate(node_vmstat_pswpin{job="node"}[5m]) * 1024
|
ok
|
|
2.729s ago
|
131.1us |
| record: job:node_vmstat_pswpin_bytes:rate5m
expr: sum by(job) (instance:node_vmstat_pswpin_bytes:rate5m{job="node"})
|
ok
|
|
2.728s ago
|
127us |
| record: instance_device:node_disk_read_bytes:rate5m
expr: rate(node_disk_read_bytes_total{job="node"}[5m])
|
ok
|
|
2.728s ago
|
1.075ms |
| record: instance:node_disk_read_bytes:rate5m
expr: sum without(device) (instance_device:node_disk_read_bytes:rate5m{job="node"})
|
ok
|
|
2.727s ago
|
574.2us |
| record: job:node_disk_read_bytes:rate5m
expr: sum by(job) (instance:node_disk_read_bytes:rate5m{job="node"})
|
ok
|
|
2.727s ago
|
147.4us |
| record: instance_device:node_disk_written_bytes:rate5m
expr: rate(node_disk_written_bytes_total{job="node"}[5m])
|
ok
|
|
2.727s ago
|
1.116ms |
| record: instance:node_disk_written_bytes:rate5m
expr: sum without(device) (instance_device:node_disk_written_bytes:rate5m{job="node"})
|
ok
|
|
2.726s ago
|
560.3us |
| record: job:node_disk_written_bytes:rate5m
expr: sum by(job) (instance:node_disk_written_bytes:rate5m{job="node"})
|
ok
|
|
2.725s ago
|
125.4us |
| record: instance_device:node_disk_reads_completed:rate5m
expr: rate(node_disk_reads_completed_total{job="node"}[5m])
|
ok
|
|
2.725s ago
|
1.013ms |
| record: instance:node_disk_reads_completed:rate5m
expr: sum without(device) (instance_device:node_disk_reads_completed:rate5m{job="node"})
|
ok
|
|
2.724s ago
|
535.5us |
| record: job:node_disk_reads_completed:rate5m
expr: sum by(job) (instance:node_disk_reads_completed:rate5m{job="node"})
|
ok
|
|
2.724s ago
|
97.44us |
| record: instance_device:node_disk_writes_completed:rate5m
expr: rate(node_disk_writes_completed_total{job="node"}[5m])
|
ok
|
|
2.723s ago
|
1ms |
| record: instance:node_disk_writes_completed:rate5m
expr: sum without(device) (instance_device:node_disk_writes_completed:rate5m{job="node"})
|
ok
|
|
2.723s ago
|
541.6us |
| record: job:node_disk_writes_completed:rate5m
expr: sum by(job) (instance:node_disk_writes_completed:rate5m{job="node"})
|
ok
|
|
2.722s ago
|
87.84us |
| record: instance_device:node_disk_io_time_seconds:rate5m
expr: rate(node_disk_io_time_seconds_total{job="node"}[5m])
|
ok
|
|
2.722s ago
|
973.6us |
| record: instance_device:node_disk_read_time_seconds:rate5m
expr: rate(node_disk_read_time_seconds_total{job="node"}[5m])
|
ok
|
|
2.721s ago
|
943.5us |
| record: instance_device:node_disk_write_time_seconds:rate5m
expr: rate(node_disk_write_time_seconds_total{job="node"}[5m])
|
ok
|
|
2.72s ago
|
974.1us |
| record: instance_device:node_disk_io_time_weighted_seconds:rate5m
expr: rate(node_disk_io_time_weighted_seconds_total{job="node"}[5m])
|
ok
|
|
2.719s ago
|
973.3us |
| record: instance_device:node_network_receive_bytes:rate5m
expr: rate(node_network_receive_bytes_total{job="node"}[5m])
|
ok
|
|
2.718s ago
|
2.527ms |
| record: instance:node_network_receive_bytes:rate5m
expr: sum without(device) (instance_device:node_network_receive_bytes:rate5m{job="node"})
|
ok
|
|
2.716s ago
|
1.238ms |
| record: job:node_network_receive_bytes:rate5m
expr: sum by(job) (instance:node_network_receive_bytes:rate5m{job="node"})
|
ok
|
|
2.715s ago
|
96.89us |
| record: instance_device:node_network_transmit_bytes:rate5m
expr: rate(node_network_transmit_bytes_total{job="node"}[5m])
|
ok
|
|
2.714s ago
|
2.367ms |
| record: instance:node_network_transmit_bytes:rate5m
expr: sum without(device) (instance_device:node_network_transmit_bytes:rate5m{job="node"})
|
ok
|
|
2.712s ago
|
1.255ms |
| record: job:node_network_transmit_bytes:rate5m
expr: sum by(job) (instance:node_network_transmit_bytes:rate5m{job="node"})
|
ok
|
|
2.711s ago
|
107.8us |
| record: instance_device:node_network_receive_packets:rate5m
expr: rate(node_network_receive_packets_total{job="node"}[5m])
|
ok
|
|
2.711s ago
|
2.352ms |
| record: instance:node_network_receive_packets:rate5m
expr: sum without(device) (instance_device:node_network_receive_packets:rate5m{job="node"})
|
ok
|
|
2.709s ago
|
1.246ms |
| record: job:node_network_receive_packets:rate5m
expr: sum by(job) (instance:node_network_receive_packets:rate5m{job="node"})
|
ok
|
|
2.707s ago
|
97.17us |
| record: instance_device:node_network_transmit_packets:rate5m
expr: rate(node_network_transmit_packets_total{job="node"}[5m])
|
ok
|
|
2.707s ago
|
2.32ms |
| record: instance:node_network_transmit_packets:rate5m
expr: sum without(device) (instance_device:node_network_transmit_packets:rate5m{job="node"})
|
ok
|
|
2.705s ago
|
1.226ms |
| record: job:node_network_transmit_packets:rate5m
expr: sum by(job) (instance:node_network_transmit_packets:rate5m{job="node"})
|
ok
|
|
2.704s ago
|
91.89us |
| record: instance_device:node_network_receive_drop:rate5m
expr: rate(node_network_receive_drop_total{job="node"}[5m])
|
ok
|
|
2.704s ago
|
2.175ms |
| record: instance:node_network_receive_drop:rate5m
expr: sum without(device) (instance_device:node_network_receive_drop:rate5m{job="node"})
|
ok
|
|
2.702s ago
|
1.098ms |
| record: job:node_network_receive_drop:rate5m
expr: sum by(job) (instance:node_network_receive_drop:rate5m{job="node"})
|
ok
|
|
2.701s ago
|
169.3us |
| record: instance_device:node_network_transmit_drop:rate5m
expr: rate(node_network_transmit_drop_total{job="node"}[5m])
|
ok
|
|
2.7s ago
|
2.649ms |
| record: instance:node_network_transmit_drop:rate5m
expr: sum without(device) (instance_device:node_network_transmit_drop:rate5m{job="node"})
|
ok
|
|
2.698s ago
|
1.161ms |
| record: job:node_network_transmit_drop:rate5m
expr: sum by(job) (instance:node_network_transmit_drop:rate5m{job="node"})
|
ok
|
|
2.697s ago
|
134.2us |
| record: instance_device:node_network_receive_errs:rate5m
expr: rate(node_network_receive_errs_total{job="node"}[5m])
|
ok
|
|
2.697s ago
|
2.269ms |
| record: instance:node_network_receive_errs:rate5m
expr: sum without(device) (instance_device:node_network_receive_errs:rate5m{job="node"})
|
ok
|
|
2.694s ago
|
1.095ms |
| record: job:node_network_receive_errs:rate5m
expr: sum by(job) (instance:node_network_receive_errs:rate5m{job="node"})
|
ok
|
|
2.693s ago
|
89.08us |
| record: instance_device:node_network_transmit_errs:rate5m
expr: rate(node_network_transmit_errs_total{job="node"}[5m])
|
ok
|
|
2.693s ago
|
2.301ms |
| record: instance:node_network_transmit_errs:rate5m
expr: sum without(device) (instance_device:node_network_transmit_errs:rate5m{job="node"})
|
ok
|
|
2.691s ago
|
1.093ms |
| record: job:node_network_transmit_errs:rate5m
expr: sum by(job) (instance:node_network_transmit_errs:rate5m{job="node"})
|
ok
|
|
2.69s ago
|
147.6us |
| record: instance_device:node_network_receive_fifo:rate5m
expr: rate(node_network_receive_fifo_total{job="node"}[5m])
|
ok
|
|
2.69s ago
|
2.489ms |
| record: instance:node_network_receive_fifo:rate5m
expr: sum without(device) (instance_device:node_network_receive_fifo:rate5m{job="node"})
|
ok
|
|
2.687s ago
|
1.097ms |
| record: job:node_network_receive_fifo:rate5m
expr: sum by(job) (instance:node_network_receive_fifo:rate5m{job="node"})
|
ok
|
|
2.686s ago
|
121.8us |
| record: instance_device:node_network_transmit_fifo:rate5m
expr: rate(node_network_transmit_fifo_total{job="node"}[5m])
|
ok
|
|
2.686s ago
|
2.49ms |
| record: instance:node_network_transmit_fifo:rate5m
expr: sum without(device) (instance_device:node_network_transmit_fifo:rate5m{job="node"})
|
ok
|
|
2.684s ago
|
1.106ms |
| record: job:node_network_transmit_fifo:rate5m
expr: sum by(job) (instance:node_network_transmit_fifo:rate5m{job="node"})
|
ok
|
|
2.683s ago
|
89.58us |
| record: instance_device:node_roce_rx_bytes:rate5m
expr: rate(node_roce_rx_bytes{job="node"}[5m])
|
ok
|
|
2.683s ago
|
47.45us |
| record: instance_device:node_roce_tx_bytes:rate5m
expr: rate(node_roce_tx_bytes{job="node"}[5m])
|
ok
|
|
2.683s ago
|
39.43us |
| record: instance_device:node_roce_rx_pkts:rate5m
expr: rate(node_roce_rx_pkts{job="node"}[5m])
|
ok
|
|
2.683s ago
|
38.25us |
| record: instance_device:node_roce_tx_pkts:rate5m
expr: rate(node_roce_tx_pkts{job="node"}[5m])
|
ok
|
|
2.683s ago
|
34.05us |
| record: instance_device:node_roce_rx_roce_drops:rate5m
expr: rate(node_roce_rx_roce_drops{job="node"}[5m])
|
ok
|
|
2.683s ago
|
75.44us |
| record: instance_device:node_roce_tx_roce_drops:rate5m
expr: rate(node_roce_tx_roce_drops{job="node"}[5m])
|
ok
|
|
2.683s ago
|
86.14us |
| record: instance_device:node_roce_bad_resp_err:rate5m
expr: rate(node_roce_bad_resp_err[5m])
|
ok
|
|
2.682s ago
|
71.2us |
| record: instance_device:node_roce_local_protection_err:rate5m
expr: rate(node_roce_local_protection_err[5m])
|
ok
|
|
2.682s ago
|
88.38us |
| record: instance_device:node_roce_local_qp_op_err:rate5m
expr: rate(node_roce_local_qp_op_err[5m])
|
ok
|
|
2.682s ago
|
46.63us |
| record: instance_device:node_roce_mem_mgmt_op_err:rate5m
expr: rate(node_roce_mem_mgmt_op_err[5m])
|
ok
|
|
2.682s ago
|
54.5us |
| record: instance_device:node_roce_recoverable_errors:rate5m
expr: rate(node_roce_recoverable_errors[5m])
|
ok
|
|
2.682s ago
|
44.68us |
| record: instance_device:node_roce_remote_access_err:rate5m
expr: rate(node_roce_remote_access_err[5m])
|
ok
|
|
2.682s ago
|
32.93us |
| record: instance_device:node_roce_remote_invalid_req_err:rate5m
expr: rate(node_roce_remote_invalid_req_err[5m])
|
ok
|
|
2.682s ago
|
35.26us |
| record: instance_device:node_roce_remote_op_err:rate5m
expr: rate(node_roce_remote_op_err[5m])
|
ok
|
|
2.682s ago
|
33.59us |
| record: instance_device:node_roce_res_cmp_err:rate5m
expr: rate(node_roce_res_cmp_err[5m])
|
ok
|
|
2.682s ago
|
26.67us |
| record: instance_device:node_roce_res_cq_load_err:rate5m
expr: rate(node_roce_res_cq_load_err[5m])
|
ok
|
|
2.682s ago
|
29.64us |
| record: instance_device:node_roce_res_mem_err:rate5m
expr: rate(node_roce_res_mem_err[5m])
|
ok
|
|
2.682s ago
|
30.45us |
| record: instance_device:node_roce_res_opcode_err:rate5m
expr: rate(node_roce_res_opcode_err[5m])
|
ok
|
|
2.682s ago
|
32.86us |
| record: instance_device:node_roce_res_rem_inv_err:rate5m
expr: rate(node_roce_res_rem_inv_err[5m])
|
ok
|
|
2.682s ago
|
31.47us |
| record: instance_device:node_roce_res_rx_domain_err:rate5m
expr: rate(node_roce_res_rx_domain_err[5m])
|
ok
|
|
2.682s ago
|
58.65us |
| record: instance_device:node_roce_res_rx_pci_err:rate5m
expr: rate(node_roce_res_rx_pci_err[5m])
|
ok
|
|
2.682s ago
|
66.55us |
| record: instance_device:node_roce_res_rx_range_err:rate5m
expr: rate(node_roce_res_rx_range_err[5m])
|
ok
|
|
2.682s ago
|
72.09us |
| record: instance_device:node_roce_res_srq_err:rate5m
expr: rate(node_roce_res_srq_err[5m])
|
ok
|
|
2.682s ago
|
62.97us |
| record: instance_device:node_roce_res_srq_load_err:rate5m
expr: rate(node_roce_res_srq_load_err[5m])
|
ok
|
|
2.682s ago
|
50.58us |
| record: instance_device:node_roce_res_tx_domain_err:rate5m
expr: rate(node_roce_res_tx_domain_err[5m])
|
ok
|
|
2.682s ago
|
64.57us |
| record: instance_device:node_roce_res_tx_pci_err:rate5m
expr: rate(node_roce_res_tx_pci_err[5m])
|
ok
|
|
2.682s ago
|
64.42us |
| record: instance_device:node_roce_res_tx_range_err:rate5m
expr: rate(node_roce_res_tx_range_err[5m])
|
ok
|
|
2.682s ago
|
77.83us |
| record: instance_device:node_roce_res_wqe_format_err:rate5m
expr: rate(node_roce_res_wqe_format_err[5m])
|
ok
|
|
2.682s ago
|
64.75us |
| record: instance_device:node_roce_seq_err_naks_rcvd:rate5m
expr: rate(node_roce_seq_err_naks_rcvd[5m])
|
ok
|
|
2.682s ago
|
56.65us |
| record: instance_device:node_roce_unrecoverable_err:rate5m
expr: rate(node_roce_unrecoverable_err[5m])
|
ok
|
|
2.682s ago
|
48.31us |
| record: instance_device:node_infiniband_port_data_received_bytes_total:rate5m
expr: rate(node_infiniband_port_data_received_bytes_total{job="node"}[5m])
|
ok
|
|
2.682s ago
|
261.1us |
| record: instance_device:node_infiniband_port_data_transmitted_bytes_total:rate5m
expr: rate(node_infiniband_port_data_received_bytes_total{job="node"}[5m])
|
ok
|
|
2.682s ago
|
284.8us |
| record: instance_device:node_infiniband_port_packets_received_total:rate5m
expr: rate(node_infiniband_port_port_packets_received_total{job="node"}[5m])
|
ok
|
|
2.682s ago
|
70.68us |
| record: instance_device:node_infiniband_port_packets_transmitted_total:rate5m
expr: rate(node_infiniband_port_packets_transmitted_total{job="node"}[5m])
|
ok
|
|
2.682s ago
|
285.6us |
| record: instance_device:node_infiniband_excessive_buffer_overrun_errors_total:rate5m
expr: rate(node_infiniband_excessive_buffer_overrun_errors_total[5m])
|
ok
|
|
2.681s ago
|
253.1us |
| record: instance_device:node_infiniband_link_error_recovery_total:rate5m
expr: rate(node_infiniband_link_error_recovery_total[5m])
|
ok
|
|
2.681s ago
|
227.2us |
| record: instance_device:node_infiniband_local_link_integrity_errors_total:rate5m
expr: rate(node_infiniband_local_link_integrity_errors_total[5m])
|
ok
|
|
2.681s ago
|
223.9us |
| record: instance_device:node_infiniband_port_constraint_errors_received_total:rate5m
expr: rate(node_infiniband_port_constraint_errors_received_total[5m])
|
ok
|
|
2.681s ago
|
241.2us |
| record: instance_device:node_infiniband_port_constraint_errors_transmitted_total:rate5m
expr: rate(node_infiniband_port_constraint_errors_transmitted_total[5m])
|
ok
|
|
2.681s ago
|
277us |
| record: instance_device:node_infiniband_port_errors_received_total:rate5m
expr: rate(node_infiniband_port_errors_received_total[5m])
|
ok
|
|
2.68s ago
|
242.5us |
| record: instance_device:node_infiniband_port_receive_remote_physical_errors_total:rate5m
expr: rate(node_infiniband_port_receive_remote_physical_errors_total[5m])
|
ok
|
|
2.68s ago
|
244.2us |
| record: instance_device:node_infiniband_port_receive_switch_relay_errors_total:rate5m
expr: rate(node_infiniband_port_receive_switch_relay_errors_total[5m])
|
ok
|
|
2.68s ago
|
244us |
| record: instance_device:node_infiniband_symbol_error_total:rate5m
expr: rate(node_infiniband_symbol_error_total[5m])
|
ok
|
|
2.68s ago
|
235.2us |
| record: instance:node_netstat_Tcp_ActiveOpens:rate5m
expr: rate(node_netstat_Tcp_ActiveOpens{job="node"}[5m])
|
ok
|
|
2.679s ago
|
130.8us |
| record: job:node_netstat_Tcp_ActiveOpens:rate5m
expr: sum by(job) (instance:node_netstat_Tcp_ActiveOpens:rate5m{job="node"})
|
ok
|
|
2.679s ago
|
95.21us |
| record: instance:node_netstat_Tcp_PassiveOpens:rate5m
expr: rate(node_netstat_Tcp_PassiveOpens{job="node"}[5m])
|
ok
|
|
2.679s ago
|
123.4us |
| record: job:node_netstat_Tcp_PassiveOpens:rate5m
expr: sum by(job) (instance:node_netstat_Tcp_PassiveOpens:rate5m{job="node"})
|
ok
|
|
2.679s ago
|
92.64us |
| record: instance:node_netstat_Udp_InDatagrams:rate5m
expr: rate(node_netstat_Udp_InDatagrams{job="node"}[5m])
|
ok
|
|
2.679s ago
|
127us |
| record: job:node_netstat_Udp_InDatagrams:rate5m
expr: sum by(job) (instance:node_netstat_Udp_InDatagrams:rate5m{job="node"})
|
ok
|
|
2.679s ago
|
128.2us |
| record: instance:node_netstat_Udp_OutDatagrams:rate5m
expr: rate(node_netstat_Udp_OutDatagrams{job="node"}[5m])
|
ok
|
|
2.679s ago
|
181.8us |
| record: job:node_netstat_Udp_OutDatagrams:rate5m
expr: sum by(job) (instance:node_netstat_Udp_OutDatagrams:rate5m{job="node"})
|
ok
|
|
2.679s ago
|
96.76us |
| record: instance:node_netstat_Udp_InErrors:rate5m
expr: rate(node_netstat_Udp_InErrors{job="node"}[5m])
|
ok
|
|
2.679s ago
|
150.1us |
| record: job:node_netstat_Udp_InErrors:rate5m
expr: sum by(job) (instance:node_netstat_Udp_InErrors:rate5m{job="node"})
|
ok
|
|
2.679s ago
|
117.3us |
|
39.892s ago |
8.002ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: openstack_exporter:openstack_identity_project_info:relabel
expr: openstack_identity_domain_info * on(domain_id) group_right(domain_name) label_replace(openstack_identity_project_info, "tenant_id", "$1", "id", "(.*)")
|
ok
|
|
35.741s ago
|
2.521ms |
| record: openstack_exporter:project_vcpu_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_nova_limits_vcpus_used / (openstack_nova_limits_vcpus_max > 0) * 100 or openstack_nova_limits_vcpus_used * 0)
|
ok
|
|
35.739s ago
|
2.627ms |
| record: openstack_exporter:project_vcpu_quota:usage
expr: sum by(tenant, tenant_id) (openstack_nova_limits_vcpus_used)
|
ok
|
|
35.736s ago
|
852.2us |
| record: openstack_exporter:project_vcpu_quota:max
expr: sum by(tenant, tenant_id) (openstack_nova_limits_vcpus_max)
|
ok
|
|
35.735s ago
|
838.9us |
| record: openstack_exporter:project_vcpus_quota_usage:relabel
expr: label_replace(openstack_nova_limits_vcpus_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.734s ago
|
1.173ms |
| record: openstack_exporter:project_vcpus_quota_max:relabel
expr: label_replace(openstack_nova_limits_vcpus_max, "graph_type", "project-max", "", "")
|
ok
|
|
35.733s ago
|
1.072ms |
| record: openstack_exporter:domain_vcpus_quota:usage
expr: sum by(domain_id, domain_name) (openstack_nova_limits_vcpus_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.732s ago
|
1.146ms |
| record: openstack_exporter:domain_vcpus_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_vcpus_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.731s ago
|
268.1us |
| record: openstack_exporter:project_memory_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_nova_limits_memory_used / (openstack_nova_limits_memory_max > 0) * 100 or openstack_nova_limits_memory_used * 0)
|
ok
|
|
35.731s ago
|
3.051ms |
| record: openstack_exporter:project_memory_quota:usage
expr: sum by(tenant, tenant_id) (openstack_nova_limits_memory_used)
|
ok
|
|
35.728s ago
|
888.4us |
| record: openstack_exporter:project_memory_quota:max
expr: sum by(tenant, tenant_id) (openstack_nova_limits_memory_max)
|
ok
|
|
35.727s ago
|
886.5us |
| record: openstack_exporter:project_memory_quota_usage:relabel
expr: label_replace(openstack_nova_limits_memory_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.726s ago
|
1.167ms |
| record: openstack_exporter:project_memory_quota_max:relabel
expr: label_replace(openstack_nova_limits_memory_max, "graph_type", "project-max", "", "")
|
ok
|
|
35.725s ago
|
1.185ms |
| record: openstack_exporter:domain_memory_quota:usage
expr: sum by(domain_id, domain_name) (openstack_nova_limits_memory_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.724s ago
|
1.605ms |
| record: openstack_exporter:domain_memory_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_memory_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.722s ago
|
140.7us |
| record: openstack_exporter:project_storage_policy_quota:ratio
expr: sum by(tenant, tenant_id, volume_type) (openstack_cinder_limits_volume_storage_policy_used_gb{volume_type!=""} / on(tenant, tenant_id, volume_type) (openstack_cinder_limits_volume_storage_policy_max_gb{volume_type!=""} > 0) * 100 or openstack_cinder_limits_volume_storage_policy_used_gb{volume_type!=""} * 0)
|
ok
|
|
35.722s ago
|
4.592ms |
| record: openstack_exporter:project_storage_policy_quota:usage
expr: sum by(tenant, tenant_id, volume_type) (openstack_cinder_limits_volume_storage_policy_used_gb{volume_type!=""})
|
ok
|
|
35.718s ago
|
1.726ms |
| record: openstack_exporter:project_storage_policy_quota:max
expr: sum by(tenant, tenant_id, volume_type) (openstack_cinder_limits_volume_storage_policy_max_gb{volume_type!=""})
|
ok
|
|
35.716s ago
|
1.734ms |
| record: openstack_exporter:project_storage_policy_quota_usage:relabel
expr: label_replace(openstack_cinder_limits_volume_storage_policy_used_gb{volume_type!=""}, "graph_type", "project-usage", "", "")
|
ok
|
|
35.714s ago
|
2.174ms |
| record: openstack_exporter:project_storage_policy_quota_max:relabel
expr: label_replace(openstack_cinder_limits_volume_storage_policy_max_gb{volume_type!=""}, "graph_type", "project-max", "", "")
|
ok
|
|
35.712s ago
|
2.139ms |
| record: openstack_exporter:domain_storage_policy_quota:usage
expr: sum by(domain_id, domain_name, volume_type) (openstack_cinder_limits_volume_storage_policy_used_gb{volume_type!=""} * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.71s ago
|
1.891ms |
| record: openstack_exporter:domain_storage_policy_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_storage_policy_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.708s ago
|
146.7us |
| record: openstack_exporter:project_floating_ip_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_neutron_quotas_floating_ips_used / (openstack_neutron_quotas_floating_ips_limit > 0) * 100 or openstack_neutron_quotas_floating_ips_used * 0)
|
ok
|
|
35.708s ago
|
2.229ms |
| record: openstack_exporter:project_floating_ip_quota:usage
expr: sum by(tenant, tenant_id) (openstack_neutron_quotas_floating_ips_used)
|
ok
|
|
35.706s ago
|
810.1us |
| record: openstack_exporter:project_floating_ip_quota:max
expr: sum by(tenant, tenant_id) (openstack_neutron_quotas_floating_ips_limit)
|
ok
|
|
35.705s ago
|
786.6us |
| record: openstack_exporter:project_floating_ip_quota_usage:relabel
expr: label_replace(openstack_neutron_quotas_floating_ips_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.704s ago
|
1.028ms |
| record: openstack_exporter:project_floating_ip_quota_max:relabel
expr: label_replace(openstack_neutron_quotas_floating_ips_limit, "graph_type", "project-max", "", "")
|
ok
|
|
35.703s ago
|
1.037ms |
| record: openstack_exporter:domain_floating_ip_quota:usage
expr: sum by(domain_id, domain_name) (openstack_neutron_quotas_floating_ips_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.702s ago
|
1.059ms |
| record: openstack_exporter:domain_floating_ip_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_floating_ip_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.701s ago
|
159.6us |
| record: openstack_exporter:project_container_infra_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_container_infra_cluster_used / (openstack_container_infra_cluster_max > 0) * 100 or openstack_container_infra_cluster_used * 0)
|
ok
|
|
35.701s ago
|
2.411ms |
| record: openstack_exporter:project_container_infra_quota:usage
expr: sum by(tenant, tenant_id) (openstack_container_infra_cluster_used)
|
ok
|
|
35.699s ago
|
794.9us |
| record: openstack_exporter:project_container_infra_quota:max
expr: sum by(tenant, tenant_id) (openstack_container_infra_cluster_max)
|
ok
|
|
35.698s ago
|
779.5us |
| record: openstack_exporter:project_container_infra_quota_usage:relabel
expr: label_replace(openstack_container_infra_cluster_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.697s ago
|
1.02ms |
| record: openstack_exporter:project_container_infra_quota_max:relabel
expr: label_replace(openstack_container_infra_cluster_max, "graph_type", "project-max", "", "")
|
ok
|
|
35.696s ago
|
1.013ms |
| record: openstack_exporter:domain_container_infra_quota:usage
expr: sum by(domain_id, domain_name) (openstack_container_infra_cluster_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.695s ago
|
1.055ms |
| record: openstack_exporter:domain_container_infra_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_container_infra_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.694s ago
|
111.5us |
| record: openstack_exporter:project_loadbalancer_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_loadbalancer_loadbalancer_used / (openstack_loadbalancer_loadbalancer_max > 0) * 100 or openstack_loadbalancer_loadbalancer_used * 0)
|
ok
|
|
35.694s ago
|
2.311ms |
| record: openstack_exporter:project_loadbalancer_quota:usage
expr: sum by(tenant, tenant_id) (openstack_loadbalancer_loadbalancer_used)
|
ok
|
|
35.692s ago
|
1.009ms |
| record: openstack_exporter:project_loadbalancer_quota:max
expr: sum by(tenant, tenant_id) (openstack_loadbalancer_loadbalancer_max)
|
ok
|
|
35.691s ago
|
954.5us |
| record: openstack_exporter:project_loadbalancer_quota_usage:relabel
expr: label_replace(openstack_loadbalancer_loadbalancer_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.69s ago
|
1.103ms |
| record: openstack_exporter:project_loadbalancer_quota_max:relabel
expr: label_replace(openstack_loadbalancer_loadbalancer_max, "graph_type", "project-max", "", "")
|
ok
|
|
35.689s ago
|
1.092ms |
| record: openstack_exporter:domain_loadbalancer_quota:usage
expr: sum by(domain_id, domain_name) (openstack_loadbalancer_loadbalancer_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.688s ago
|
1.171ms |
| record: openstack_exporter:domain_loadbalancer_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_loadbalancer_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.687s ago
|
155us |
| record: openstack_exporter:project_vpnaas_quota:ratio
expr: sum by(tenant, tenant_id) (openstack_neutron_vpnaas_used / (openstack_neutron_vpnaas_max > 0) * 100 or openstack_neutron_vpnaas_used * 0)
|
ok
|
|
35.687s ago
|
2.147ms |
| record: openstack_exporter:project_vpnaas_quota:usage
expr: sum by(tenant, tenant_id) (openstack_neutron_vpnaas_used)
|
ok
|
|
35.685s ago
|
781.7us |
| record: openstack_exporter:project_vpnaas_quota:max
expr: sum by(tenant, tenant_id) (openstack_neutron_vpnaas_max)
|
ok
|
|
35.684s ago
|
750.7us |
| record: openstack_exporter:project_vpnaas_quota_usage:relabel
expr: label_replace(openstack_neutron_vpnaas_used, "graph_type", "project-usage", "", "")
|
ok
|
|
35.683s ago
|
1.046ms |
| record: openstack_exporter:project_vpnaas_quota_max:relabel
expr: label_replace(openstack_neutron_vpnaas_max, "graph_type", "project-max", "", "")
|
ok
|
|
35.682s ago
|
1.075ms |
| record: openstack_exporter:domain_vpnaas_quota:usage
expr: sum by(domain_id, domain_name) (openstack_neutron_vpnaas_used * on(tenant_id) group_left(domain_name) openstack_exporter:openstack_identity_project_info:relabel)
|
ok
|
|
35.681s ago
|
1.045ms |
| record: openstack_exporter:domain_vpnaas_quota_usage:relabel
expr: label_replace(openstack_exporter:domain_vpnaas_quota:usage, "graph_type", "domain-usage", "", "")
|
ok
|
|
35.68s ago
|
107.2us |
|
4.267s ago |
1.114ms |