Illumina Innovates with Rancher and Kubernetes
CPU Utilization
1 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))
1 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])))
Load Average
sum(node_load1) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)
sum(node_load5) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)
sum(node_load15) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)
sum(node_load1) by (instance) / count(node_cpu_seconds_total{mode="system"})
sum(node_load5) by (instance) / count(node_cpu_seconds_total{mode="system"})
sum(node_load15) by (instance) / count(node_cpu_seconds_total{mode="system"})
Memory Utilization
1 - sum(node_memory_MemAvailable_bytes) by (instance) / sum(node_memory_MemTotal_bytes) by (instance)
1 - sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)
Disk Utilization
(sum(node_filesystem_size_bytes{device!="rootfs"}) by (instance) - sum(node_filesystem_free_bytes{device!="rootfs"}) by (instance)) / sum(node_filesystem_size_bytes{device!="rootfs"}) by (instance)
(sum(node_filesystem_size_bytes{device!="rootfs"}) - sum(node_filesystem_free_bytes{device!="rootfs"})) / sum(node_filesystem_size_bytes{device!="rootfs"})
Disk I/O
sum(rate(node_disk_read_bytes_total[5m])) by (instance)
sum(rate(node_disk_written_bytes_total[5m])) by (instance)
sum(rate(node_disk_read_bytes_total[5m]))
sum(rate(node_disk_written_bytes_total[5m]))
Network Packets
sum(rate(node_network_receive_drop_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_receive_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_receive_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_transmit_drop_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_transmit_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_transmit_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“}[5m])) by (instance)
sum(rate(node_network_receive_drop_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_receive_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_receive_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_transmit_drop_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_transmit_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_transmit_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“}[5m]))
Network I/O
sum(rate(node_network_receive_bytes_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”}[5m])) by (instance)
sum(rate(node_network_transmit_bytes_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“}[5m])) by (instance)
sum(rate(node_network_receive_bytes_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”}[5m]))
sum(rate(node_network_transmit_bytes_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“}[5m]))
avg(irate(node_cpu_seconds_total{mode!="idle", instance=~"$instance"}[5m])) by (mode)
1 - (avg(irate(node_cpu_seconds_total{mode="idle", instance=~"$instance"}[5m])))
sum(node_load1{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})
sum(node_load5{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})
sum(node_load15{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})
1 - sum(node_memory_MemAvailable_bytes{instance=~"$instance"}) / sum(node_memory_MemTotal_bytes{instance=~"$instance"})
(sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) by (device) - sum(node_filesystem_free_bytes{device!="rootfs",instance=~"$instance"}) by (device)) / sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) by (device)
(sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) - sum(node_filesystem_free_bytes{device!="rootfs",instance=~"$instance"})) / sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"})
sum(rate(node_disk_read_bytes_total{instance=~"$instance"}[5m]))
sum(rate(node_disk_written_bytes_total{instance=~"$instance"}[5m]))
sum(rate(node_network_receive_drop_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_receive_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_receive_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_transmit_drop_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_transmit_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_transmit_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_receive_drop_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_receive_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_receive_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_transmit_drop_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_transmit_errs_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_transmit_packets_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“,instance=~“$instance”}[5m]))
sum(rate(node_network_receive_bytes_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_transmit_bytes_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“,instance=~“$instance”}[5m])) by (device)
sum(rate(node_network_receive_bytes_total{device!~“lo | veth.* | docker.* | flannel.* | cali.* | cbr.”,instance=~“$instance”}[5m]))
sum(rate(node_network_transmit_bytes_total{device!~“lo | veth. | docker.* | flannel.* | cali.* | cbr.*“,instance=~“$instance”}[5m]))
Etcd has a leader
max(etcd_server_has_leader)
Number of leader changes
max(etcd_server_leader_changes_seen_total)
Number of failed proposals
sum(etcd_server_proposals_failed_total)
GRPC Client Traffic
sum(rate(etcd_network_client_grpc_received_bytes_total[5m])) by (instance)
sum(rate(etcd_network_client_grpc_sent_bytes_total[5m])) by (instance)
sum(rate(etcd_network_client_grpc_received_bytes_total[5m]))
sum(rate(etcd_network_client_grpc_sent_bytes_total[5m]))
Peer Traffic
sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)
sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)
sum(rate(etcd_network_peer_received_bytes_total[5m]))
sum(rate(etcd_network_peer_sent_bytes_total[5m]))
DB Size
sum(etcd_debugging_mvcc_db_total_size_in_bytes) by (instance)
sum(etcd_debugging_mvcc_db_total_size_in_bytes)
Active Streams
sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) by (instance) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) by (instance)
sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) by (instance) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) by (instance)
sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})
sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})
Raft Proposals
sum(increase(etcd_server_proposals_applied_total[5m])) by (instance)
sum(increase(etcd_server_proposals_committed_total[5m])) by (instance)
sum(increase(etcd_server_proposals_pending[5m])) by (instance)
sum(increase(etcd_server_proposals_failed_total[5m])) by (instance)
sum(increase(etcd_server_proposals_applied_total[5m]))
sum(increase(etcd_server_proposals_committed_total[5m]))
sum(increase(etcd_server_proposals_pending[5m]))
sum(increase(etcd_server_proposals_failed_total[5m]))
RPC Rate
sum(rate(grpc_server_started_total{grpc_type="unary"}[5m])) by (instance)
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m])) by (instance)
sum(rate(grpc_server_started_total{grpc_type="unary"}[5m]))
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m]))
Disk Operations
sum(rate(etcd_disk_backend_commit_duration_seconds_sum[1m])) by (instance)
sum(rate(etcd_disk_wal_fsync_duration_seconds_sum[1m])) by (instance)
sum(rate(etcd_disk_backend_commit_duration_seconds_sum[1m]))
sum(rate(etcd_disk_wal_fsync_duration_seconds_sum[1m]))
Disk Sync Duration
histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))
histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))
sum(histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)))
sum(histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)))
API Server Request Latency
avg(apiserver_request_latencies_sum / apiserver_request_latencies_count) by (instance, verb) /1e+06
avg(apiserver_request_latencies_sum / apiserver_request_latencies_count) by (instance) /1e+06
API Server Request Rate
sum(rate(apiserver_request_count[5m])) by (instance, code)
sum(rate(apiserver_request_count[5m])) by (instance)
Scheduling Failed Pods
sum(kube_pod_status_scheduled{condition="false"})
Controller Manager Queue Depth
sum(volumes_depth) by instance
sum(deployment_depth) by instance
sum(replicaset_depth) by instance
sum(service_depth) by instance
sum(serviceaccount_depth) by instance
sum(endpoint_depth) by instance
sum(daemonset_depth) by instance
sum(statefulset_depth) by instance
sum(replicationmanager_depth) by instance
sum(volumes_depth)
sum(deployment_depth)
sum(replicaset_depth)
sum(service_depth)
sum(serviceaccount_depth)
sum(endpoint_depth)
sum(daemonset_depth)
sum(statefulset_depth)
sum(replicationmanager_depth)
Scheduler E2E Scheduling Latency
histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) by (le, instance)) / 1e+06
sum(histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) by (le, instance)) / 1e+06)
Scheduler Preemption Attempts
sum(rate(scheduler_total_preemption_attempts[5m])) by (instance)
sum(rate(scheduler_total_preemption_attempts[5m]))
Ingress Controller Connections
sum(nginx_ingress_controller_nginx_process_connections{state="reading"}) by (instance)
sum(nginx_ingress_controller_nginx_process_connections{state="waiting"}) by (instance)
sum(nginx_ingress_controller_nginx_process_connections{state="writing"}) by (instance)
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="accepted"}[5m]))) by (instance)
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="active"}[5m]))) by (instance)
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="handled"}[5m]))) by (instance)
sum(nginx_ingress_controller_nginx_process_connections{state="reading"})
sum(nginx_ingress_controller_nginx_process_connections{state="waiting"})
sum(nginx_ingress_controller_nginx_process_connections{state="writing"})
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="accepted"}[5m])))
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="active"}[5m])))
sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="handled"}[5m])))
Ingress Controller Request Process Time
topk(10, histogram_quantile(0.95,sum by (le, host, path)(rate(nginx_ingress_controller_request_duration_seconds_bucket{host!="_"}[5m]))))
topk(10, histogram_quantile(0.95,sum by (le, host)(rate(nginx_ingress_controller_request_duration_seconds_bucket{host!="_"}[5m]))))
Fluentd Buffer Queue Rate
sum(rate(fluentd_output_status_buffer_queue_length[5m])) by (instance)
sum(rate(fluentd_output_status_buffer_queue_length[5m]))
Fluentd Input Rate
sum(rate(fluentd_input_status_num_records_total[5m])) by (instance)
sum(rate(fluentd_input_status_num_records_total[5m]))
Fluentd Output Errors Rate
sum(rate(fluentd_output_status_num_errors[5m])) by (type)
sum(rate(fluentd_output_status_num_errors[5m]))
Fluentd Output Rate
sum(rate(fluentd_output_status_num_records_total[5m])) by (instance)
sum(rate(fluentd_output_status_num_records_total[5m]))
sum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(container_memory_working_set_bytes{namespace="$namespace",pod_name=~"$podName", container_name!=""}) by (pod_name)
sum(container_memory_working_set_bytes{namespace="$namespace",pod_name=~"$podName", container_name!=""})
sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)
sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))
sum(rate(container_cpu_cfs_throttled_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)
sum(rate(container_cpu_usage_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)
sum(rate(container_cpu_system_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)
sum(rate(container_cpu_user_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)
sum(rate(container_cpu_cfs_throttled_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))
sum(rate(container_cpu_usage_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))
sum(rate(container_cpu_system_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))
sum(rate(container_cpu_user_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))
sum(container_memory_working_set_bytes{container_name!="POD",namespace="$namespace",pod_name="$podName",container_name!=""}) by (container_name)
sum(container_memory_working_set_bytes{container_name!="POD",namespace="$namespace",pod_name="$podName",container_name!=""})
sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m])) by (container_name)
sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m])) by (container_name)
sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))
sum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
sum(container_memory_working_set_bytes{namespace="$namespace",pod_name="$podName",container_name="$containerName"})
Disk IO
sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))