package metrics import ( "strconv" "time" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "lostak.dev/pve-exporter/proxmox" ) // PVE cluster state collector. type PveNodeStatusCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. registry *TTLRegistry // TTL metrics registry. state *TTLGaugeVec // Node state prometheus gauge. uptime *TTLGaugeVec // Node uptime in seconds prometheus gauge. cpus *TTLGaugeVec // Node CPU count prometheus gauge. cpuUsage *TTLGaugeVec // Node CPU usage in percent prometheus gauge. memBytes *TTLGaugeVec // Node total RAM capacity in bytes prometheus gauge. memBytesUsed *TTLGaugeVec // Node RAM usage in bytes prometheus gauge. memBytesFree *TTLGaugeVec // Node RAM free in bytes prometheus gauge. ksmShared *TTLGaugeVec // Node Kernel samepage shared in bytes prometheus gauge. cgroupMode *TTLGaugeVec // Node CGroups mode prometheus gauge. load1 *TTLGaugeVec // Node load1 unix like (CPU seconds) prometheus gauge. load5 *TTLGaugeVec // Node load5 unix like (CPU seconds) prometheus gauge. load15 *TTLGaugeVec // Node load15 unix like (CPU seconds) prometheus gauge. fSFree *TTLGaugeVec // Node filesystem free space in bytes prometheus gauge. fSUsed *TTLGaugeVec // Node filesystem used space in bytes prometheus gauge. fSTotal *TTLGaugeVec // Node filesystem total space in bytes prometheus gauge. fSAvail *TTLGaugeVec // Node filesystem available capacity in bytes prometheus gauge. cpuInfo *TTLGaugeVec // Node CPU info prometheus gauge. systemInfo *TTLGaugeVec // Node system info prometheus gauge. time *TTLGaugeVec // Node time prometheus gauge. localTime *TTLGaugeVec // Node localtime prometheus gauge. } // Create new instance of PVE cluster state collector. func NewPveNodeStatusCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveNodeStatusCollector { c := PveNodeStatusCollector{apiClient: apiClient} c.registry = registry // Node state. c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_state", Help: "Node state.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.state) // Node uptime. c.uptime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_uptime", Help: "Node uptime.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.uptime) // Node cpu count. c.cpus = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpu_count", Help: "Node CPU count.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.cpus) // Node CPU usage. c.cpuUsage = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpu_usage", Help: "Cluster node CPU usage %.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.cpuUsage) // Node memory in bytes. c.memBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_total_bytes", Help: "Node total memory in bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.memBytes) // Cluster node memory used in bytes. c.memBytesUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_used_bytes", Help: "Node used memory in bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.memBytesUsed) // Node memory free in bytes. c.memBytesFree = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_free_bytes", Help: "Node free memory in bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.memBytesFree) // Kernel samepage shared in bytes. c.ksmShared = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_ksm_bytes", Help: "Node kernel samepage shares in bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.ksmShared) // Node memory cgroup mode. c.cgroupMode = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cgroup_mode", Help: "Node cgroup mode.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.cgroupMode) // Node load 1. c.load1 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load1", Help: "Node CPU load 1 minute average.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.load1) // Node load 5. c.load5 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load5", Help: "Node CPU load 5 minutes average.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.load5) // Cluster node load 15. c.load15 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load15", Help: "Node CPU load 15 minutes average.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.load15) // Node root FS free bytes. c.fSFree = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_free_bytes", Help: "Node RootFS free bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.fSFree) // Node root filesystem used bytes. c.fSUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_used_bytes", Help: "Node root filesystem used bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.fSUsed) // Node root filesystem total bytes. c.fSTotal = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_total_bytes", Help: "Node root filesystem total bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.fSTotal) // Node root filesystem avail bytes. c.fSAvail = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_avail_bytes", Help: "Node root filesystem avail bytes.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.fSAvail) // Node CPU info. c.cpuInfo = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpuinfo", Help: "Node CPU info.", }, []string{"cluster", "node", "flags", "cores", "model", "sockets", "cpus", "hvm"}, 5*time.Minute, ) c.registry.Register(c.cpuInfo) // Node system info metrics. c.systemInfo = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_systeminfo", Help: "Node system info.", }, []string{"cluster", "node", "kversion", "pveversion", "machine", "sysname", "release"}, 5*time.Minute, ) c.registry.Register(c.systemInfo) // Node time info. c.time = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_time", Help: "Node time.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.time) // Node localtime info. c.localTime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_localtime", Help: "Node localtime.", }, []string{"cluster", "node"}, 5*time.Minute, ) c.registry.Register(c.localTime) return &c } // PveMetricsCollector interface implementation. func (c *PveNodeStatusCollector) CollectMetrics() error { cluster, err := c.apiClient.GetClusterStatus() if err != nil { return err } for _, node := range cluster.NodeStatuses { labels := prometheus.Labels{ "cluster": cluster.GetClusterName(), "node": node.Name, } time, err := c.apiClient.GetNodeTime(node.Name) if err != nil { return err } else { c.time.With(labels).Set(float64(time.Time)) c.localTime.With(labels).Set(float64(time.LocalTime)) } status, err := c.apiClient.GetNodeStatusDetail(node.Name) if err != nil { return err } else { c.state.With(labels).Set(float64(node.Online)) c.cpus.With(labels).Set(float64(status.CPUInfo.CPUs)) c.cpuUsage.With(labels).Set(float64(status.CPU)) c.uptime.With(labels).Set(float64(status.Uptime)) c.memBytes.With(labels).Set(float64(status.Memory.Total)) c.memBytesUsed.With(labels).Set(float64(status.Memory.Used)) c.memBytesFree.With(labels).Set(float64(status.Memory.Free)) c.ksmShared.With(labels).Set(float64(status.Ksm.Shared)) c.fSFree.With(labels).Set(float64(status.Rootfs.Free)) c.fSUsed.With(labels).Set(float64(status.Rootfs.Used)) c.fSTotal.With(labels).Set(float64(status.Rootfs.Total)) c.fSAvail.With(labels).Set(float64(status.Rootfs.Avail)) // CPU load avg. if len(status.LoadAvg) > 0 { // Node load 1 metrics. f, err := strconv.ParseFloat(status.LoadAvg[0], 64) if err != nil { log.Errorf("Unable to parse load1. Error: %s.", err) } else { c.load1.With(labels).Set(f) } // Node load 5 metrics. f, err = strconv.ParseFloat(status.LoadAvg[1], 64) if err != nil { log.Errorf("Unable to parse load5. Error: %s.", err) } else { c.load5.With(labels).Set(f) } // Node load 15 metrics. f, err = strconv.ParseFloat(status.LoadAvg[2], 64) if err != nil { log.Errorf("Unable to parse load15. Error: %s.", err) } else { c.load15.With(labels).Set(f) } } else { log.Error("CPU load stats are empty.") } // Node CPU info. cpuLabels := prometheus.Labels{ "cluster": cluster.GetClusterName(), "node": node.Name, "flags": status.CPUInfo.Flags, "cores": strconv.Itoa(status.CPUInfo.Cores), "model": status.CPUInfo.Model, "sockets": strconv.Itoa(status.CPUInfo.Sockets), "cpus": strconv.Itoa(status.CPUInfo.CPUs), "hvm": status.CPUInfo.HVM, } c.cpuInfo.With(cpuLabels).Set(1) // Node system info. sysLabels := prometheus.Labels{ "cluster": cluster.GetClusterName(), "node": node.Name, "kversion": status.Kversion, "pveversion": status.PveVersion, "machine": status.CurrentKernel.Machine, "sysname": status.CurrentKernel.Sysname, "release": status.CurrentKernel.Release, } c.systemInfo.With(sysLabels).Set(1) } } return nil } // PveMetricsCollector interface implementation. func (c *PveNodeStatusCollector) GetName() string { return "Node State" }