diff --git a/metrics/pve_cluster_state_collector.go b/metrics/pve_cluster_state_collector.go index f11c86b..4320ee0 100644 --- a/metrics/pve_cluster_state_collector.go +++ b/metrics/pve_cluster_state_collector.go @@ -1,40 +1,48 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE cluster state collector. type PveClusterStateCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - nodes *prometheus.GaugeVec // Count of nodes prometheus gauge. - quorate *prometheus.GaugeVec // Cluster quorum state prometheus gauge. + nodes *TTLGaugeVec // Count of nodes prometheus gauge. + quorate *TTLGaugeVec // Cluster quorum state prometheus gauge. } // Create new instance of PVE cluster state collector. -func NewPveClusterStateCollector(apiClient *proxmox.PveApiClient) *PveClusterStateCollector { +func NewPveClusterStateCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveClusterStateCollector { c := PveClusterStateCollector{apiClient: apiClient} + c.registry = registry // Cluster meta gauge. - c.nodes = promauto.NewGaugeVec( + c.nodes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_cluster_nodes", Help: "Cluster nodes count.", }, []string{"cluster"}, + 1*time.Minute, ) + c.registry.Register(c.nodes) // Cluster quorate gauge. - c.quorate = promauto.NewGaugeVec( + c.quorate = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_cluster_quorate", Help: "Cluster quorum state.", }, []string{"cluster"}, + 1*time.Minute, ) + c.registry.Register(c.quorate) + c.registry.StartCleanupLoop(5 * time.Second) return &c } @@ -46,9 +54,6 @@ func (c *PveClusterStateCollector) CollectMetrics() error { return err } - c.nodes.Reset() - c.quorate.Reset() - l := prometheus.Labels{"cluster": cluster.Name} c.nodes.With(l).Set(float64(cluster.Nodes)) c.quorate.With(l).Set(float64(cluster.Quorate)) diff --git a/metrics/pve_metrics_manager.go b/metrics/pve_metrics_manager.go index 3673d65..8aaea88 100644 --- a/metrics/pve_metrics_manager.go +++ b/metrics/pve_metrics_manager.go @@ -21,6 +21,7 @@ type PveMetricsCollector interface { type PveMetricsManager struct { apiClient *proxmox.PveApiClient // Proxmox virtual environment API client instance. collectors []PveMetricsCollector // Metrics collector instances. + registry *TTLRegistry // Registry which handles autoamtic dangling metrics deletion. latencySummary *prometheus.SummaryVec // Collection latency summary. interval int // Collection interval. @@ -32,45 +33,46 @@ type PveMetricsManager struct { func NewPveMetricsManager(apiClient *proxmox.PveApiClient, conf *configuration.PveConfiguration) *PveMetricsManager { c := PveMetricsManager{apiClient: apiClient, interval: conf.Interval} metricsCf := conf.Metrics + c.registry = NewTTLRegistry() // Cluster state metrics collector. if metricsCf.ClusterState { - c.RegisterCollector(NewPveClusterStateCollector(apiClient)) + c.RegisterCollector(NewPveClusterStateCollector(apiClient, c.registry)) } // Node state metrics collector. if metricsCf.NodeStatus { - c.RegisterCollector(NewPveNodeStatusCollector(apiClient)) + c.RegisterCollector(NewPveNodeStatusCollector(apiClient, c.registry)) } // Node subscription state collector. if metricsCf.Subscription { - c.RegisterCollector(NewPveSubscriptionCollector(apiClient)) + c.RegisterCollector(NewPveSubscriptionCollector(apiClient, c.registry)) } // Node disk collector. if metricsCf.Disk { - c.RegisterCollector(NewPveNodeDiskCollector(apiClient)) + c.RegisterCollector(NewPveNodeDiskCollector(apiClient, c.registry)) } // Node SDN collector. if metricsCf.SDN { - c.RegisterCollector(NewPveSdnCollector(apiClient)) + c.RegisterCollector(NewPveSdnCollector(apiClient, c.registry)) } // Node storage collector. if metricsCf.Storage { - c.RegisterCollector(NewPveStorageCollector(apiClient)) + c.RegisterCollector(NewPveStorageCollector(apiClient, c.registry)) } // Node container collector. if metricsCf.LXC { - c.RegisterCollector(NewPveContainerCollector(apiClient)) + c.RegisterCollector(NewPveContainerCollector(apiClient, c.registry)) } // Node virtual machine collector. if metricsCf.QEMU { - c.RegisterCollector(NewPveVirtualMachineCollector(apiClient)) + c.RegisterCollector(NewPveVirtualMachineCollector(apiClient, c.registry)) } // Metrics collection latency summary. @@ -79,6 +81,8 @@ func NewPveMetricsManager(apiClient *proxmox.PveApiClient, conf *configuration.P Help: "Summary of metrics collection latency milliseconds from PVE API.", }, []string{"collector"}) + c.registry.StartCleanupLoop(5 * time.Second) + return &c } diff --git a/metrics/pve_node_container_collector.go b/metrics/pve_node_container_collector.go index b18b8ed..08da477 100644 --- a/metrics/pve_node_container_collector.go +++ b/metrics/pve_node_container_collector.go @@ -1,155 +1,184 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE container collector. type PveContainerCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - state *prometheus.GaugeVec // Container state prometheus gauge. - uptime *prometheus.GaugeVec // Container uptime prometheus gauge. + state *TTLGaugeVec // Container state prometheus gauge. + uptime *TTLGaugeVec // Container uptime prometheus gauge. - cpu *prometheus.GaugeVec // Container count of CPUs prometheus gauge. - cpuUsage *prometheus.GaugeVec // Container CPU usage % prometheus gauge. + cpu *TTLGaugeVec // Container count of CPUs prometheus gauge. + cpuUsage *TTLGaugeVec // Container CPU usage % prometheus gauge. - memBytes *prometheus.GaugeVec // Container memory in bytes prometheus gauge. - memBytesUsed *prometheus.GaugeVec // Container memory usage in bytes prometheus gauge. + memBytes *TTLGaugeVec // Container memory in bytes prometheus gauge. + memBytesUsed *TTLGaugeVec // Container memory usage in bytes prometheus gauge. - netReceive *prometheus.GaugeVec // Container network RX in bytes prometheus gauge. - netTransmit *prometheus.GaugeVec // Container network TX in bytes prometheus gauge. + netReceive *TTLGaugeVec // Container network RX in bytes prometheus gauge. + netTransmit *TTLGaugeVec // Container network TX in bytes prometheus gauge. - diskWrite *prometheus.GaugeVec // Container disk written in bytes prometheus gauge. - diskRead *prometheus.GaugeVec // Container disk read in bytes prometheus gauge. + diskWrite *TTLGaugeVec // Container disk written in bytes prometheus gauge. + diskRead *TTLGaugeVec // Container disk read in bytes prometheus gauge. - disk *prometheus.GaugeVec // Container disk space usage in bytes prometheus gauge. - diskMax *prometheus.GaugeVec // Container disk size in bytes prometheus gauge. - swap *prometheus.GaugeVec // Container swap usage in bytes prometheus gauge. + disk *TTLGaugeVec // Container disk space usage in bytes prometheus gauge. + diskMax *TTLGaugeVec // Container disk size in bytes prometheus gauge. + swap *TTLGaugeVec // Container swap usage in bytes prometheus gauge. } // Create new instance of PVE container collector. -func NewPveContainerCollector(apiClient *proxmox.PveApiClient) *PveContainerCollector { +func NewPveContainerCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveContainerCollector { c := PveContainerCollector{apiClient: apiClient} + c.registry = registry // Container state. - c.state = promauto.NewGaugeVec( + c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_state", Help: "Container state.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.state) // Container uptime. - c.uptime = promauto.NewGaugeVec( + c.uptime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_uptime", Help: "Container uptime.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.uptime) // Container CPU count. - c.cpu = promauto.NewGaugeVec( + c.cpu = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_cpu_count", Help: "Container CPU count.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.cpu) // Container CPU usage. - c.cpuUsage = promauto.NewGaugeVec( + c.cpuUsage = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_cpu_usage", Help: "Container CPU usage.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.cpuUsage) // Container memory total. - c.memBytes = promauto.NewGaugeVec( + c.memBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_mem_total_bytes", Help: "Container total memory in bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.memBytes) // Container memory usage. - c.memBytesUsed = promauto.NewGaugeVec( + c.memBytesUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_mem_used_bytes", Help: "Container used memory in bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.memBytesUsed) // Container network RX. - c.netReceive = promauto.NewGaugeVec( + c.netReceive = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_network_in_bytes", Help: "Container network RX bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.netReceive) // Container network TX. - c.netTransmit = promauto.NewGaugeVec( + c.netTransmit = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_network_out_bytes", Help: "Container network TX bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.netTransmit) // Container disk written. - c.diskWrite = promauto.NewGaugeVec( + c.diskWrite = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_disk_wr_bytes", Help: "Container disk written bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.diskWrite) // Container disk read. - c.diskRead = promauto.NewGaugeVec( + c.diskRead = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_disk_rd_bytes", Help: "Container disk read bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.diskRead) // Container disk size. - c.disk = promauto.NewGaugeVec( + c.disk = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_disk_usage_bytes", Help: "Container disk read bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.disk) // Container disk size. - c.diskMax = promauto.NewGaugeVec( + c.diskMax = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_disk_size_bytes", Help: "Container disk size bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.diskMax) // Container swap usage. - c.swap = promauto.NewGaugeVec( + c.swap = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_ct_swap_used_bytes", Help: "Container swap usage bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.swap) return &c } @@ -161,20 +190,6 @@ func (c *PveContainerCollector) CollectMetrics() error { return err } - c.state.Reset() - c.cpu.Reset() - c.memBytes.Reset() - c.diskMax.Reset() - c.uptime.Reset() - c.cpuUsage.Reset() - c.memBytesUsed.Reset() - c.netReceive.Reset() - c.netTransmit.Reset() - c.diskRead.Reset() - c.diskWrite.Reset() - c.disk.Reset() - c.swap.Reset() - for _, node := range cluster.NodeStatuses { containers, err := c.apiClient.GetNodeContainerList(node.Name) if err != nil { diff --git a/metrics/pve_node_disk_collector.go b/metrics/pve_node_disk_collector.go index ab1d199..6af06d8 100644 --- a/metrics/pve_node_disk_collector.go +++ b/metrics/pve_node_disk_collector.go @@ -3,51 +3,59 @@ package metrics import ( "strconv" "strings" + "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE cluster state collector. type PveNodeDiskCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - healthy *prometheus.GaugeVec // Node disk SMART passed state prometheus gauge. - wearout *prometheus.GaugeVec // Node disk wearout % prometheus gauge. - sizeBytes *prometheus.GaugeVec // Node disk size in bytes prometheus gauge. + healthy *TTLGaugeVec // Node disk SMART passed state prometheus gauge. + wearout *TTLGaugeVec // Node disk wearout % prometheus gauge. + sizeBytes *TTLGaugeVec // Node disk size in bytes prometheus gauge. } // Create new instance of PVE cluster state collector. -func NewPveNodeDiskCollector(apiClient *proxmox.PveApiClient) *PveNodeDiskCollector { +func NewPveNodeDiskCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveNodeDiskCollector { c := PveNodeDiskCollector{apiClient: apiClient} + c.registry = registry // Node disk healthy state. - c.healthy = promauto.NewGaugeVec( + c.healthy = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_disk_healthy", Help: "Node disk healthy state.", }, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, + 1*time.Minute, ) + c.registry.Register(c.healthy) // Node disk wearout. - c.wearout = promauto.NewGaugeVec( + c.wearout = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_disk_wearout", Help: "Node disk wearout percent.", }, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, + 1*time.Minute, ) + c.registry.Register(c.healthy) // Node disk size in bytes. - c.sizeBytes = promauto.NewGaugeVec( + c.sizeBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_disk_size_bytes", Help: "Node disk size in bytes.", }, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, + 1*time.Minute, ) + c.registry.Register(c.sizeBytes) return &c } @@ -59,10 +67,6 @@ func (c *PveNodeDiskCollector) CollectMetrics() error { return err } - c.healthy.Reset() - c.wearout.Reset() - c.sizeBytes.Reset() - for _, node := range cluster.NodeStatuses { disks, err := c.apiClient.GetNodeDisksList(node.Name) if err != nil { diff --git a/metrics/pve_node_sdn_collector.go b/metrics/pve_node_sdn_collector.go index 862211d..291beaa 100644 --- a/metrics/pve_node_sdn_collector.go +++ b/metrics/pve_node_sdn_collector.go @@ -1,30 +1,35 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE SDN state collector. type PveSdnCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - state *prometheus.GaugeVec // SDN state prometheus gauge. + state *TTLGaugeVec // SDN state prometheus gauge. } // Create new instance of PVE SDN collector. -func NewPveSdnCollector(apiClient *proxmox.PveApiClient) *PveSdnCollector { +func NewPveSdnCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveSdnCollector { c := PveSdnCollector{apiClient: apiClient} + c.registry = registry // SDN Up state. - c.state = promauto.NewGaugeVec( + c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_sdn_state", Help: "Node software defined network state.", }, []string{"cluster", "node", "sdn", "sdn_id"}, + 1*time.Minute, ) + c.registry.Register(c.state) return &c } @@ -41,8 +46,6 @@ func (c *PveSdnCollector) CollectMetrics() error { return err } - c.state.Reset() - for _, node := range cluster.NodeStatuses { sdns := resources.FindNodeSDN(node.Name) if len(*sdns) > 0 { diff --git a/metrics/pve_node_status_collector.go b/metrics/pve_node_status_collector.go index 6d72aa9..1deb504 100644 --- a/metrics/pve_node_status_collector.go +++ b/metrics/pve_node_status_collector.go @@ -2,9 +2,9 @@ package metrics import ( "strconv" + "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" log "github.com/sirupsen/logrus" "lostak.dev/pve-exporter/proxmox" ) @@ -12,212 +12,254 @@ import ( // PVE cluster state collector. type PveNodeStatusCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - state *prometheus.GaugeVec // Node state prometheus gauge. - uptime *prometheus.GaugeVec // Node uptime in seconds prometheus gauge. - cpus *prometheus.GaugeVec // Node CPU count prometheus gauge. - cpuUsage *prometheus.GaugeVec // Node CPU usage in percent prometheus gauge. - memBytes *prometheus.GaugeVec // Node total RAM capacity in bytes prometheus gauge. - memBytesUsed *prometheus.GaugeVec // Node RAM usage in bytes prometheus gauge. - memBytesFree *prometheus.GaugeVec // Node RAM free in bytes prometheus gauge. - ksmShared *prometheus.GaugeVec // Node Kernel samepage shared in bytes prometheus gauge. - cgroupMode *prometheus.GaugeVec // Node CGroups mode prometheus gauge. - load1 *prometheus.GaugeVec // Node load1 unix like (CPU seconds) prometheus gauge. - load5 *prometheus.GaugeVec // Node load5 unix like (CPU seconds) prometheus gauge. - load15 *prometheus.GaugeVec // Node load15 unix like (CPU seconds) prometheus gauge. - fSFree *prometheus.GaugeVec // Node filesystem free space in bytes prometheus gauge. - fSUsed *prometheus.GaugeVec // Node filesystem used space in bytes prometheus gauge. - fSTotal *prometheus.GaugeVec // Node filesystem total space in bytes prometheus gauge. - fSAvail *prometheus.GaugeVec // Node filesystem available capacity in bytes prometheus gauge. - cpuInfo *prometheus.GaugeVec // Node CPU info prometheus gauge. - systemInfo *prometheus.GaugeVec // Node system info prometheus gauge. - time *prometheus.GaugeVec // Node time prometheus gauge. - localTime *prometheus.GaugeVec // Node localtime prometheus gauge. + state *TTLGaugeVec // Node state prometheus gauge. + uptime *TTLGaugeVec // Node uptime in seconds prometheus gauge. + cpus *TTLGaugeVec // Node CPU count prometheus gauge. + cpuUsage *TTLGaugeVec // Node CPU usage in percent prometheus gauge. + memBytes *TTLGaugeVec // Node total RAM capacity in bytes prometheus gauge. + memBytesUsed *TTLGaugeVec // Node RAM usage in bytes prometheus gauge. + memBytesFree *TTLGaugeVec // Node RAM free in bytes prometheus gauge. + ksmShared *TTLGaugeVec // Node Kernel samepage shared in bytes prometheus gauge. + cgroupMode *TTLGaugeVec // Node CGroups mode prometheus gauge. + load1 *TTLGaugeVec // Node load1 unix like (CPU seconds) prometheus gauge. + load5 *TTLGaugeVec // Node load5 unix like (CPU seconds) prometheus gauge. + load15 *TTLGaugeVec // Node load15 unix like (CPU seconds) prometheus gauge. + fSFree *TTLGaugeVec // Node filesystem free space in bytes prometheus gauge. + fSUsed *TTLGaugeVec // Node filesystem used space in bytes prometheus gauge. + fSTotal *TTLGaugeVec // Node filesystem total space in bytes prometheus gauge. + fSAvail *TTLGaugeVec // Node filesystem available capacity in bytes prometheus gauge. + cpuInfo *TTLGaugeVec // Node CPU info prometheus gauge. + systemInfo *TTLGaugeVec // Node system info prometheus gauge. + time *TTLGaugeVec // Node time prometheus gauge. + localTime *TTLGaugeVec // Node localtime prometheus gauge. } // Create new instance of PVE cluster state collector. -func NewPveNodeStatusCollector(apiClient *proxmox.PveApiClient) *PveNodeStatusCollector { +func NewPveNodeStatusCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveNodeStatusCollector { c := PveNodeStatusCollector{apiClient: apiClient} + c.registry = registry // Node state. - c.state = promauto.NewGaugeVec( + c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_state", Help: "Node state.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.state) // Node uptime. - c.uptime = promauto.NewGaugeVec( + c.uptime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_uptime", Help: "Node uptime.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.uptime) // Node cpu count. - c.cpus = promauto.NewGaugeVec( + c.cpus = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpu_count", Help: "Node CPU count.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.cpus) // Node CPU usage. - c.cpuUsage = promauto.NewGaugeVec( + c.cpuUsage = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpu_usage", Help: "Cluster node CPU usage %.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.cpuUsage) // Node memory in bytes. - c.memBytes = promauto.NewGaugeVec( + c.memBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_total_bytes", Help: "Node total memory in bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.memBytes) // Cluster node memory used in bytes. - c.memBytesUsed = promauto.NewGaugeVec( + c.memBytesUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_used_bytes", Help: "Node used memory in bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.memBytesUsed) // Node memory free in bytes. - c.memBytesFree = promauto.NewGaugeVec( + c.memBytesFree = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_memory_free_bytes", Help: "Node free memory in bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.memBytesFree) // Kernel samepage shared in bytes. - c.ksmShared = promauto.NewGaugeVec( + c.ksmShared = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_ksm_bytes", Help: "Node kernel samepage shares in bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.ksmShared) // Node memory cgroup mode. - c.cgroupMode = promauto.NewGaugeVec( + c.cgroupMode = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cgroup_mode", Help: "Node cgroup mode.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.cgroupMode) // Node load 1. - c.load1 = promauto.NewGaugeVec( + c.load1 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load1", Help: "Node CPU load 1 minute average.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.load1) // Node load 5. - c.load5 = promauto.NewGaugeVec( + c.load5 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load5", Help: "Node CPU load 5 minutes average.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.load5) // Cluster node load 15. - c.load15 = promauto.NewGaugeVec( + c.load15 = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_load15", Help: "Node CPU load 15 minutes average.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.load15) // Node root FS free bytes. - c.fSFree = promauto.NewGaugeVec( + c.fSFree = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_free_bytes", Help: "Node RootFS free bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.fSFree) // Node root filesystem used bytes. - c.fSUsed = promauto.NewGaugeVec( + c.fSUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_used_bytes", Help: "Node root filesystem used bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.fSUsed) // Node root filesystem total bytes. - c.fSTotal = promauto.NewGaugeVec( + c.fSTotal = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_total_bytes", Help: "Node root filesystem total bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.fSTotal) // Node root filesystem avail bytes. - c.fSAvail = promauto.NewGaugeVec( + c.fSAvail = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_rootfs_avail_bytes", Help: "Node root filesystem avail bytes.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.fSAvail) // Node CPU info. - c.cpuInfo = promauto.NewGaugeVec( + c.cpuInfo = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_cpuinfo", Help: "Node CPU info.", }, []string{"cluster", "node", "flags", "cores", "model", "sockets", "cpus", "hvm"}, + 1*time.Minute, ) + c.registry.Register(c.cpuInfo) // Node system info metrics. - c.systemInfo = promauto.NewGaugeVec( + c.systemInfo = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_systeminfo", Help: "Node system info.", }, []string{"cluster", "node", "kversion", "pveversion", "machine", "sysname", "release"}, + 1*time.Minute, ) + c.registry.Register(c.systemInfo) // Node time info. - c.time = promauto.NewGaugeVec( + c.time = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_time", Help: "Node time.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.time) // Node localtime info. - c.localTime = promauto.NewGaugeVec( + c.localTime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_localtime", Help: "Node localtime.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.localTime) return &c } @@ -229,27 +271,6 @@ func (c *PveNodeStatusCollector) CollectMetrics() error { return err } - c.state.Reset() - c.uptime.Reset() - c.cpus.Reset() - c.cpuUsage.Reset() - c.memBytes.Reset() - c.memBytesUsed.Reset() - c.memBytesFree.Reset() - c.ksmShared.Reset() - c.cgroupMode.Reset() - c.load1.Reset() - c.load5.Reset() - c.load15.Reset() - c.fSFree.Reset() - c.fSUsed.Reset() - c.fSTotal.Reset() - c.fSAvail.Reset() - c.cpuInfo.Reset() - c.systemInfo.Reset() - c.time.Reset() - c.localTime.Reset() - for _, node := range cluster.NodeStatuses { labels := prometheus.Labels{ "cluster": cluster.GetClusterName(), diff --git a/metrics/pve_node_storage_collector.go b/metrics/pve_node_storage_collector.go index 5ef4eff..068ade2 100644 --- a/metrics/pve_node_storage_collector.go +++ b/metrics/pve_node_storage_collector.go @@ -2,61 +2,71 @@ package metrics import ( "strconv" + "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE Storage state collector. type PveStorageCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - state *prometheus.GaugeVec // Storage state prometheus gauge. - total *prometheus.GaugeVec // Storage total bytes prometheus gauge. - avail *prometheus.GaugeVec // Storage available bytes prometheus gauge. - used *prometheus.GaugeVec // Storage used bytes prometheus gauge. + state *TTLGaugeVec // Storage state prometheus gauge. + total *TTLGaugeVec // Storage total bytes prometheus gauge. + avail *TTLGaugeVec // Storage available bytes prometheus gauge. + used *TTLGaugeVec // Storage used bytes prometheus gauge. } // Create new instance of PVE SDN collector. -func NewPveStorageCollector(apiClient *proxmox.PveApiClient) *PveStorageCollector { +func NewPveStorageCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveStorageCollector { c := PveStorageCollector{apiClient: apiClient} + c.registry = registry // Storage state. - c.state = promauto.NewGaugeVec( + c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_storage_up", Help: "Node storage UP state.", }, []string{"cluster", "node", "storage", "type", "content", "shared"}, + 1*time.Minute, ) + c.registry.Register(c.state) // Storage total bytes. - c.total = promauto.NewGaugeVec( + c.total = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_storage_total_bytes", Help: "Node storage total capacity in bytes.", }, []string{"cluster", "node", "storage", "type", "content", "shared"}, + 1*time.Minute, ) + c.registry.Register(c.total) // Storage available bytes. - c.avail = promauto.NewGaugeVec( + c.avail = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_storage_avail_bytes", Help: "Node storage available capacity in bytes.", }, []string{"cluster", "node", "storage", "type", "content", "shared"}, + 1*time.Minute, ) + c.registry.Register(c.avail) // Storage used bytes. - c.used = promauto.NewGaugeVec( + c.used = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_storage_used_bytes", Help: "Node storage used capacity in bytes.", }, []string{"cluster", "node", "storage", "type", "content", "shared"}, + 1*time.Minute, ) + c.registry.Register(c.used) return &c } @@ -68,11 +78,6 @@ func (c *PveStorageCollector) CollectMetrics() error { return err } - c.state.Reset() - c.total.Reset() - c.avail.Reset() - c.used.Reset() - for _, node := range cluster.NodeStatuses { storages, err := c.apiClient.GetNodeStorages(node.Name) if err != nil { diff --git a/metrics/pve_node_subscription_collector.go b/metrics/pve_node_subscription_collector.go index 92892e7..2c57034 100644 --- a/metrics/pve_node_subscription_collector.go +++ b/metrics/pve_node_subscription_collector.go @@ -4,69 +4,80 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE subscription state collector. type PveSubscriptionCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - info *prometheus.GaugeVec // Node subscription info prometheus gauge. - status *prometheus.GaugeVec // Node subscription status prometheus gauge. - nextDueDate *prometheus.GaugeVec // Node subscription next due date prometheus gauge. - regDate *prometheus.GaugeVec // Node subscription registration date prometheus gauge. - sockets *prometheus.GaugeVec // Node subscription sockets count prometheus gauge. + info *TTLGaugeVec // Node subscription info prometheus gauge. + status *TTLGaugeVec // Node subscription status prometheus gauge. + nextDueDate *TTLGaugeVec // Node subscription next due date prometheus gauge. + regDate *TTLGaugeVec // Node subscription registration date prometheus gauge. + sockets *TTLGaugeVec // Node subscription sockets count prometheus gauge. } // Create new instance of PVE cluster state collector. -func NewPveSubscriptionCollector(apiClient *proxmox.PveApiClient) *PveSubscriptionCollector { +func NewPveSubscriptionCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveSubscriptionCollector { c := PveSubscriptionCollector{apiClient: apiClient} + c.registry = registry // Node subscription info. - c.info = promauto.NewGaugeVec( + c.info = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_subscription_info", Help: "Node subscription info.", }, []string{"cluster", "node", "productname", "serverid"}, + 1*time.Minute, ) + c.registry.Register(c.info) // Node subscription status. - c.status = promauto.NewGaugeVec( + c.status = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_subscription_status", Help: "Node subscription status.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.status) // Node subscription registration date. - c.regDate = promauto.NewGaugeVec( + c.regDate = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_subscription_regdate", Help: "Node subscription registration date.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.regDate) // Node subscription next due date. - c.nextDueDate = promauto.NewGaugeVec( + c.nextDueDate = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_subscription_nextduedate", Help: "Node subscription next due date.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.nextDueDate) // Node subscription count of sockets. - c.sockets = promauto.NewGaugeVec( + c.sockets = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_node_subscription_sockets", Help: "Node subscription count of sockets.", }, []string{"cluster", "node"}, + 1*time.Minute, ) + c.registry.Register(c.sockets) return &c } @@ -78,12 +89,6 @@ func (c *PveSubscriptionCollector) CollectMetrics() error { return err } - c.info.Reset() - c.status.Reset() - c.nextDueDate.Reset() - c.regDate.Reset() - c.sockets.Reset() - for _, node := range cluster.NodeStatuses { labels := prometheus.Labels{ "cluster": cluster.GetClusterName(), diff --git a/metrics/pve_node_virtual_machine_collector.go b/metrics/pve_node_virtual_machine_collector.go index da4e27e..e7a0bec 100644 --- a/metrics/pve_node_virtual_machine_collector.go +++ b/metrics/pve_node_virtual_machine_collector.go @@ -2,221 +2,261 @@ package metrics import ( "strconv" + "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "lostak.dev/pve-exporter/proxmox" ) // PVE virtual machine collector. type PveVirtualMachineCollector struct { apiClient *proxmox.PveApiClient // PVE API client instance. + registry *TTLRegistry // TTL metrics registry. - state *prometheus.GaugeVec // Virtual machine state prometheus gauge. - uptime *prometheus.GaugeVec // Virtual machine uptime prometheus gauge. + state *TTLGaugeVec // Virtual machine state prometheus gauge. + uptime *TTLGaugeVec // Virtual machine uptime prometheus gauge. - cpu *prometheus.GaugeVec // Virtual machine count of CPUs prometheus gauge. - cpuUsage *prometheus.GaugeVec // Virtual machine CPU usage % prometheus gauge. + cpu *TTLGaugeVec // Virtual machine count of CPUs prometheus gauge. + cpuUsage *TTLGaugeVec // Virtual machine CPU usage % prometheus gauge. - memBytes *prometheus.GaugeVec // Virtual machine memory in bytes prometheus gauge. - memBytesUsed *prometheus.GaugeVec // Virtual machine memory usage in bytes prometheus gauge. + memBytes *TTLGaugeVec // Virtual machine memory in bytes prometheus gauge. + memBytesUsed *TTLGaugeVec // Virtual machine memory usage in bytes prometheus gauge. - disk *prometheus.GaugeVec // Virtual machine disk space usage in bytes prometheus gauge. - diskMax *prometheus.GaugeVec // Virtual machine disk size in bytes prometheus gauge. - swap *prometheus.GaugeVec // Virtual machine swap usage in bytes prometheus gauge. + disk *TTLGaugeVec // Virtual machine disk space usage in bytes prometheus gauge. + diskMax *TTLGaugeVec // Virtual machine disk size in bytes prometheus gauge. + swap *TTLGaugeVec // Virtual machine swap usage in bytes prometheus gauge. - netReceive *prometheus.GaugeVec // Virtual machine network receive in bytes prometheus gauge. - netTransmit *prometheus.GaugeVec // Virtual machine network transmit in bytes prometheus gauge. + netReceive *TTLGaugeVec // Virtual machine network receive in bytes prometheus gauge. + netTransmit *TTLGaugeVec // Virtual machine network transmit in bytes prometheus gauge. - diskReadOps *prometheus.GaugeVec // Virtual machine disk read ops prometheus gauge. - diskWriteOps *prometheus.GaugeVec // Virtual machine disk write ops prometheus gauge. + diskReadOps *TTLGaugeVec // Virtual machine disk read ops prometheus gauge. + diskWriteOps *TTLGaugeVec // Virtual machine disk write ops prometheus gauge. - diskReadBytes *prometheus.GaugeVec // Virtual machine disk read bytes prometheus gauge. - diskWriteBytes *prometheus.GaugeVec // Virtual machine disk write bytes prometheus gauge. + diskReadBytes *TTLGaugeVec // Virtual machine disk read bytes prometheus gauge. + diskWriteBytes *TTLGaugeVec // Virtual machine disk write bytes prometheus gauge. - diskReadTimeNs *prometheus.GaugeVec // Virtual machine disk read time total prometheus gauge. - diskWriteTimeNs *prometheus.GaugeVec // Virtual machine disk write time total prometheus gauge. + diskReadTimeNs *TTLGaugeVec // Virtual machine disk read time total prometheus gauge. + diskWriteTimeNs *TTLGaugeVec // Virtual machine disk write time total prometheus gauge. - diskFailedReadOps *prometheus.GaugeVec // Virtual machine disk failed read ops prometheus gauge. - diskFailedWriteOps *prometheus.GaugeVec // Virtual machine disk failed write ops prometheus gauge. + diskFailedReadOps *TTLGaugeVec // Virtual machine disk failed read ops prometheus gauge. + diskFailedWriteOps *TTLGaugeVec // Virtual machine disk failed write ops prometheus gauge. - agent *prometheus.GaugeVec // Virtual machine agent enabled prometheus gauge. + agent *TTLGaugeVec // Virtual machine agent enabled prometheus gauge. } // Create new instance of PVE virtual machine collector. -func NewPveVirtualMachineCollector(apiClient *proxmox.PveApiClient) *PveVirtualMachineCollector { +func NewPveVirtualMachineCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveVirtualMachineCollector { c := PveVirtualMachineCollector{apiClient: apiClient} + c.registry = registry // Virtual machine state. - c.state = promauto.NewGaugeVec( + c.state = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_state", Help: "Virtual machine state.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.state) // Virtual machine uptime. - c.uptime = promauto.NewGaugeVec( + c.uptime = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_uptime", Help: "Virtual machine uptime.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.uptime) // Virtual machine agent state. - c.agent = promauto.NewGaugeVec( + c.agent = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_agent", Help: "Virtual machine agent state.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.agent) // Virtual machine CPU count. - c.cpu = promauto.NewGaugeVec( + c.cpu = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_cpu_count", Help: "Virtual machine CPU count.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.cpu) // Virtual machine CPU usage. - c.cpuUsage = promauto.NewGaugeVec( + c.cpuUsage = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_cpu_usage", Help: "Virtual machine CPU usage.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.cpuUsage) // Virtual machine memory total. - c.memBytes = promauto.NewGaugeVec( + c.memBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_mem_total_bytes", Help: "Virtual machine total memory in bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.memBytes) // Virtual machine memory usage. - c.memBytesUsed = promauto.NewGaugeVec( + c.memBytesUsed = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_mem_used_bytes", Help: "Virtual machine used memory in bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.memBytesUsed) // Virtual machine disk size. - c.disk = promauto.NewGaugeVec( + c.disk = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_usage_bytes", Help: "Virtual machine disk read bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.disk) // Virtual machine disk size. - c.diskMax = promauto.NewGaugeVec( + c.diskMax = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_size_bytes", Help: "Virtual machine disk size bytes.", }, []string{"cluster", "node", "vmid", "name"}, + 1*time.Minute, ) + c.registry.Register(c.diskMax) // Virtual machine network receive bytes. - c.netReceive = promauto.NewGaugeVec( + c.netReceive = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_network_in_bytes", Help: "Virtual machine network receive in bytes.", }, []string{"cluster", "node", "vmid", "name", "interface"}, + 1*time.Minute, ) + c.registry.Register(c.netReceive) // Virtual machine network transmit bytes. - c.netTransmit = promauto.NewGaugeVec( + c.netTransmit = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_network_out_bytes", Help: "Virtual machine network transmit in bytes.", }, []string{"cluster", "node", "vmid", "name", "interface"}, + 1*time.Minute, ) + c.registry.Register(c.netTransmit) // Virtual machine disk read ops. - c.diskReadOps = promauto.NewGaugeVec( + c.diskReadOps = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_rd_operations", Help: "Virtual machine disk read ops.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskReadOps) // Virtual machine disk write ops. - c.diskWriteOps = promauto.NewGaugeVec( + c.diskWriteOps = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_wr_operations", Help: "Virtual machine disk write ops.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskWriteOps) // Virtual machine disk read bytes. - c.diskReadBytes = promauto.NewGaugeVec( + c.diskReadBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_rd_bytes", Help: "Virtual machine disk read bytes.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskReadBytes) // Virtual machine disk write bytes. - c.diskWriteBytes = promauto.NewGaugeVec( + c.diskWriteBytes = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_wr_bytes", Help: "Virtual machine disk write bytes.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskWriteBytes) // Virtual machine failed disk read ops. - c.diskFailedReadOps = promauto.NewGaugeVec( + c.diskFailedReadOps = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_failed_rd_ops", Help: "Virtual machine failed disk read ops.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskFailedReadOps) // Virtual machine failed disk write ops. - c.diskFailedWriteOps = promauto.NewGaugeVec( + c.diskFailedWriteOps = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_failed_wr_ops", Help: "Virtual machine failed disk write ops.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskFailedWriteOps) // Virtual machine disk read time total nanoseconds. - c.diskReadTimeNs = promauto.NewGaugeVec( + c.diskReadTimeNs = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_rd_time_total_ns", Help: "Virtual machine disk read time total in nanoseconds.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskReadTimeNs) // Virtual machine disk write time total nanoseconds. - c.diskWriteTimeNs = promauto.NewGaugeVec( + c.diskWriteTimeNs = NewTTLGaugeVec( prometheus.GaugeOpts{ Name: "pve_vm_disk_wr_time_total_ns", Help: "Virtual machine disk write time total in nanoseconds.", }, []string{"cluster", "node", "vmid", "name", "device"}, + 1*time.Minute, ) + c.registry.Register(c.diskWriteTimeNs) return &c } @@ -228,24 +268,6 @@ func (c *PveVirtualMachineCollector) CollectMetrics() error { return err } - c.state.Reset() - c.cpu.Reset() - c.memBytes.Reset() - c.diskMax.Reset() - c.uptime.Reset() - c.cpuUsage.Reset() - c.memBytesUsed.Reset() - c.netReceive.Reset() - c.netTransmit.Reset() - c.diskReadOps.Reset() - c.diskWriteOps.Reset() - c.diskReadBytes.Reset() - c.diskWriteBytes.Reset() - c.diskFailedReadOps.Reset() - c.diskFailedWriteOps.Reset() - c.diskReadTimeNs.Reset() - c.diskWriteTimeNs.Reset() - for _, node := range cluster.NodeStatuses { qemus, err := c.apiClient.GetNodeQemuList(node.Name) if err != nil { diff --git a/metrics/pve_ttl_metrics.go b/metrics/pve_ttl_metrics.go new file mode 100644 index 0000000..03177a2 --- /dev/null +++ b/metrics/pve_ttl_metrics.go @@ -0,0 +1,139 @@ +package metrics + +import ( + "fmt" + "sort" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +// TTLMetric is an interface for metrics that support time-to-live cleanup. +// Any metric that implements Cleanup() can be registered in the TTLRegistry. +type TTLMetric interface { + // Cleanup removes stale metric label sets that have exceeded their TTL. + Cleanup() +} + +// TTLGaugeVec wraps a Prometheus GaugeVec and tracks the last update time +// for each set of labels. When a set of labels is not updated within the TTL, +// it is automatically removed from the underlying GaugeVec. +type TTLGaugeVec struct { + gaugeVec *prometheus.GaugeVec // Underlying Prometheus GaugeVec. + ttl time.Duration // Duration after which an unused label set is considered stale. + lastUpdate sync.Map // Map storing last update time for each label set (key is a sorted labels string). +} + +// NewTTLGaugeVec creates a new TTLGaugeVec using the provided GaugeOpts, label names, and TTL. +// The underlying GaugeVec is registered using promauto. +func NewTTLGaugeVec(opts prometheus.GaugeOpts, labelNames []string, ttl time.Duration) *TTLGaugeVec { + return &TTLGaugeVec{ + gaugeVec: promauto.NewGaugeVec(opts, labelNames), + ttl: ttl, + } +} + +// With returns the gauge for the given label set and records the current time +// as the last update for those labels. +func (t *TTLGaugeVec) With(labels prometheus.Labels) prometheus.Gauge { + key := labelsKey(labels) + t.lastUpdate.Store(key, time.Now()) + return t.gaugeVec.With(labels) +} + +// Delete removes the metric associated with the given label set from both the underlying +// GaugeVec and the lastUpdate tracking map. It returns true if the deletion was successful. +func (t *TTLGaugeVec) Delete(labels prometheus.Labels) bool { + key := labelsKey(labels) + t.lastUpdate.Delete(key) + return t.gaugeVec.Delete(labels) +} + +// Cleanup iterates over all tracked label sets and deletes those that have not been updated +// within the TTL duration. +func (t *TTLGaugeVec) Cleanup() { + now := time.Now() + t.lastUpdate.Range(func(key, value interface{}) bool { + if last, ok := value.(time.Time); ok { + if now.Sub(last) > t.ttl { + labels := parseLabels(key.(string)) + t.gaugeVec.Delete(labels) + t.lastUpdate.Delete(key) + } + } + return true + }) +} + +// labelsKey creates a deterministic key from a Prometheus labels map. +// It sorts the keys and concatenates them in the format "key=value" separated by commas. +func labelsKey(labels prometheus.Labels) string { + var keys []string + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + var parts []string + for _, k := range keys { + parts = append(parts, fmt.Sprintf("%s=%s", k, labels[k])) + } + return strings.Join(parts, ",") +} + +// parseLabels converts a sorted key string back into a Prometheus labels map. +// The key is expected to be in the format produced by labelsKey. +func parseLabels(key string) prometheus.Labels { + labels := prometheus.Labels{} + parts := strings.Split(key, ",") + for _, part := range parts { + kv := strings.SplitN(part, "=", 2) + if len(kv) == 2 { + labels[kv[0]] = kv[1] + } + } + return labels +} + +// TTLRegistry manages multiple TTLMetric instances and periodically cleans them up. +type TTLRegistry struct { + mu sync.RWMutex + metrics []TTLMetric +} + +// NewTTLRegistry creates and returns a new TTLRegistry. +func NewTTLRegistry() *TTLRegistry { + return &TTLRegistry{ + metrics: make([]TTLMetric, 0), + } +} + +// Register adds a TTLMetric to the registry for periodic cleanup. +func (r *TTLRegistry) Register(metric TTLMetric) { + r.mu.Lock() + defer r.mu.Unlock() + r.metrics = append(r.metrics, metric) +} + +// Cleanup calls the Cleanup method on each registered TTLMetric. +func (r *TTLRegistry) Cleanup() { + r.mu.RLock() + defer r.mu.RUnlock() + for _, metric := range r.metrics { + metric.Cleanup() + } +} + +// StartCleanupLoop starts a background goroutine that periodically cleans up stale metrics. +// The cleanup is performed at the specified interval. +func (r *TTLRegistry) StartCleanupLoop(interval time.Duration) { + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + r.Cleanup() + } + }() +}