Metrics now have expiration if not updated

This commit is contained in:
Jan Lošťák
2025-02-22 21:24:03 +01:00
parent 2ed310eef7
commit f78df9d3e3
10 changed files with 455 additions and 232 deletions

View File

@@ -1,40 +1,48 @@
package metrics package metrics
import ( import (
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE cluster state collector. // PVE cluster state collector.
type PveClusterStateCollector struct { type PveClusterStateCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
nodes *prometheus.GaugeVec // Count of nodes prometheus gauge. nodes *TTLGaugeVec // Count of nodes prometheus gauge.
quorate *prometheus.GaugeVec // Cluster quorum state prometheus gauge. quorate *TTLGaugeVec // Cluster quorum state prometheus gauge.
} }
// Create new instance of PVE cluster state collector. // Create new instance of PVE cluster state collector.
func NewPveClusterStateCollector(apiClient *proxmox.PveApiClient) *PveClusterStateCollector { func NewPveClusterStateCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveClusterStateCollector {
c := PveClusterStateCollector{apiClient: apiClient} c := PveClusterStateCollector{apiClient: apiClient}
c.registry = registry
// Cluster meta gauge. // Cluster meta gauge.
c.nodes = promauto.NewGaugeVec( c.nodes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_cluster_nodes", Name: "pve_cluster_nodes",
Help: "Cluster nodes count.", Help: "Cluster nodes count.",
}, },
[]string{"cluster"}, []string{"cluster"},
1*time.Minute,
) )
c.registry.Register(c.nodes)
// Cluster quorate gauge. // Cluster quorate gauge.
c.quorate = promauto.NewGaugeVec( c.quorate = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_cluster_quorate", Name: "pve_cluster_quorate",
Help: "Cluster quorum state.", Help: "Cluster quorum state.",
}, },
[]string{"cluster"}, []string{"cluster"},
1*time.Minute,
) )
c.registry.Register(c.quorate)
c.registry.StartCleanupLoop(5 * time.Second)
return &c return &c
} }
@@ -46,9 +54,6 @@ func (c *PveClusterStateCollector) CollectMetrics() error {
return err return err
} }
c.nodes.Reset()
c.quorate.Reset()
l := prometheus.Labels{"cluster": cluster.Name} l := prometheus.Labels{"cluster": cluster.Name}
c.nodes.With(l).Set(float64(cluster.Nodes)) c.nodes.With(l).Set(float64(cluster.Nodes))
c.quorate.With(l).Set(float64(cluster.Quorate)) c.quorate.With(l).Set(float64(cluster.Quorate))

View File

@@ -21,6 +21,7 @@ type PveMetricsCollector interface {
type PveMetricsManager struct { type PveMetricsManager struct {
apiClient *proxmox.PveApiClient // Proxmox virtual environment API client instance. apiClient *proxmox.PveApiClient // Proxmox virtual environment API client instance.
collectors []PveMetricsCollector // Metrics collector instances. collectors []PveMetricsCollector // Metrics collector instances.
registry *TTLRegistry // Registry which handles autoamtic dangling metrics deletion.
latencySummary *prometheus.SummaryVec // Collection latency summary. latencySummary *prometheus.SummaryVec // Collection latency summary.
interval int // Collection interval. interval int // Collection interval.
@@ -32,45 +33,46 @@ type PveMetricsManager struct {
func NewPveMetricsManager(apiClient *proxmox.PveApiClient, conf *configuration.PveConfiguration) *PveMetricsManager { func NewPveMetricsManager(apiClient *proxmox.PveApiClient, conf *configuration.PveConfiguration) *PveMetricsManager {
c := PveMetricsManager{apiClient: apiClient, interval: conf.Interval} c := PveMetricsManager{apiClient: apiClient, interval: conf.Interval}
metricsCf := conf.Metrics metricsCf := conf.Metrics
c.registry = NewTTLRegistry()
// Cluster state metrics collector. // Cluster state metrics collector.
if metricsCf.ClusterState { if metricsCf.ClusterState {
c.RegisterCollector(NewPveClusterStateCollector(apiClient)) c.RegisterCollector(NewPveClusterStateCollector(apiClient, c.registry))
} }
// Node state metrics collector. // Node state metrics collector.
if metricsCf.NodeStatus { if metricsCf.NodeStatus {
c.RegisterCollector(NewPveNodeStatusCollector(apiClient)) c.RegisterCollector(NewPveNodeStatusCollector(apiClient, c.registry))
} }
// Node subscription state collector. // Node subscription state collector.
if metricsCf.Subscription { if metricsCf.Subscription {
c.RegisterCollector(NewPveSubscriptionCollector(apiClient)) c.RegisterCollector(NewPveSubscriptionCollector(apiClient, c.registry))
} }
// Node disk collector. // Node disk collector.
if metricsCf.Disk { if metricsCf.Disk {
c.RegisterCollector(NewPveNodeDiskCollector(apiClient)) c.RegisterCollector(NewPveNodeDiskCollector(apiClient, c.registry))
} }
// Node SDN collector. // Node SDN collector.
if metricsCf.SDN { if metricsCf.SDN {
c.RegisterCollector(NewPveSdnCollector(apiClient)) c.RegisterCollector(NewPveSdnCollector(apiClient, c.registry))
} }
// Node storage collector. // Node storage collector.
if metricsCf.Storage { if metricsCf.Storage {
c.RegisterCollector(NewPveStorageCollector(apiClient)) c.RegisterCollector(NewPveStorageCollector(apiClient, c.registry))
} }
// Node container collector. // Node container collector.
if metricsCf.LXC { if metricsCf.LXC {
c.RegisterCollector(NewPveContainerCollector(apiClient)) c.RegisterCollector(NewPveContainerCollector(apiClient, c.registry))
} }
// Node virtual machine collector. // Node virtual machine collector.
if metricsCf.QEMU { if metricsCf.QEMU {
c.RegisterCollector(NewPveVirtualMachineCollector(apiClient)) c.RegisterCollector(NewPveVirtualMachineCollector(apiClient, c.registry))
} }
// Metrics collection latency summary. // Metrics collection latency summary.
@@ -79,6 +81,8 @@ func NewPveMetricsManager(apiClient *proxmox.PveApiClient, conf *configuration.P
Help: "Summary of metrics collection latency milliseconds from PVE API.", Help: "Summary of metrics collection latency milliseconds from PVE API.",
}, []string{"collector"}) }, []string{"collector"})
c.registry.StartCleanupLoop(5 * time.Second)
return &c return &c
} }

View File

@@ -1,155 +1,184 @@
package metrics package metrics
import ( import (
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE container collector. // PVE container collector.
type PveContainerCollector struct { type PveContainerCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *prometheus.GaugeVec // Container state prometheus gauge. state *TTLGaugeVec // Container state prometheus gauge.
uptime *prometheus.GaugeVec // Container uptime prometheus gauge. uptime *TTLGaugeVec // Container uptime prometheus gauge.
cpu *prometheus.GaugeVec // Container count of CPUs prometheus gauge. cpu *TTLGaugeVec // Container count of CPUs prometheus gauge.
cpuUsage *prometheus.GaugeVec // Container CPU usage % prometheus gauge. cpuUsage *TTLGaugeVec // Container CPU usage % prometheus gauge.
memBytes *prometheus.GaugeVec // Container memory in bytes prometheus gauge. memBytes *TTLGaugeVec // Container memory in bytes prometheus gauge.
memBytesUsed *prometheus.GaugeVec // Container memory usage in bytes prometheus gauge. memBytesUsed *TTLGaugeVec // Container memory usage in bytes prometheus gauge.
netReceive *prometheus.GaugeVec // Container network RX in bytes prometheus gauge. netReceive *TTLGaugeVec // Container network RX in bytes prometheus gauge.
netTransmit *prometheus.GaugeVec // Container network TX in bytes prometheus gauge. netTransmit *TTLGaugeVec // Container network TX in bytes prometheus gauge.
diskWrite *prometheus.GaugeVec // Container disk written in bytes prometheus gauge. diskWrite *TTLGaugeVec // Container disk written in bytes prometheus gauge.
diskRead *prometheus.GaugeVec // Container disk read in bytes prometheus gauge. diskRead *TTLGaugeVec // Container disk read in bytes prometheus gauge.
disk *prometheus.GaugeVec // Container disk space usage in bytes prometheus gauge. disk *TTLGaugeVec // Container disk space usage in bytes prometheus gauge.
diskMax *prometheus.GaugeVec // Container disk size in bytes prometheus gauge. diskMax *TTLGaugeVec // Container disk size in bytes prometheus gauge.
swap *prometheus.GaugeVec // Container swap usage in bytes prometheus gauge. swap *TTLGaugeVec // Container swap usage in bytes prometheus gauge.
} }
// Create new instance of PVE container collector. // Create new instance of PVE container collector.
func NewPveContainerCollector(apiClient *proxmox.PveApiClient) *PveContainerCollector { func NewPveContainerCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveContainerCollector {
c := PveContainerCollector{apiClient: apiClient} c := PveContainerCollector{apiClient: apiClient}
c.registry = registry
// Container state. // Container state.
c.state = promauto.NewGaugeVec( c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_state", Name: "pve_ct_state",
Help: "Container state.", Help: "Container state.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.state)
// Container uptime. // Container uptime.
c.uptime = promauto.NewGaugeVec( c.uptime = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_uptime", Name: "pve_ct_uptime",
Help: "Container uptime.", Help: "Container uptime.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.uptime)
// Container CPU count. // Container CPU count.
c.cpu = promauto.NewGaugeVec( c.cpu = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_cpu_count", Name: "pve_ct_cpu_count",
Help: "Container CPU count.", Help: "Container CPU count.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.cpu)
// Container CPU usage. // Container CPU usage.
c.cpuUsage = promauto.NewGaugeVec( c.cpuUsage = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_cpu_usage", Name: "pve_ct_cpu_usage",
Help: "Container CPU usage.", Help: "Container CPU usage.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.cpuUsage)
// Container memory total. // Container memory total.
c.memBytes = promauto.NewGaugeVec( c.memBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_mem_total_bytes", Name: "pve_ct_mem_total_bytes",
Help: "Container total memory in bytes.", Help: "Container total memory in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.memBytes)
// Container memory usage. // Container memory usage.
c.memBytesUsed = promauto.NewGaugeVec( c.memBytesUsed = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_mem_used_bytes", Name: "pve_ct_mem_used_bytes",
Help: "Container used memory in bytes.", Help: "Container used memory in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.memBytesUsed)
// Container network RX. // Container network RX.
c.netReceive = promauto.NewGaugeVec( c.netReceive = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_network_in_bytes", Name: "pve_ct_network_in_bytes",
Help: "Container network RX bytes.", Help: "Container network RX bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.netReceive)
// Container network TX. // Container network TX.
c.netTransmit = promauto.NewGaugeVec( c.netTransmit = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_network_out_bytes", Name: "pve_ct_network_out_bytes",
Help: "Container network TX bytes.", Help: "Container network TX bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.netTransmit)
// Container disk written. // Container disk written.
c.diskWrite = promauto.NewGaugeVec( c.diskWrite = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_disk_wr_bytes", Name: "pve_ct_disk_wr_bytes",
Help: "Container disk written bytes.", Help: "Container disk written bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.diskWrite)
// Container disk read. // Container disk read.
c.diskRead = promauto.NewGaugeVec( c.diskRead = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_disk_rd_bytes", Name: "pve_ct_disk_rd_bytes",
Help: "Container disk read bytes.", Help: "Container disk read bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.diskRead)
// Container disk size. // Container disk size.
c.disk = promauto.NewGaugeVec( c.disk = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_disk_usage_bytes", Name: "pve_ct_disk_usage_bytes",
Help: "Container disk read bytes.", Help: "Container disk read bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.disk)
// Container disk size. // Container disk size.
c.diskMax = promauto.NewGaugeVec( c.diskMax = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_disk_size_bytes", Name: "pve_ct_disk_size_bytes",
Help: "Container disk size bytes.", Help: "Container disk size bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.diskMax)
// Container swap usage. // Container swap usage.
c.swap = promauto.NewGaugeVec( c.swap = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_ct_swap_used_bytes", Name: "pve_ct_swap_used_bytes",
Help: "Container swap usage bytes.", Help: "Container swap usage bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.swap)
return &c return &c
} }
@@ -161,20 +190,6 @@ func (c *PveContainerCollector) CollectMetrics() error {
return err return err
} }
c.state.Reset()
c.cpu.Reset()
c.memBytes.Reset()
c.diskMax.Reset()
c.uptime.Reset()
c.cpuUsage.Reset()
c.memBytesUsed.Reset()
c.netReceive.Reset()
c.netTransmit.Reset()
c.diskRead.Reset()
c.diskWrite.Reset()
c.disk.Reset()
c.swap.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
containers, err := c.apiClient.GetNodeContainerList(node.Name) containers, err := c.apiClient.GetNodeContainerList(node.Name)
if err != nil { if err != nil {

View File

@@ -3,51 +3,59 @@ package metrics
import ( import (
"strconv" "strconv"
"strings" "strings"
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE cluster state collector. // PVE cluster state collector.
type PveNodeDiskCollector struct { type PveNodeDiskCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
healthy *prometheus.GaugeVec // Node disk SMART passed state prometheus gauge. healthy *TTLGaugeVec // Node disk SMART passed state prometheus gauge.
wearout *prometheus.GaugeVec // Node disk wearout % prometheus gauge. wearout *TTLGaugeVec // Node disk wearout % prometheus gauge.
sizeBytes *prometheus.GaugeVec // Node disk size in bytes prometheus gauge. sizeBytes *TTLGaugeVec // Node disk size in bytes prometheus gauge.
} }
// Create new instance of PVE cluster state collector. // Create new instance of PVE cluster state collector.
func NewPveNodeDiskCollector(apiClient *proxmox.PveApiClient) *PveNodeDiskCollector { func NewPveNodeDiskCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveNodeDiskCollector {
c := PveNodeDiskCollector{apiClient: apiClient} c := PveNodeDiskCollector{apiClient: apiClient}
c.registry = registry
// Node disk healthy state. // Node disk healthy state.
c.healthy = promauto.NewGaugeVec( c.healthy = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_disk_healthy", Name: "pve_node_disk_healthy",
Help: "Node disk healthy state.", Help: "Node disk healthy state.",
}, },
[]string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"},
1*time.Minute,
) )
c.registry.Register(c.healthy)
// Node disk wearout. // Node disk wearout.
c.wearout = promauto.NewGaugeVec( c.wearout = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_disk_wearout", Name: "pve_node_disk_wearout",
Help: "Node disk wearout percent.", Help: "Node disk wearout percent.",
}, },
[]string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"},
1*time.Minute,
) )
c.registry.Register(c.healthy)
// Node disk size in bytes. // Node disk size in bytes.
c.sizeBytes = promauto.NewGaugeVec( c.sizeBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_disk_size_bytes", Name: "pve_node_disk_size_bytes",
Help: "Node disk size in bytes.", Help: "Node disk size in bytes.",
}, },
[]string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"}, []string{"cluster", "node", "wwn", "type", "model", "serial", "vendor", "used", "osd_id"},
1*time.Minute,
) )
c.registry.Register(c.sizeBytes)
return &c return &c
} }
@@ -59,10 +67,6 @@ func (c *PveNodeDiskCollector) CollectMetrics() error {
return err return err
} }
c.healthy.Reset()
c.wearout.Reset()
c.sizeBytes.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
disks, err := c.apiClient.GetNodeDisksList(node.Name) disks, err := c.apiClient.GetNodeDisksList(node.Name)
if err != nil { if err != nil {

View File

@@ -1,30 +1,35 @@
package metrics package metrics
import ( import (
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE SDN state collector. // PVE SDN state collector.
type PveSdnCollector struct { type PveSdnCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *prometheus.GaugeVec // SDN state prometheus gauge. state *TTLGaugeVec // SDN state prometheus gauge.
} }
// Create new instance of PVE SDN collector. // Create new instance of PVE SDN collector.
func NewPveSdnCollector(apiClient *proxmox.PveApiClient) *PveSdnCollector { func NewPveSdnCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveSdnCollector {
c := PveSdnCollector{apiClient: apiClient} c := PveSdnCollector{apiClient: apiClient}
c.registry = registry
// SDN Up state. // SDN Up state.
c.state = promauto.NewGaugeVec( c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_sdn_state", Name: "pve_sdn_state",
Help: "Node software defined network state.", Help: "Node software defined network state.",
}, },
[]string{"cluster", "node", "sdn", "sdn_id"}, []string{"cluster", "node", "sdn", "sdn_id"},
1*time.Minute,
) )
c.registry.Register(c.state)
return &c return &c
} }
@@ -41,8 +46,6 @@ func (c *PveSdnCollector) CollectMetrics() error {
return err return err
} }
c.state.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
sdns := resources.FindNodeSDN(node.Name) sdns := resources.FindNodeSDN(node.Name)
if len(*sdns) > 0 { if len(*sdns) > 0 {

View File

@@ -2,9 +2,9 @@ package metrics
import ( import (
"strconv" "strconv"
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
@@ -12,212 +12,254 @@ import (
// PVE cluster state collector. // PVE cluster state collector.
type PveNodeStatusCollector struct { type PveNodeStatusCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *prometheus.GaugeVec // Node state prometheus gauge. state *TTLGaugeVec // Node state prometheus gauge.
uptime *prometheus.GaugeVec // Node uptime in seconds prometheus gauge. uptime *TTLGaugeVec // Node uptime in seconds prometheus gauge.
cpus *prometheus.GaugeVec // Node CPU count prometheus gauge. cpus *TTLGaugeVec // Node CPU count prometheus gauge.
cpuUsage *prometheus.GaugeVec // Node CPU usage in percent prometheus gauge. cpuUsage *TTLGaugeVec // Node CPU usage in percent prometheus gauge.
memBytes *prometheus.GaugeVec // Node total RAM capacity in bytes prometheus gauge. memBytes *TTLGaugeVec // Node total RAM capacity in bytes prometheus gauge.
memBytesUsed *prometheus.GaugeVec // Node RAM usage in bytes prometheus gauge. memBytesUsed *TTLGaugeVec // Node RAM usage in bytes prometheus gauge.
memBytesFree *prometheus.GaugeVec // Node RAM free in bytes prometheus gauge. memBytesFree *TTLGaugeVec // Node RAM free in bytes prometheus gauge.
ksmShared *prometheus.GaugeVec // Node Kernel samepage shared in bytes prometheus gauge. ksmShared *TTLGaugeVec // Node Kernel samepage shared in bytes prometheus gauge.
cgroupMode *prometheus.GaugeVec // Node CGroups mode prometheus gauge. cgroupMode *TTLGaugeVec // Node CGroups mode prometheus gauge.
load1 *prometheus.GaugeVec // Node load1 unix like (CPU seconds) prometheus gauge. load1 *TTLGaugeVec // Node load1 unix like (CPU seconds) prometheus gauge.
load5 *prometheus.GaugeVec // Node load5 unix like (CPU seconds) prometheus gauge. load5 *TTLGaugeVec // Node load5 unix like (CPU seconds) prometheus gauge.
load15 *prometheus.GaugeVec // Node load15 unix like (CPU seconds) prometheus gauge. load15 *TTLGaugeVec // Node load15 unix like (CPU seconds) prometheus gauge.
fSFree *prometheus.GaugeVec // Node filesystem free space in bytes prometheus gauge. fSFree *TTLGaugeVec // Node filesystem free space in bytes prometheus gauge.
fSUsed *prometheus.GaugeVec // Node filesystem used space in bytes prometheus gauge. fSUsed *TTLGaugeVec // Node filesystem used space in bytes prometheus gauge.
fSTotal *prometheus.GaugeVec // Node filesystem total space in bytes prometheus gauge. fSTotal *TTLGaugeVec // Node filesystem total space in bytes prometheus gauge.
fSAvail *prometheus.GaugeVec // Node filesystem available capacity in bytes prometheus gauge. fSAvail *TTLGaugeVec // Node filesystem available capacity in bytes prometheus gauge.
cpuInfo *prometheus.GaugeVec // Node CPU info prometheus gauge. cpuInfo *TTLGaugeVec // Node CPU info prometheus gauge.
systemInfo *prometheus.GaugeVec // Node system info prometheus gauge. systemInfo *TTLGaugeVec // Node system info prometheus gauge.
time *prometheus.GaugeVec // Node time prometheus gauge. time *TTLGaugeVec // Node time prometheus gauge.
localTime *prometheus.GaugeVec // Node localtime prometheus gauge. localTime *TTLGaugeVec // Node localtime prometheus gauge.
} }
// Create new instance of PVE cluster state collector. // Create new instance of PVE cluster state collector.
func NewPveNodeStatusCollector(apiClient *proxmox.PveApiClient) *PveNodeStatusCollector { func NewPveNodeStatusCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveNodeStatusCollector {
c := PveNodeStatusCollector{apiClient: apiClient} c := PveNodeStatusCollector{apiClient: apiClient}
c.registry = registry
// Node state. // Node state.
c.state = promauto.NewGaugeVec( c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_state", Name: "pve_node_state",
Help: "Node state.", Help: "Node state.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.state)
// Node uptime. // Node uptime.
c.uptime = promauto.NewGaugeVec( c.uptime = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_uptime", Name: "pve_node_uptime",
Help: "Node uptime.", Help: "Node uptime.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.uptime)
// Node cpu count. // Node cpu count.
c.cpus = promauto.NewGaugeVec( c.cpus = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_cpu_count", Name: "pve_node_cpu_count",
Help: "Node CPU count.", Help: "Node CPU count.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.cpus)
// Node CPU usage. // Node CPU usage.
c.cpuUsage = promauto.NewGaugeVec( c.cpuUsage = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_cpu_usage", Name: "pve_node_cpu_usage",
Help: "Cluster node CPU usage %.", Help: "Cluster node CPU usage %.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.cpuUsage)
// Node memory in bytes. // Node memory in bytes.
c.memBytes = promauto.NewGaugeVec( c.memBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_memory_total_bytes", Name: "pve_node_memory_total_bytes",
Help: "Node total memory in bytes.", Help: "Node total memory in bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.memBytes)
// Cluster node memory used in bytes. // Cluster node memory used in bytes.
c.memBytesUsed = promauto.NewGaugeVec( c.memBytesUsed = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_memory_used_bytes", Name: "pve_node_memory_used_bytes",
Help: "Node used memory in bytes.", Help: "Node used memory in bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.memBytesUsed)
// Node memory free in bytes. // Node memory free in bytes.
c.memBytesFree = promauto.NewGaugeVec( c.memBytesFree = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_memory_free_bytes", Name: "pve_node_memory_free_bytes",
Help: "Node free memory in bytes.", Help: "Node free memory in bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.memBytesFree)
// Kernel samepage shared in bytes. // Kernel samepage shared in bytes.
c.ksmShared = promauto.NewGaugeVec( c.ksmShared = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_ksm_bytes", Name: "pve_node_ksm_bytes",
Help: "Node kernel samepage shares in bytes.", Help: "Node kernel samepage shares in bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.ksmShared)
// Node memory cgroup mode. // Node memory cgroup mode.
c.cgroupMode = promauto.NewGaugeVec( c.cgroupMode = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_cgroup_mode", Name: "pve_node_cgroup_mode",
Help: "Node cgroup mode.", Help: "Node cgroup mode.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.cgroupMode)
// Node load 1. // Node load 1.
c.load1 = promauto.NewGaugeVec( c.load1 = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_load1", Name: "pve_node_load1",
Help: "Node CPU load 1 minute average.", Help: "Node CPU load 1 minute average.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.load1)
// Node load 5. // Node load 5.
c.load5 = promauto.NewGaugeVec( c.load5 = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_load5", Name: "pve_node_load5",
Help: "Node CPU load 5 minutes average.", Help: "Node CPU load 5 minutes average.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.load5)
// Cluster node load 15. // Cluster node load 15.
c.load15 = promauto.NewGaugeVec( c.load15 = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_load15", Name: "pve_node_load15",
Help: "Node CPU load 15 minutes average.", Help: "Node CPU load 15 minutes average.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.load15)
// Node root FS free bytes. // Node root FS free bytes.
c.fSFree = promauto.NewGaugeVec( c.fSFree = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_rootfs_free_bytes", Name: "pve_node_rootfs_free_bytes",
Help: "Node RootFS free bytes.", Help: "Node RootFS free bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.fSFree)
// Node root filesystem used bytes. // Node root filesystem used bytes.
c.fSUsed = promauto.NewGaugeVec( c.fSUsed = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_rootfs_used_bytes", Name: "pve_node_rootfs_used_bytes",
Help: "Node root filesystem used bytes.", Help: "Node root filesystem used bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.fSUsed)
// Node root filesystem total bytes. // Node root filesystem total bytes.
c.fSTotal = promauto.NewGaugeVec( c.fSTotal = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_rootfs_total_bytes", Name: "pve_node_rootfs_total_bytes",
Help: "Node root filesystem total bytes.", Help: "Node root filesystem total bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.fSTotal)
// Node root filesystem avail bytes. // Node root filesystem avail bytes.
c.fSAvail = promauto.NewGaugeVec( c.fSAvail = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_rootfs_avail_bytes", Name: "pve_node_rootfs_avail_bytes",
Help: "Node root filesystem avail bytes.", Help: "Node root filesystem avail bytes.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.fSAvail)
// Node CPU info. // Node CPU info.
c.cpuInfo = promauto.NewGaugeVec( c.cpuInfo = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_cpuinfo", Name: "pve_node_cpuinfo",
Help: "Node CPU info.", Help: "Node CPU info.",
}, },
[]string{"cluster", "node", "flags", "cores", "model", "sockets", "cpus", "hvm"}, []string{"cluster", "node", "flags", "cores", "model", "sockets", "cpus", "hvm"},
1*time.Minute,
) )
c.registry.Register(c.cpuInfo)
// Node system info metrics. // Node system info metrics.
c.systemInfo = promauto.NewGaugeVec( c.systemInfo = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_systeminfo", Name: "pve_node_systeminfo",
Help: "Node system info.", Help: "Node system info.",
}, },
[]string{"cluster", "node", "kversion", "pveversion", "machine", "sysname", "release"}, []string{"cluster", "node", "kversion", "pveversion", "machine", "sysname", "release"},
1*time.Minute,
) )
c.registry.Register(c.systemInfo)
// Node time info. // Node time info.
c.time = promauto.NewGaugeVec( c.time = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_time", Name: "pve_node_time",
Help: "Node time.", Help: "Node time.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.time)
// Node localtime info. // Node localtime info.
c.localTime = promauto.NewGaugeVec( c.localTime = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_localtime", Name: "pve_node_localtime",
Help: "Node localtime.", Help: "Node localtime.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.localTime)
return &c return &c
} }
@@ -229,27 +271,6 @@ func (c *PveNodeStatusCollector) CollectMetrics() error {
return err return err
} }
c.state.Reset()
c.uptime.Reset()
c.cpus.Reset()
c.cpuUsage.Reset()
c.memBytes.Reset()
c.memBytesUsed.Reset()
c.memBytesFree.Reset()
c.ksmShared.Reset()
c.cgroupMode.Reset()
c.load1.Reset()
c.load5.Reset()
c.load15.Reset()
c.fSFree.Reset()
c.fSUsed.Reset()
c.fSTotal.Reset()
c.fSAvail.Reset()
c.cpuInfo.Reset()
c.systemInfo.Reset()
c.time.Reset()
c.localTime.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
labels := prometheus.Labels{ labels := prometheus.Labels{
"cluster": cluster.GetClusterName(), "cluster": cluster.GetClusterName(),

View File

@@ -2,61 +2,71 @@ package metrics
import ( import (
"strconv" "strconv"
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE Storage state collector. // PVE Storage state collector.
type PveStorageCollector struct { type PveStorageCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *prometheus.GaugeVec // Storage state prometheus gauge. state *TTLGaugeVec // Storage state prometheus gauge.
total *prometheus.GaugeVec // Storage total bytes prometheus gauge. total *TTLGaugeVec // Storage total bytes prometheus gauge.
avail *prometheus.GaugeVec // Storage available bytes prometheus gauge. avail *TTLGaugeVec // Storage available bytes prometheus gauge.
used *prometheus.GaugeVec // Storage used bytes prometheus gauge. used *TTLGaugeVec // Storage used bytes prometheus gauge.
} }
// Create new instance of PVE SDN collector. // Create new instance of PVE SDN collector.
func NewPveStorageCollector(apiClient *proxmox.PveApiClient) *PveStorageCollector { func NewPveStorageCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveStorageCollector {
c := PveStorageCollector{apiClient: apiClient} c := PveStorageCollector{apiClient: apiClient}
c.registry = registry
// Storage state. // Storage state.
c.state = promauto.NewGaugeVec( c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_storage_up", Name: "pve_storage_up",
Help: "Node storage UP state.", Help: "Node storage UP state.",
}, },
[]string{"cluster", "node", "storage", "type", "content", "shared"}, []string{"cluster", "node", "storage", "type", "content", "shared"},
1*time.Minute,
) )
c.registry.Register(c.state)
// Storage total bytes. // Storage total bytes.
c.total = promauto.NewGaugeVec( c.total = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_storage_total_bytes", Name: "pve_storage_total_bytes",
Help: "Node storage total capacity in bytes.", Help: "Node storage total capacity in bytes.",
}, },
[]string{"cluster", "node", "storage", "type", "content", "shared"}, []string{"cluster", "node", "storage", "type", "content", "shared"},
1*time.Minute,
) )
c.registry.Register(c.total)
// Storage available bytes. // Storage available bytes.
c.avail = promauto.NewGaugeVec( c.avail = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_storage_avail_bytes", Name: "pve_storage_avail_bytes",
Help: "Node storage available capacity in bytes.", Help: "Node storage available capacity in bytes.",
}, },
[]string{"cluster", "node", "storage", "type", "content", "shared"}, []string{"cluster", "node", "storage", "type", "content", "shared"},
1*time.Minute,
) )
c.registry.Register(c.avail)
// Storage used bytes. // Storage used bytes.
c.used = promauto.NewGaugeVec( c.used = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_storage_used_bytes", Name: "pve_storage_used_bytes",
Help: "Node storage used capacity in bytes.", Help: "Node storage used capacity in bytes.",
}, },
[]string{"cluster", "node", "storage", "type", "content", "shared"}, []string{"cluster", "node", "storage", "type", "content", "shared"},
1*time.Minute,
) )
c.registry.Register(c.used)
return &c return &c
} }
@@ -68,11 +78,6 @@ func (c *PveStorageCollector) CollectMetrics() error {
return err return err
} }
c.state.Reset()
c.total.Reset()
c.avail.Reset()
c.used.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
storages, err := c.apiClient.GetNodeStorages(node.Name) storages, err := c.apiClient.GetNodeStorages(node.Name)
if err != nil { if err != nil {

View File

@@ -4,69 +4,80 @@ import (
"time" "time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE subscription state collector. // PVE subscription state collector.
type PveSubscriptionCollector struct { type PveSubscriptionCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
info *prometheus.GaugeVec // Node subscription info prometheus gauge. info *TTLGaugeVec // Node subscription info prometheus gauge.
status *prometheus.GaugeVec // Node subscription status prometheus gauge. status *TTLGaugeVec // Node subscription status prometheus gauge.
nextDueDate *prometheus.GaugeVec // Node subscription next due date prometheus gauge. nextDueDate *TTLGaugeVec // Node subscription next due date prometheus gauge.
regDate *prometheus.GaugeVec // Node subscription registration date prometheus gauge. regDate *TTLGaugeVec // Node subscription registration date prometheus gauge.
sockets *prometheus.GaugeVec // Node subscription sockets count prometheus gauge. sockets *TTLGaugeVec // Node subscription sockets count prometheus gauge.
} }
// Create new instance of PVE cluster state collector. // Create new instance of PVE cluster state collector.
func NewPveSubscriptionCollector(apiClient *proxmox.PveApiClient) *PveSubscriptionCollector { func NewPveSubscriptionCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveSubscriptionCollector {
c := PveSubscriptionCollector{apiClient: apiClient} c := PveSubscriptionCollector{apiClient: apiClient}
c.registry = registry
// Node subscription info. // Node subscription info.
c.info = promauto.NewGaugeVec( c.info = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_subscription_info", Name: "pve_node_subscription_info",
Help: "Node subscription info.", Help: "Node subscription info.",
}, },
[]string{"cluster", "node", "productname", "serverid"}, []string{"cluster", "node", "productname", "serverid"},
1*time.Minute,
) )
c.registry.Register(c.info)
// Node subscription status. // Node subscription status.
c.status = promauto.NewGaugeVec( c.status = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_subscription_status", Name: "pve_node_subscription_status",
Help: "Node subscription status.", Help: "Node subscription status.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.status)
// Node subscription registration date. // Node subscription registration date.
c.regDate = promauto.NewGaugeVec( c.regDate = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_subscription_regdate", Name: "pve_node_subscription_regdate",
Help: "Node subscription registration date.", Help: "Node subscription registration date.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.regDate)
// Node subscription next due date. // Node subscription next due date.
c.nextDueDate = promauto.NewGaugeVec( c.nextDueDate = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_subscription_nextduedate", Name: "pve_node_subscription_nextduedate",
Help: "Node subscription next due date.", Help: "Node subscription next due date.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.nextDueDate)
// Node subscription count of sockets. // Node subscription count of sockets.
c.sockets = promauto.NewGaugeVec( c.sockets = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_node_subscription_sockets", Name: "pve_node_subscription_sockets",
Help: "Node subscription count of sockets.", Help: "Node subscription count of sockets.",
}, },
[]string{"cluster", "node"}, []string{"cluster", "node"},
1*time.Minute,
) )
c.registry.Register(c.sockets)
return &c return &c
} }
@@ -78,12 +89,6 @@ func (c *PveSubscriptionCollector) CollectMetrics() error {
return err return err
} }
c.info.Reset()
c.status.Reset()
c.nextDueDate.Reset()
c.regDate.Reset()
c.sockets.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
labels := prometheus.Labels{ labels := prometheus.Labels{
"cluster": cluster.GetClusterName(), "cluster": cluster.GetClusterName(),

View File

@@ -2,221 +2,261 @@ package metrics
import ( import (
"strconv" "strconv"
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"lostak.dev/pve-exporter/proxmox" "lostak.dev/pve-exporter/proxmox"
) )
// PVE virtual machine collector. // PVE virtual machine collector.
type PveVirtualMachineCollector struct { type PveVirtualMachineCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance. apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *prometheus.GaugeVec // Virtual machine state prometheus gauge. state *TTLGaugeVec // Virtual machine state prometheus gauge.
uptime *prometheus.GaugeVec // Virtual machine uptime prometheus gauge. uptime *TTLGaugeVec // Virtual machine uptime prometheus gauge.
cpu *prometheus.GaugeVec // Virtual machine count of CPUs prometheus gauge. cpu *TTLGaugeVec // Virtual machine count of CPUs prometheus gauge.
cpuUsage *prometheus.GaugeVec // Virtual machine CPU usage % prometheus gauge. cpuUsage *TTLGaugeVec // Virtual machine CPU usage % prometheus gauge.
memBytes *prometheus.GaugeVec // Virtual machine memory in bytes prometheus gauge. memBytes *TTLGaugeVec // Virtual machine memory in bytes prometheus gauge.
memBytesUsed *prometheus.GaugeVec // Virtual machine memory usage in bytes prometheus gauge. memBytesUsed *TTLGaugeVec // Virtual machine memory usage in bytes prometheus gauge.
disk *prometheus.GaugeVec // Virtual machine disk space usage in bytes prometheus gauge. disk *TTLGaugeVec // Virtual machine disk space usage in bytes prometheus gauge.
diskMax *prometheus.GaugeVec // Virtual machine disk size in bytes prometheus gauge. diskMax *TTLGaugeVec // Virtual machine disk size in bytes prometheus gauge.
swap *prometheus.GaugeVec // Virtual machine swap usage in bytes prometheus gauge. swap *TTLGaugeVec // Virtual machine swap usage in bytes prometheus gauge.
netReceive *prometheus.GaugeVec // Virtual machine network receive in bytes prometheus gauge. netReceive *TTLGaugeVec // Virtual machine network receive in bytes prometheus gauge.
netTransmit *prometheus.GaugeVec // Virtual machine network transmit in bytes prometheus gauge. netTransmit *TTLGaugeVec // Virtual machine network transmit in bytes prometheus gauge.
diskReadOps *prometheus.GaugeVec // Virtual machine disk read ops prometheus gauge. diskReadOps *TTLGaugeVec // Virtual machine disk read ops prometheus gauge.
diskWriteOps *prometheus.GaugeVec // Virtual machine disk write ops prometheus gauge. diskWriteOps *TTLGaugeVec // Virtual machine disk write ops prometheus gauge.
diskReadBytes *prometheus.GaugeVec // Virtual machine disk read bytes prometheus gauge. diskReadBytes *TTLGaugeVec // Virtual machine disk read bytes prometheus gauge.
diskWriteBytes *prometheus.GaugeVec // Virtual machine disk write bytes prometheus gauge. diskWriteBytes *TTLGaugeVec // Virtual machine disk write bytes prometheus gauge.
diskReadTimeNs *prometheus.GaugeVec // Virtual machine disk read time total prometheus gauge. diskReadTimeNs *TTLGaugeVec // Virtual machine disk read time total prometheus gauge.
diskWriteTimeNs *prometheus.GaugeVec // Virtual machine disk write time total prometheus gauge. diskWriteTimeNs *TTLGaugeVec // Virtual machine disk write time total prometheus gauge.
diskFailedReadOps *prometheus.GaugeVec // Virtual machine disk failed read ops prometheus gauge. diskFailedReadOps *TTLGaugeVec // Virtual machine disk failed read ops prometheus gauge.
diskFailedWriteOps *prometheus.GaugeVec // Virtual machine disk failed write ops prometheus gauge. diskFailedWriteOps *TTLGaugeVec // Virtual machine disk failed write ops prometheus gauge.
agent *prometheus.GaugeVec // Virtual machine agent enabled prometheus gauge. agent *TTLGaugeVec // Virtual machine agent enabled prometheus gauge.
} }
// Create new instance of PVE virtual machine collector. // Create new instance of PVE virtual machine collector.
func NewPveVirtualMachineCollector(apiClient *proxmox.PveApiClient) *PveVirtualMachineCollector { func NewPveVirtualMachineCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveVirtualMachineCollector {
c := PveVirtualMachineCollector{apiClient: apiClient} c := PveVirtualMachineCollector{apiClient: apiClient}
c.registry = registry
// Virtual machine state. // Virtual machine state.
c.state = promauto.NewGaugeVec( c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_state", Name: "pve_vm_state",
Help: "Virtual machine state.", Help: "Virtual machine state.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.state)
// Virtual machine uptime. // Virtual machine uptime.
c.uptime = promauto.NewGaugeVec( c.uptime = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_uptime", Name: "pve_vm_uptime",
Help: "Virtual machine uptime.", Help: "Virtual machine uptime.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.uptime)
// Virtual machine agent state. // Virtual machine agent state.
c.agent = promauto.NewGaugeVec( c.agent = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_agent", Name: "pve_vm_agent",
Help: "Virtual machine agent state.", Help: "Virtual machine agent state.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.agent)
// Virtual machine CPU count. // Virtual machine CPU count.
c.cpu = promauto.NewGaugeVec( c.cpu = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_cpu_count", Name: "pve_vm_cpu_count",
Help: "Virtual machine CPU count.", Help: "Virtual machine CPU count.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.cpu)
// Virtual machine CPU usage. // Virtual machine CPU usage.
c.cpuUsage = promauto.NewGaugeVec( c.cpuUsage = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_cpu_usage", Name: "pve_vm_cpu_usage",
Help: "Virtual machine CPU usage.", Help: "Virtual machine CPU usage.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.cpuUsage)
// Virtual machine memory total. // Virtual machine memory total.
c.memBytes = promauto.NewGaugeVec( c.memBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_mem_total_bytes", Name: "pve_vm_mem_total_bytes",
Help: "Virtual machine total memory in bytes.", Help: "Virtual machine total memory in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.memBytes)
// Virtual machine memory usage. // Virtual machine memory usage.
c.memBytesUsed = promauto.NewGaugeVec( c.memBytesUsed = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_mem_used_bytes", Name: "pve_vm_mem_used_bytes",
Help: "Virtual machine used memory in bytes.", Help: "Virtual machine used memory in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.memBytesUsed)
// Virtual machine disk size. // Virtual machine disk size.
c.disk = promauto.NewGaugeVec( c.disk = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_usage_bytes", Name: "pve_vm_disk_usage_bytes",
Help: "Virtual machine disk read bytes.", Help: "Virtual machine disk read bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.disk)
// Virtual machine disk size. // Virtual machine disk size.
c.diskMax = promauto.NewGaugeVec( c.diskMax = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_size_bytes", Name: "pve_vm_disk_size_bytes",
Help: "Virtual machine disk size bytes.", Help: "Virtual machine disk size bytes.",
}, },
[]string{"cluster", "node", "vmid", "name"}, []string{"cluster", "node", "vmid", "name"},
1*time.Minute,
) )
c.registry.Register(c.diskMax)
// Virtual machine network receive bytes. // Virtual machine network receive bytes.
c.netReceive = promauto.NewGaugeVec( c.netReceive = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_network_in_bytes", Name: "pve_vm_network_in_bytes",
Help: "Virtual machine network receive in bytes.", Help: "Virtual machine network receive in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name", "interface"}, []string{"cluster", "node", "vmid", "name", "interface"},
1*time.Minute,
) )
c.registry.Register(c.netReceive)
// Virtual machine network transmit bytes. // Virtual machine network transmit bytes.
c.netTransmit = promauto.NewGaugeVec( c.netTransmit = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_network_out_bytes", Name: "pve_vm_network_out_bytes",
Help: "Virtual machine network transmit in bytes.", Help: "Virtual machine network transmit in bytes.",
}, },
[]string{"cluster", "node", "vmid", "name", "interface"}, []string{"cluster", "node", "vmid", "name", "interface"},
1*time.Minute,
) )
c.registry.Register(c.netTransmit)
// Virtual machine disk read ops. // Virtual machine disk read ops.
c.diskReadOps = promauto.NewGaugeVec( c.diskReadOps = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_operations", Name: "pve_vm_disk_rd_operations",
Help: "Virtual machine disk read ops.", Help: "Virtual machine disk read ops.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskReadOps)
// Virtual machine disk write ops. // Virtual machine disk write ops.
c.diskWriteOps = promauto.NewGaugeVec( c.diskWriteOps = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_operations", Name: "pve_vm_disk_wr_operations",
Help: "Virtual machine disk write ops.", Help: "Virtual machine disk write ops.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskWriteOps)
// Virtual machine disk read bytes. // Virtual machine disk read bytes.
c.diskReadBytes = promauto.NewGaugeVec( c.diskReadBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_bytes", Name: "pve_vm_disk_rd_bytes",
Help: "Virtual machine disk read bytes.", Help: "Virtual machine disk read bytes.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskReadBytes)
// Virtual machine disk write bytes. // Virtual machine disk write bytes.
c.diskWriteBytes = promauto.NewGaugeVec( c.diskWriteBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_bytes", Name: "pve_vm_disk_wr_bytes",
Help: "Virtual machine disk write bytes.", Help: "Virtual machine disk write bytes.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskWriteBytes)
// Virtual machine failed disk read ops. // Virtual machine failed disk read ops.
c.diskFailedReadOps = promauto.NewGaugeVec( c.diskFailedReadOps = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_failed_rd_ops", Name: "pve_vm_disk_failed_rd_ops",
Help: "Virtual machine failed disk read ops.", Help: "Virtual machine failed disk read ops.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskFailedReadOps)
// Virtual machine failed disk write ops. // Virtual machine failed disk write ops.
c.diskFailedWriteOps = promauto.NewGaugeVec( c.diskFailedWriteOps = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_failed_wr_ops", Name: "pve_vm_disk_failed_wr_ops",
Help: "Virtual machine failed disk write ops.", Help: "Virtual machine failed disk write ops.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskFailedWriteOps)
// Virtual machine disk read time total nanoseconds. // Virtual machine disk read time total nanoseconds.
c.diskReadTimeNs = promauto.NewGaugeVec( c.diskReadTimeNs = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_time_total_ns", Name: "pve_vm_disk_rd_time_total_ns",
Help: "Virtual machine disk read time total in nanoseconds.", Help: "Virtual machine disk read time total in nanoseconds.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskReadTimeNs)
// Virtual machine disk write time total nanoseconds. // Virtual machine disk write time total nanoseconds.
c.diskWriteTimeNs = promauto.NewGaugeVec( c.diskWriteTimeNs = NewTTLGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_time_total_ns", Name: "pve_vm_disk_wr_time_total_ns",
Help: "Virtual machine disk write time total in nanoseconds.", Help: "Virtual machine disk write time total in nanoseconds.",
}, },
[]string{"cluster", "node", "vmid", "name", "device"}, []string{"cluster", "node", "vmid", "name", "device"},
1*time.Minute,
) )
c.registry.Register(c.diskWriteTimeNs)
return &c return &c
} }
@@ -228,24 +268,6 @@ func (c *PveVirtualMachineCollector) CollectMetrics() error {
return err return err
} }
c.state.Reset()
c.cpu.Reset()
c.memBytes.Reset()
c.diskMax.Reset()
c.uptime.Reset()
c.cpuUsage.Reset()
c.memBytesUsed.Reset()
c.netReceive.Reset()
c.netTransmit.Reset()
c.diskReadOps.Reset()
c.diskWriteOps.Reset()
c.diskReadBytes.Reset()
c.diskWriteBytes.Reset()
c.diskFailedReadOps.Reset()
c.diskFailedWriteOps.Reset()
c.diskReadTimeNs.Reset()
c.diskWriteTimeNs.Reset()
for _, node := range cluster.NodeStatuses { for _, node := range cluster.NodeStatuses {
qemus, err := c.apiClient.GetNodeQemuList(node.Name) qemus, err := c.apiClient.GetNodeQemuList(node.Name)
if err != nil { if err != nil {

139
metrics/pve_ttl_metrics.go Normal file
View File

@@ -0,0 +1,139 @@
package metrics
import (
"fmt"
"sort"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// TTLMetric is an interface for metrics that support time-to-live cleanup.
// Any metric that implements Cleanup() can be registered in the TTLRegistry.
type TTLMetric interface {
// Cleanup removes stale metric label sets that have exceeded their TTL.
Cleanup()
}
// TTLGaugeVec wraps a Prometheus GaugeVec and tracks the last update time
// for each set of labels. When a set of labels is not updated within the TTL,
// it is automatically removed from the underlying GaugeVec.
type TTLGaugeVec struct {
gaugeVec *prometheus.GaugeVec // Underlying Prometheus GaugeVec.
ttl time.Duration // Duration after which an unused label set is considered stale.
lastUpdate sync.Map // Map storing last update time for each label set (key is a sorted labels string).
}
// NewTTLGaugeVec creates a new TTLGaugeVec using the provided GaugeOpts, label names, and TTL.
// The underlying GaugeVec is registered using promauto.
func NewTTLGaugeVec(opts prometheus.GaugeOpts, labelNames []string, ttl time.Duration) *TTLGaugeVec {
return &TTLGaugeVec{
gaugeVec: promauto.NewGaugeVec(opts, labelNames),
ttl: ttl,
}
}
// With returns the gauge for the given label set and records the current time
// as the last update for those labels.
func (t *TTLGaugeVec) With(labels prometheus.Labels) prometheus.Gauge {
key := labelsKey(labels)
t.lastUpdate.Store(key, time.Now())
return t.gaugeVec.With(labels)
}
// Delete removes the metric associated with the given label set from both the underlying
// GaugeVec and the lastUpdate tracking map. It returns true if the deletion was successful.
func (t *TTLGaugeVec) Delete(labels prometheus.Labels) bool {
key := labelsKey(labels)
t.lastUpdate.Delete(key)
return t.gaugeVec.Delete(labels)
}
// Cleanup iterates over all tracked label sets and deletes those that have not been updated
// within the TTL duration.
func (t *TTLGaugeVec) Cleanup() {
now := time.Now()
t.lastUpdate.Range(func(key, value interface{}) bool {
if last, ok := value.(time.Time); ok {
if now.Sub(last) > t.ttl {
labels := parseLabels(key.(string))
t.gaugeVec.Delete(labels)
t.lastUpdate.Delete(key)
}
}
return true
})
}
// labelsKey creates a deterministic key from a Prometheus labels map.
// It sorts the keys and concatenates them in the format "key=value" separated by commas.
func labelsKey(labels prometheus.Labels) string {
var keys []string
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
var parts []string
for _, k := range keys {
parts = append(parts, fmt.Sprintf("%s=%s", k, labels[k]))
}
return strings.Join(parts, ",")
}
// parseLabels converts a sorted key string back into a Prometheus labels map.
// The key is expected to be in the format produced by labelsKey.
func parseLabels(key string) prometheus.Labels {
labels := prometheus.Labels{}
parts := strings.Split(key, ",")
for _, part := range parts {
kv := strings.SplitN(part, "=", 2)
if len(kv) == 2 {
labels[kv[0]] = kv[1]
}
}
return labels
}
// TTLRegistry manages multiple TTLMetric instances and periodically cleans them up.
type TTLRegistry struct {
mu sync.RWMutex
metrics []TTLMetric
}
// NewTTLRegistry creates and returns a new TTLRegistry.
func NewTTLRegistry() *TTLRegistry {
return &TTLRegistry{
metrics: make([]TTLMetric, 0),
}
}
// Register adds a TTLMetric to the registry for periodic cleanup.
func (r *TTLRegistry) Register(metric TTLMetric) {
r.mu.Lock()
defer r.mu.Unlock()
r.metrics = append(r.metrics, metric)
}
// Cleanup calls the Cleanup method on each registered TTLMetric.
func (r *TTLRegistry) Cleanup() {
r.mu.RLock()
defer r.mu.RUnlock()
for _, metric := range r.metrics {
metric.Cleanup()
}
}
// StartCleanupLoop starts a background goroutine that periodically cleans up stale metrics.
// The cleanup is performed at the specified interval.
func (r *TTLRegistry) StartCleanupLoop(interval time.Duration) {
go func() {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for range ticker.C {
r.Cleanup()
}
}()
}