Files
pve-exporter/metrics/pve_node_virtual_machine_collector.go
2025-03-04 12:05:05 +01:00

352 lines
10 KiB
Go

package metrics
import (
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"lostak.dev/pve-exporter/proxmox"
)
// PVE virtual machine collector.
type PveVirtualMachineCollector struct {
apiClient *proxmox.PveApiClient // PVE API client instance.
registry *TTLRegistry // TTL metrics registry.
state *TTLGaugeVec // Virtual machine state prometheus gauge.
uptime *TTLGaugeVec // Virtual machine uptime prometheus gauge.
cpu *TTLGaugeVec // Virtual machine count of CPUs prometheus gauge.
cpuUsage *TTLGaugeVec // Virtual machine CPU usage % prometheus gauge.
memBytes *TTLGaugeVec // Virtual machine memory in bytes prometheus gauge.
memBytesUsed *TTLGaugeVec // Virtual machine memory usage in bytes prometheus gauge.
disk *TTLGaugeVec // Virtual machine disk space usage in bytes prometheus gauge.
diskMax *TTLGaugeVec // Virtual machine disk size in bytes prometheus gauge.
swap *TTLGaugeVec // Virtual machine swap usage in bytes prometheus gauge.
netReceive *TTLGaugeVec // Virtual machine network receive in bytes prometheus gauge.
netTransmit *TTLGaugeVec // Virtual machine network transmit in bytes prometheus gauge.
diskReadOps *TTLGaugeVec // Virtual machine disk read ops prometheus gauge.
diskWriteOps *TTLGaugeVec // Virtual machine disk write ops prometheus gauge.
diskReadBytes *TTLGaugeVec // Virtual machine disk read bytes prometheus gauge.
diskWriteBytes *TTLGaugeVec // Virtual machine disk write bytes prometheus gauge.
diskReadTimeNs *TTLGaugeVec // Virtual machine disk read time total prometheus gauge.
diskWriteTimeNs *TTLGaugeVec // Virtual machine disk write time total prometheus gauge.
diskFailedReadOps *TTLGaugeVec // Virtual machine disk failed read ops prometheus gauge.
diskFailedWriteOps *TTLGaugeVec // Virtual machine disk failed write ops prometheus gauge.
agent *TTLGaugeVec // Virtual machine agent enabled prometheus gauge.
}
// Create new instance of PVE virtual machine collector.
func NewPveVirtualMachineCollector(apiClient *proxmox.PveApiClient, registry *TTLRegistry) *PveVirtualMachineCollector {
c := PveVirtualMachineCollector{apiClient: apiClient}
c.registry = registry
// Virtual machine state.
c.state = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_state",
Help: "Virtual machine state.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.state)
// Virtual machine uptime.
c.uptime = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_uptime",
Help: "Virtual machine uptime.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.uptime)
// Virtual machine agent state.
c.agent = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_agent",
Help: "Virtual machine agent state.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.agent)
// Virtual machine CPU count.
c.cpu = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_cpu_count",
Help: "Virtual machine CPU count.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.cpu)
// Virtual machine CPU usage.
c.cpuUsage = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_cpu_usage",
Help: "Virtual machine CPU usage.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.cpuUsage)
// Virtual machine memory total.
c.memBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_mem_total_bytes",
Help: "Virtual machine total memory in bytes.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.memBytes)
// Virtual machine memory usage.
c.memBytesUsed = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_mem_used_bytes",
Help: "Virtual machine used memory in bytes.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.memBytesUsed)
// Virtual machine disk size.
c.disk = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_usage_bytes",
Help: "Virtual machine disk read bytes.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.disk)
// Virtual machine disk size.
c.diskMax = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_size_bytes",
Help: "Virtual machine disk size bytes.",
},
[]string{"cluster", "node", "vmid", "name"},
5*time.Minute,
)
c.registry.Register(c.diskMax)
// Virtual machine network receive bytes.
c.netReceive = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_network_in_bytes",
Help: "Virtual machine network receive in bytes.",
},
[]string{"cluster", "node", "vmid", "name", "interface"},
5*time.Minute,
)
c.registry.Register(c.netReceive)
// Virtual machine network transmit bytes.
c.netTransmit = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_network_out_bytes",
Help: "Virtual machine network transmit in bytes.",
},
[]string{"cluster", "node", "vmid", "name", "interface"},
5*time.Minute,
)
c.registry.Register(c.netTransmit)
// Virtual machine disk read ops.
c.diskReadOps = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_operations",
Help: "Virtual machine disk read ops.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskReadOps)
// Virtual machine disk write ops.
c.diskWriteOps = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_operations",
Help: "Virtual machine disk write ops.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskWriteOps)
// Virtual machine disk read bytes.
c.diskReadBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_bytes",
Help: "Virtual machine disk read bytes.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskReadBytes)
// Virtual machine disk write bytes.
c.diskWriteBytes = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_bytes",
Help: "Virtual machine disk write bytes.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskWriteBytes)
// Virtual machine failed disk read ops.
c.diskFailedReadOps = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_failed_rd_ops",
Help: "Virtual machine failed disk read ops.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskFailedReadOps)
// Virtual machine failed disk write ops.
c.diskFailedWriteOps = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_failed_wr_ops",
Help: "Virtual machine failed disk write ops.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskFailedWriteOps)
// Virtual machine disk read time total nanoseconds.
c.diskReadTimeNs = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_rd_time_total_ns",
Help: "Virtual machine disk read time total in nanoseconds.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskReadTimeNs)
// Virtual machine disk write time total nanoseconds.
c.diskWriteTimeNs = NewTTLGaugeVec(
prometheus.GaugeOpts{
Name: "pve_vm_disk_wr_time_total_ns",
Help: "Virtual machine disk write time total in nanoseconds.",
},
[]string{"cluster", "node", "vmid", "name", "device"},
5*time.Minute,
)
c.registry.Register(c.diskWriteTimeNs)
return &c
}
// PveMetricsCollector interface implementation.
func (c *PveVirtualMachineCollector) CollectMetrics() error {
cluster, err := c.apiClient.GetClusterStatus()
if err != nil {
return err
}
for _, node := range cluster.NodeStatuses {
qemus, err := c.apiClient.GetNodeQemuList(node.Name)
if err != nil {
return err
}
for _, qemu := range *qemus {
// Skip templates because they are always offline.
if qemu.Template == 1 {
continue
}
labels := prometheus.Labels{
"cluster": cluster.GetClusterName(),
"node": node.Name,
"vmid": strconv.Itoa(qemu.VMID),
"name": qemu.Name,
}
c.state.With(labels).Set(qemu.GetStatusNumeric())
c.cpu.With(labels).Set(float64(qemu.CPUs))
c.memBytes.With(labels).Set(float64(qemu.MaxMem))
c.diskMax.With(labels).Set(float64(qemu.MaxDisk))
// Metrics only on running virtual machines.
if qemu.IsRunning() {
c.uptime.With(labels).Set(float64(qemu.Uptime))
c.cpuUsage.With(labels).Set(float64(qemu.CPU))
c.memBytesUsed.With(labels).Set(float64(qemu.Mem))
detail, err := c.apiClient.GetNodeQemu(node.Name, strconv.Itoa(qemu.VMID))
if err != nil {
return err
}
c.agent.With(labels).Set(float64(detail.Agent))
for iface, value := range detail.Nics {
labels := prometheus.Labels{
"cluster": cluster.GetClusterName(),
"node": node.Name,
"vmid": strconv.Itoa(qemu.VMID),
"name": qemu.Name,
"interface": iface,
}
c.netReceive.With(labels).Set(float64(value.NetIn))
c.netTransmit.With(labels).Set(float64(value.NetOut))
}
for device, value := range detail.BlockStat {
labels := prometheus.Labels{
"cluster": cluster.GetClusterName(),
"node": node.Name,
"vmid": strconv.Itoa(qemu.VMID),
"name": qemu.Name,
"device": device,
}
c.diskReadOps.With(labels).Set(float64(value.RdOperations))
c.diskWriteOps.With(labels).Set(float64(value.WrOperations))
c.diskReadBytes.With(labels).Set(float64(value.RdBytes))
c.diskWriteBytes.With(labels).Set(float64(value.WrBytes))
c.diskFailedReadOps.With(labels).Set(float64(value.FailedRdOperations))
c.diskFailedWriteOps.With(labels).Set(float64(value.FailedWrOperations))
c.diskReadTimeNs.With(labels).Set(float64(value.RdTotalTimeNs))
c.diskWriteTimeNs.With(labels).Set(float64(value.WrTotalTimeNs))
}
}
}
}
return nil
}
// PveMetricsCollector interface implementation.
func (c *PveVirtualMachineCollector) GetName() string {
return "Virtual Machine"
}