fast_ilo_exporter/main.py
2024-05-20 01:36:33 -07:00

240 lines
9.9 KiB
Python

from prometheus_client import start_http_server, Gauge, Counter
from prometheus_client.core import REGISTRY, GaugeMetricFamily
from prometheus_client.registry import Collector
from pysnmp.entity.engine import SnmpEngine
from pysnmp.hlapi import CommunityData, UdpTransportTarget, ContextData
from snmp import SnmpConfiguration, snmp_get
import scrape
from snmp_groups import BulkValues, BulkDummyValue
from targets.temp import *
from targets.fan import *
from targets.cpu import *
from targets.drive import *
from targets.memory import *
from targets.power import *
import argparse
NAMESPACE = 'ilo'
arg_parser = argparse.ArgumentParser(
'ilo_exporter',
description='A fast(er) prometheus exporter for applicable HP servers using SNMP via the ILO controller.',
)
arg_parser.add_argument('-i', '--ilo-address', help='ILO IP address to scan.', required=True)
arg_parser.add_argument('-a', '--server-address', default='0.0.0.0', help='Address to bind for hosting the metrics endpoint.')
arg_parser.add_argument('-p', '--server-port', default=6969, help='Port to bind for the metrics endpoint.')
arg_parser.add_argument('-c', '--snmp-community', default='public', help='SNMP community to read.')
arg_parser.add_argument('--snmp-port', default=161, help='SNMP port to use.')
arg_parser.add_argument('-o', '--scan-once', action='store_true', help='Only scan for SNMP variables on init, instead of on each collection (except hard drives, see --scan-drives-once). This is a small optimizaion that can be used if your sever configuration never changes.')
arg_parser.add_argument('--scan-drives-once', action='store_true', help='When combined with --scan-once, this also prevents hard drives from being rescanned on collection. This is not recommeded.')
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity.')
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning.')
args = arg_parser.parse_args()
if args.quiet and args.verbose:
print('stop it. (--quiet and --verbose do not mix)')
exit(1)
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the iLO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
def noisy(*a, **kwa):
if not args.quiet:
print(*a, **kwa)
def verbose(*a, **kwa):
if args.verbose:
print(*a, **kwa)
class BulkCollector(Collector):
def __init__(self, snmp_config: SnmpConfiguration, index_oid_template: str, target_name: str, scan_on_collect: bool, *metrics_groups: tuple[str, BulkValues, list[BulkEnums]], scan_method: any = scrape.detect_things):
self._snmp_config = snmp_config
self._metrics_groups = metrics_groups
self._target_name = target_name
self._name_template = '%s_%s_' % (NAMESPACE, target_name) + '%s'
self._ids = []
self._index_oid_template = index_oid_template
self._scan_on_collect = scan_on_collect
self._scan_method = scan_method
if not scan_on_collect:
self.scan()
def scan(self):
verbose('scanning target', self._target_name)
self._ids = self._scan_method(self._snmp_config, self._index_oid_template)
noisy('found', len(self._ids), 'items for target', self._target_name)
def collect(self):
cache = {}
try:
if self._scan_on_collect:
self.scan()
for documentation, bulk_values, bulk_labels in self._metrics_groups:
metric_name = self._name_template % bulk_values.name
verbose('collecting', metric_name)
label_names = ['id']
label_maps = []
for label in bulk_labels:
# the labels are cached since they may be reused
if label.name not in cache:
cache[label.name] = label.get_values(self._snmp_config, self._ids)
label_names.append(label.name)
label_maps.append(cache[label.name])
metric = GaugeMetricFamily(
metric_name,
documentation,
labels=label_names
)
# values are not reused
value_map = bulk_values.get_values(self._snmp_config, self._ids)
# do some fuckery (bad design, I know.)
for i in self._ids:
labels = [str(i)] # id is first
for label_map in label_maps:
label_value = label_map[i]
labels.append(str(label_value))
value = value_map[i]
metric.add_metric(labels, value)
yield metric
except Exception as e:
print('Failed to scan SNMP, aborting collection')
SCAN_FAIL_COUNTER.inc()
raise e
class PowerCollector(Collector):
def collect(self) -> float:
verbose('collecting ilo_server_power_draw')
try:
reading = snmp_get(config, POWER_METER_READING)
support = snmp_get(config, POWER_METER_SUPPORT)
status = snmp_get(config, POWER_METER_STATUS)
if not isinstance(reading, int):
print('expected power meter reading to be an int, got', type(reading))
print('value in question:', reading)
reading = -1
if not isinstance(support, int):
print('expected power meter support to be an int, got', type(support))
print('value in question:', support)
support = 1
if not isinstance(status, int):
print('expected power meter status to be an int, got', type(status))
print('value in question:', status)
status = 1
if support not in POWER_METER_SUPPORT_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_SUPPORT:', support)
support_s = 'unknown'
else:
support_s = POWER_METER_SUPPORT_MAP[support]
if status not in POWER_METER_STATUS_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_STATUS:', status)
status_s = 'unknown'
else:
status_s = POWER_METER_STATUS_MAP[status]
metric = GaugeMetricFamily('ilo_server_power_draw', 'Power draw of the server in watts', labels=['support', 'status'])
metric.add_metric([support_s, status_s], reading)
yield metric
except Exception as e:
print('Failed to scan SNMP, aborting collection')
SCAN_FAIL_COUNTER.inc()
raise e
if __name__ == '__main__':
config = SnmpConfiguration(
SnmpEngine(),
CommunityData(args.snmp_community),
UdpTransportTarget((args.ilo_address, args.snmp_port)),
ContextData(),
)
REGISTRY.register(PowerCollector())
no_value = BulkDummyValue('info')
REGISTRY.register(BulkCollector(
config,
TEMP_INDEX,
'temperature',
not args.scan_once,
('Temperatures readings of each temperature sensor in celsius', TEMP_CELSIUS, [TEMP_SENSOR_LOCALE, TEMP_CONDITION]),
('Temperature thresholds for each temperature sensor in celsius', TEMP_THRESHOLD, [TEMP_SENSOR_LOCALE, TEMP_THRESHOLD_TYPE]),
))
REGISTRY.register(BulkCollector(
config,
FAN_INDEX,
'fan',
not args.scan_once,
('Information about system fans', no_value, [FAN_LOCALE, FAN_CONDITION, FAN_SPEED, FAN_PRESENT, FAN_PRESENCE_TEST]),
))
REGISTRY.register(BulkCollector(
config,
CPU_INDEX,
'cpu',
not args.scan_once,
('Information about CPUs', no_value, [CPU_NAME, CPU_STATUS, CPU_POWER_STATUS]),
('Speed of CPUs in megahertz', CPU_SPEED, [CPU_NAME]),
('CPU step', CPU_STEP, [CPU_NAME]), # I dunno
('Number of enabled cores', CORES_ENABLED, [CPU_NAME]),
('Number of available threads', THREADS_AVAILABLE, [CPU_NAME]),
))
# logical drives are for v2 if it ever exists (I don't use logical drives, sorry)
REGISTRY.register(BulkCollector(
config,
DRIVE_INDEX,
'drive',
not args.scan_drives_once,
('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL, DRIVE_FIRMWARE, DRIVE_LINK_RATE, DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING, DRIVE_SMART_STATUS, DRIVE_MEDIA_TYPE, DRIVE_ROTATIONAL_SPEED, DRIVE_STATUS, DRIVE_CONDITION]),
('Sizes of installed drives in megabytes', DRIVE_SIZE, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Temperatures of installed drives in celsius', DRIVE_TEMP, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
scan_method=scrape.detect_complex,
))
REGISTRY.register(BulkCollector(
config,
MEMORY_INDEX,
'memory',
not args.scan_once,
('Information about system memory', no_value, [MEMORY_LOCATION, MEMORY_MANUFACTURER, MEMORY_PART_NUMBER, MEMORY_STATUS, MEMORY_CONDITION]),
('Sizes of system memory modules in kilobytes', MEMORY_SIZE, [MEMORY_LOCATION]),
))
# start metrics endpoint
addr = args.server_address
port = args.server_port
print('starting metrics server on http://%s:%s' % (addr, port))
server, thread = start_http_server(port, addr)
print('ready!')
thread.join()
print('thread died!')