fast_ilo_exporter/main.py
Benjamin Wiegand 95f7fa7c03 fix typos
2024-05-22 16:01:45 -07:00

367 lines
16 KiB
Python

from prometheus_client import start_http_server, Counter
from prometheus_client.core import REGISTRY, GaugeMetricFamily
from prometheus_client.registry import Collector
from pysnmp.entity.engine import SnmpEngine
from pysnmp.hlapi import CommunityData, UdpTransportTarget, ContextData
from https import HttpsConfiguration
from snmp import SnmpConfiguration, snmp_get
import scrape
from snmp_groups import BulkValues, BulkDummyValue, BulkPredeterminedValues
from targets.temp import *
from targets.fan import *
from targets.cpu import *
from targets.drive import *
from targets.memory import *
from targets.power import *
import argparse
import os
import traceback
NAMESPACE = 'ilo'
VERSION = '1.0.0'
arg_parser = argparse.ArgumentParser(
'ilo_exporter',
description='A fast(er) prometheus exporter for applicable HP servers using SNMP via the ILO controller. v%s' % VERSION,
)
arg_parser.add_argument('-i', '--ilo-address', help='ILO IP address to scan.', required=True)
arg_parser.add_argument('-a', '--server-address', default='0.0.0.0', help='Address to bind for hosting the metrics endpoint.')
arg_parser.add_argument('-p', '--server-port', default=6969, help='Port to bind for the metrics endpoint.')
arg_parser.add_argument('-c', '--snmp-community', default='public', help='SNMP community to read.')
arg_parser.add_argument('--snmp-port', default=161, help='SNMP port to use.')
arg_parser.add_argument('-o', '--scan-once', action='store_true', help='Only scan for SNMP variables on init, instead of on each collection (except hard drives, see --scan-drives-once). This is a small optimization that can be used if your sever configuration never changes.')
arg_parser.add_argument('--scan-drives-once', action='store_true', help='When combined with --scan-once, this also prevents hard drives from being rescanned on collection. This is not recommended.')
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity. Incompatible with --quiet')
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning. Incompatible with --verbose')
arg_parser.add_argument('--https-temperature', action='store_true', help='Attempt to fetch and combine additional temperature sensor info over https, such as sensor names. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
arg_parser.add_argument('--https-fans', action='store_true', help='Attempt to fetch the fan speed of each fan in percent over https. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
arg_parser.add_argument('--https-verify', action='store_true', help='Enable SSL verification with ILO for https requests. You can optionally specify a specific certificate to use with the ILO_CERTIFICATE environment variable.')
arg_parser.add_argument('--https-timeout', default=5, help='Set the timeout for getting metrics over https. This sets both the connect timeout and the response timeout, meaning the actual maximum amount of allowed time is double this value, while the minimum amount of time is equal to it.')
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the ILO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
HTTPS_FAIL_COUNTER = None
def noisy(*a, **kwa):
if not args.quiet:
print(*a, **kwa)
def verbose(*a, **kwa):
if args.verbose:
print(*a, **kwa)
class BulkCollector(Collector):
def __init__(self, snmp_config: SnmpConfiguration, index_oid_template: str, target_name: str, scan_on_collect: bool, *metrics_groups: tuple[str, BulkValues, list[BulkEnums]], scan_method: any = scrape.detect_things):
self._snmp_config = snmp_config
self._metrics_groups = metrics_groups
self._target_name = target_name
self._name_template = '%s_%s_' % (NAMESPACE, target_name) + '%s'
self._ids = []
self._index_oid_template = index_oid_template
self._scan_on_collect = scan_on_collect
self._scan_method = scan_method
if not scan_on_collect:
self.scan()
def scan(self):
verbose('scanning target', self._target_name)
self._ids = self._scan_method(self._snmp_config, self._index_oid_template)
noisy('found', len(self._ids), 'items for target', self._target_name)
def collect(self):
cache = {}
try:
if self._scan_on_collect:
self.scan()
for documentation, bulk_values, bulk_labels in self._metrics_groups:
metric_name = self._name_template % bulk_values.name
verbose('collecting', metric_name)
label_names = ['id']
label_maps = []
for label in bulk_labels:
# the labels are cached since they may be reused
if label.name not in cache:
cache[label.name] = label.get_values(self._snmp_config, self._ids)
label_names.append(label.name)
label_maps.append(cache[label.name])
metric = GaugeMetricFamily(
metric_name,
documentation,
labels=label_names
)
# values are not reused
value_map = bulk_values.get_values(self._snmp_config, self._ids)
# map everything
for i in self._ids:
labels = [str(i)] # id is first
for label_map in label_maps:
label_value = label_map[i]
labels.append(str(label_value))
value = value_map.get(i)
if value is None:
print('missing value! metric:', metric_name, 'id:', i)
value = 'nan'
metric.add_metric(labels, value)
yield metric
except Exception as e:
print('Failed to scan SNMP, aborting collection')
SCAN_FAIL_COUNTER.inc()
raise e
class PowerCollector(Collector):
def collect(self) -> float:
verbose('collecting ilo_server_power_draw')
try:
reading = snmp_get(config, POWER_METER_READING)
support = snmp_get(config, POWER_METER_SUPPORT)
status = snmp_get(config, POWER_METER_STATUS)
if not isinstance(reading, int):
print('expected power meter reading to be an int, got', type(reading))
print('value in question:', reading)
reading = -1
if not isinstance(support, int):
print('expected power meter support to be an int, got', type(support))
print('value in question:', support)
support = 1
if not isinstance(status, int):
print('expected power meter status to be an int, got', type(status))
print('value in question:', status)
status = 1
if support not in POWER_METER_SUPPORT_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_SUPPORT:', support)
support_s = 'unknown'
else:
support_s = POWER_METER_SUPPORT_MAP[support]
if status not in POWER_METER_STATUS_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_STATUS:', status)
status_s = 'unknown'
else:
status_s = POWER_METER_STATUS_MAP[status]
metric = GaugeMetricFamily('ilo_server_power_draw', 'Power draw of the server in watts', labels=['support', 'status'])
metric.add_metric([support_s, status_s], reading)
yield metric
except Exception as e:
print('Failed to scan SNMP, aborting collection')
SCAN_FAIL_COUNTER.inc()
raise e
class FanSpeedCollector(Collector):
def __init__(self, https_config: HttpsConfiguration):
self._https_config = https_config
def collect(self) -> float:
verbose('collecting ilo_fan_speed')
try:
metric = GaugeMetricFamily('ilo_fan_speed', 'Detailed fan speed as returned from the ILO over https', labels=['id', 'units'])
fan_speeds = scrape.get_fan_speeds(self._https_config)
for fan in fan_speeds:
speed, units = fan_speeds[fan]
metric.add_metric([str(fan), units], speed)
yield metric
except Exception as e:
#
print('Failed to fetch fan speed')
traceback.print_exception(e)
SCAN_FAIL_COUNTER.inc()
if __name__ == '__main__':
args = arg_parser.parse_args()
# validate args
if args.quiet and args.verbose:
print('--quiet and --verbose do not mix')
exit(1)
using_https = args.https_temperature or args.https_fans
if using_https:
https_user = os.getenv('ILO_USERNAME')
https_pass = os.getenv('ILO_PASSWORD')
if https_user is None or https_pass is None:
print('Fetching values over https requires setting the ILO_USERNAME and ILO_PASSWORD environment variables.')
exit(1)
if args.https_verify:
ssl_cert = os.getenv('ILO_CERTIFICATE')
if ssl_cert is not None:
ssl_verify = ssl_cert
else:
ssl_verify = True # use system certificates
else:
ssl_verify = False
# disable insecure request warning if not verifying requests. Instead, give a single warning at init
from urllib3 import disable_warnings
from urllib3.exceptions import InsecureRequestWarning
disable_warnings(InsecureRequestWarning)
print('Warning! Not verifying SSL certificate for https requests to the ILO.')
else:
https_user = None
https_pass = None
ssl_verify = None
# init everything
config = SnmpConfiguration(
SnmpEngine(),
CommunityData(args.snmp_community),
UdpTransportTarget((args.ilo_address, args.snmp_port)),
ContextData(),
)
if using_https:
https_config = HttpsConfiguration(
args.ilo_address,
https_user,
https_pass,
ssl_verify,
args.https_timeout
)
HTTPS_FAIL_COUNTER = Counter('https_failures', 'Number of times scraping the ILO over HTTPS has failed.', namespace=NAMESPACE, subsystem='exporter')
else:
https_config = None
REGISTRY.register(PowerCollector())
no_value = BulkDummyValue('info')
https_temp_labels = []
https_temp_groups = []
temp_scan_method = scrape.detect_things
if args.https_temperature:
temp_label = BulkPredeterminedValues('label')
temp_x_pos = BulkPredeterminedValues('x_pos')
temp_y_pos = BulkPredeterminedValues('y_pos')
https_temp_labels = [temp_label, temp_x_pos, temp_y_pos]
temp_caution_threshold = BulkPredeterminedValues('threshold_caution')
temp_critical_threshold = BulkPredeterminedValues('threshold_critical')
https_temp_groups = [
('Temperature caution thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_caution_threshold, []),
('Temperature critical thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_critical_threshold, []),
]
def scan_temperature_info(c: SnmpConfiguration, base_oid: str):
sensors = scrape.detect_things(c, base_oid)
try:
# clear old mappings
temp_label.values = {}
temp_x_pos.values = {}
temp_y_pos.values = {}
temp_caution_threshold.values = {}
temp_critical_threshold.values = {}
# get new mappings
label_map = scrape.get_temp_sensor_info(https_config)
for sensor in sensors:
labels = label_map.get(sensor, {})
temp_label.values[sensor] = labels.get('label', 'unknown')
temp_x_pos.values[sensor] = labels.get('xposition', '-1')
temp_y_pos.values[sensor] = labels.get('yposition', '-1')
temp_caution_threshold.values[sensor] = labels.get('caution', -1)
temp_critical_threshold.values[sensor] = labels.get('critical', -1)
except Exception as e:
print('failed to fetch additional temperature sensor data over HTTPS')
HTTPS_FAIL_COUNTER.inc()
traceback.print_exception(e)
return sensors
temp_scan_method = scan_temperature_info
REGISTRY.register(BulkCollector(
config,
TEMP_INDEX,
'temperature',
not args.scan_once,
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE, *https_temp_labels]),
('Temperatures readings of each temperature sensor in celsius', TEMP_CELSIUS, []),
('Temperature thresholds for each temperature sensor in celsius', TEMP_THRESHOLD, []),
*https_temp_groups,
scan_method=temp_scan_method
))
REGISTRY.register(BulkCollector(
config,
FAN_INDEX,
'fan',
not args.scan_once,
('Information about system fans', no_value, [FAN_LOCALE, FAN_CONDITION, FAN_SPEED, FAN_PRESENT, FAN_PRESENCE_TEST]),
))
# enhanced fan metrics over https
if args.https_fans:
REGISTRY.register(FanSpeedCollector(https_config))
REGISTRY.register(BulkCollector(
config,
CPU_INDEX,
'cpu',
not args.scan_once,
('Information about CPUs', no_value, [CPU_NAME, CPU_STATUS, CPU_POWER_STATUS]),
('Speed of CPUs in megahertz', CPU_SPEED, []),
('CPU step', CPU_STEP, []), # revision?
('Number of enabled cores', CORES_ENABLED, []),
('Number of available threads', THREADS_AVAILABLE, []),
))
# logical drives are for v2 if it ever exists (I don't use logical drives, sorry)
REGISTRY.register(BulkCollector(
config,
DRIVE_INDEX,
'drive',
not args.scan_drives_once,
('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL, DRIVE_FIRMWARE, DRIVE_LINK_RATE, DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING, DRIVE_SMART_STATUS, DRIVE_MEDIA_TYPE, DRIVE_ROTATIONAL_SPEED, DRIVE_STATUS, DRIVE_CONDITION]),
('Sizes of installed drives in megabytes', DRIVE_SIZE, []),
('Temperatures of installed drives in celsius', DRIVE_TEMP, []),
('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, []),
('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, []),
('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, []),
scan_method=scrape.detect_complex,
))
REGISTRY.register(BulkCollector(
config,
MEMORY_INDEX,
'memory',
not args.scan_once,
('Information about system memory', no_value, [MEMORY_LOCATION, MEMORY_MANUFACTURER, MEMORY_PART_NUMBER, MEMORY_STATUS, MEMORY_CONDITION]),
('Sizes of system memory modules in kilobytes', MEMORY_SIZE, []),
))
# start metrics endpoint
addr = args.server_address
port = args.server_port
print('starting metrics server on http://%s:%s' % (addr, port))
server, thread = start_http_server(port, addr)
print('ready!')
thread.join()
print('thread died!')