17b94b1419
I only just found out this is a thing: https://grafana.com/blog/2021/08/04/how-to-use-promql-joins-for-more-effective-queries-of-prometheus-metrics-at-scale/
241 lines
9.6 KiB
Python
241 lines
9.6 KiB
Python
from prometheus_client import start_http_server, Counter
|
|
from prometheus_client.core import REGISTRY, GaugeMetricFamily
|
|
from prometheus_client.registry import Collector
|
|
|
|
from pysnmp.entity.engine import SnmpEngine
|
|
from pysnmp.hlapi import CommunityData, UdpTransportTarget, ContextData
|
|
|
|
from snmp import SnmpConfiguration, snmp_get
|
|
import scrape
|
|
|
|
from snmp_groups import BulkValues, BulkDummyValue
|
|
from targets.temp import *
|
|
from targets.fan import *
|
|
from targets.cpu import *
|
|
from targets.drive import *
|
|
from targets.memory import *
|
|
from targets.power import *
|
|
|
|
import argparse
|
|
|
|
NAMESPACE = 'ilo'
|
|
|
|
arg_parser = argparse.ArgumentParser(
|
|
'ilo_exporter',
|
|
description='A fast(er) prometheus exporter for applicable HP servers using SNMP via the ILO controller.',
|
|
)
|
|
|
|
arg_parser.add_argument('-i', '--ilo-address', help='ILO IP address to scan.', required=True)
|
|
arg_parser.add_argument('-a', '--server-address', default='0.0.0.0', help='Address to bind for hosting the metrics endpoint.')
|
|
arg_parser.add_argument('-p', '--server-port', default=6969, help='Port to bind for the metrics endpoint.')
|
|
arg_parser.add_argument('-c', '--snmp-community', default='public', help='SNMP community to read.')
|
|
arg_parser.add_argument('--snmp-port', default=161, help='SNMP port to use.')
|
|
arg_parser.add_argument('-o', '--scan-once', action='store_true', help='Only scan for SNMP variables on init, instead of on each collection (except hard drives, see --scan-drives-once). This is a small optimizaion that can be used if your sever configuration never changes.')
|
|
arg_parser.add_argument('--scan-drives-once', action='store_true', help='When combined with --scan-once, this also prevents hard drives from being rescanned on collection. This is not recommeded.')
|
|
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity.')
|
|
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning.')
|
|
|
|
args = arg_parser.parse_args()
|
|
if args.quiet and args.verbose:
|
|
print('stop it. (--quiet and --verbose do not mix)')
|
|
exit(1)
|
|
|
|
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the iLO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
|
|
|
|
|
|
def noisy(*a, **kwa):
|
|
if not args.quiet:
|
|
print(*a, **kwa)
|
|
|
|
|
|
def verbose(*a, **kwa):
|
|
if args.verbose:
|
|
print(*a, **kwa)
|
|
|
|
|
|
class BulkCollector(Collector):
|
|
def __init__(self, snmp_config: SnmpConfiguration, index_oid_template: str, target_name: str, scan_on_collect: bool, *metrics_groups: tuple[str, BulkValues, list[BulkEnums]], scan_method: any = scrape.detect_things):
|
|
self._snmp_config = snmp_config
|
|
self._metrics_groups = metrics_groups
|
|
self._target_name = target_name
|
|
self._name_template = '%s_%s_' % (NAMESPACE, target_name) + '%s'
|
|
self._ids = []
|
|
self._index_oid_template = index_oid_template
|
|
self._scan_on_collect = scan_on_collect
|
|
self._scan_method = scan_method
|
|
|
|
if not scan_on_collect:
|
|
self.scan()
|
|
|
|
def scan(self):
|
|
verbose('scanning target', self._target_name)
|
|
self._ids = self._scan_method(self._snmp_config, self._index_oid_template)
|
|
noisy('found', len(self._ids), 'items for target', self._target_name)
|
|
|
|
def collect(self):
|
|
cache = {}
|
|
|
|
try:
|
|
if self._scan_on_collect:
|
|
self.scan()
|
|
|
|
for documentation, bulk_values, bulk_labels in self._metrics_groups:
|
|
metric_name = self._name_template % bulk_values.name
|
|
verbose('collecting', metric_name)
|
|
|
|
label_names = ['id']
|
|
label_maps = []
|
|
|
|
for label in bulk_labels:
|
|
# the labels are cached since they may be reused
|
|
if label.name not in cache:
|
|
cache[label.name] = label.get_values(self._snmp_config, self._ids)
|
|
label_names.append(label.name)
|
|
label_maps.append(cache[label.name])
|
|
|
|
metric = GaugeMetricFamily(
|
|
metric_name,
|
|
documentation,
|
|
labels=label_names
|
|
)
|
|
|
|
# values are not reused
|
|
value_map = bulk_values.get_values(self._snmp_config, self._ids)
|
|
|
|
# do some fuckery (bad design, I know.)
|
|
for i in self._ids:
|
|
labels = [str(i)] # id is first
|
|
for label_map in label_maps:
|
|
label_value = label_map[i]
|
|
labels.append(str(label_value))
|
|
|
|
value = value_map[i]
|
|
metric.add_metric(labels, value)
|
|
|
|
yield metric
|
|
|
|
except Exception as e:
|
|
print('Failed to scan SNMP, aborting collection')
|
|
SCAN_FAIL_COUNTER.inc()
|
|
raise e
|
|
|
|
|
|
class PowerCollector(Collector):
|
|
def collect(self) -> float:
|
|
verbose('collecting ilo_server_power_draw')
|
|
try:
|
|
reading = snmp_get(config, POWER_METER_READING)
|
|
support = snmp_get(config, POWER_METER_SUPPORT)
|
|
status = snmp_get(config, POWER_METER_STATUS)
|
|
|
|
if not isinstance(reading, int):
|
|
print('expected power meter reading to be an int, got', type(reading))
|
|
print('value in question:', reading)
|
|
reading = -1
|
|
if not isinstance(support, int):
|
|
print('expected power meter support to be an int, got', type(support))
|
|
print('value in question:', support)
|
|
support = 1
|
|
if not isinstance(status, int):
|
|
print('expected power meter status to be an int, got', type(status))
|
|
print('value in question:', status)
|
|
status = 1
|
|
|
|
if support not in POWER_METER_SUPPORT_MAP:
|
|
print('ILO returned a value outside of the expected range for POWER_METER_SUPPORT:', support)
|
|
support_s = 'unknown'
|
|
else:
|
|
support_s = POWER_METER_SUPPORT_MAP[support]
|
|
if status not in POWER_METER_STATUS_MAP:
|
|
print('ILO returned a value outside of the expected range for POWER_METER_STATUS:', status)
|
|
status_s = 'unknown'
|
|
else:
|
|
status_s = POWER_METER_STATUS_MAP[status]
|
|
|
|
metric = GaugeMetricFamily('ilo_server_power_draw', 'Power draw of the server in watts', labels=['support', 'status'])
|
|
metric.add_metric([support_s, status_s], reading)
|
|
yield metric
|
|
except Exception as e:
|
|
print('Failed to scan SNMP, aborting collection')
|
|
SCAN_FAIL_COUNTER.inc()
|
|
raise e
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
config = SnmpConfiguration(
|
|
SnmpEngine(),
|
|
CommunityData(args.snmp_community),
|
|
UdpTransportTarget((args.ilo_address, args.snmp_port)),
|
|
ContextData(),
|
|
)
|
|
|
|
REGISTRY.register(PowerCollector())
|
|
|
|
no_value = BulkDummyValue('info')
|
|
|
|
REGISTRY.register(BulkCollector(
|
|
config,
|
|
TEMP_INDEX,
|
|
'temperature',
|
|
not args.scan_once,
|
|
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE]),
|
|
('Temperatures readings of each temperature sensor in celsius', TEMP_CELSIUS, []),
|
|
('Temperature thresholds for each temperature sensor in celsius', TEMP_THRESHOLD, []),
|
|
))
|
|
|
|
REGISTRY.register(BulkCollector(
|
|
config,
|
|
FAN_INDEX,
|
|
'fan',
|
|
not args.scan_once,
|
|
('Information about system fans', no_value, [FAN_LOCALE, FAN_CONDITION, FAN_SPEED, FAN_PRESENT, FAN_PRESENCE_TEST]),
|
|
))
|
|
|
|
REGISTRY.register(BulkCollector(
|
|
config,
|
|
CPU_INDEX,
|
|
'cpu',
|
|
not args.scan_once,
|
|
('Information about CPUs', no_value, [CPU_NAME, CPU_STATUS, CPU_POWER_STATUS]),
|
|
('Speed of CPUs in megahertz', CPU_SPEED, []),
|
|
('CPU step', CPU_STEP, []), # revision?
|
|
('Number of enabled cores', CORES_ENABLED, []),
|
|
('Number of available threads', THREADS_AVAILABLE, []),
|
|
))
|
|
|
|
# logical drives are for v2 if it ever exists (I don't use logical drives, sorry)
|
|
|
|
REGISTRY.register(BulkCollector(
|
|
config,
|
|
DRIVE_INDEX,
|
|
'drive',
|
|
not args.scan_drives_once,
|
|
('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL, DRIVE_FIRMWARE, DRIVE_LINK_RATE, DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING, DRIVE_SMART_STATUS, DRIVE_MEDIA_TYPE, DRIVE_ROTATIONAL_SPEED, DRIVE_STATUS, DRIVE_CONDITION]),
|
|
('Sizes of installed drives in megabytes', DRIVE_SIZE, []),
|
|
('Temperatures of installed drives in celsius', DRIVE_TEMP, []),
|
|
('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, []),
|
|
('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, []),
|
|
('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, []),
|
|
scan_method=scrape.detect_complex,
|
|
))
|
|
|
|
REGISTRY.register(BulkCollector(
|
|
config,
|
|
MEMORY_INDEX,
|
|
'memory',
|
|
not args.scan_once,
|
|
('Information about system memory', no_value, [MEMORY_LOCATION, MEMORY_MANUFACTURER, MEMORY_PART_NUMBER, MEMORY_STATUS, MEMORY_CONDITION]),
|
|
('Sizes of system memory modules in kilobytes', MEMORY_SIZE, []),
|
|
))
|
|
|
|
# start metrics endpoint
|
|
addr = args.server_address
|
|
port = args.server_port
|
|
print('starting metrics server on http://%s:%s' % (addr, port))
|
|
server, thread = start_http_server(port, addr)
|
|
print('ready!')
|
|
|
|
thread.join()
|
|
print('thread died!')
|