add https support for fetching a few additional metrics not present on SNMP, and do bit of cleaning up

This commit is contained in:
Benjamin Wiegand 2024-05-21 18:04:47 -07:00
parent 17b94b1419
commit c4dc960fc3
8 changed files with 272 additions and 20 deletions

21
https.py Normal file
View File

@ -0,0 +1,21 @@
import requests as r
import json
class HttpsConfiguration(object):
def __init__(self, host: str, username: str, password: str, ssl_verify: str | bool, timeout: int):
self.host = host
self.username = username
self.password = password
self.ssl_verify = ssl_verify
self.timeout = timeout
def get_json_response(c: HttpsConfiguration, endpoint: str):
response = r.get(
'https://%s/%s' % (c.host, endpoint),
auth=(c.username, c.password),
verify=c.ssl_verify,
timeout=(c.timeout, c.timeout)
)
return json.loads(response.text)

147
main.py
View File

@ -5,10 +5,11 @@ from prometheus_client.registry import Collector
from pysnmp.entity.engine import SnmpEngine
from pysnmp.hlapi import CommunityData, UdpTransportTarget, ContextData
from https import HttpsConfiguration
from snmp import SnmpConfiguration, snmp_get
import scrape
from snmp_groups import BulkValues, BulkDummyValue
from snmp_groups import BulkValues, BulkDummyValue, BulkPredeterminedValues
from targets.temp import *
from targets.fan import *
from targets.cpu import *
@ -17,6 +18,8 @@ from targets.memory import *
from targets.power import *
import argparse
import os
import traceback
NAMESPACE = 'ilo'
@ -32,15 +35,17 @@ arg_parser.add_argument('-c', '--snmp-community', default='public', help='SNMP c
arg_parser.add_argument('--snmp-port', default=161, help='SNMP port to use.')
arg_parser.add_argument('-o', '--scan-once', action='store_true', help='Only scan for SNMP variables on init, instead of on each collection (except hard drives, see --scan-drives-once). This is a small optimizaion that can be used if your sever configuration never changes.')
arg_parser.add_argument('--scan-drives-once', action='store_true', help='When combined with --scan-once, this also prevents hard drives from being rescanned on collection. This is not recommeded.')
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity.')
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning.')
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity. Incompatible with --quiet')
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning. Incompatible with --verbose')
args = arg_parser.parse_args()
if args.quiet and args.verbose:
print('stop it. (--quiet and --verbose do not mix)')
exit(1)
arg_parser.add_argument('--https-temperature', action='store_true', help='Attempt to fetch and combine additional temperature sensor info over https, such as sensor names. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
arg_parser.add_argument('--https-fans', action='store_true', help='Attempt to fetch the fan speed of each fan in percent over https. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
arg_parser.add_argument('--https-verify', action='store_true', help='Enable SSL verification with ILO for https requests. You can optionally specify a specific certificate to use with the ILO_CERTIFICATE environment variable.')
arg_parser.add_argument('--https-timeout', default=5, help='Set the timeout for getting metrics over https. This sets both the connect timeout and the response timeout, meaning the actual maximum amount of allowed time is double this value, while the minimum amount of time is equal to it.')
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the iLO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the ILO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
HTTPS_FAIL_COUNTER = None
def noisy(*a, **kwa):
@ -102,14 +107,17 @@ class BulkCollector(Collector):
# values are not reused
value_map = bulk_values.get_values(self._snmp_config, self._ids)
# do some fuckery (bad design, I know.)
# map everything
for i in self._ids:
labels = [str(i)] # id is first
for label_map in label_maps:
label_value = label_map[i]
labels.append(str(label_value))
value = value_map[i]
value = value_map.get(i)
if value is None:
print('missing value! metric:', metric_name, 'id:', i)
value = 'nan'
metric.add_metric(labels, value)
yield metric
@ -161,8 +169,64 @@ class PowerCollector(Collector):
raise e
class FanSpeedCollector(Collector):
def __init__(self, https_config: HttpsConfiguration):
self._https_config = https_config
def collect(self) -> float:
verbose('collecting ilo_fan_speed')
try:
metric = GaugeMetricFamily('ilo_fan_speed', 'Detailed fan speed as returned from the ILO over https', labels=['id', 'units'])
fan_speeds = scrape.get_fan_speeds(self._https_config)
for fan in fan_speeds:
speed, units = fan_speeds[fan]
metric.add_metric([str(fan), units], speed)
yield metric
except Exception as e:
#
print('Failed to fetch fan speed')
traceback.print_exception(e)
SCAN_FAIL_COUNTER.inc()
if __name__ == '__main__':
args = arg_parser.parse_args()
# validate args
if args.quiet and args.verbose:
print('--quiet and --verbose do not mix')
exit(1)
using_https = args.https_temperature or args.https_fans
if using_https:
https_user = os.getenv('ILO_USERNAME')
https_pass = os.getenv('ILO_PASSWORD')
if https_user is None or https_pass is None:
print('Fetching values over https requires setting the ILO_USERNAME and ILO_PASSWORD environment variables.')
exit(1)
if args.https_verify:
ssl_cert = os.getenv('ILO_CERTIFICATE')
if ssl_cert is not None:
ssl_verify = ssl_cert
else:
ssl_verify = True # use system certificates
else:
ssl_verify = False
# disable insecure request warning if not verifying requests. Instead, give a single warning at init
from urllib3 import disable_warnings
from urllib3.exceptions import InsecureRequestWarning
disable_warnings(InsecureRequestWarning)
print('Warning! Not verifying SSL certificate for https requests to the ILO.')
else:
https_user = None
https_pass = None
ssl_verify = None
# init everything
config = SnmpConfiguration(
SnmpEngine(),
CommunityData(args.snmp_community),
@ -170,18 +234,75 @@ if __name__ == '__main__':
ContextData(),
)
if using_https:
https_config = HttpsConfiguration(
args.ilo_address,
https_user,
https_pass,
ssl_verify,
args.https_timeout
)
HTTPS_FAIL_COUNTER = Counter('https_failures', 'Number of times scraping the ILO over HTTPS has failed.', namespace=NAMESPACE, subsystem='exporter')
else:
https_config = None
REGISTRY.register(PowerCollector())
no_value = BulkDummyValue('info')
https_temp_labels = []
https_temp_groups = []
temp_scan_method = scrape.detect_things
if args.https_temperature:
temp_label = BulkPredeterminedValues('label')
temp_x_pos = BulkPredeterminedValues('x_pos')
temp_y_pos = BulkPredeterminedValues('y_pos')
https_temp_labels = [temp_label, temp_x_pos, temp_y_pos]
temp_caution_threshold = BulkPredeterminedValues('threshold_caution')
temp_critical_threshold = BulkPredeterminedValues('threshold_critical')
https_temp_groups = [
('Temperature caution thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_caution_threshold, []),
('Temperature critical thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_critical_threshold, []),
]
def scan_temperature_info(c: SnmpConfiguration, base_oid: str):
sensors = scrape.detect_things(c, base_oid)
try:
# clear old mappings
temp_label.values = {}
temp_x_pos.values = {}
temp_y_pos.values = {}
temp_caution_threshold.values = {}
temp_critical_threshold.values = {}
# get new mappings
label_map = scrape.get_temp_sensor_info(https_config)
for sensor in sensors:
labels = label_map.get(sensor, {})
temp_label.values[sensor] = labels.get('label', 'unknown')
temp_x_pos.values[sensor] = labels.get('xposition', '-1')
temp_y_pos.values[sensor] = labels.get('yposition', '-1')
temp_caution_threshold.values[sensor] = labels.get('caution', -1)
temp_critical_threshold.values[sensor] = labels.get('critical', -1)
except Exception as e:
print('failed to fetch additional temperature sensor data over HTTPS')
HTTPS_FAIL_COUNTER.inc()
traceback.print_exception(e)
return sensors
temp_scan_method = scan_temperature_info
REGISTRY.register(BulkCollector(
config,
TEMP_INDEX,
'temperature',
not args.scan_once,
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE]),
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE, *https_temp_labels]),
('Temperatures readings of each temperature sensor in celsius', TEMP_CELSIUS, []),
('Temperature thresholds for each temperature sensor in celsius', TEMP_THRESHOLD, []),
*https_temp_groups,
scan_method=temp_scan_method
))
REGISTRY.register(BulkCollector(
@ -192,6 +313,10 @@ if __name__ == '__main__':
('Information about system fans', no_value, [FAN_LOCALE, FAN_CONDITION, FAN_SPEED, FAN_PRESENT, FAN_PRESENCE_TEST]),
))
# enhanced fan metrics over https
if args.https_fans:
REGISTRY.register(FanSpeedCollector(https_config))
REGISTRY.register(BulkCollector(
config,
CPU_INDEX,

View File

@ -1,2 +1,4 @@
prometheus_client~=0.20.0
pysnmp~=4.4.12
prometheus_client
pysnmp
urllib3
requests

View File

@ -1,4 +1,8 @@
from snmp import snmp_get_all, snmp_walk, SnmpConfiguration, SnmpEngine, CommunityData, UdpTransportTarget, ContextData
from snmp import snmp_walk, SnmpConfiguration, SnmpEngine, CommunityData, UdpTransportTarget, ContextData
from https import get_json_response, HttpsConfiguration
from targets.fan import FAN_ENDPOINT
from targets.temp import TEMP_ENDPOINT
import traceback
def detect_things(c: SnmpConfiguration, base_oid: str) -> list[int]:
@ -24,6 +28,83 @@ def detect_complex(c: SnmpConfiguration, base_oid: str) -> list[tuple[int]]:
return drives
# since there's only two that get fetched via https, these are just kinda hacked together
def get_fan_speeds(c: HttpsConfiguration) -> dict[int, tuple[int | float, str]]: # {id: (speed, unit)}
speed_map = {}
response = get_json_response(c, FAN_ENDPOINT)
try:
fans = response['fans']
for fan in fans:
# try to derive fan id
label = fan.get('label')
if not isinstance(label, str):
continue
fan_id = str(label[4:]) # label seems to follow a format of "Fan X"
if not fan_id.isnumeric():
continue
fan_id = int(fan_id)
# get speed
speed = fan.get('speed')
if not (isinstance(speed, int) or isinstance(speed, float)):
speed = -1
# get units
unit = fan.get('speed_unit')
if not isinstance(unit, str):
unit = 'unknown'
speed_map[fan_id] = (speed, unit)
except (KeyError, TypeError) as e:
print('unexpected response from ILO')
traceback.print_exception(e)
print('response:', response)
return speed_map
def get_temp_sensor_info(c: HttpsConfiguration) -> dict[int, dict[str, str | int | float]]:
# {id: {name: value}}
labels = {}
response = get_json_response(c, TEMP_ENDPOINT)
try:
sensors = response['temperature']
for sensor in sensors:
sensor_labels = {}
# try to derive sensor id
label = sensor.get('label')
if not isinstance(label, str):
continue
sensor_id = str(label[:2]) # label seems to follow a format of "XX-Some Sensor"
if not sensor_id.isnumeric():
continue
sensor_id = int(sensor_id)
for key in ['label', 'xposition', 'yposition']:
value = sensor.get(key)
if value is not None:
sensor_labels[key] = str(value)
for key in ['caution', 'critical']:
value = sensor.get(key)
if isinstance(value, int) or isinstance(value, float):
sensor_labels[key] = value
labels[sensor_id] = sensor_labels
except (KeyError, TypeError) as e:
print('unexpected response from ILO')
traceback.print_exception(e)
print('response:', response)
return labels
if __name__ == '__main__':
from targets.fan import FAN_VALUES, FAN_INDEX
from targets.temp import TEMP_VALUES, TEMP_INDEX
@ -34,7 +115,7 @@ if __name__ == '__main__':
config = SnmpConfiguration(
SnmpEngine(),
CommunityData('deeznuts'),
CommunityData('public'),
UdpTransportTarget(('192.168.100.88', 161)),
ContextData(),
)
@ -78,3 +159,14 @@ if __name__ == '__main__':
print('memory slot', slot, value.name, 'is', states[slot])
print()
print('asdf')
conf = HttpsConfiguration(
'192.168.100.88',
'some username',
'some password',
'./ilo.pem',
5
)
print(get_fan_speeds(conf))
print(get_temp_sensor_info(conf))

View File

@ -45,10 +45,6 @@ class BulkDummyValue(BulkValues):
super().__init__(None, name)
self._name = name
@property
def name(self):
return self._name
def get_values(self, _: SnmpConfiguration, indexes: list) -> dict:
result_dict = {}
for index in indexes:
@ -57,6 +53,16 @@ class BulkDummyValue(BulkValues):
return result_dict
class BulkPredeterminedValues(BulkValues):
def __init__(self, name: str, values: dict = {}):
super().__init__(None, name)
self._name = name
self.values = values
def get_values(self, c: SnmpConfiguration, indexes: list) -> dict:
return self.values
class BulkNumbers(BulkValues):
def __init__(self, oid_template, name: str):
super().__init__(oid_template, name)

View File

@ -110,7 +110,7 @@ DRIVE_REFERENCE_TIME = BulkNumbers(
DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING = BulkEnums(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 52) + i),
'supports_predictive_failure_monitoring',
'predictive_failure',
{
1: 'other',
2: 'notAvailable',

View File

@ -68,6 +68,9 @@ FAN_CONDITION = BulkEnums(
}
)
# https
FAN_ENDPOINT = 'json/health_fans'
# for debugging
FAN_VALUES = [
FAN_LOCALE,

View File

@ -55,6 +55,9 @@ TEMP_CONDITION = BulkEnums(
}
)
# https
TEMP_ENDPOINT = 'json/health_temperature'
# for debugging
TEMP_VALUES = [
TEMP_SENSOR_LOCALE,