add https support for fetching a few additional metrics not present on SNMP, and do bit of cleaning up
This commit is contained in:
parent
17b94b1419
commit
c4dc960fc3
21
https.py
Normal file
21
https.py
Normal file
@ -0,0 +1,21 @@
|
||||
import requests as r
|
||||
import json
|
||||
|
||||
|
||||
class HttpsConfiguration(object):
|
||||
def __init__(self, host: str, username: str, password: str, ssl_verify: str | bool, timeout: int):
|
||||
self.host = host
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.ssl_verify = ssl_verify
|
||||
self.timeout = timeout
|
||||
|
||||
|
||||
def get_json_response(c: HttpsConfiguration, endpoint: str):
|
||||
response = r.get(
|
||||
'https://%s/%s' % (c.host, endpoint),
|
||||
auth=(c.username, c.password),
|
||||
verify=c.ssl_verify,
|
||||
timeout=(c.timeout, c.timeout)
|
||||
)
|
||||
return json.loads(response.text)
|
147
main.py
147
main.py
@ -5,10 +5,11 @@ from prometheus_client.registry import Collector
|
||||
from pysnmp.entity.engine import SnmpEngine
|
||||
from pysnmp.hlapi import CommunityData, UdpTransportTarget, ContextData
|
||||
|
||||
from https import HttpsConfiguration
|
||||
from snmp import SnmpConfiguration, snmp_get
|
||||
import scrape
|
||||
|
||||
from snmp_groups import BulkValues, BulkDummyValue
|
||||
from snmp_groups import BulkValues, BulkDummyValue, BulkPredeterminedValues
|
||||
from targets.temp import *
|
||||
from targets.fan import *
|
||||
from targets.cpu import *
|
||||
@ -17,6 +18,8 @@ from targets.memory import *
|
||||
from targets.power import *
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import traceback
|
||||
|
||||
NAMESPACE = 'ilo'
|
||||
|
||||
@ -32,15 +35,17 @@ arg_parser.add_argument('-c', '--snmp-community', default='public', help='SNMP c
|
||||
arg_parser.add_argument('--snmp-port', default=161, help='SNMP port to use.')
|
||||
arg_parser.add_argument('-o', '--scan-once', action='store_true', help='Only scan for SNMP variables on init, instead of on each collection (except hard drives, see --scan-drives-once). This is a small optimizaion that can be used if your sever configuration never changes.')
|
||||
arg_parser.add_argument('--scan-drives-once', action='store_true', help='When combined with --scan-once, this also prevents hard drives from being rescanned on collection. This is not recommeded.')
|
||||
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity.')
|
||||
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning.')
|
||||
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Increases verbosity. Incompatible with --quiet')
|
||||
arg_parser.add_argument('-q', '--quiet', action='store_true', help='Tells the exporter to stfu under normal operation unless there is an error/warning. Incompatible with --verbose')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
if args.quiet and args.verbose:
|
||||
print('stop it. (--quiet and --verbose do not mix)')
|
||||
exit(1)
|
||||
arg_parser.add_argument('--https-temperature', action='store_true', help='Attempt to fetch and combine additional temperature sensor info over https, such as sensor names. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
|
||||
arg_parser.add_argument('--https-fans', action='store_true', help='Attempt to fetch the fan speed of each fan in percent over https. Requires ILO_USERNAME and ILO_PASSWORD environment variables.')
|
||||
arg_parser.add_argument('--https-verify', action='store_true', help='Enable SSL verification with ILO for https requests. You can optionally specify a specific certificate to use with the ILO_CERTIFICATE environment variable.')
|
||||
arg_parser.add_argument('--https-timeout', default=5, help='Set the timeout for getting metrics over https. This sets both the connect timeout and the response timeout, meaning the actual maximum amount of allowed time is double this value, while the minimum amount of time is equal to it.')
|
||||
|
||||
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the iLO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
|
||||
|
||||
SCAN_FAIL_COUNTER = Counter('scrape_failures', 'Number of times scraping the ILO for SNMP variables has failed.', namespace=NAMESPACE, subsystem='exporter')
|
||||
HTTPS_FAIL_COUNTER = None
|
||||
|
||||
|
||||
def noisy(*a, **kwa):
|
||||
@ -102,14 +107,17 @@ class BulkCollector(Collector):
|
||||
# values are not reused
|
||||
value_map = bulk_values.get_values(self._snmp_config, self._ids)
|
||||
|
||||
# do some fuckery (bad design, I know.)
|
||||
# map everything
|
||||
for i in self._ids:
|
||||
labels = [str(i)] # id is first
|
||||
for label_map in label_maps:
|
||||
label_value = label_map[i]
|
||||
labels.append(str(label_value))
|
||||
|
||||
value = value_map[i]
|
||||
value = value_map.get(i)
|
||||
if value is None:
|
||||
print('missing value! metric:', metric_name, 'id:', i)
|
||||
value = 'nan'
|
||||
metric.add_metric(labels, value)
|
||||
|
||||
yield metric
|
||||
@ -161,8 +169,64 @@ class PowerCollector(Collector):
|
||||
raise e
|
||||
|
||||
|
||||
class FanSpeedCollector(Collector):
|
||||
def __init__(self, https_config: HttpsConfiguration):
|
||||
self._https_config = https_config
|
||||
|
||||
def collect(self) -> float:
|
||||
verbose('collecting ilo_fan_speed')
|
||||
try:
|
||||
metric = GaugeMetricFamily('ilo_fan_speed', 'Detailed fan speed as returned from the ILO over https', labels=['id', 'units'])
|
||||
fan_speeds = scrape.get_fan_speeds(self._https_config)
|
||||
for fan in fan_speeds:
|
||||
speed, units = fan_speeds[fan]
|
||||
metric.add_metric([str(fan), units], speed)
|
||||
|
||||
yield metric
|
||||
except Exception as e:
|
||||
#
|
||||
print('Failed to fetch fan speed')
|
||||
traceback.print_exception(e)
|
||||
SCAN_FAIL_COUNTER.inc()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
# validate args
|
||||
if args.quiet and args.verbose:
|
||||
print('--quiet and --verbose do not mix')
|
||||
exit(1)
|
||||
|
||||
using_https = args.https_temperature or args.https_fans
|
||||
if using_https:
|
||||
https_user = os.getenv('ILO_USERNAME')
|
||||
https_pass = os.getenv('ILO_PASSWORD')
|
||||
if https_user is None or https_pass is None:
|
||||
print('Fetching values over https requires setting the ILO_USERNAME and ILO_PASSWORD environment variables.')
|
||||
exit(1)
|
||||
|
||||
if args.https_verify:
|
||||
ssl_cert = os.getenv('ILO_CERTIFICATE')
|
||||
if ssl_cert is not None:
|
||||
ssl_verify = ssl_cert
|
||||
else:
|
||||
ssl_verify = True # use system certificates
|
||||
else:
|
||||
ssl_verify = False
|
||||
# disable insecure request warning if not verifying requests. Instead, give a single warning at init
|
||||
from urllib3 import disable_warnings
|
||||
from urllib3.exceptions import InsecureRequestWarning
|
||||
|
||||
disable_warnings(InsecureRequestWarning)
|
||||
print('Warning! Not verifying SSL certificate for https requests to the ILO.')
|
||||
else:
|
||||
https_user = None
|
||||
https_pass = None
|
||||
ssl_verify = None
|
||||
|
||||
# init everything
|
||||
config = SnmpConfiguration(
|
||||
SnmpEngine(),
|
||||
CommunityData(args.snmp_community),
|
||||
@ -170,18 +234,75 @@ if __name__ == '__main__':
|
||||
ContextData(),
|
||||
)
|
||||
|
||||
if using_https:
|
||||
https_config = HttpsConfiguration(
|
||||
args.ilo_address,
|
||||
https_user,
|
||||
https_pass,
|
||||
ssl_verify,
|
||||
args.https_timeout
|
||||
)
|
||||
HTTPS_FAIL_COUNTER = Counter('https_failures', 'Number of times scraping the ILO over HTTPS has failed.', namespace=NAMESPACE, subsystem='exporter')
|
||||
else:
|
||||
https_config = None
|
||||
|
||||
REGISTRY.register(PowerCollector())
|
||||
|
||||
no_value = BulkDummyValue('info')
|
||||
|
||||
https_temp_labels = []
|
||||
https_temp_groups = []
|
||||
temp_scan_method = scrape.detect_things
|
||||
if args.https_temperature:
|
||||
temp_label = BulkPredeterminedValues('label')
|
||||
temp_x_pos = BulkPredeterminedValues('x_pos')
|
||||
temp_y_pos = BulkPredeterminedValues('y_pos')
|
||||
https_temp_labels = [temp_label, temp_x_pos, temp_y_pos]
|
||||
|
||||
temp_caution_threshold = BulkPredeterminedValues('threshold_caution')
|
||||
temp_critical_threshold = BulkPredeterminedValues('threshold_critical')
|
||||
https_temp_groups = [
|
||||
('Temperature caution thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_caution_threshold, []),
|
||||
('Temperature critical thresholds for each temperature sensor in celsius as returned by the ILO over HTTPS', temp_critical_threshold, []),
|
||||
]
|
||||
|
||||
def scan_temperature_info(c: SnmpConfiguration, base_oid: str):
|
||||
sensors = scrape.detect_things(c, base_oid)
|
||||
try:
|
||||
# clear old mappings
|
||||
temp_label.values = {}
|
||||
temp_x_pos.values = {}
|
||||
temp_y_pos.values = {}
|
||||
temp_caution_threshold.values = {}
|
||||
temp_critical_threshold.values = {}
|
||||
|
||||
# get new mappings
|
||||
label_map = scrape.get_temp_sensor_info(https_config)
|
||||
for sensor in sensors:
|
||||
labels = label_map.get(sensor, {})
|
||||
temp_label.values[sensor] = labels.get('label', 'unknown')
|
||||
temp_x_pos.values[sensor] = labels.get('xposition', '-1')
|
||||
temp_y_pos.values[sensor] = labels.get('yposition', '-1')
|
||||
temp_caution_threshold.values[sensor] = labels.get('caution', -1)
|
||||
temp_critical_threshold.values[sensor] = labels.get('critical', -1)
|
||||
except Exception as e:
|
||||
print('failed to fetch additional temperature sensor data over HTTPS')
|
||||
HTTPS_FAIL_COUNTER.inc()
|
||||
traceback.print_exception(e)
|
||||
return sensors
|
||||
|
||||
temp_scan_method = scan_temperature_info
|
||||
|
||||
REGISTRY.register(BulkCollector(
|
||||
config,
|
||||
TEMP_INDEX,
|
||||
'temperature',
|
||||
not args.scan_once,
|
||||
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE]),
|
||||
('Information temperature sensors', no_value, [TEMP_SENSOR_LOCALE, TEMP_CONDITION, TEMP_THRESHOLD_TYPE, *https_temp_labels]),
|
||||
('Temperatures readings of each temperature sensor in celsius', TEMP_CELSIUS, []),
|
||||
('Temperature thresholds for each temperature sensor in celsius', TEMP_THRESHOLD, []),
|
||||
*https_temp_groups,
|
||||
scan_method=temp_scan_method
|
||||
))
|
||||
|
||||
REGISTRY.register(BulkCollector(
|
||||
@ -192,6 +313,10 @@ if __name__ == '__main__':
|
||||
('Information about system fans', no_value, [FAN_LOCALE, FAN_CONDITION, FAN_SPEED, FAN_PRESENT, FAN_PRESENCE_TEST]),
|
||||
))
|
||||
|
||||
# enhanced fan metrics over https
|
||||
if args.https_fans:
|
||||
REGISTRY.register(FanSpeedCollector(https_config))
|
||||
|
||||
REGISTRY.register(BulkCollector(
|
||||
config,
|
||||
CPU_INDEX,
|
||||
|
@ -1,2 +1,4 @@
|
||||
prometheus_client~=0.20.0
|
||||
pysnmp~=4.4.12
|
||||
prometheus_client
|
||||
pysnmp
|
||||
urllib3
|
||||
requests
|
96
scrape.py
96
scrape.py
@ -1,4 +1,8 @@
|
||||
from snmp import snmp_get_all, snmp_walk, SnmpConfiguration, SnmpEngine, CommunityData, UdpTransportTarget, ContextData
|
||||
from snmp import snmp_walk, SnmpConfiguration, SnmpEngine, CommunityData, UdpTransportTarget, ContextData
|
||||
from https import get_json_response, HttpsConfiguration
|
||||
from targets.fan import FAN_ENDPOINT
|
||||
from targets.temp import TEMP_ENDPOINT
|
||||
import traceback
|
||||
|
||||
|
||||
def detect_things(c: SnmpConfiguration, base_oid: str) -> list[int]:
|
||||
@ -24,6 +28,83 @@ def detect_complex(c: SnmpConfiguration, base_oid: str) -> list[tuple[int]]:
|
||||
return drives
|
||||
|
||||
|
||||
# since there's only two that get fetched via https, these are just kinda hacked together
|
||||
def get_fan_speeds(c: HttpsConfiguration) -> dict[int, tuple[int | float, str]]: # {id: (speed, unit)}
|
||||
speed_map = {}
|
||||
|
||||
response = get_json_response(c, FAN_ENDPOINT)
|
||||
try:
|
||||
fans = response['fans']
|
||||
for fan in fans:
|
||||
|
||||
# try to derive fan id
|
||||
label = fan.get('label')
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
|
||||
fan_id = str(label[4:]) # label seems to follow a format of "Fan X"
|
||||
if not fan_id.isnumeric():
|
||||
continue
|
||||
fan_id = int(fan_id)
|
||||
|
||||
# get speed
|
||||
speed = fan.get('speed')
|
||||
if not (isinstance(speed, int) or isinstance(speed, float)):
|
||||
speed = -1
|
||||
|
||||
# get units
|
||||
unit = fan.get('speed_unit')
|
||||
if not isinstance(unit, str):
|
||||
unit = 'unknown'
|
||||
|
||||
speed_map[fan_id] = (speed, unit)
|
||||
except (KeyError, TypeError) as e:
|
||||
print('unexpected response from ILO')
|
||||
traceback.print_exception(e)
|
||||
print('response:', response)
|
||||
|
||||
return speed_map
|
||||
|
||||
|
||||
def get_temp_sensor_info(c: HttpsConfiguration) -> dict[int, dict[str, str | int | float]]:
|
||||
# {id: {name: value}}
|
||||
labels = {}
|
||||
|
||||
response = get_json_response(c, TEMP_ENDPOINT)
|
||||
try:
|
||||
sensors = response['temperature']
|
||||
for sensor in sensors:
|
||||
sensor_labels = {}
|
||||
|
||||
# try to derive sensor id
|
||||
label = sensor.get('label')
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
|
||||
sensor_id = str(label[:2]) # label seems to follow a format of "XX-Some Sensor"
|
||||
if not sensor_id.isnumeric():
|
||||
continue
|
||||
sensor_id = int(sensor_id)
|
||||
|
||||
for key in ['label', 'xposition', 'yposition']:
|
||||
value = sensor.get(key)
|
||||
if value is not None:
|
||||
sensor_labels[key] = str(value)
|
||||
|
||||
for key in ['caution', 'critical']:
|
||||
value = sensor.get(key)
|
||||
if isinstance(value, int) or isinstance(value, float):
|
||||
sensor_labels[key] = value
|
||||
|
||||
labels[sensor_id] = sensor_labels
|
||||
except (KeyError, TypeError) as e:
|
||||
print('unexpected response from ILO')
|
||||
traceback.print_exception(e)
|
||||
print('response:', response)
|
||||
|
||||
return labels
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from targets.fan import FAN_VALUES, FAN_INDEX
|
||||
from targets.temp import TEMP_VALUES, TEMP_INDEX
|
||||
@ -34,7 +115,7 @@ if __name__ == '__main__':
|
||||
|
||||
config = SnmpConfiguration(
|
||||
SnmpEngine(),
|
||||
CommunityData('deeznuts'),
|
||||
CommunityData('public'),
|
||||
UdpTransportTarget(('192.168.100.88', 161)),
|
||||
ContextData(),
|
||||
)
|
||||
@ -78,3 +159,14 @@ if __name__ == '__main__':
|
||||
print('memory slot', slot, value.name, 'is', states[slot])
|
||||
print()
|
||||
|
||||
print('asdf')
|
||||
conf = HttpsConfiguration(
|
||||
'192.168.100.88',
|
||||
'some username',
|
||||
'some password',
|
||||
'./ilo.pem',
|
||||
5
|
||||
)
|
||||
|
||||
print(get_fan_speeds(conf))
|
||||
print(get_temp_sensor_info(conf))
|
||||
|
@ -45,10 +45,6 @@ class BulkDummyValue(BulkValues):
|
||||
super().__init__(None, name)
|
||||
self._name = name
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self._name
|
||||
|
||||
def get_values(self, _: SnmpConfiguration, indexes: list) -> dict:
|
||||
result_dict = {}
|
||||
for index in indexes:
|
||||
@ -57,6 +53,16 @@ class BulkDummyValue(BulkValues):
|
||||
return result_dict
|
||||
|
||||
|
||||
class BulkPredeterminedValues(BulkValues):
|
||||
def __init__(self, name: str, values: dict = {}):
|
||||
super().__init__(None, name)
|
||||
self._name = name
|
||||
self.values = values
|
||||
|
||||
def get_values(self, c: SnmpConfiguration, indexes: list) -> dict:
|
||||
return self.values
|
||||
|
||||
|
||||
class BulkNumbers(BulkValues):
|
||||
def __init__(self, oid_template, name: str):
|
||||
super().__init__(oid_template, name)
|
||||
|
@ -110,7 +110,7 @@ DRIVE_REFERENCE_TIME = BulkNumbers(
|
||||
|
||||
DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING = BulkEnums(
|
||||
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 52) + i),
|
||||
'supports_predictive_failure_monitoring',
|
||||
'predictive_failure',
|
||||
{
|
||||
1: 'other',
|
||||
2: 'notAvailable',
|
||||
|
@ -68,6 +68,9 @@ FAN_CONDITION = BulkEnums(
|
||||
}
|
||||
)
|
||||
|
||||
# https
|
||||
FAN_ENDPOINT = 'json/health_fans'
|
||||
|
||||
# for debugging
|
||||
FAN_VALUES = [
|
||||
FAN_LOCALE,
|
||||
|
@ -55,6 +55,9 @@ TEMP_CONDITION = BulkEnums(
|
||||
}
|
||||
)
|
||||
|
||||
# https
|
||||
TEMP_ENDPOINT = 'json/health_temperature'
|
||||
|
||||
# for debugging
|
||||
TEMP_VALUES = [
|
||||
TEMP_SENSOR_LOCALE,
|
||||
|
Loading…
Reference in New Issue
Block a user