add/correct a bunch of metrics

This commit is contained in:
Benjamin Wiegand 2024-05-20 01:36:33 -07:00
parent ff206c24cb
commit 4b7683e9d9
4 changed files with 111 additions and 22 deletions

60
main.py
View File

@ -14,7 +14,7 @@ from targets.fan import *
from targets.cpu import * from targets.cpu import *
from targets.drive import * from targets.drive import *
from targets.memory import * from targets.memory import *
import targets.power from targets.power import *
import argparse import argparse
@ -120,10 +120,45 @@ class BulkCollector(Collector):
raise e raise e
def get_power_draw() -> float: class PowerCollector(Collector):
verbose('collecting ilo_server_power_draw') def collect(self) -> float:
val = snmp_get(config, targets.power.POWER_METER_READING) verbose('collecting ilo_server_power_draw')
return val try:
reading = snmp_get(config, POWER_METER_READING)
support = snmp_get(config, POWER_METER_SUPPORT)
status = snmp_get(config, POWER_METER_STATUS)
if not isinstance(reading, int):
print('expected power meter reading to be an int, got', type(reading))
print('value in question:', reading)
reading = -1
if not isinstance(support, int):
print('expected power meter support to be an int, got', type(support))
print('value in question:', support)
support = 1
if not isinstance(status, int):
print('expected power meter status to be an int, got', type(status))
print('value in question:', status)
status = 1
if support not in POWER_METER_SUPPORT_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_SUPPORT:', support)
support_s = 'unknown'
else:
support_s = POWER_METER_SUPPORT_MAP[support]
if status not in POWER_METER_STATUS_MAP:
print('ILO returned a value outside of the expected range for POWER_METER_STATUS:', status)
status_s = 'unknown'
else:
status_s = POWER_METER_STATUS_MAP[status]
metric = GaugeMetricFamily('ilo_server_power_draw', 'Power draw of the server in watts', labels=['support', 'status'])
metric.add_metric([support_s, status_s], reading)
yield metric
except Exception as e:
print('Failed to scan SNMP, aborting collection')
SCAN_FAIL_COUNTER.inc()
raise e
if __name__ == '__main__': if __name__ == '__main__':
@ -135,8 +170,7 @@ if __name__ == '__main__':
ContextData(), ContextData(),
) )
power = Gauge("ilo_server_power_draw", "Power draw of the server in watts") REGISTRY.register(PowerCollector())
power.set_function(get_power_draw)
no_value = BulkDummyValue('info') no_value = BulkDummyValue('info')
@ -176,12 +210,12 @@ if __name__ == '__main__':
DRIVE_INDEX, DRIVE_INDEX,
'drive', 'drive',
not args.scan_drives_once, not args.scan_drives_once,
('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL, DRIVE_LINK_RATE, DRIVE_STATUS, DRIVE_CONDITION]), ('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL, DRIVE_FIRMWARE, DRIVE_LINK_RATE, DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING, DRIVE_SMART_STATUS, DRIVE_MEDIA_TYPE, DRIVE_ROTATIONAL_SPEED, DRIVE_STATUS, DRIVE_CONDITION]),
('Sizes of installed drives in megabytes', DRIVE_SIZE, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), ('Sizes of installed drives in megabytes', DRIVE_SIZE, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Temperatures of installed drives in celsius', DRIVE_TEMP, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), ('Temperatures of installed drives in celsius', DRIVE_TEMP, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), ('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), ('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), ('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]),
scan_method=scrape.detect_complex, scan_method=scrape.detect_complex,
)) ))

View File

@ -2,7 +2,7 @@ from snmp_groups import BulkEnums, BulkNumbers, BulkStrings
DRIVE_INDEX = '1.3.6.1.4.1.232.3.2.5.1.1.2' DRIVE_INDEX = '1.3.6.1.4.1.232.3.2.5.1.1.2'
# controller index? # controller index
# DRIVE_CONTROLLER = BulkNumbers( # DRIVE_CONTROLLER = BulkNumbers(
# (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 1) + i), # (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 1) + i),
# 'controller' # 'controller'
@ -39,9 +39,9 @@ DRIVE_SERIAL = BulkStrings(
'serial', 'serial',
) )
DRIVE_MODEL = BulkStrings( DRIVE_FIRMWARE = BulkStrings(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 4) + i), (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 4) + i),
'model', 'firmware',
) )
DRIVE_SIZE = BulkNumbers( DRIVE_SIZE = BulkNumbers(
@ -107,3 +107,49 @@ DRIVE_REFERENCE_TIME = BulkNumbers(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 9) + i), (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 9) + i),
'reference_time' 'reference_time'
) )
DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING = BulkEnums(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 52) + i),
'supports_predictive_failure_monitoring',
{
1: 'other',
2: 'notAvailable',
3: 'available'
}
)
DRIVE_SMART_STATUS = BulkEnums(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 57) + i),
'smart_status',
{
1: 'other',
2: 'ok',
3: 'replaceDrive'
}
)
DRIVE_ROTATIONAL_SPEED = BulkEnums(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 59) + i),
'rotational_speed',
{
1: 'other',
2: '7200 rpm',
3: '10k rpm',
4: '15k rpm',
5: 'ssd',
}
)
DRIVE_MEDIA_TYPE = BulkEnums(
(lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 69) + i),
'media_type',
{
1: 'other',
2: 'rotatingPlatters',
3: 'solidState',
}
)
# there appear to be a hell of a lot more, but I don't have the time to add them all right now
# here is a reference: https://oidref.com/1.3.6.1.4.1.232.3.2.5.1.1

View File

@ -1,7 +1,15 @@
POWER_METER_READING = '1.3.6.1.4.1.232.6.2.15.3.0' POWER_METER_READING = '1.3.6.1.4.1.232.6.2.15.3.0'
POWER_METER_SUPPORT = '1.3.6.1.4.1.232.6.2.15.1.0'
POWER_METER_SUPPORT_MAP = {
1: 'other',
2: 'supported',
3: 'unsupported',
}
# I have no idea what these values mean (or map to). any help would be appreciated POWER_METER_STATUS = '1.3.6.1.4.1.232.6.2.15.2.0'
# POWER_METER_SUPPORT = '1.3.6.1.4.1.232.6.2.15.1' POWER_METER_STATUS_MAP = {
# POWER_METER_STATUS = '1.3.6.1.4.1.232.6.2.15.2' 1: 'other',
# POWER_METER_PREVIOUS_READING = '1.3.6.1.4.1.232.6.2.15.4' 2: 'present',
3: 'absent',
}

View File

@ -49,8 +49,9 @@ TEMP_CONDITION = BulkEnums(
'condition', 'condition',
{ {
1: 'other', 1: 'other',
2: 'normal', 2: 'ok',
3: 'high', 3: 'degraded',
4: 'failed',
} }
) )