diff --git a/main.py b/main.py index 7b06954..c7ceb71 100644 --- a/main.py +++ b/main.py @@ -14,7 +14,7 @@ from targets.fan import * from targets.cpu import * from targets.drive import * from targets.memory import * -import targets.power +from targets.power import * import argparse @@ -120,10 +120,45 @@ class BulkCollector(Collector): raise e -def get_power_draw() -> float: - verbose('collecting ilo_server_power_draw') - val = snmp_get(config, targets.power.POWER_METER_READING) - return val +class PowerCollector(Collector): + def collect(self) -> float: + verbose('collecting ilo_server_power_draw') + try: + reading = snmp_get(config, POWER_METER_READING) + support = snmp_get(config, POWER_METER_SUPPORT) + status = snmp_get(config, POWER_METER_STATUS) + + if not isinstance(reading, int): + print('expected power meter reading to be an int, got', type(reading)) + print('value in question:', reading) + reading = -1 + if not isinstance(support, int): + print('expected power meter support to be an int, got', type(support)) + print('value in question:', support) + support = 1 + if not isinstance(status, int): + print('expected power meter status to be an int, got', type(status)) + print('value in question:', status) + status = 1 + + if support not in POWER_METER_SUPPORT_MAP: + print('ILO returned a value outside of the expected range for POWER_METER_SUPPORT:', support) + support_s = 'unknown' + else: + support_s = POWER_METER_SUPPORT_MAP[support] + if status not in POWER_METER_STATUS_MAP: + print('ILO returned a value outside of the expected range for POWER_METER_STATUS:', status) + status_s = 'unknown' + else: + status_s = POWER_METER_STATUS_MAP[status] + + metric = GaugeMetricFamily('ilo_server_power_draw', 'Power draw of the server in watts', labels=['support', 'status']) + metric.add_metric([support_s, status_s], reading) + yield metric + except Exception as e: + print('Failed to scan SNMP, aborting collection') + SCAN_FAIL_COUNTER.inc() + raise e if __name__ == '__main__': @@ -135,8 +170,7 @@ if __name__ == '__main__': ContextData(), ) - power = Gauge("ilo_server_power_draw", "Power draw of the server in watts") - power.set_function(get_power_draw) + REGISTRY.register(PowerCollector()) no_value = BulkDummyValue('info') @@ -176,12 +210,12 @@ if __name__ == '__main__': DRIVE_INDEX, 'drive', not args.scan_drives_once, - ('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL, DRIVE_LINK_RATE, DRIVE_STATUS, DRIVE_CONDITION]), - ('Sizes of installed drives in megabytes', DRIVE_SIZE, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), - ('Temperatures of installed drives in celsius', DRIVE_TEMP, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), - ('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), - ('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), - ('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_MODEL, DRIVE_SERIAL]), + ('Information about installed drives', no_value, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL, DRIVE_FIRMWARE, DRIVE_LINK_RATE, DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING, DRIVE_SMART_STATUS, DRIVE_MEDIA_TYPE, DRIVE_ROTATIONAL_SPEED, DRIVE_STATUS, DRIVE_CONDITION]), + ('Sizes of installed drives in megabytes', DRIVE_SIZE, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]), + ('Temperatures of installed drives in celsius', DRIVE_TEMP, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]), + ('Temperature thresholds of installed drives in celsius', DRIVE_TEMP_THRESHOLD, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]), + ('Maximum temperatures of installed drives in celsius', DRIVE_TEMP_MAX, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]), + ('Reference time of installed drives in hours', DRIVE_REFERENCE_TIME, [DRIVE_PORT, DRIVE_BOX, DRIVE_BAY, DRIVE_VENDOR, DRIVE_SERIAL]), scan_method=scrape.detect_complex, )) diff --git a/targets/drive.py b/targets/drive.py index 39dcdeb..ec577a9 100644 --- a/targets/drive.py +++ b/targets/drive.py @@ -2,7 +2,7 @@ from snmp_groups import BulkEnums, BulkNumbers, BulkStrings DRIVE_INDEX = '1.3.6.1.4.1.232.3.2.5.1.1.2' -# controller index? +# controller index # DRIVE_CONTROLLER = BulkNumbers( # (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 1) + i), # 'controller' @@ -39,9 +39,9 @@ DRIVE_SERIAL = BulkStrings( 'serial', ) -DRIVE_MODEL = BulkStrings( +DRIVE_FIRMWARE = BulkStrings( (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 4) + i), - 'model', + 'firmware', ) DRIVE_SIZE = BulkNumbers( @@ -107,3 +107,49 @@ DRIVE_REFERENCE_TIME = BulkNumbers( (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 9) + i), 'reference_time' ) + +DRIVE_SUPPORTS_PREDICTIVE_FAILURE_MONITORING = BulkEnums( + (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 52) + i), + 'supports_predictive_failure_monitoring', + { + 1: 'other', + 2: 'notAvailable', + 3: 'available' + } +) + +DRIVE_SMART_STATUS = BulkEnums( + (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 57) + i), + 'smart_status', + { + 1: 'other', + 2: 'ok', + 3: 'replaceDrive' + } +) + +DRIVE_ROTATIONAL_SPEED = BulkEnums( + (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 59) + i), + 'rotational_speed', + { + 1: 'other', + 2: '7200 rpm', + 3: '10k rpm', + 4: '15k rpm', + 5: 'ssd', + } +) + +DRIVE_MEDIA_TYPE = BulkEnums( + (lambda i: (1, 3, 6, 1, 4, 1, 232, 3, 2, 5, 1, 1, 69) + i), + 'media_type', + { + 1: 'other', + 2: 'rotatingPlatters', + 3: 'solidState', + } +) + + +# there appear to be a hell of a lot more, but I don't have the time to add them all right now +# here is a reference: https://oidref.com/1.3.6.1.4.1.232.3.2.5.1.1 diff --git a/targets/power.py b/targets/power.py index c3ee80e..f1df96b 100644 --- a/targets/power.py +++ b/targets/power.py @@ -1,7 +1,15 @@ POWER_METER_READING = '1.3.6.1.4.1.232.6.2.15.3.0' +POWER_METER_SUPPORT = '1.3.6.1.4.1.232.6.2.15.1.0' +POWER_METER_SUPPORT_MAP = { + 1: 'other', + 2: 'supported', + 3: 'unsupported', +} -# I have no idea what these values mean (or map to). any help would be appreciated -# POWER_METER_SUPPORT = '1.3.6.1.4.1.232.6.2.15.1' -# POWER_METER_STATUS = '1.3.6.1.4.1.232.6.2.15.2' -# POWER_METER_PREVIOUS_READING = '1.3.6.1.4.1.232.6.2.15.4' +POWER_METER_STATUS = '1.3.6.1.4.1.232.6.2.15.2.0' +POWER_METER_STATUS_MAP = { + 1: 'other', + 2: 'present', + 3: 'absent', +} diff --git a/targets/temp.py b/targets/temp.py index 1358e16..58aa61e 100644 --- a/targets/temp.py +++ b/targets/temp.py @@ -49,8 +49,9 @@ TEMP_CONDITION = BulkEnums( 'condition', { 1: 'other', - 2: 'normal', - 3: 'high', + 2: 'ok', + 3: 'degraded', + 4: 'failed', } )