#!/usr/bin/env python3
import os
import re
import subprocess
import sys
import json

# check_bluefs_db_usage - Icinga monitoring plugin
#
# Default warn (90) and crit (95) thresholds can be adjusted
# per OSD by changing the arguments fed to this script:
#   check_bluefs_db_usage osd.NN:WARN:CRIT osd.XX:WARN:CRIT [..]
#
# eg:
#   check_bluefs_db_usage osd.81:96:98 osd.72:10:20
#
# would check osd.81 on this system with warn 96 and crit 98 thresholds
# and osd.72 with warn 10 crit 20 thresholds. All other discovered OSDs
# will be checked withe defeault warn (90) and crit (95) thresholds
#
default_warn_threshold = 90
default_crit_threshold = 95
override_thresholds = {}
for arg in sys.argv:
    res = re.match(r'^(osd\.\d+):(\d+):(\d+)$', arg)
    if res:
        osd = res.group(1)
        warn = res.group(2)
        crit = res.group(3)
        override_thresholds[osd] = {}
        override_thresholds[osd]['warn'] = warn
        override_thresholds[osd]['crit'] = crit


def fatal(msg):
    print(f"CRITICAL - {msg}")
    sys.exit(2)

def warning(msg):
    print(f"WARNING - {msg}")
    sys.exit(1)

def ok(msg):
    print(f"OK - {msg}")
    sys.exit(0)


# Find all Ceph OSD 'asok' admin socket files by first looking up the fsid's
# and then looking for all matching socket files inside that fsid's directory.
osd_asoks = []
ceph_fsid_path = f"/var/run/ceph/"
fsid_dents = os.scandir(ceph_fsid_path)
for fsid_dent in fsid_dents:
    if not re.match(r'^[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}$', fsid_dent.name):
        continue

    ceph_osd_asok_path = f"/var/run/ceph/{fsid_dent.name}/"
    osd_asok_dents = os.scandir(ceph_osd_asok_path)
    for osd_asok_dent in osd_asok_dents:
        if not re.match(r'^ceph\-osd\.\d+\.asok$', osd_asok_dent.name):
            continue
        osd_name = osd_asok_dent.name
        osd_name = re.sub(r'^ceph\-', '', osd_name)
        osd_name = re.sub(r'\.asok$', '', osd_name)
        osd_asoks.append([osd_name, f"{ceph_osd_asok_path}/{osd_asok_dent.name}"])


# For reach Ceph OSD 'asok' admin socket, query the perf data and process the bluefs db usage
fatals = []
warnings = []
oks = []

for osd_asok_entry in osd_asoks:
    osd_name = osd_asok_entry[0]
    osd_asok_path = osd_asok_entry[1]

    cmd = ['/usr/bin/ceph', 'daemon', osd_asok_path, 'perf', 'dump']

    try:
        process = subprocess.run(cmd, capture_output=True)
    except subprocess.CalledProcessError as e:
        fatal(f"ceph daemon {osd_asok_path} perf dump failed: '{e}'")
    
    raw_output = process.stdout.decode('utf-8')
    try:
        json_obj = json.loads(raw_output)
    except json.decoder.JSONDecodeError as e:
        fatal(f"could not parse {osd_asok_path} perf dump data as json: '{e}'")

    if 'bluefs' not in json_obj:
        fatal(f"key 'bluefs' is not in perf dump data")
    for key in ['db_total_bytes', 'db_used_bytes']:
        if not key in json_obj['bluefs']:
            fatal(f"key '{key}' is not in 'bluefs' dict in perf dump data")

    db_total_bytes = json_obj['bluefs']['db_total_bytes']
    db_used_bytes = json_obj['bluefs']['db_used_bytes']
    usage_percentage = db_used_bytes / db_total_bytes * 100

    check_warn_threshold = default_warn_threshold
    check_crit_threshold = default_crit_threshold
    if osd_name in override_thresholds:
        check_warn_threshold = int(override_thresholds[osd_name]['warn'])
        check_crit_threshold = int(override_thresholds[osd_name]['crit'])

    if usage_percentage >= check_crit_threshold:
        fatals.append(f"{osd_name} usage {usage_percentage:.2f} >= {check_crit_threshold}")
    elif usage_percentage >= check_warn_threshold:
        warnings.append(f"{osd_name} usage {usage_percentage:.2f} >= {check_warn_threshold}")
    else:
        oks.append(f"{osd_name} usage ok: {usage_percentage:.0f}%")


# Construct a message to show as check output and exit accordingly.
message = ''
if fatals:
    message += "CRITICAL: " + '; '.join(fatals)
if warnings:
    if message:
        message += "; "
    message += "WARNING: " + '; '.join(warnings)
if message:
    message += "; "
message += "OK: " + '; '.join(oks)
message = re.sub(r'; ; ', '; ', message)
message = re.sub(r'^; ', '', message)

if fatals:
    fatal(message)

if warnings:
    warning(message)

ok(message)
