#!/usr/bin/env python3
# ***************************************************************************
#  IIIIII NNN  NNN  Copyright (C) 2024 Innovative Networks, Inc.
#    II    NNN  N   All Rights Reserved. Any redistribution or reproduction
#    II    N NN N   of part or all of the content of this program in any form
#    II    N  NNN   without expressed written consent of the copyright holder
#  IIIIII NNN  NNN  is strictly prohibited.  Please contact admins@in-kc.com
#   Be Innovative.  for additional information.
# ***************************************************************************
#  check_disk_health.py
#  Author - Ian Perry <iperry@indigex.com>
#
#  Purpose:  Checks disk lifetime and wear level on an ubuntu/debian host.
#
#  Version History:
#       2024.12.09 - Initial Creation
# ***************************************************************************

# pylint: disable=bare-except
# pylint: disable=broad-exception-caught
# pylint: disable=redefined-outer-name
# pylint: disable=missing-module-docstring
# pylint: disable=too-many-locals

import sys

try:
    import inmon_utils as inmon
except Exception as e:
    print("Failed to import inmon_utils %s", e)
    sys.exit(3)

import argparse
import logging
import subprocess
import json
import re

import inmon_quant as quant

parser = argparse.ArgumentParser(description="Checks disk health on a Ubuntu/Debian host.")

parser.add_argument(
    "-d", "--debug",
    dest="debug",
    action="store_true",
    help="Enable debug logging"
)
parser.add_argument(
    "-D", "--device",
    dest="device",
    help="Device to check e.g. /dev/sdX or /dev/disk/by-id/X",
    required=True
)
parser.add_argument(
    "--lifetime-percent-warn",
    dest="lifetime_percent_warn",
    help="Warn if lifetime percent is greater or equal to this value and less than lifetime-percent-crit",
    default=80
)
parser.add_argument(
    "--lifetime-percent-crit",
    dest="lifetime_percent_crit",
    help="Critical if lifetime percent is greater or equal to this value",
    default=90
)
parser.add_argument(
    "--wear-level-warn",
    dest="wear_level_warn",
    help="Warn if wear level is greater or equal to this value and less than wear-level-crit",
    default=80
)
parser.add_argument(
    "--wear-level-crit",
    dest="wear_level_crit",
    help="Critical if wear level is greater or equal to this value",
    default=90
)

args = parser.parse_args()

if args.debug:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)

# Grabs the data fromsmartctl
smartctl_output = subprocess.run(
    ["/usr/bin/sudo", "/usr/sbin/smartctl", args.device, "--json=s", "-a"],
    capture_output=True,
    text=True,
    check=False
)

# Load into json
data = json.loads(smartctl_output.stdout)

# Check for run errors
if smartctl_output.returncode != 0:
    print(f"[UNKNOWN] - {data['smartctl']['messages'][0]['string']}")
    sys.exit(3)

# Get the device type. NVME works different.
device_type = data['device']['protocol'].lower()

# Initialize check.
container = inmon.MultiActiveCheck()

# NVME devices don't have a wear level statistic nor do they support attributes. They have a health log
# that we can use in stead.
if device_type == "nvme":
    # Thresholds use lifetime percent
    thresholds = quant.parse_thresholds(None, args.lifetime_percent_warn, args.lifetime_percent_crit, None)
    # Expected lifetime starts at 100 and goes down to 0. We use the inverse because the attributes on ATA devices
    # start at 0 and go up to 100
    expected_lifetime = data['nvme_smart_health_information_log']['percentage_used']
    del data['nvme_smart_health_information_log']['percentage_used']
    # Add check result to container
    check = inmon.CheckResult(
        quant.parse_stat(quant.Quantity(value=expected_lifetime), thresholds),
        f"{expected_lifetime}% lifetime used",
    )
    check.add_perfdata(inmon.PerfData("expected_lifetime", quant.Quantity(value=expected_lifetime, uom="%")))
    container.append(check)


    lba_size = data['logical_block_size']
    
    # Add all items to perfdata
    for key, val in data['nvme_smart_health_information_log'].items():
        name = key.lower()
        # Handle uom without value change
        logger.debug("Attribute %s: %s", key, val)
        if "percent" in name:
            uom = "%"
            d_uom = "%"
        elif "hour" in name:
            uom = "h"
            d_uom = "h"
        elif "fahrenheit" in name:
            uom = "F"
            d_uom = "F"
        elif "temperature" in name or "celsius" in name:
            uom = "C"
            d_uom = "C"
        else:
            uom = None
            d_uom = None
            

                
            
        # Handle value changes where units are in the name or sizes are in the name
        if "unit" in name:
            val = lba_size * val * 1000 # "Units" are defined in thousands of lba_size in bytes.
            uom = "B"
            d_uom = None
        elif "mib" in name:
            match = re.search(r'\S+_(?P<size>\d+)mib', name)
            val = int(match.group('size')) * 1024 * 1024 * val
            uom = "B"
            d_uom = None
            name = name.replace(f"_{match.group('size')}mib", "")
            
        # Some temperature sensors are lists
        if isinstance(val, list):
            val = [x for x in val if x is not None]
            for num, sub_val in enumerate(val):
                check = inmon.CheckResult(0, f"{key}_{num} {sub_val}{d_uom if d_uom else ''}")
                check.add_perfdata(
                    inmon.PerfData(
                        f"{name}_{num}",
                        quant.Quantity(
                            value=int(sub_val),
                            uom=uom
                        )
                    )
                )
                container.append(check)
        else:
            check = inmon.CheckResult(0, f"{key} {val}{d_uom if d_uom else ''}")
            check.add_perfdata(
                inmon.PerfData(
                    name,
                    quant.Quantity(
                        value=int(val),
                        uom=uom
                    )
                )
            )
            container.append(check)
    
# ATA devices all have attributes.
elif device_type == "ata":
    wear_thresholds = quant.parse_thresholds(None, args.wear_level_warn, args.wear_level_crit, None)
    lifetime_thresholds = quant.parse_thresholds(None, args.lifetime_percent_warn, args.lifetime_percent_crit, None)
    
    attributes = {}
    # Build an array so that we can access this in a way that makes sense. We leave the ID in because
    # otherwise we have to add specific items.

    logger.debug("Attributes: %s", data['ata_smart_attributes']['table'])

    for i in data['ata_smart_attributes']['table']:
        attributes[int(i['id'])] = i

    # There are three attributes to check for lifetime, 169 is standard, 231 is for SSDs, 202 is vendor specific.
    logger.debug("Attribute 169: %s", attributes.get(169))
    logger.debug("Attribute 231: %s", attributes.get(231))
    logger.debug("Attribute 202: %s", attributes.get(202))
    if 169 in attributes:
        expected_lifetime = 100 - attributes[169]['raw']['value']
        del attributes[169]
    elif 231 in attributes:
        expected_lifetime = 100 - attributes[231]['raw']['value']
        del attributes[231]
    elif 202 in attributes:
        expected_lifetime = 100 - attributes[202]['raw']['value']
        del attributes[202]
    else:
        print("[UNKNOWN] - Unable to obtain expected lifetime from S.M.A.R.T. attributes 169, 231, or 202.")
        sys.exit(3)

    logger.debug("Expected lifetime: %s", expected_lifetime)

    res = quant.parse_stat(quant.Quantity(value=int(expected_lifetime)), lifetime_thresholds)

    # Create check

    check = inmon.CheckResult(
        res,
        f"{expected_lifetime}% lifetime used",
    )
    check.add_perfdata(inmon.PerfData("expected_lifetime", quant.Quantity(value=int(expected_lifetime), uom="%")))
    container.append(check)


    # Gather wear data if available
    expected_wear = None #pylint: disable=invalid-name

    # attribute 173 is supposed to be the SSD wear leveling count and 177 is wear range delta.
    # However, it appears that on samsung and kingston drives at least, 177 is wear_leveling_count
    # which is a range starting at 100 and going to 0 as write lifetime goes down.
    logger.debug("Attribute 173: %s", attributes.get(173))
    logger.debug("Attribute 177: %s", attributes.get(177))
    if 173 in attributes:
        expected_wear = 100 - attributes[173]['value']
        del attributes[173]
    elif 177 in attributes:
        expected_wear = 100 - attributes[177]['value']
        del attributes[177]

    # If we have a value, use it
    if expected_wear is not None:
        logger.debug("Expected wear level: %s", expected_wear)
        res = quant.parse_stat(quant.Quantity(value=int(expected_wear)), wear_thresholds)

        check = inmon.CheckResult(
            res,
            f"{expected_wear}% wear level used",
        )
        container.append(check)

    lba_size = data['logical_block_size']

    # Add all remaining ATA attributes to perfdata in check and to output
    for i, attr in attributes.items():
        name = attr['name'].lower()
        logger.debug("Attribute %s: %s", i, attr)
        # Handle unit of measurement if not modifying value
        if "percent" in name:
            uom = "%"
            d_uom = "%"
        elif "hour" in name:
            uom = "h"
            d_uom = "h"
        elif "fahrenheit" in name:
            uom = "F"
            d_uom = "F"
        elif "temperature" in name or "celsius" in name:
            uom = "C"
            d_uom = "C"
        else:
            uom = None
            d_uom = None

        # Handle units and sizes
        if "unit" in name:
            val = lba_size * attr['raw']['value']
            uom = "B"
            d_uom = None
        elif "mib" in name:
            match = re.search(r'\S+_(?P<size>\d+)mib', name)
            val = int(match.group('size')) * 1024 * 1024 * attr['raw']['value']
            uom = "B"
            d_uom = None
            name = name.replace(f"_{match.group('size')}mib", "")
        else:
            val = attr['raw']['value']

        # Add attribute with smart ID to output
        check = inmon.CheckResult(0, f"SMART {attr['id']:03}: {attr['name']} {attr['raw']['value']}{d_uom if d_uom else ''}")
        check.add_perfdata(
            inmon.PerfData(
                name,
                quant.Quantity(
                    value=int(val),
                    uom=uom
                )
            )
        )
        container.append(check)

else:
    print("Unsupported device type %s", device_type)
    sys.exit(3)

# Parse information a little bit easier.
serial = data['serial_number']
model = data['model_name']
size = f"{round(quant.parse_bytes(str(data['user_capacity']['bytes']), 'G', numeric=True), 2)}G"

# Add device information as a secondary check
container.append(inmon.CheckResult(0, f"Device model {model}, size {size}, serial number {serial}"))

inmon.handle_exit(container)
