#! /usr/bin/env python3
import os
import sys
import json
import time
import socket
import signal
import platform
import argparse
import requests
import threading
import subprocess
import configparser
import urllib3

# Disable insecure warning
# api-monitoring.dmz.bit.nl don't have a valid SSL cert
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# TODO-LIST:
# - check if namespaces actually exist before trying to use them
# - rewrite plugins config file to allow additional config options
#    - logging
#    - local echo
# - Errors like "configparser.DuplicateSectionError: While reading from
#   '/etc/bit-monitoring-agent/plugins.cfg' [line 14]: section 'AGENT_UPTIME'
#   already exists
#   will not show in /var/log/bit-monitoring-agent
#
# DONE:
# - register in eventlog on Windows
# - run agent in a specific namespace
# - ability to specify a (different) namespace for each plugin
# - plugin specific namespace
# - plugint specific timeout
# - logging to eventlog on Windows
# - forking & killing processes on Windows
# - add HTTP(S) proxy support
# - handle ^C
# - SIGHUP reloads configs
# - move as many defaults to config files as possible
# - comments and documentation
# - checks on robustness and more checks!
# - logging to file instead of (sys|Event)log

# some defaults and internal constants
VERSION = "2.0"                 # version of this tool
# agent version which is used to report to the monitoring server
AGENT_VERSION = "201"
MIN_INTERVAL = 60                   # minimum interval between checks
HTTP_TIMEOUT = 10                   # timeout for JSON HTTP post requests
DAEMON_NAME = "bit-monitoring-agent"      # name shown in logs and used for reporting

# these values should be overwritten in the agent config file if required
PLUG_TIMEOUT = 15                    # default plugin timout
LOG_TYPE = "FILE"                   # default for *nix, Windows only writes to eventlog

# custom log levels
LOG_INFO = 3
LOG_WARNING = 2
LOG_ERROR = 1

# default log level
LOG_LEVEL = LOG_INFO
# do not log to stdout when not running on a tty
ON_A_TTY = False



if platform.system() != "Windows":
    # paths on *nix
    PATH_PID = "/run"
    PATH_CONF = "/etc/bit-monitoring-agent"
    PATH_LOG = "/var/log/"
    FILE_CFG = "%s/agent.cfg" % PATH_CONF
    FILE_PLUG = "%s/plugins.cfg" % PATH_CONF
    FILE_LOG = "%s/%s.log" % (PATH_LOG, DAEMON_NAME)
    FILE_PID = "%s/%s.pid" % (PATH_PID, DAEMON_NAME)
    ON_A_TTY = sys.stdout.isatty()

    # check for a log_type override
    config = configparser.ConfigParser()
    try:
        config.read(FILE_CFG)
    except Exception as e:
        print("ERROR: failed to read config: %s" % e)
        sys.exit(1)

    # check for non default log_type
    if config.has_option('main', "log_type"):
        if config.get('main', 'log_type').upper() == "SYSLOG":
            LOG_TYPE = "SYSLOG"
        else:
            print("Unknown log type '%s', ignoring it." % config.get('main', 'log_type'))

    if LOG_TYPE == "SYSLOG":
        import syslog
        # map custom log levels to syslog levels
        LOG_LEVELS = {
            LOG_INFO: syslog.LOG_INFO,
            LOG_WARNING: syslog.LOG_WARNING,
            LOG_ERROR: syslog.LOG_ERR,
        }

        try:
            # open a syslog connection
            logfile = syslog.openlog(
                DAEMON_NAME, logoption=syslog.LOG_PID | syslog.LOG_NOWAIT)
        except Exception as e:
            print("ERROR: failed to write to syslog: %s" % e)
            sys.exit(1)
    else:
        import logging
        # map custom log levels to syslog levels
        LOG_LEVELS = {
            LOG_INFO: logging.info,
            LOG_WARNING: logging.warning,
            LOG_ERROR: logging.error,
        }

        try:
            # open log file
            logfile = logging.basicConfig(
                filename=FILE_LOG, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=LOG_LEVEL)
        except Exception as e:
            print("ERROR: failed to write to log: %s" % e)
            sys.exit(1)

elif platform.system() == "Windows":
    # paths on Windows
    PATH_PID = "C:\\bit-monitoring-agent\\run"
    PATH_CONF = "C:\\bit-monitoring-agent\\etc"
    FILE_CFG = "%s\\agent.cfg" % PATH_CONF
    FILE_PLUG = "%s\\plugins.cfg" % PATH_CONF
    FILE_PID = "%s\\%s.pid" % (PATH_PID, DAEMON_NAME)
    FILE_LOG = "C:\\bit-monitoring-agent\\%s.log" % (DAEMON_NAME)



def log(text, level=LOG_INFO, always=False, do_exit=False):
    if always or level <= LOG_LEVEL:
        if ON_A_TTY:
            print("[%s] %s" % (time.strftime("%d-%m-%Y %H:%M:%S", time.localtime()), text))
        if platform.system() == "Linux":
            if LOG_TYPE == "SYSLOG":
                syslog.syslog(LOG_LEVELS[level], text)
            else:
                LOG_LEVELS[level](text)
        elif platform.system() == "Windows":
            with open(FILE_LOG, "a") as f:
                f.write(f"[{time.strftime('%d-%m-%Y %H:%M:%S', time.localtime())}] {text}")
                f.write("\n")
            stat = os.stat(FILE_LOG)
            if stat.st_size > 1048576:
                if os.path.exists("%s.9" % FILE_LOG): os.unlink("%s.9" % FILE_LOG)
                if os.path.exists("%s.8" % FILE_LOG): os.rename("%s.8" % FILE_LOG, "%s.9" % FILE_LOG)
                if os.path.exists("%s.7" % FILE_LOG): os.rename("%s.7" % FILE_LOG, "%s.8" % FILE_LOG)
                if os.path.exists("%s.6" % FILE_LOG): os.rename("%s.6" % FILE_LOG, "%s.7" % FILE_LOG)
                if os.path.exists("%s.5" % FILE_LOG): os.rename("%s.5" % FILE_LOG, "%s.6" % FILE_LOG)
                if os.path.exists("%s.4" % FILE_LOG): os.rename("%s.4" % FILE_LOG, "%s.5" % FILE_LOG)
                if os.path.exists("%s.3" % FILE_LOG): os.rename("%s.3" % FILE_LOG, "%s.4" % FILE_LOG)
                if os.path.exists("%s.2" % FILE_LOG): os.rename("%s.2" % FILE_LOG, "%s.3" % FILE_LOG)
                if os.path.exists("%s.1" % FILE_LOG): os.rename("%s.1" % FILE_LOG, "%s.2" % FILE_LOG)
                if os.path.exists(FILE_LOG):          os.rename(FILE_LOG,          "%s.1" % FILE_LOG)
                q = open(FILE_LOG, "a")
                q.close()
        else:
            print("Unknown OS platform '%s', don't know how to log." % platform.system())

    if do_exit:
        sys.exit(1)


class Command(object):
    def __init__(self, plugin, config):
        self.process = None
        self.stdout = ""
        self.exitcode = -1
        self.timeout = plugin["timeout"]
        self.cmd = plugin["cmd"]

        if platform.system() == "Linux":
            # on linux we use the timeout binary as an extra way to kill
            # the process and its children after the` timeout is reached.
            self.cmd = "/usr/bin/timeout %d " % plugin["timeout"]

            # run in a namespace if specified
            if len(plugin["ns"]) > 0:
                self.cmd += "/sbin/ip netns exec %s " % plugin["ns"]
            elif len(config.get("default_namespace", "")) > 0:
                self.cmd += "/sbin/ip netns exec %s " % config["default_namespace"]
            self.cmd += plugin["cmd"]

    def run(self):
        def target():
            self.process = subprocess.Popen(self.cmd, shell=True, stdout=subprocess.PIPE)
            self.stdout, _ = self.process.communicate()
            # exitcode 124 is used by the timeout binary for indicating timeout was reached
            # so we use it here as well.
            self.exitcode = 124

        thread = threading.Thread(target=target)
        thread.start()
        thread.join(self.timeout)

        # kill the process if it hasn't terminated properly
        if thread.is_alive():
            if platform.system() == "Windows":
                subprocess.call(
                    ['taskkill', '/F', '/T', '/PID', str(self.process.pid)])
            else:
                self.process.terminate()
            thread.join()
            # exitcode 124 = killed due to timeout
            self.exitcode = 124
        else:
            self.exitcode = self.process.returncode

    def get_exitcode(self):
        return self.exitcode

    def get_stdout(self):
        return self.stdout


class MonitoringAgent(object):
    def __init__(self,):
        self.config = {}
        self.plugins = {}
        msg = "%s version %s starting up, supported protocol v%s" % (DAEMON_NAME, VERSION, AGENT_VERSION)
        if platform.system() == "Linux":
            msg += " with PID %s" % os.getpid()
        log(msg, LOG_INFO, always=True)

    def read_config(self, fname=None):
        if fname is None:
            fname = self.config.get("configfile", "")

        if fname == "":
            log("Don't know which configfile to read.", LOG_ERROR, do_exit=True)

        log("Reading config file '%s'" % fname, LOG_INFO)

        cfg_parser = configparser.ConfigParser()
        result = cfg_parser.read(fname)
        if fname not in result:
            log("Unable to load configuration from %s. Does file exist? Permissions OK?" %
                fname, LOG_ERROR, do_exit=True)

        if not cfg_parser.has_section('main'):
            log("No section 'main' found in config file '%s', quitting!" % (fname), do_exit=True)

        # by default save everything in the config map
        for option in cfg_parser.options('main'):
            self.config[option] = cfg_parser.get('main', option)

        # check if monitoring is enabled in config
        if not self.config['monitoring_enabled'] in ['1', 'True', 'true', 'yes']:
            log("Monitoring is not enabled, refusing to start", LOG_ERROR, do_exit=True)

        # default to the hostname if no hostname is specified
        if self.config.get("reporting_hostname", "") == "":
            self.config["reporting_hostname"] = socket.getaddrinfo(socket.gethostname(), 0, flags=socket.AI_CANONNAME)[0][3]
            log("no 'reporting_hostname' found in config, defaulting to hostname '%s'" %
                self.config["reporting_hostname"], LOG_INFO)

        # default to the reporting_hostname if no identity is specified
        if self.config.get("identity", "") == "":
            self.config["identity"] = self.config["reporting_hostname"]
            log("no 'identity' found in config, defaulting to reporting_hostname '%s'" %
                self.config["reporting_hostname"], LOG_INFO)

        # check for a non-default reporting interval
        if not self.config["normal_report_interval"].isdigit() or \
                int(self.config["normal_report_interval"]) < MIN_INTERVAL:
            log("Normal reporting interval '%s' does not make any sense. " %
                self.config["normal_report_interval"], LOG_ERROR, do_exit=True)

        if not self.config["alarm_report_interval"].isdigit() or \
                int(self.config["alarm_report_interval"]) < MIN_INTERVAL:
            log("Alarm reporting interval '%s' does not make any sense. " %
                self.config["alarm_report_interval"], LOG_ERROR, do_exit=True)

        if self.config.get("loglevel", "") != "":
            global LOG_LEVEL
            if self.config["loglevel"].upper() == "INFO":
                LOG_LEVEL = LOG_INFO
            elif self.config["loglevel"].upper() == "WARNING":
                LOG_LEVEL = LOG_WARNING
            elif self.config["loglevel"].upper() == "ERROR":
                LOG_LEVEL = LOG_ERROR
            else:
                log("Unknown loglevel '%s', ignoring it." %
                    self.config["loglevel"], LOG_WARNING)

        # check for a non-default plugin timeout
        if "plugin_timeout" not in self.config:
            self.config["plugin_timeout"] = PLUG_TIMEOUT
        elif not self.config["plugin_timeout"].isdigit() or int(self.config["plugin_timeout"]) < 1 or \
                int(self.config["plugin_timeout"]) > 60:
            log("Default plugin timeout '%s' does not make any sense." %
                self.config["plugin_timeout"], LOG_ERROR, do_exit=True)
        else:
            self.config["plugin_timeout"] = int(
                self.config["plugin_timeout"])

        # check if a default namespace for plugins is specified
        if "default_namespace" in self.config and len(self.config["default_namespace"]) > 0:
            if platform.system() == "Windows":
                log("Warning, 'default_namespace' configured but not supported on Windows.", LOG_WARNING)
            elif os.geteuid() != 0:
                log("Default namespace '%s' is configured, but not running as root, can't switch." %
                    self.config["default_namespace"], do_exit=True)


    def set_pluginfile(self, fname):
        self.config["pluginfile"] = fname

    def read_plugins(self):
        log("Reading plugins file '%s'" % (self.config["pluginfile"]), LOG_INFO)

        plugins = {}

        plug_parser = configparser.RawConfigParser()
        result = plug_parser.read(self.config["pluginfile"])
        if self.config['pluginfile'] not in result:
            log("Can't read plugins file '%s'. Does file exist? Permissions OK?" % self.config['pluginfile'], do_exit=True)

        # each plugin should be a seperate section in the plugin config file
        for plugin in plug_parser.sections():

            # command to run is required
            if not plug_parser.has_option(plugin, "command"):
                log("Plugin '%s' has no 'command' entry, skipping it!" % plugin)
                continue

            plugins[plugin] = {}
            plugins[plugin]["cmd"] = plug_parser.get(plugin, "command")

            # possible overwrite of the namespace for the plugin
            if plug_parser.has_option(plugin, "namespace") and len(plug_parser.get(plugin, "namespace")) > 0:
                if platform.system() == "Windows":
                    log("No namespace support on Windows but requested for plugin '%s', ignoring it.", LOG_WARNING)
                elif os.geteuid() != 0:
                    log("Namespace specified but not running as root, cannot change it.", do_exit=True)
                else:
                    plugins[plugin]["ns"] = plug_parser.get(plugin, "namespace")
            else:
                plugins[plugin]["ns"] = self.config.get("namespace", "")

            # possible overwrite for the plugin timeout
            if plug_parser.has_option(plugin, "timeout"):
                if not plug_parser.get(plugin, "timeout").isdigit() or \
                        plug_parser.get(plugin, "timeout").isdigit() < 1 or \
                        plug_parser.get(plugin, "timeout").isdigit() > 60:
                    log("Plugin '%s' has a timeout which does not make any sense: '%s'" % (
                        plugin, plug_parser.get(plugin, "timeout")))
                else:
                    plugins[plugin]["timeout"] = int(
                        plug_parser.get(plugin, "timeout"))
            else:
                plugins[plugin]["timeout"] = self.config["plugin_timeout"]

        if len(plugins) == 0:
            log("No plugins configured, check your config.", LOG_ERROR, do_exit=True)

        log("%d plugin definitions found." % len(plugins), LOG_INFO)
        self.plugins = plugins


    def run(self, noloop=False):
        while True:
            log("Starting normal run.", LOG_INFO)

            alarms = 0
            plug_alarms = 0
            starttime = time.time()

            # check each plugin
            for plugin in self.plugins:
                log("Checking plugin '%s' with command '%s'" %
                    (plugin, self.plugins[plugin]["cmd"]), LOG_INFO)

                plug_result = "UNKNOWN: No attempt to run the plugin was made at all"
                plug_exit = 3

                cmd = Command(self.plugins[plugin], self.config)
                cmd.run()

                if cmd.get_exitcode() == 124:
                    # exitcode 124 is used by the 'timeout' binary to indicate the process was killed
                    log("Plugin '%s' didn't finish within %d seconds, killed it." %
                        (plugin, self.plugins[plugin]["timeout"]), LOG_WARNING)
                    plug_result = "UNKNOWN: Plugin did not return data after %d seconds".encode("utf-8") % \
                        self.plugins[plugin]["timeout"]
                    plug_exit = 3
                    plug_alarms += 1
                else:
                    plug_exit = cmd.get_exitcode()
                    plug_result = cmd.get_stdout()
                    if plug_exit == 0:
                        log("Plugin '%s' finished successfully, result: '%s'" %
                            (plugin, plug_result.decode("utf-8").rstrip()), LOG_INFO)
                    else:
                        plug_alarms += 1
                        if plug_exit == 127:
                            log("Plugin '%s' command '%s' not found or not executable." %
                                (plugin, self.plugins[plugin]["cmd"]), LOG_WARNING)
                            plug_result = "Plugin not found or not executable."
                            plug_exit = 3
                        else:
                            log("Plugin '%s' exit code '%d' result: '%s'" %
                                (plugin, plug_exit, plug_result.decode("utf-8")), LOG_WARNING)

                log("Submitting check results for '%s' to '%s'" % (
                    plugin,
                    self.config["destination_server"].replace('"', '')
                ), LOG_INFO)
                alarms += self.submit_result(
                    self.config,
                    "Service",
                    plugin.upper(),
                    plug_exit,
                    "AGENT %s" % plug_result.decode("utf-8")
                )

            log("Completed normal run - submitting to '%s'" %
                self.config["destination_server"].replace('"', ''), LOG_INFO)

            # submit agent status
            self.submit_result(
                self.config,
                "Service",
                "AGENT_STATUS",
                (0, 1)[alarms > 0],
                "AGENT has %d alarms and running version %s" % (
                    alarms, AGENT_VERSION)
            )

            # determine how long to sleep
            alarms += plug_alarms
            interval = int(
                self.config["normal_report_interval"]) - int(time.time() - starttime)
            sleep = max((interval, int(self.config["alarm_report_interval"]))[
                        alarms > 0], 1)
            if alarms > 0:
                log("%d alarms active!" % alarms, LOG_WARNING)

            if self.config.get("once", False):
                log("Requested to run only once. We're done!", LOG_ERROR)
                sys.exit(0)

            log("Plugins done in %d seconds, sleeping for %d seconds" %
                (time.time() - starttime, sleep))

            time.sleep(sleep)


    def submit_result(self, config, checktype, service, plug_exit, plug_result):
        data = {
            "type":             "%s" % checktype,
            "exit_status":      str(plug_exit),
            "plugin_output":    plug_result,
            "filter":           "host.name==\"%s\" && service.name==\"%s\"" % (self.config["reporting_hostname"].replace('"', ''), service),
        }

        headers = {
            'Accept': 'application/json',
            'X-HTTP-Method-Override': 'POST'
        }

        proxies = {}
        if len(config.get("proxy", "")) > 0:
            proxies["http"] = "http://%s" % config["proxy"]
            proxies["https"] = "https://%s" % config["proxy"]

        # set connection string
        request_url = "https://%s/v1/actions/process-check-result" % (self.config["destination_server"].replace('"', ''))

        try:
            req = requests.post(
                request_url,
                headers=headers,
                data=json.dumps(data),
                auth=(self.config["identity"].replace('"', ''),
                      self.config["password"].replace('"', '')),
                timeout=HTTP_TIMEOUT,
                proxies=proxies,
                verify=False,
            )

        except requests.exceptions.ConnectionError as e:
            log("Post to '%s' failed, error: %s" %
                (self.config["destination_server"].replace('"', ''), e))
            return 1

        if req.status_code == 500:
            log("Post to '%s' returned error code %d, plugin check may not be found" % (self.config["destination_server"].replace('"', ''), req.status_code),
                LOG_INFO)
            return 1
        elif req.status_code != 200:
            log("Post to '%s' returned error code %d with reason: ''%s'" % (self.config["destination_server"].replace('"', ''), req.status_code, req.reason),
                LOG_INFO)
            return 1
        else:
            if req.status_code != 200:
                log("Post to '%s' returned error message: '%s'" % (
                    self.config["destination_server"].replace('"', ''),
                    req.json()["results"]["status"]),
                    LOG_ERROR
                    )
                return 1
            else:
                log("Succesful post, result: Successfully processed check result for object '%s'" %
                    service, LOG_INFO)
                return 0

    def check_pid(self):
        try:
            fh = open(self.config["pidfile"], "r")
            self.pid = fh.readline().strip()
            fh.close()
            if self.pid.isdigit():
                try:
                    # check if the process with given PID is alive, abort if it is
                    os.kill(int(self.pid), 0)
                    log("%s is already running as process '%s'" %
                        (DAEMON_NAME, self.pid), LOG_ERROR, do_exit=True)
                except OSError:
                    self.rm_pid()
        except OSError:
            return

        except IOError:
            return

    def write_pid(self):
        try:
            fh = open(self.config["pidfile"], "w")
            fh.write("%d\n" % os.getpid())
            fh.close()
            self.pid = os.getpid()
            log("Wrote PID %d to '%s'" %
                (os.getpid(), self.config["pidfile"]), LOG_INFO)
        except Exception:
            log("Failed to write current PID to '%s'" %
                self.config["pidfile"], LOG_ERROR, do_exit=True)

    def rm_pid(self):
        try:
            os.remove(self.config["pidfile"])
            log("Removed old PID file '%s' for PID %s" %
                (self.config["pidfile"], self.pid), LOG_INFO)
        except Exception as e:
            log("Failed to remove old PID file '%s': %s" % (
                self.config["pidfile"], e), LOG_ERROR, do_exit=True)

    def sigint(self):
        if platform.system() != "Windows":
            self.rm_pid()
            if logfile:
                logfile.close()

        log("SIGINT received, exiting.", LOG_ERROR)
        sys.exit(0)

###############################
#            main
###############################
def main():
    global LOG_LEVEL

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-a", "--agent-file",
        action="store",
        dest="agentfile",
        help="agent config filename",
        default=FILE_CFG,
    )

    # pidfile location cannot be specified on Windows
    if platform.system() != "Windows":
        parser.add_argument(
            "-i", "--pid-file",
            action="store",
            dest="pidfile",
            help="PID file",
            default=FILE_PID,
        )

    parser.add_argument(
        "-o", "--once",
        action="store_const",
        dest="once",
        help="test all plugins once and then stop the agent",
        const=True,
        default=False,
    )

    parser.add_argument(
        "-p", "--plugin-file",
        action="store",
        dest="pluginfile",
        help="plugin config filename",
        default=FILE_PLUG,
    )

    parser.add_argument(
        "-v", "--verbose",
        action="store_const",
        dest="verbose",
        help="be more verbose",
        const=True,
        default=False,
    )

    parser.add_argument(
        "-V", "--version",
        action="store_const",
        dest="version",
        help="print version number and exit gracefully",
        const=True,
        default=False,
    )

    ns = parser.parse_args()

    if ns.version:
        print("%s v%s" % (DAEMON_NAME, VERSION))
        sys.exit(0)

    if ns.verbose:
        LOG_LEVEL = LOG_INFO
        log("Verbose logging enabled.", LOG_INFO)

    # initialize the agent and read configuration files
    agent = MonitoringAgent()
    agent.read_config(ns.agentfile)
    agent.set_pluginfile(ns.pluginfile)
    agent.config['once'] = ns.once
    agent.read_plugins()

    if platform.system() != "Windows":
        agent.config['pidfile'] = ns.pidfile
        agent.check_pid()
        agent.write_pid()

    def sigint_handler(signal, frame):
        agent.sigint()
        sys.exit(0)

    def sighup_handler(signal, frame):
        agent.read_config(ns.agentfile)
        agent.read_plugins()

    if platform.system() != "Windows":
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGHUP, sighup_handler)

    # main run
    agent.run()

if __name__ == "__main__":
    main()
