diff --git a/.gitignore b/.gitignore index f3396cc..09be2a2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ share/ *.egg-info MANIFEST dist/ +.idea diff --git a/cachet_url_monitor/configuration.py b/cachet_url_monitor/configuration.py index 54a571a..ee6f008 100644 --- a/cachet_url_monitor/configuration.py +++ b/cachet_url_monitor/configuration.py @@ -1,22 +1,23 @@ #!/usr/bin/env python import abc +import cachet_url_monitor.status import logging import re import requests import time from yaml import load - # This is the mandatory fields that must be in the configuration file in this # same exact structure. configuration_mandatory_fields = { - 'endpoint': ['url', 'method', 'timeout', 'expectation'], - 'cachet': ['api_url', 'token', 'component_id'], - 'frequency': []} + 'endpoint': ['url', 'method', 'timeout', 'expectation'], + 'cachet': ['api_url', 'token', 'component_id'], + 'frequency': []} class ConfigurationValidationError(Exception): """Exception raised when there's a validation error.""" + def __init__(self, value): self.value = value @@ -24,28 +25,62 @@ class ConfigurationValidationError(Exception): return repr(self.value) +class ComponentNonexistentError(Exception): + """Exception raised when the component does not exist.""" + + def __init__(self, component_id): + self.component_id = component_id + + def __str__(self): + return repr('Component with id [%d] does not exist.' % (self.component_id,)) + + class Configuration(object): """Represents a configuration file, but it also includes the functionality of assessing the API and pushing the results to cachet. """ + def __init__(self, config_file): - #TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment + # TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment # variables so we have a more docker-friendly approach. self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration') self.config_file = config_file self.data = load(file(self.config_file, 'r')) + # We need to validate the configuration is correct and then validate the component actually exists. self.validate() + self.headers = {'X-Cachet-Token': self.data['cachet']['token']} + self.status = self.get_current_status(self.data['cachet']['component_id']) self.logger.info('Monitoring URL: %s %s' % - (self.data['endpoint']['method'], self.data['endpoint']['url'])) + (self.data['endpoint']['method'], self.data['endpoint']['url'])) self.expectations = [Expectaction.create(expectation) for expectation - in self.data['endpoint']['expectation']] + in self.data['endpoint']['expectation']] for expectation in self.expectations: self.logger.info('Registered expectation: %s' % (expectation,)) - self.headers = {'X-Cachet-Token': self.data['cachet']['token']} + + def get_current_status(self, component_id): + get_status_request = requests.get( + '%s/components/%d' % (self.data['cachet']['api_url'], self.data['cachet']['component_id']), + headers=self.headers) + + if get_status_request.ok: + # The component exists. + return get_status_request.json()['data']['status'] + else: + raise ComponentNonexistentError(component_id) + + def is_create_incident(self): + """Will verify if the configuration is set to create incidents or not. + :return True if the configuration is set to create incidents or False it otherwise. + """ + return 'create_incident' in self.data['cachet'] and self.data['cachet']['create_incident'] def validate(self): + """Validates the configuration by verifying the mandatory fields are + present and in the correct format. If the validation fails, a + ConfigurationValidationError is raised. Otherwise nothing will happen. + """ configuration_errors = [] for key, sub_entries in configuration_mandatory_fields.iteritems(): if key not in self.data: @@ -56,99 +91,150 @@ class Configuration(object): configuration_errors.append('%s.%s' % (key, sub_key)) if ('endpoint' in self.data and 'expectation' in - self.data['endpoint']): + self.data['endpoint']): if (not isinstance(self.data['endpoint']['expectation'], list) or - (isinstance(self.data['endpoint']['expectation'], list) and - len(self.data['endpoint']['expectation']) == 0)): + (isinstance(self.data['endpoint']['expectation'], list) and + len(self.data['endpoint']['expectation']) == 0)): configuration_errors.append('endpoint.expectation') if len(configuration_errors) > 0: - raise ConfigurationValidationError(('Config file [%s] failed ' - 'validation. Missing keys: %s') % (self.config_file, - ', '.join(configuration_errors))) + raise ConfigurationValidationError( + 'Config file [%s] failed validation. Missing keys: %s' % (self.config_file, + ', '.join(configuration_errors))) def evaluate(self): """Sends the request to the URL set in the configuration and executes each one of the expectations, one by one. The status will be updated according to the expectation results. """ + if hasattr(self, 'status'): + # Keeping track of the previous status. + self.previous_status = self.status + try: self.request = requests.request(self.data['endpoint']['method'], - self.data['endpoint']['url'], - timeout=self.data['endpoint']['timeout']) + self.data['endpoint']['url'], + timeout=self.data['endpoint']['timeout']) self.current_timestamp = int(time.time()) except requests.ConnectionError: - self.logger.warning('The URL is unreachable: %s %s' % - (self.data['endpoint']['method'], - self.data['endpoint']['url'])) - self.status = 3 + self.message = 'The URL is unreachable: %s %s' % ( + self.data['endpoint']['method'], self.data['endpoint']['url']) + self.logger.warning(self.message) + self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE return except requests.HTTPError: - self.logger.exception('Unexpected HTTP response') - self.status = 3 + self.message = 'Unexpected HTTP response' + self.logger.exception(self.message) + self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE return except requests.Timeout: - self.logger.warning('Request timed out') - self.status = 3 + self.message = 'Request timed out' + self.logger.warning(self.message) + self.status = cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES return # We initially assume the API is healthy. - self.status = 1 + self.status = cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL + self.message = '' for expectation in self.expectations: status = expectation.get_status(self.request) # The greater the status is, the worse the state of the API is. if status > self.status: self.status = status + self.message = expectation.get_message(self.request) def push_status(self): + """Pushes the status of the component to the cachet server. It will update the component + status based on the previous call to evaluate(). + """ params = {'id': self.data['cachet']['component_id'], 'status': - self.status} + self.status} component_request = requests.put('%s/components/%d' % - (self.data['cachet']['api_url'], - self.data['cachet']['component_id']), - params=params, headers=self.headers) + (self.data['cachet']['api_url'], + self.data['cachet']['component_id']), + params=params, headers=self.headers) if component_request.ok: # Successful update self.logger.info('Component update: status [%d]' % (self.status,)) else: # Failed to update the API status self.logger.warning('Component update failed with status [%d]: API' - ' status: [%d]' % (component_request.status_code, self.status)) + ' status: [%d]' % (component_request.status_code, self.status)) def push_metrics(self): + """Pushes the total amount of seconds the request took to get a response from the URL. + It only will send a request if the metric id was set in the configuration. + """ if 'metric_id' in self.data['cachet'] and hasattr(self, 'request'): params = {'id': self.data['cachet']['metric_id'], 'value': - self.request.elapsed.total_seconds(), 'timestamp': - self.current_timestamp} + self.request.elapsed.total_seconds(), 'timestamp': + self.current_timestamp} metrics_request = requests.post('%s/metrics/%d/points' % - (self.data['cachet']['api_url'], - self.data['cachet']['metric_id']), params=params, - headers=self.headers) + (self.data['cachet']['api_url'], + self.data['cachet']['metric_id']), params=params, + headers=self.headers) if metrics_request.ok: # Successful metrics upload self.logger.info('Metric uploaded: %.6f seconds' % - (self.request.elapsed.total_seconds(),)) + (self.request.elapsed.total_seconds(),)) else: self.logger.warning('Metric upload failed with status [%d]' % - (metrics_request.status_code,)) + (metrics_request.status_code,)) + + def push_incident(self): + if hasattr(self, 'incident_id') and self.status == 1: + # If the incident already exists, it means it's unhealthy. We only update it when it becomes healthy again. + params = {'status': 4, 'visible': 1, 'component_id': self.data['cachet']['component_id'], + 'component_status': self.status, 'notify': True} + + incident_request = requests.put('%s/incidents/%d' % (self.data['cachet']['api_url'], self.incident_id), + params=params, headers=self.headers) + if incident_request.ok: + # Successful metrics upload + self.logger.info( + 'Incident updated, API healthy again: component status [%d], message: "%s"' % ( + self.status, self.message)) + del self.incident_id + else: + self.logger.warning( + 'Incident update failed with status [%d], message: "%s"' % ( + incident_request.status_code, self.message)) + elif not hasattr(self, 'incident_id') and self.status != 1: + # This is the first time the incident is being created. + params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': 1, + 'component_id': self.data['cachet']['component_id'], 'component_status': self.status, + 'notify': True} + incident_request = requests.post('%s/incidents' % (self.data['cachet']['api_url'],), params=params, + headers=self.headers) + if incident_request.ok: + # Successful incident upload. + self.incident_id = incident_request.json()['data']['id'] + self.logger.info( + 'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % ( + self.status, self.message)) + else: + self.logger.warning( + 'Incident upload failed with status [%d], message: "%s"' % ( + incident_request.status_code, self.message)) class Expectaction(object): """Base class for URL result expectations. Any new excpectation should extend this class and the name added to create() method. """ + @staticmethod def create(configuration): """Creates a list of expectations based on the configuration types list. """ expectations = { - 'HTTP_STATUS': HttpStatus, - 'LATENCY': Latency, - 'REGEX': Regex - } + 'HTTP_STATUS': HttpStatus, + 'LATENCY': Latency, + 'REGEX': Regex + } return expectations.get(configuration['type'])(configuration) @abc.abstractmethod @@ -168,9 +254,9 @@ class HttpStatus(Expectaction): def get_status(self, response): if response.status_code == self.status: - return 1 + return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL else: - return 3 + return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE def get_message(self, response): return 'Unexpected HTTP status (%s)' % (response.status_code,) @@ -185,27 +271,27 @@ class Latency(Expectaction): def get_status(self, response): if response.elapsed.total_seconds() <= self.threshold: - return 1 + return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL else: - return 2 + return cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES def get_message(self, response): - return 'Latency above threshold: %.4f' % (response.elapsed.total_seconds(),) + return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),) def __str__(self): - return repr('Latency threshold: %.4f' % (self.threshold,)) + return repr('Latency threshold: %.4f seconds' % (self.threshold,)) class Regex(Expectaction): def __init__(self, configuration): self.regex_string = configuration['regex'] - self.regex = re.compile(configuration['regex']) + self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL) def get_status(self, response): if self.regex.match(response.text): - return 1 + return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL else: - return 3 + return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE def get_message(self, response): return 'Regex did not match anything in the body' diff --git a/cachet_url_monitor/scheduler.py b/cachet_url_monitor/scheduler.py index 7fae92c..37c6bb1 100644 --- a/cachet_url_monitor/scheduler.py +++ b/cachet_url_monitor/scheduler.py @@ -10,6 +10,7 @@ class Agent(object): """Monitor agent that will be constantly verifying if the URL is healthy and updating the component. """ + def __init__(self, configuration): self.configuration = configuration @@ -18,7 +19,6 @@ class Agent(object): cachet server. """ self.configuration.evaluate() - self.configuration.push_status() self.configuration.push_metrics() def start(self): @@ -26,11 +26,34 @@ class Agent(object): schedule.every(self.configuration.data['frequency']).seconds.do(self.execute) +class UpdateStatusAgent(Agent): + def __init__(self, configuration): + super(UpdateStatusAgent, self).__init__(configuration) + + def execute(self): + super(UpdateStatusAgent, self).execute() + self.configuration.push_status() + + +class CreateIncidentAgent(Agent): + def __init__(self, configuration): + super(CreateIncidentAgent, self).__init__(configuration) + + def execute(self): + super(CreateIncidentAgent, self).execute() + self.configuration.push_incident() + + class Scheduler(object): def __init__(self, config_file): self.logger = logging.getLogger('cachet_url_monitor.scheduler.Scheduler') self.configuration = Configuration(config_file) - self.agent = Agent(self.configuration) + + if self.configuration.is_create_incident(): + self.agent = CreateIncidentAgent(self.configuration) + else: + self.agent = UpdateStatusAgent(self.configuration) + self.stop = False def start(self): diff --git a/cachet_url_monitor/status.py b/cachet_url_monitor/status.py new file mode 100644 index 0000000..66564de --- /dev/null +++ b/cachet_url_monitor/status.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python + +COMPONENT_STATUS_OPERATIONAL = 1 +COMPONENT_STATUS_PERFORMANCE_ISSUES = 2 +COMPONENT_STATUS_PARTIAL_OUTAGE = 3 +COMPONENT_STATUS_MAJOR_OUTAGE = 4 + +COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL, + COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE, + COMPONENT_STATUS_MAJOR_OUTAGE] diff --git a/config.yml b/config.yml index 537d4d2..5ba9817 100644 --- a/config.yml +++ b/config.yml @@ -8,10 +8,11 @@ endpoint: - type: LATENCY threshold: 1 - type: REGEX - regex: '.*.*' + regex: '.*(