Initial attempt at creating incidents when an URL becomes unhealthy. Missing to actually call it from the scheduler. #3

This commit is contained in:
Mitsuo Takaki
2016-05-16 01:31:53 -07:00
parent 9c8c89c1dd
commit 0f53ff8678
6 changed files with 129 additions and 53 deletions

1
.gitignore vendored
View File

@@ -10,3 +10,4 @@ share/
*.egg-info *.egg-info
MANIFEST MANIFEST
dist/ dist/
.idea

View File

@@ -1,22 +1,23 @@
#!/usr/bin/env python #!/usr/bin/env python
import abc import abc
import cachet_url_monitor.status
import logging import logging
import re import re
import requests import requests
import time import time
from yaml import load from yaml import load
# This is the mandatory fields that must be in the configuration file in this # This is the mandatory fields that must be in the configuration file in this
# same exact structure. # same exact structure.
configuration_mandatory_fields = { configuration_mandatory_fields = {
'endpoint': ['url', 'method', 'timeout', 'expectation'], 'endpoint': ['url', 'method', 'timeout', 'expectation'],
'cachet': ['api_url', 'token', 'component_id'], 'cachet': ['api_url', 'token', 'component_id'],
'frequency': []} 'frequency': []}
class ConfigurationValidationError(Exception): class ConfigurationValidationError(Exception):
"""Exception raised when there's a validation error.""" """Exception raised when there's a validation error."""
def __init__(self, value): def __init__(self, value):
self.value = value self.value = value
@@ -28,8 +29,9 @@ class Configuration(object):
"""Represents a configuration file, but it also includes the functionality """Represents a configuration file, but it also includes the functionality
of assessing the API and pushing the results to cachet. of assessing the API and pushing the results to cachet.
""" """
def __init__(self, config_file): def __init__(self, config_file):
#TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment # TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment
# variables so we have a more docker-friendly approach. # variables so we have a more docker-friendly approach.
self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration') self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration')
self.config_file = config_file self.config_file = config_file
@@ -38,14 +40,25 @@ class Configuration(object):
self.validate() self.validate()
self.logger.info('Monitoring URL: %s %s' % self.logger.info('Monitoring URL: %s %s' %
(self.data['endpoint']['method'], self.data['endpoint']['url'])) (self.data['endpoint']['method'], self.data['endpoint']['url']))
self.expectations = [Expectaction.create(expectation) for expectation self.expectations = [Expectaction.create(expectation) for expectation
in self.data['endpoint']['expectation']] in self.data['endpoint']['expectation']]
for expectation in self.expectations: for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,)) self.logger.info('Registered expectation: %s' % (expectation,))
self.headers = {'X-Cachet-Token': self.data['cachet']['token']} self.headers = {'X-Cachet-Token': self.data['cachet']['token']}
def is_create_incident(self):
"""Will verify if the configuration is set to create incidents or not.
:return True if the configuration is set to create incidents or False it otherwise.
"""
return 'create_incident' in self.data['cachet'] and self.data['cachet']['create_incident']
def validate(self): def validate(self):
"""Validates the configuration by verifying the mandatory fields are
present and in the correct format. If the validation fails, a
ConfigurationValidationError is raised. Otherwise nothing will happen.
"""
configuration_errors = [] configuration_errors = []
for key, sub_entries in configuration_mandatory_fields.iteritems(): for key, sub_entries in configuration_mandatory_fields.iteritems():
if key not in self.data: if key not in self.data:
@@ -56,99 +69,149 @@ class Configuration(object):
configuration_errors.append('%s.%s' % (key, sub_key)) configuration_errors.append('%s.%s' % (key, sub_key))
if ('endpoint' in self.data and 'expectation' in if ('endpoint' in self.data and 'expectation' in
self.data['endpoint']): self.data['endpoint']):
if (not isinstance(self.data['endpoint']['expectation'], list) or if (not isinstance(self.data['endpoint']['expectation'], list) or
(isinstance(self.data['endpoint']['expectation'], list) and (isinstance(self.data['endpoint']['expectation'], list) and
len(self.data['endpoint']['expectation']) == 0)): len(self.data['endpoint']['expectation']) == 0)):
configuration_errors.append('endpoint.expectation') configuration_errors.append('endpoint.expectation')
if len(configuration_errors) > 0: if len(configuration_errors) > 0:
raise ConfigurationValidationError(('Config file [%s] failed ' raise ConfigurationValidationError(
'validation. Missing keys: %s') % (self.config_file, 'Config file [%s] failed validation. Missing keys: %s' % (self.config_file,
', '.join(configuration_errors))) ', '.join(configuration_errors)))
def evaluate(self): def evaluate(self):
"""Sends the request to the URL set in the configuration and executes """Sends the request to the URL set in the configuration and executes
each one of the expectations, one by one. The status will be updated each one of the expectations, one by one. The status will be updated
according to the expectation results. according to the expectation results.
""" """
if hasattr(self, 'status'):
# Keeping track of the previous status.
self.previous_status = self.status
try: try:
self.request = requests.request(self.data['endpoint']['method'], self.request = requests.request(self.data['endpoint']['method'],
self.data['endpoint']['url'], self.data['endpoint']['url'],
timeout=self.data['endpoint']['timeout']) timeout=self.data['endpoint']['timeout'])
self.current_timestamp = int(time.time()) self.current_timestamp = int(time.time())
except requests.ConnectionError: except requests.ConnectionError:
self.logger.warning('The URL is unreachable: %s %s' % self.message = 'The URL is unreachable: %s %s' % (
(self.data['endpoint']['method'], self.data['endpoint']['method'], self.data['endpoint']['url'])
self.data['endpoint']['url'])) self.logger.warning(self.message)
self.status = 3 self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return return
except requests.HTTPError: except requests.HTTPError:
self.logger.exception('Unexpected HTTP response') self.message = 'Unexpected HTTP response'
self.status = 3 self.logger.exception(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return return
except requests.Timeout: except requests.Timeout:
self.logger.warning('Request timed out') self.message = 'Request timed out'
self.status = 3 self.logger.warning(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
return return
# We initially assume the API is healthy. # We initially assume the API is healthy.
self.status = 1 self.status = cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
self.message = ''
for expectation in self.expectations: for expectation in self.expectations:
status = expectation.get_status(self.request) status = expectation.get_status(self.request)
# The greater the status is, the worse the state of the API is. # The greater the status is, the worse the state of the API is.
if status > self.status: if status > self.status:
self.status = status self.status = status
self.message = expectation.get_message(self.request)
def push_status(self): def push_status(self):
"""Pushes the status of the component to the cachet server. It will update the component
status based on the previous call to evaluate().
"""
params = {'id': self.data['cachet']['component_id'], 'status': params = {'id': self.data['cachet']['component_id'], 'status':
self.status} self.status}
component_request = requests.put('%s/components/%d' % component_request = requests.put('%s/components/%d' %
(self.data['cachet']['api_url'], (self.data['cachet']['api_url'],
self.data['cachet']['component_id']), self.data['cachet']['component_id']),
params=params, headers=self.headers) params=params, headers=self.headers)
if component_request.ok: if component_request.ok:
# Successful update # Successful update
self.logger.info('Component update: status [%d]' % (self.status,)) self.logger.info('Component update: status [%d]' % (self.status,))
else: else:
# Failed to update the API status # Failed to update the API status
self.logger.warning('Component update failed with status [%d]: API' self.logger.warning('Component update failed with status [%d]: API'
' status: [%d]' % (component_request.status_code, self.status)) ' status: [%d]' % (component_request.status_code, self.status))
def push_metrics(self): def push_metrics(self):
"""Pushes the total amount of seconds the request took to get a response from the URL.
It only will send a request if the metric id was set in the configuration.
"""
if 'metric_id' in self.data['cachet'] and hasattr(self, 'request'): if 'metric_id' in self.data['cachet'] and hasattr(self, 'request'):
params = {'id': self.data['cachet']['metric_id'], 'value': params = {'id': self.data['cachet']['metric_id'], 'value':
self.request.elapsed.total_seconds(), 'timestamp': self.request.elapsed.total_seconds(), 'timestamp':
self.current_timestamp} self.current_timestamp}
metrics_request = requests.post('%s/metrics/%d/points' % metrics_request = requests.post('%s/metrics/%d/points' %
(self.data['cachet']['api_url'], (self.data['cachet']['api_url'],
self.data['cachet']['metric_id']), params=params, self.data['cachet']['metric_id']), params=params,
headers=self.headers) headers=self.headers)
if metrics_request.ok: if metrics_request.ok:
# Successful metrics upload # Successful metrics upload
self.logger.info('Metric uploaded: %.6f seconds' % self.logger.info('Metric uploaded: %.6f seconds' %
(self.request.elapsed.total_seconds(),)) (self.request.elapsed.total_seconds(),))
else: else:
self.logger.warning('Metric upload failed with status [%d]' % self.logger.warning('Metric upload failed with status [%d]' %
(metrics_request.status_code,)) (metrics_request.status_code,))
def push_incident(self):
if hasattr(self, 'incident_id') and self.status == 1:
# If the incident already exists, it means it's unhealthy. We only update it when it becomes healthy again.
params = {'status': 4, 'visible': 1, 'component_id': self.data['cachet']['component_id'],
'component_status': self.status, 'notify': True}
incident_request = requests.put('%s/incidents/%d' % (self.data['cachet']['api_url'], self.incident_id),
params=params, headers=self.headers)
if incident_request.ok:
# Successful metrics upload
self.logger.info(
'Incident updated: component status [%d], message: "%s"' % (self.status, self.message))
del self.incident_id
else:
self.logger.warning(
'Incident update failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
elif not hasattr(self, 'incident_id') and self.status != 1:
# This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': 1,
'component_id': self.data['cachet']['component_id'], 'component_status': self.status,
'notify': True}
incident_request = requests.post('%s/incidents' % (self.data['cachet']['api_url'],), params=params,
headers=self.headers)
if incident_request.ok:
# Successful incident upload.
self.incident_id = incident_request.json()['data']['id']
self.logger.info(
'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % (
self.status, self.message))
else:
self.logger.warning(
'Incident upload failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
class Expectaction(object): class Expectaction(object):
"""Base class for URL result expectations. Any new excpectation should extend """Base class for URL result expectations. Any new excpectation should extend
this class and the name added to create() method. this class and the name added to create() method.
""" """
@staticmethod @staticmethod
def create(configuration): def create(configuration):
"""Creates a list of expectations based on the configuration types """Creates a list of expectations based on the configuration types
list. list.
""" """
expectations = { expectations = {
'HTTP_STATUS': HttpStatus, 'HTTP_STATUS': HttpStatus,
'LATENCY': Latency, 'LATENCY': Latency,
'REGEX': Regex 'REGEX': Regex
} }
return expectations.get(configuration['type'])(configuration) return expectations.get(configuration['type'])(configuration)
@abc.abstractmethod @abc.abstractmethod
@@ -168,9 +231,9 @@ class HttpStatus(Expectaction):
def get_status(self, response): def get_status(self, response):
if response.status_code == self.status: if response.status_code == self.status:
return 1 return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else: else:
return 3 return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response): def get_message(self, response):
return 'Unexpected HTTP status (%s)' % (response.status_code,) return 'Unexpected HTTP status (%s)' % (response.status_code,)
@@ -185,9 +248,9 @@ class Latency(Expectaction):
def get_status(self, response): def get_status(self, response):
if response.elapsed.total_seconds() <= self.threshold: if response.elapsed.total_seconds() <= self.threshold:
return 1 return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else: else:
return 2 return cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
def get_message(self, response): def get_message(self, response):
return 'Latency above threshold: %.4f' % (response.elapsed.total_seconds(),) return 'Latency above threshold: %.4f' % (response.elapsed.total_seconds(),)
@@ -199,13 +262,13 @@ class Latency(Expectaction):
class Regex(Expectaction): class Regex(Expectaction):
def __init__(self, configuration): def __init__(self, configuration):
self.regex_string = configuration['regex'] self.regex_string = configuration['regex']
self.regex = re.compile(configuration['regex']) self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL)
def get_status(self, response): def get_status(self, response):
if self.regex.match(response.text): if self.regex.match(response.text):
return 1 return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else: else:
return 3 return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response): def get_message(self, response):
return 'Regex did not match anything in the body' return 'Regex did not match anything in the body'

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python
COMPONENT_STATUS_OPERATIONAL = 1
COMPONENT_STATUS_PERFORMANCE_ISSUES = 2
COMPONENT_STATUS_PARTIAL_OUTAGE = 3
COMPONENT_STATUS_MAJOR_OUTAGE = 4
COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL,
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE]

View File

@@ -8,10 +8,11 @@ endpoint:
- type: LATENCY - type: LATENCY
threshold: 1 threshold: 1
- type: REGEX - type: REGEX
regex: '.*<body>.*' regex: '.*(<body).*'
cachet: cachet:
api_url: https://demo.cachethq.io/api/v1 api_url: https://demo.cachethq.io/api/v1
token: my_token token: my_token
component_id: 1 component_id: 1
#metric_id: 1 #metric_id: 1
create_incident: true
frequency: 30 frequency: 30

View File

@@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
import cachet_url_monitor.status
import mock import mock
import unittest import unittest
import sys import sys
@@ -37,7 +38,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request sys.modules['requests'].request = request
self.configuration.evaluate() self.configuration.evaluate()
assert self.configuration.status == 1 assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
def test_evaluate_with_failure(self): def test_evaluate_with_failure(self):
def total_seconds(): def total_seconds():
@@ -54,7 +55,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request sys.modules['requests'].request = request
self.configuration.evaluate() self.configuration.evaluate()
assert self.configuration.status == 3 assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def test_evaluate_with_timeout(self): def test_evaluate_with_timeout(self):
def request(method, url, timeout=None): def request(method, url, timeout=None):
@@ -67,7 +68,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request sys.modules['requests'].request = request
self.configuration.evaluate() self.configuration.evaluate()
assert self.configuration.status == 3 assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
self.mock_logger.warning.assert_called_with('Request timed out') self.mock_logger.warning.assert_called_with('Request timed out')
def test_evaluate_with_connection_error(self): def test_evaluate_with_connection_error(self):

View File

@@ -77,11 +77,11 @@ class RegexTest(unittest.TestCase):
self.expectation = Regex({'type': 'REGEX', 'regex': '.*(find stuff).*'}) self.expectation = Regex({'type': 'REGEX', 'regex': '.*(find stuff).*'})
def test_init(self): def test_init(self):
assert self.expectation.regex == re.compile('.*(find stuff).*') assert self.expectation.regex == re.compile('.*(find stuff).*', re.UNICODE + re.DOTALL)
def test_get_status_healthy(self): def test_get_status_healthy(self):
request = mock.Mock() request = mock.Mock()
request.text = 'We could find stuff in this body.' request.text = 'We could find stuff\n in this body.'
assert self.expectation.get_status(request) == 1 assert self.expectation.get_status(request) == 1