Merge pull request #6 from mtakaki/mtakaki_3_create_incident

Initial attempt at creating incidents when an URL becomes unhealthy #3
This commit is contained in:
mtakaki
2016-05-19 08:43:32 -07:00
8 changed files with 228 additions and 78 deletions

1
.gitignore vendored
View File

@@ -10,3 +10,4 @@ share/
*.egg-info
MANIFEST
dist/
.idea

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python
import abc
import cachet_url_monitor.status
import logging
import re
import requests
import time
from yaml import load
# This is the mandatory fields that must be in the configuration file in this
# same exact structure.
configuration_mandatory_fields = {
@@ -17,6 +17,7 @@ configuration_mandatory_fields = {
class ConfigurationValidationError(Exception):
"""Exception raised when there's a validation error."""
def __init__(self, value):
self.value = value
@@ -24,10 +25,21 @@ class ConfigurationValidationError(Exception):
return repr(self.value)
class ComponentNonexistentError(Exception):
"""Exception raised when the component does not exist."""
def __init__(self, component_id):
self.component_id = component_id
def __str__(self):
return repr('Component with id [%d] does not exist.' % (self.component_id,))
class Configuration(object):
"""Represents a configuration file, but it also includes the functionality
of assessing the API and pushing the results to cachet.
"""
def __init__(self, config_file):
# TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment
# variables so we have a more docker-friendly approach.
@@ -35,7 +47,10 @@ class Configuration(object):
self.config_file = config_file
self.data = load(file(self.config_file, 'r'))
# We need to validate the configuration is correct and then validate the component actually exists.
self.validate()
self.headers = {'X-Cachet-Token': self.data['cachet']['token']}
self.status = self.get_current_status(self.data['cachet']['component_id'])
self.logger.info('Monitoring URL: %s %s' %
(self.data['endpoint']['method'], self.data['endpoint']['url']))
@@ -43,9 +58,29 @@ class Configuration(object):
in self.data['endpoint']['expectation']]
for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,))
self.headers = {'X-Cachet-Token': self.data['cachet']['token']}
def get_current_status(self, component_id):
get_status_request = requests.get(
'%s/components/%d' % (self.data['cachet']['api_url'], self.data['cachet']['component_id']),
headers=self.headers)
if get_status_request.ok:
# The component exists.
return get_status_request.json()['data']['status']
else:
raise ComponentNonexistentError(component_id)
def is_create_incident(self):
"""Will verify if the configuration is set to create incidents or not.
:return True if the configuration is set to create incidents or False it otherwise.
"""
return 'create_incident' in self.data['cachet'] and self.data['cachet']['create_incident']
def validate(self):
"""Validates the configuration by verifying the mandatory fields are
present and in the correct format. If the validation fails, a
ConfigurationValidationError is raised. Otherwise nothing will happen.
"""
configuration_errors = []
for key, sub_entries in configuration_mandatory_fields.iteritems():
if key not in self.data:
@@ -63,8 +98,8 @@ class Configuration(object):
configuration_errors.append('endpoint.expectation')
if len(configuration_errors) > 0:
raise ConfigurationValidationError(('Config file [%s] failed '
'validation. Missing keys: %s') % (self.config_file,
raise ConfigurationValidationError(
'Config file [%s] failed validation. Missing keys: %s' % (self.config_file,
', '.join(configuration_errors)))
def evaluate(self):
@@ -72,36 +107,47 @@ class Configuration(object):
each one of the expectations, one by one. The status will be updated
according to the expectation results.
"""
if hasattr(self, 'status'):
# Keeping track of the previous status.
self.previous_status = self.status
try:
self.request = requests.request(self.data['endpoint']['method'],
self.data['endpoint']['url'],
timeout=self.data['endpoint']['timeout'])
self.current_timestamp = int(time.time())
except requests.ConnectionError:
self.logger.warning('The URL is unreachable: %s %s' %
(self.data['endpoint']['method'],
self.data['endpoint']['url']))
self.status = 3
self.message = 'The URL is unreachable: %s %s' % (
self.data['endpoint']['method'], self.data['endpoint']['url'])
self.logger.warning(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return
except requests.HTTPError:
self.logger.exception('Unexpected HTTP response')
self.status = 3
self.message = 'Unexpected HTTP response'
self.logger.exception(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return
except requests.Timeout:
self.logger.warning('Request timed out')
self.status = 3
self.message = 'Request timed out'
self.logger.warning(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
return
# We initially assume the API is healthy.
self.status = 1
self.status = cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
self.message = ''
for expectation in self.expectations:
status = expectation.get_status(self.request)
# The greater the status is, the worse the state of the API is.
if status > self.status:
self.status = status
self.message = expectation.get_message(self.request)
def push_status(self):
"""Pushes the status of the component to the cachet server. It will update the component
status based on the previous call to evaluate().
"""
params = {'id': self.data['cachet']['component_id'], 'status':
self.status}
component_request = requests.put('%s/components/%d' %
@@ -117,6 +163,9 @@ class Configuration(object):
' status: [%d]' % (component_request.status_code, self.status))
def push_metrics(self):
"""Pushes the total amount of seconds the request took to get a response from the URL.
It only will send a request if the metric id was set in the configuration.
"""
if 'metric_id' in self.data['cachet'] and hasattr(self, 'request'):
params = {'id': self.data['cachet']['metric_id'], 'value':
self.request.elapsed.total_seconds(), 'timestamp':
@@ -134,11 +183,48 @@ class Configuration(object):
self.logger.warning('Metric upload failed with status [%d]' %
(metrics_request.status_code,))
def push_incident(self):
if hasattr(self, 'incident_id') and self.status == 1:
# If the incident already exists, it means it's unhealthy. We only update it when it becomes healthy again.
params = {'status': 4, 'visible': 1, 'component_id': self.data['cachet']['component_id'],
'component_status': self.status, 'notify': True}
incident_request = requests.put('%s/incidents/%d' % (self.data['cachet']['api_url'], self.incident_id),
params=params, headers=self.headers)
if incident_request.ok:
# Successful metrics upload
self.logger.info(
'Incident updated, API healthy again: component status [%d], message: "%s"' % (
self.status, self.message))
del self.incident_id
else:
self.logger.warning(
'Incident update failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
elif not hasattr(self, 'incident_id') and self.status != 1:
# This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': 1,
'component_id': self.data['cachet']['component_id'], 'component_status': self.status,
'notify': True}
incident_request = requests.post('%s/incidents' % (self.data['cachet']['api_url'],), params=params,
headers=self.headers)
if incident_request.ok:
# Successful incident upload.
self.incident_id = incident_request.json()['data']['id']
self.logger.info(
'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % (
self.status, self.message))
else:
self.logger.warning(
'Incident upload failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
class Expectaction(object):
"""Base class for URL result expectations. Any new excpectation should extend
this class and the name added to create() method.
"""
@staticmethod
def create(configuration):
"""Creates a list of expectations based on the configuration types
@@ -168,9 +254,9 @@ class HttpStatus(Expectaction):
def get_status(self, response):
if response.status_code == self.status:
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 3
return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response):
return 'Unexpected HTTP status (%s)' % (response.status_code,)
@@ -185,27 +271,27 @@ class Latency(Expectaction):
def get_status(self, response):
if response.elapsed.total_seconds() <= self.threshold:
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 2
return cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
def get_message(self, response):
return 'Latency above threshold: %.4f' % (response.elapsed.total_seconds(),)
return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),)
def __str__(self):
return repr('Latency threshold: %.4f' % (self.threshold,))
return repr('Latency threshold: %.4f seconds' % (self.threshold,))
class Regex(Expectaction):
def __init__(self, configuration):
self.regex_string = configuration['regex']
self.regex = re.compile(configuration['regex'])
self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL)
def get_status(self, response):
if self.regex.match(response.text):
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 3
return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response):
return 'Regex did not match anything in the body'

View File

@@ -10,6 +10,7 @@ class Agent(object):
"""Monitor agent that will be constantly verifying if the URL is healthy
and updating the component.
"""
def __init__(self, configuration):
self.configuration = configuration
@@ -18,7 +19,6 @@ class Agent(object):
cachet server.
"""
self.configuration.evaluate()
self.configuration.push_status()
self.configuration.push_metrics()
def start(self):
@@ -26,11 +26,34 @@ class Agent(object):
schedule.every(self.configuration.data['frequency']).seconds.do(self.execute)
class UpdateStatusAgent(Agent):
def __init__(self, configuration):
super(UpdateStatusAgent, self).__init__(configuration)
def execute(self):
super(UpdateStatusAgent, self).execute()
self.configuration.push_status()
class CreateIncidentAgent(Agent):
def __init__(self, configuration):
super(CreateIncidentAgent, self).__init__(configuration)
def execute(self):
super(CreateIncidentAgent, self).execute()
self.configuration.push_incident()
class Scheduler(object):
def __init__(self, config_file):
self.logger = logging.getLogger('cachet_url_monitor.scheduler.Scheduler')
self.configuration = Configuration(config_file)
self.agent = Agent(self.configuration)
if self.configuration.is_create_incident():
self.agent = CreateIncidentAgent(self.configuration)
else:
self.agent = UpdateStatusAgent(self.configuration)
self.stop = False
def start(self):

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python
COMPONENT_STATUS_OPERATIONAL = 1
COMPONENT_STATUS_PERFORMANCE_ISSUES = 2
COMPONENT_STATUS_PARTIAL_OUTAGE = 3
COMPONENT_STATUS_MAJOR_OUTAGE = 4
COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL,
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE]

View File

@@ -8,10 +8,11 @@ endpoint:
- type: LATENCY
threshold: 1
- type: REGEX
regex: '.*<body>.*'
regex: '.*(<body).*'
cachet:
api_url: https://demo.cachethq.io/api/v1
token: my_token
component_id: 1
#metric_id: 1
create_incident: true
frequency: 30

View File

@@ -1,19 +1,33 @@
#!/usr/bin/env python
import mock
import unittest
import sys
import unittest
import cachet_url_monitor.status
import mock
from requests import ConnectionError, HTTPError, Timeout
sys.modules['requests'] = mock.Mock()
sys.modules['logging'] = mock.Mock()
from cachet_url_monitor.configuration import Configuration
class ConfigurationTest(unittest.TestCase):
def setUp(self):
def getLogger(name):
self.mock_logger = mock.Mock()
return self.mock_logger
sys.modules['logging'].getLogger = getLogger
def get(url, headers):
get_return = mock.Mock()
get_return.ok = True
get_return.json = mock.Mock()
get_return.json.return_value = {'data': {'status': 1}}
return get_return
sys.modules['requests'].get = get
self.configuration = Configuration('config.yml')
sys.modules['requests'].Timeout = Timeout
sys.modules['requests'].ConnectionError = ConnectionError
@@ -26,6 +40,7 @@ class ConfigurationTest(unittest.TestCase):
def test_evaluate(self):
def total_seconds():
return 0.1
def request(method, url, timeout=None):
response = mock.Mock()
response.status_code = 200
@@ -37,11 +52,12 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 1
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
def test_evaluate_with_failure(self):
def total_seconds():
return 0.1
def request(method, url, timeout=None):
response = mock.Mock()
# We are expecting a 200 response, so this will fail the expectation.
@@ -54,7 +70,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 3
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def test_evaluate_with_timeout(self):
def request(method, url, timeout=None):
@@ -67,7 +83,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 3
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
self.mock_logger.warning.assert_called_with('Request timed out')
def test_evaluate_with_connection_error(self):

View File

@@ -1,9 +1,10 @@
#!/usr/bin/env python
import unittest
import mock
import re
import unittest
from cachet_url_monitor.configuration import Expectaction,Latency
from cachet_url_monitor.configuration import HttpStatus, Regex
from cachet_url_monitor.configuration import Latency
class LatencyTest(unittest.TestCase):
@@ -16,6 +17,7 @@ class LatencyTest(unittest.TestCase):
def test_get_status_healthy(self):
def total_seconds():
return 0.1
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
@@ -26,6 +28,7 @@ class LatencyTest(unittest.TestCase):
def test_get_status_unhealthy(self):
def total_seconds():
return 2
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
@@ -36,13 +39,14 @@ class LatencyTest(unittest.TestCase):
def test_get_message(self):
def total_seconds():
return 0.1
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
elapsed.total_seconds = total_seconds
assert self.expectation.get_message(request) == ('Latency above '
'threshold: 0.1000')
'threshold: 0.1000 seconds')
class HttpStatusTest(unittest.TestCase):
@@ -77,11 +81,11 @@ class RegexTest(unittest.TestCase):
self.expectation = Regex({'type': 'REGEX', 'regex': '.*(find stuff).*'})
def test_init(self):
assert self.expectation.regex == re.compile('.*(find stuff).*')
assert self.expectation.regex == re.compile('.*(find stuff).*', re.UNICODE + re.DOTALL)
def test_get_status_healthy(self):
request = mock.Mock()
request.text = 'We could find stuff in this body.'
request.text = 'We could find stuff\n in this body.'
assert self.expectation.get_status(request) == 1

View File

@@ -1,9 +1,10 @@
#!/usr/bin/env python
import mock
import unittest
import sys
import unittest
import mock
sys.modules['schedule'] = mock.Mock()
sys.modules['cachet_url_monitor.configuration.Configuration'] = mock.Mock()
from cachet_url_monitor.scheduler import Agent, Scheduler
@@ -21,7 +22,7 @@ class AgentTest(unittest.TestCase):
self.agent.execute()
evaluate.assert_called_once()
push_status.assert_called_once()
push_status.assert_not_called()
def test_start(self):
every = sys.modules['schedule'].every
@@ -33,9 +34,17 @@ class AgentTest(unittest.TestCase):
class SchedulerTest(unittest.TestCase):
def setUp(self):
self.mock_configuration = sys.modules[('cachet_url_monitor.configuration'
'.Configuration')]
@mock.patch('requests.get')
def setUp(self, mock_requests):
def get(url, headers):
get_return = mock.Mock()
get_return.ok = True
get_return.json = mock.Mock()
get_return.json.return_value = {'data': {'status': 1}}
return get_return
mock_requests.get = get
self.scheduler = Scheduler('config.yml')
def test_init(self):