Merge pull request #6 from mtakaki/mtakaki_3_create_incident

Initial attempt at creating incidents when an URL becomes unhealthy #3
This commit is contained in:
mtakaki
2016-05-19 08:43:32 -07:00
8 changed files with 228 additions and 78 deletions

1
.gitignore vendored
View File

@@ -10,3 +10,4 @@ share/
*.egg-info
MANIFEST
dist/
.idea

View File

@@ -1,22 +1,23 @@
#!/usr/bin/env python
import abc
import cachet_url_monitor.status
import logging
import re
import requests
import time
from yaml import load
# This is the mandatory fields that must be in the configuration file in this
# same exact structure.
configuration_mandatory_fields = {
'endpoint': ['url', 'method', 'timeout', 'expectation'],
'cachet': ['api_url', 'token', 'component_id'],
'frequency': []}
'endpoint': ['url', 'method', 'timeout', 'expectation'],
'cachet': ['api_url', 'token', 'component_id'],
'frequency': []}
class ConfigurationValidationError(Exception):
"""Exception raised when there's a validation error."""
def __init__(self, value):
self.value = value
@@ -24,28 +25,62 @@ class ConfigurationValidationError(Exception):
return repr(self.value)
class ComponentNonexistentError(Exception):
"""Exception raised when the component does not exist."""
def __init__(self, component_id):
self.component_id = component_id
def __str__(self):
return repr('Component with id [%d] does not exist.' % (self.component_id,))
class Configuration(object):
"""Represents a configuration file, but it also includes the functionality
of assessing the API and pushing the results to cachet.
"""
def __init__(self, config_file):
#TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment
# TODO(mtakaki#1|2016-04-28): Accept overriding settings using environment
# variables so we have a more docker-friendly approach.
self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration')
self.config_file = config_file
self.data = load(file(self.config_file, 'r'))
# We need to validate the configuration is correct and then validate the component actually exists.
self.validate()
self.headers = {'X-Cachet-Token': self.data['cachet']['token']}
self.status = self.get_current_status(self.data['cachet']['component_id'])
self.logger.info('Monitoring URL: %s %s' %
(self.data['endpoint']['method'], self.data['endpoint']['url']))
(self.data['endpoint']['method'], self.data['endpoint']['url']))
self.expectations = [Expectaction.create(expectation) for expectation
in self.data['endpoint']['expectation']]
in self.data['endpoint']['expectation']]
for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,))
self.headers = {'X-Cachet-Token': self.data['cachet']['token']}
def get_current_status(self, component_id):
get_status_request = requests.get(
'%s/components/%d' % (self.data['cachet']['api_url'], self.data['cachet']['component_id']),
headers=self.headers)
if get_status_request.ok:
# The component exists.
return get_status_request.json()['data']['status']
else:
raise ComponentNonexistentError(component_id)
def is_create_incident(self):
"""Will verify if the configuration is set to create incidents or not.
:return True if the configuration is set to create incidents or False it otherwise.
"""
return 'create_incident' in self.data['cachet'] and self.data['cachet']['create_incident']
def validate(self):
"""Validates the configuration by verifying the mandatory fields are
present and in the correct format. If the validation fails, a
ConfigurationValidationError is raised. Otherwise nothing will happen.
"""
configuration_errors = []
for key, sub_entries in configuration_mandatory_fields.iteritems():
if key not in self.data:
@@ -56,99 +91,150 @@ class Configuration(object):
configuration_errors.append('%s.%s' % (key, sub_key))
if ('endpoint' in self.data and 'expectation' in
self.data['endpoint']):
self.data['endpoint']):
if (not isinstance(self.data['endpoint']['expectation'], list) or
(isinstance(self.data['endpoint']['expectation'], list) and
len(self.data['endpoint']['expectation']) == 0)):
(isinstance(self.data['endpoint']['expectation'], list) and
len(self.data['endpoint']['expectation']) == 0)):
configuration_errors.append('endpoint.expectation')
if len(configuration_errors) > 0:
raise ConfigurationValidationError(('Config file [%s] failed '
'validation. Missing keys: %s') % (self.config_file,
', '.join(configuration_errors)))
raise ConfigurationValidationError(
'Config file [%s] failed validation. Missing keys: %s' % (self.config_file,
', '.join(configuration_errors)))
def evaluate(self):
"""Sends the request to the URL set in the configuration and executes
each one of the expectations, one by one. The status will be updated
according to the expectation results.
"""
if hasattr(self, 'status'):
# Keeping track of the previous status.
self.previous_status = self.status
try:
self.request = requests.request(self.data['endpoint']['method'],
self.data['endpoint']['url'],
timeout=self.data['endpoint']['timeout'])
self.data['endpoint']['url'],
timeout=self.data['endpoint']['timeout'])
self.current_timestamp = int(time.time())
except requests.ConnectionError:
self.logger.warning('The URL is unreachable: %s %s' %
(self.data['endpoint']['method'],
self.data['endpoint']['url']))
self.status = 3
self.message = 'The URL is unreachable: %s %s' % (
self.data['endpoint']['method'], self.data['endpoint']['url'])
self.logger.warning(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return
except requests.HTTPError:
self.logger.exception('Unexpected HTTP response')
self.status = 3
self.message = 'Unexpected HTTP response'
self.logger.exception(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
return
except requests.Timeout:
self.logger.warning('Request timed out')
self.status = 3
self.message = 'Request timed out'
self.logger.warning(self.message)
self.status = cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
return
# We initially assume the API is healthy.
self.status = 1
self.status = cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
self.message = ''
for expectation in self.expectations:
status = expectation.get_status(self.request)
# The greater the status is, the worse the state of the API is.
if status > self.status:
self.status = status
self.message = expectation.get_message(self.request)
def push_status(self):
"""Pushes the status of the component to the cachet server. It will update the component
status based on the previous call to evaluate().
"""
params = {'id': self.data['cachet']['component_id'], 'status':
self.status}
self.status}
component_request = requests.put('%s/components/%d' %
(self.data['cachet']['api_url'],
self.data['cachet']['component_id']),
params=params, headers=self.headers)
(self.data['cachet']['api_url'],
self.data['cachet']['component_id']),
params=params, headers=self.headers)
if component_request.ok:
# Successful update
self.logger.info('Component update: status [%d]' % (self.status,))
else:
# Failed to update the API status
self.logger.warning('Component update failed with status [%d]: API'
' status: [%d]' % (component_request.status_code, self.status))
' status: [%d]' % (component_request.status_code, self.status))
def push_metrics(self):
"""Pushes the total amount of seconds the request took to get a response from the URL.
It only will send a request if the metric id was set in the configuration.
"""
if 'metric_id' in self.data['cachet'] and hasattr(self, 'request'):
params = {'id': self.data['cachet']['metric_id'], 'value':
self.request.elapsed.total_seconds(), 'timestamp':
self.current_timestamp}
self.request.elapsed.total_seconds(), 'timestamp':
self.current_timestamp}
metrics_request = requests.post('%s/metrics/%d/points' %
(self.data['cachet']['api_url'],
self.data['cachet']['metric_id']), params=params,
headers=self.headers)
(self.data['cachet']['api_url'],
self.data['cachet']['metric_id']), params=params,
headers=self.headers)
if metrics_request.ok:
# Successful metrics upload
self.logger.info('Metric uploaded: %.6f seconds' %
(self.request.elapsed.total_seconds(),))
(self.request.elapsed.total_seconds(),))
else:
self.logger.warning('Metric upload failed with status [%d]' %
(metrics_request.status_code,))
(metrics_request.status_code,))
def push_incident(self):
if hasattr(self, 'incident_id') and self.status == 1:
# If the incident already exists, it means it's unhealthy. We only update it when it becomes healthy again.
params = {'status': 4, 'visible': 1, 'component_id': self.data['cachet']['component_id'],
'component_status': self.status, 'notify': True}
incident_request = requests.put('%s/incidents/%d' % (self.data['cachet']['api_url'], self.incident_id),
params=params, headers=self.headers)
if incident_request.ok:
# Successful metrics upload
self.logger.info(
'Incident updated, API healthy again: component status [%d], message: "%s"' % (
self.status, self.message))
del self.incident_id
else:
self.logger.warning(
'Incident update failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
elif not hasattr(self, 'incident_id') and self.status != 1:
# This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': 1,
'component_id': self.data['cachet']['component_id'], 'component_status': self.status,
'notify': True}
incident_request = requests.post('%s/incidents' % (self.data['cachet']['api_url'],), params=params,
headers=self.headers)
if incident_request.ok:
# Successful incident upload.
self.incident_id = incident_request.json()['data']['id']
self.logger.info(
'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % (
self.status, self.message))
else:
self.logger.warning(
'Incident upload failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
class Expectaction(object):
"""Base class for URL result expectations. Any new excpectation should extend
this class and the name added to create() method.
"""
@staticmethod
def create(configuration):
"""Creates a list of expectations based on the configuration types
list.
"""
expectations = {
'HTTP_STATUS': HttpStatus,
'LATENCY': Latency,
'REGEX': Regex
}
'HTTP_STATUS': HttpStatus,
'LATENCY': Latency,
'REGEX': Regex
}
return expectations.get(configuration['type'])(configuration)
@abc.abstractmethod
@@ -168,9 +254,9 @@ class HttpStatus(Expectaction):
def get_status(self, response):
if response.status_code == self.status:
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 3
return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response):
return 'Unexpected HTTP status (%s)' % (response.status_code,)
@@ -185,27 +271,27 @@ class Latency(Expectaction):
def get_status(self, response):
if response.elapsed.total_seconds() <= self.threshold:
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 2
return cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
def get_message(self, response):
return 'Latency above threshold: %.4f' % (response.elapsed.total_seconds(),)
return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),)
def __str__(self):
return repr('Latency threshold: %.4f' % (self.threshold,))
return repr('Latency threshold: %.4f seconds' % (self.threshold,))
class Regex(Expectaction):
def __init__(self, configuration):
self.regex_string = configuration['regex']
self.regex = re.compile(configuration['regex'])
self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL)
def get_status(self, response):
if self.regex.match(response.text):
return 1
return cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
else:
return 3
return cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response):
return 'Regex did not match anything in the body'

View File

@@ -10,6 +10,7 @@ class Agent(object):
"""Monitor agent that will be constantly verifying if the URL is healthy
and updating the component.
"""
def __init__(self, configuration):
self.configuration = configuration
@@ -18,7 +19,6 @@ class Agent(object):
cachet server.
"""
self.configuration.evaluate()
self.configuration.push_status()
self.configuration.push_metrics()
def start(self):
@@ -26,11 +26,34 @@ class Agent(object):
schedule.every(self.configuration.data['frequency']).seconds.do(self.execute)
class UpdateStatusAgent(Agent):
def __init__(self, configuration):
super(UpdateStatusAgent, self).__init__(configuration)
def execute(self):
super(UpdateStatusAgent, self).execute()
self.configuration.push_status()
class CreateIncidentAgent(Agent):
def __init__(self, configuration):
super(CreateIncidentAgent, self).__init__(configuration)
def execute(self):
super(CreateIncidentAgent, self).execute()
self.configuration.push_incident()
class Scheduler(object):
def __init__(self, config_file):
self.logger = logging.getLogger('cachet_url_monitor.scheduler.Scheduler')
self.configuration = Configuration(config_file)
self.agent = Agent(self.configuration)
if self.configuration.is_create_incident():
self.agent = CreateIncidentAgent(self.configuration)
else:
self.agent = UpdateStatusAgent(self.configuration)
self.stop = False
def start(self):

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python
COMPONENT_STATUS_OPERATIONAL = 1
COMPONENT_STATUS_PERFORMANCE_ISSUES = 2
COMPONENT_STATUS_PARTIAL_OUTAGE = 3
COMPONENT_STATUS_MAJOR_OUTAGE = 4
COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL,
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE]

View File

@@ -8,10 +8,11 @@ endpoint:
- type: LATENCY
threshold: 1
- type: REGEX
regex: '.*<body>.*'
regex: '.*(<body).*'
cachet:
api_url: https://demo.cachethq.io/api/v1
token: my_token
component_id: 1
#metric_id: 1
create_incident: true
frequency: 30

View File

@@ -1,19 +1,33 @@
#!/usr/bin/env python
import mock
import unittest
import sys
from requests import ConnectionError,HTTPError,Timeout
import unittest
import cachet_url_monitor.status
import mock
from requests import ConnectionError, HTTPError, Timeout
sys.modules['requests'] = mock.Mock()
sys.modules['logging'] = mock.Mock()
from cachet_url_monitor.configuration import Configuration
class ConfigurationTest(unittest.TestCase):
def setUp(self):
def getLogger(name):
self.mock_logger = mock.Mock()
return self.mock_logger
sys.modules['logging'].getLogger = getLogger
def get(url, headers):
get_return = mock.Mock()
get_return.ok = True
get_return.json = mock.Mock()
get_return.json.return_value = {'data': {'status': 1}}
return get_return
sys.modules['requests'].get = get
self.configuration = Configuration('config.yml')
sys.modules['requests'].Timeout = Timeout
sys.modules['requests'].ConnectionError = ConnectionError
@@ -26,6 +40,7 @@ class ConfigurationTest(unittest.TestCase):
def test_evaluate(self):
def total_seconds():
return 0.1
def request(method, url, timeout=None):
response = mock.Mock()
response.status_code = 200
@@ -37,11 +52,12 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 1
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_OPERATIONAL
def test_evaluate_with_failure(self):
def total_seconds():
return 0.1
def request(method, url, timeout=None):
response = mock.Mock()
# We are expecting a 200 response, so this will fail the expectation.
@@ -54,7 +70,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 3
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE
def test_evaluate_with_timeout(self):
def request(method, url, timeout=None):
@@ -67,7 +83,7 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request
self.configuration.evaluate()
assert self.configuration.status == 3
assert self.configuration.status == cachet_url_monitor.status.COMPONENT_STATUS_PERFORMANCE_ISSUES
self.mock_logger.warning.assert_called_with('Request timed out')
def test_evaluate_with_connection_error(self):
@@ -83,7 +99,7 @@ class ConfigurationTest(unittest.TestCase):
assert self.configuration.status == 3
self.mock_logger.warning.assert_called_with(('The URL is '
'unreachable: GET http://localhost:8080/swagger'))
'unreachable: GET http://localhost:8080/swagger'))
def test_evaluate_with_http_error(self):
def request(method, url, timeout=None):
@@ -98,7 +114,7 @@ class ConfigurationTest(unittest.TestCase):
assert self.configuration.status == 3
self.mock_logger.exception.assert_called_with(('Unexpected HTTP '
'response'))
'response'))
def test_push_status(self):
def put(url, params=None, headers=None):

View File

@@ -1,9 +1,10 @@
#!/usr/bin/env python
import unittest
import mock
import re
import unittest
from cachet_url_monitor.configuration import Expectaction,Latency
from cachet_url_monitor.configuration import HttpStatus,Regex
from cachet_url_monitor.configuration import HttpStatus, Regex
from cachet_url_monitor.configuration import Latency
class LatencyTest(unittest.TestCase):
@@ -16,6 +17,7 @@ class LatencyTest(unittest.TestCase):
def test_get_status_healthy(self):
def total_seconds():
return 0.1
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
@@ -26,6 +28,7 @@ class LatencyTest(unittest.TestCase):
def test_get_status_unhealthy(self):
def total_seconds():
return 2
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
@@ -36,13 +39,14 @@ class LatencyTest(unittest.TestCase):
def test_get_message(self):
def total_seconds():
return 0.1
request = mock.Mock()
elapsed = mock.Mock()
request.elapsed = elapsed
elapsed.total_seconds = total_seconds
assert self.expectation.get_message(request) == ('Latency above '
'threshold: 0.1000')
'threshold: 0.1000 seconds')
class HttpStatusTest(unittest.TestCase):
@@ -69,7 +73,7 @@ class HttpStatusTest(unittest.TestCase):
request.status_code = 400
assert self.expectation.get_message(request) == ('Unexpected HTTP '
'status (400)')
'status (400)')
class RegexTest(unittest.TestCase):
@@ -77,11 +81,11 @@ class RegexTest(unittest.TestCase):
self.expectation = Regex({'type': 'REGEX', 'regex': '.*(find stuff).*'})
def test_init(self):
assert self.expectation.regex == re.compile('.*(find stuff).*')
assert self.expectation.regex == re.compile('.*(find stuff).*', re.UNICODE + re.DOTALL)
def test_get_status_healthy(self):
request = mock.Mock()
request.text = 'We could find stuff in this body.'
request.text = 'We could find stuff\n in this body.'
assert self.expectation.get_status(request) == 1
@@ -96,4 +100,4 @@ class RegexTest(unittest.TestCase):
request.text = 'We will not find it here'
assert self.expectation.get_message(request) == ('Regex did not match '
'anything in the body')
'anything in the body')

View File

@@ -1,10 +1,11 @@
#!/usr/bin/env python
import mock
import unittest
import sys
import unittest
import mock
sys.modules['schedule'] = mock.Mock()
sys.modules['cachet_url_monitor.configuration.Configuration'] = mock.Mock()
from cachet_url_monitor.scheduler import Agent,Scheduler
from cachet_url_monitor.scheduler import Agent, Scheduler
class AgentTest(unittest.TestCase):
@@ -21,7 +22,7 @@ class AgentTest(unittest.TestCase):
self.agent.execute()
evaluate.assert_called_once()
push_status.assert_called_once()
push_status.assert_not_called()
def test_start(self):
every = sys.modules['schedule'].every
@@ -33,16 +34,24 @@ class AgentTest(unittest.TestCase):
class SchedulerTest(unittest.TestCase):
def setUp(self):
self.mock_configuration = sys.modules[('cachet_url_monitor.configuration'
'.Configuration')]
@mock.patch('requests.get')
def setUp(self, mock_requests):
def get(url, headers):
get_return = mock.Mock()
get_return.ok = True
get_return.json = mock.Mock()
get_return.json.return_value = {'data': {'status': 1}}
return get_return
mock_requests.get = get
self.scheduler = Scheduler('config.yml')
def test_init(self):
assert self.scheduler.stop == False
def test_start(self):
#TODO(mtakaki|2016-05-01): We need a better way of testing this method.
# TODO(mtakaki|2016-05-01): We need a better way of testing this method.
# Leaving it as a placeholder.
self.scheduler.stop = True
self.scheduler.start()