#72 - Adding the ability to control incident status

This commit is contained in:
Mitsuo Takaki
2019-10-25 08:10:12 -07:00
parent 9e5d42f8b8
commit e132f8660b
6 changed files with 88 additions and 45 deletions
+14
View File
@@ -25,6 +25,7 @@ endpoint:
expectation: expectation:
- type: HTTP_STATUS - type: HTTP_STATUS
status_range: 200-300 status_range: 200-300
incident: MAJOR
- type: LATENCY - type: LATENCY
threshold: 1 threshold: 1
- type: REGEX - type: REGEX
@@ -65,6 +66,19 @@ frequency: 30
- **latency_unit**, the latency unit used when reporting the metrics. It will automatically convert to the specified unit. It's not mandatory and it will default to **seconds**. Available units: `ms`, `s`, `m`, `h`. - **latency_unit**, the latency unit used when reporting the metrics. It will automatically convert to the specified unit. It's not mandatory and it will default to **seconds**. Available units: `ms`, `s`, `m`, `h`.
- **frequency**, how often we'll send a request to the given URL. The unit is in seconds. - **frequency**, how often we'll send a request to the given URL. The unit is in seconds.
Each `expectation` has their own default incident status. It can be overridden by setting the `incident` property to any of the following values:
- `PARTIAL`
- `MAJOR`
- `PERFORMANCE`
By choosing any of the aforementioned statuses, it will let you control the kind of incident it should be considered. These are the default incident status for each `expectation` type:
| Expectation | Incident status |
| ----------- | --------------- |
| HTTP_STATUS | PARTIAL |
| LATENCY | PERFORMANCE |
| REGEX | PARTIAL |
## Setting up ## Setting up
The application should be installed using **virtualenv**, through the following command: The application should be installed using **virtualenv**, through the following command:
+58 -39
View File
@@ -39,7 +39,7 @@ class ComponentNonexistentError(Exception):
self.component_id = component_id self.component_id = component_id
def __str__(self): def __str__(self):
return repr('Component with id [%d] does not exist.' % (self.component_id,)) return repr(f'Component with id [{self.component_id}] does not exist.')
class MetricNonexistentError(Exception): class MetricNonexistentError(Exception):
@@ -49,7 +49,7 @@ class MetricNonexistentError(Exception):
self.metric_id = metric_id self.metric_id = metric_id
def __str__(self): def __str__(self):
return repr('Metric with id [%d] does not exist.' % (self.metric_id,)) return repr(f'Metric with id [{self.metric_id}] does not exist.')
def get_current_status(endpoint_url, component_id, headers): def get_current_status(endpoint_url, component_id, headers):
@@ -57,7 +57,7 @@ def get_current_status(endpoint_url, component_id, headers):
not exist or doesn't respond with the expected data. not exist or doesn't respond with the expected data.
:return component status. :return component status.
""" """
get_status_request = requests.get('%s/components/%s' % (endpoint_url, component_id), headers=headers) get_status_request = requests.get(f'{endpoint_url}/components/{component_id}', headers=headers)
if get_status_request.ok: if get_status_request.ok:
# The component exists. # The component exists.
@@ -69,7 +69,7 @@ def get_current_status(endpoint_url, component_id, headers):
def normalize_url(url): def normalize_url(url):
"""If passed url doesn't include schema return it with default one - http.""" """If passed url doesn't include schema return it with default one - http."""
if not url.lower().startswith('http'): if not url.lower().startswith('http'):
return 'http://%s' % url return f'http://{url}'
return url return url
@@ -120,7 +120,7 @@ class Configuration(object):
os.environ.get('CACHET_PUBLIC_INCIDENTS') or self.data['cachet']['public_incidents']) os.environ.get('CACHET_PUBLIC_INCIDENTS') or self.data['cachet']['public_incidents'])
self.logger.info('Monitoring URL: %s %s' % (self.endpoint_method, self.endpoint_url)) self.logger.info('Monitoring URL: %s %s' % (self.endpoint_method, self.endpoint_url))
self.expectations = [Expectaction.create(expectation) for expectation in self.data['endpoint']['expectation']] self.expectations = [Expectation.create(expectation) for expectation in self.data['endpoint']['expectation']]
for expectation in self.expectations: for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,)) self.logger.info('Registered expectation: %s' % (expectation,))
@@ -157,16 +157,15 @@ class Configuration(object):
configuration_errors.append('%s.%s' % (key, sub_key)) configuration_errors.append('%s.%s' % (key, sub_key))
if ('endpoint' in self.data and 'expectation' in if ('endpoint' in self.data and 'expectation' in
self.data['endpoint']): self.data['endpoint']):
if (not isinstance(self.data['endpoint']['expectation'], list) or if (not isinstance(self.data['endpoint']['expectation'], list) or
(isinstance(self.data['endpoint']['expectation'], list) and (isinstance(self.data['endpoint']['expectation'], list) and
len(self.data['endpoint']['expectation']) == 0)): len(self.data['endpoint']['expectation']) == 0)):
configuration_errors.append('endpoint.expectation') configuration_errors.append('endpoint.expectation')
if len(configuration_errors) > 0: if len(configuration_errors) > 0:
raise ConfigurationValidationError( raise ConfigurationValidationError(
'Config file [%s] failed validation. Missing keys: %s' % (self.config_file, f"Config file [{self.config_file}] failed validation. Missing keys: {', '.join(configuration_errors)}")
', '.join(configuration_errors)))
def evaluate(self): def evaluate(self):
"""Sends the request to the URL set in the configuration and executes """Sends the request to the URL set in the configuration and executes
@@ -175,9 +174,10 @@ class Configuration(object):
""" """
try: try:
if self.endpoint_header is not None: if self.endpoint_header is not None:
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout, headers=self.endpoint_header) self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout,
headers=self.endpoint_header)
else: else:
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout) self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout)
self.current_timestamp = int(time.time()) self.current_timestamp = int(time.time())
except requests.ConnectionError: except requests.ConnectionError:
self.message = 'The URL is unreachable: %s %s' % (self.endpoint_method, self.endpoint_url) self.message = 'The URL is unreachable: %s %s' % (self.endpoint_method, self.endpoint_url)
@@ -208,7 +208,7 @@ class Configuration(object):
self.logger.info(self.message) self.logger.info(self.message)
def print_out(self): def print_out(self):
self.logger.info('Current configuration:\n%s' % (self.__repr__())) self.logger.info(f'Current configuration:\n{self.__repr__()}')
def __repr__(self): def __repr__(self):
temporary_data = copy.deepcopy(self.data) temporary_data = copy.deepcopy(self.data)
@@ -224,7 +224,7 @@ class Configuration(object):
if self.status != 1: if self.status != 1:
self.current_fails = self.current_fails + 1 self.current_fails = self.current_fails + 1
self.logger.info('Failure #%s with threshold set to %s' % (self.current_fails, self.allowed_fails)) self.logger.warning(f'Failure #{self.current_fails} with threshold set to {self.allowed_fails}')
if self.current_fails <= self.allowed_fails: if self.current_fails <= self.allowed_fails:
self.trigger_update = False self.trigger_update = False
return return
@@ -276,8 +276,7 @@ class Configuration(object):
# Successful metrics upload # Successful metrics upload
self.logger.info('Metric uploaded: %.6f %s' % (value, self.latency_unit)) self.logger.info('Metric uploaded: %.6f %s' % (value, self.latency_unit))
else: else:
self.logger.warning('Metric upload failed with status [%d]' % self.logger.warning(f'Metric upload failed with status [{metrics_request.status_code}]')
(metrics_request.status_code,))
def push_incident(self): def push_incident(self):
"""If the component status has changed, we create a new incident (if this is the first time it becomes unstable) """If the component status has changed, we create a new incident (if this is the first time it becomes unstable)
@@ -291,36 +290,33 @@ class Configuration(object):
'component_status': self.status, 'component_status': self.status,
'notify': True} 'notify': True}
incident_request = requests.put('%s/incidents/%d' % (self.api_url, self.incident_id), params=params, incident_request = requests.put(f'{self.api_url}/incidents/{self.incident_id}', params=params,
headers=self.headers) headers=self.headers)
if incident_request.ok: if incident_request.ok:
# Successful metrics upload # Successful metrics upload
self.logger.info( self.logger.info(
'Incident updated, API healthy again: component status [%d], message: "%s"' % ( f'Incident updated, API healthy again: component status [{self.status}], message: "{self.message}"')
self.status, self.message))
del self.incident_id del self.incident_id
else: else:
self.logger.warning('Incident update failed with status [%d], message: "%s"' % ( self.logger.warning(
incident_request.status_code, self.message)) f'Incident update failed with status [{incident_request.status_code}], message: "{self.message}"')
elif not hasattr(self, 'incident_id') and self.status != st.COMPONENT_STATUS_OPERATIONAL: elif not hasattr(self, 'incident_id') and self.status != st.COMPONENT_STATUS_OPERATIONAL:
# This is the first time the incident is being created. # This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': self.public_incidents, params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': self.public_incidents,
'component_id': self.component_id, 'component_status': self.status, 'notify': True} 'component_id': self.component_id, 'component_status': self.status, 'notify': True}
incident_request = requests.post('%s/incidents' % (self.api_url,), params=params, headers=self.headers) incident_request = requests.post(f'{self.api_url}/incidents', params=params, headers=self.headers)
if incident_request.ok: if incident_request.ok:
# Successful incident upload. # Successful incident upload.
self.incident_id = incident_request.json()['data']['id'] self.incident_id = incident_request.json()['data']['id']
self.logger.info( self.logger.info(
'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % ( f'Incident uploaded, API unhealthy: component status [{self.status}], message: "{self.message}"')
self.status, self.message))
else: else:
self.logger.warning( self.logger.warning(
'Incident upload failed with status [%d], message: "%s"' % ( f'Incident upload failed with status [{incident_request.status_code}], message: "{self.message}"')
incident_request.status_code, self.message))
class Expectaction(object): class Expectation(object):
"""Base class for URL result expectations. Any new excpectation should extend """Base class for URL result expectations. Any new expectation should extend
this class and the name added to create() method. this class and the name added to create() method.
""" """
@@ -329,6 +325,7 @@ class Expectaction(object):
"""Creates a list of expectations based on the configuration types """Creates a list of expectations based on the configuration types
list. list.
""" """
# If a need expectation is created, this is where we need to add it.
expectations = { expectations = {
'HTTP_STATUS': HttpStatus, 'HTTP_STATUS': HttpStatus,
'LATENCY': Latency, 'LATENCY': Latency,
@@ -336,6 +333,9 @@ class Expectaction(object):
} }
return expectations.get(configuration['type'])(configuration) return expectations.get(configuration['type'])(configuration)
def __init__(self, configuration):
self.incident_status = self.parse_incident_status(configuration)
@abc.abstractmethod @abc.abstractmethod
def get_status(self, response): def get_status(self, response):
"""Returns the status of the API, following cachet's component status """Returns the status of the API, following cachet's component status
@@ -346,43 +346,58 @@ class Expectaction(object):
def get_message(self, response): def get_message(self, response):
"""Gets the error message.""" """Gets the error message."""
@abc.abstractmethod
def get_default_incident(self):
"""Returns the default status when this incident happens."""
class HttpStatus(Expectaction): def parse_incident_status(self, configuration):
return st.INCIDENT_MAP.get(configuration.get('incident', None), self.get_default_incident())
class HttpStatus(Expectation):
def __init__(self, configuration): def __init__(self, configuration):
self.status_range = HttpStatus.parse_range(configuration['status_range']) self.status_range = HttpStatus.parse_range(configuration['status_range'])
super(HttpStatus, self).__init__(configuration)
@staticmethod @staticmethod
def parse_range(range_string): def parse_range(range_string):
statuses = range_string.split("-") statuses = range_string.split("-")
if len(statuses) == 1: if len(statuses) == 1:
# When there was no range given, we should treat the first number as a single status check. # When there was no range given, we should treat the first number as a single status check.
return (int(statuses[0]), int(statuses[0]) + 1) return int(statuses[0]), int(statuses[0]) + 1
else: else:
# We shouldn't look into more than one value, as this is a range value. # We shouldn't look into more than one value, as this is a range value.
return (int(statuses[0]), int(statuses[1])) return int(statuses[0]), int(statuses[1])
def get_status(self, response): def get_status(self, response):
if response.status_code >= self.status_range[0] and response.status_code < self.status_range[1]: if self.status_range[0] <= response.status_code < self.status_range[1]:
return st.COMPONENT_STATUS_OPERATIONAL return st.COMPONENT_STATUS_OPERATIONAL
else: else:
return st.COMPONENT_STATUS_PARTIAL_OUTAGE return self.incident_status
def get_default_incident(self):
return st.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response): def get_message(self, response):
return 'Unexpected HTTP status (%s)' % (response.status_code,) return f'Unexpected HTTP status ({response.status_code})'
def __str__(self): def __str__(self):
return repr('HTTP status range: %s' % (self.status_range,)) return repr(f'HTTP status range: {self.status_range}')
class Latency(Expectaction): class Latency(Expectation):
def __init__(self, configuration): def __init__(self, configuration):
self.threshold = configuration['threshold'] self.threshold = configuration['threshold']
super(Latency, self).__init__(configuration)
def get_status(self, response): def get_status(self, response):
if response.elapsed.total_seconds() <= self.threshold: if response.elapsed.total_seconds() <= self.threshold:
return st.COMPONENT_STATUS_OPERATIONAL return st.COMPONENT_STATUS_OPERATIONAL
else: else:
return st.COMPONENT_STATUS_PERFORMANCE_ISSUES return self.incident_status
def get_default_incident(self):
return st.COMPONENT_STATUS_PERFORMANCE_ISSUES
def get_message(self, response): def get_message(self, response):
return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),) return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),)
@@ -391,19 +406,23 @@ class Latency(Expectaction):
return repr('Latency threshold: %.4f seconds' % (self.threshold,)) return repr('Latency threshold: %.4f seconds' % (self.threshold,))
class Regex(Expectaction): class Regex(Expectation):
def __init__(self, configuration): def __init__(self, configuration):
self.regex_string = configuration['regex'] self.regex_string = configuration['regex']
self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL) self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL)
super(Regex, self).__init__(configuration)
def get_status(self, response): def get_status(self, response):
if self.regex.match(response.text): if self.regex.match(response.text):
return st.COMPONENT_STATUS_OPERATIONAL return st.COMPONENT_STATUS_OPERATIONAL
else: else:
return st.COMPONENT_STATUS_PARTIAL_OUTAGE return self.incident_status
def get_default_incident(self):
return st.COMPONENT_STATUS_PARTIAL_OUTAGE
def get_message(self, response): def get_message(self, response):
return 'Regex did not match anything in the body' return 'Regex did not match anything in the body'
def __str__(self): def __str__(self):
return repr('Regex: %s' % (self.regex_string,)) return repr(f'Regex: {self.regex_string}')
+12 -3
View File
@@ -4,12 +4,21 @@ This file defines all the different status different values.
These are all constants and are coupled to cachet's API configuration. These are all constants and are coupled to cachet's API configuration.
""" """
COMPONENT_STATUS_OPERATIONAL = 1 COMPONENT_STATUS_OPERATIONAL = 1
COMPONENT_STATUS_PERFORMANCE_ISSUES = 2 COMPONENT_STATUS_PERFORMANCE_ISSUES = 2
COMPONENT_STATUS_PARTIAL_OUTAGE = 3 COMPONENT_STATUS_PARTIAL_OUTAGE = 3
COMPONENT_STATUS_MAJOR_OUTAGE = 4 COMPONENT_STATUS_MAJOR_OUTAGE = 4
COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL, COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL,
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE, COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE] COMPONENT_STATUS_MAJOR_OUTAGE]
INCIDENT_PARTIAL = 'PARTIAL'
INCIDENT_MAJOR = 'MAJOR'
INCIDENT_PERFORMANCE = 'PERFORMANCE'
INCIDENT_MAP = {
INCIDENT_PARTIAL: COMPONENT_STATUS_PARTIAL_OUTAGE,
INCIDENT_MAJOR: COMPONENT_STATUS_MAJOR_OUTAGE,
INCIDENT_PERFORMANCE: COMPONENT_STATUS_PERFORMANCE_ISSUES,
}
+1
View File
@@ -7,6 +7,7 @@ endpoint:
expectation: expectation:
- type: HTTP_STATUS - type: HTTP_STATUS
status_range: 200-300 status_range: 200-300
incident: MAJOR
- type: LATENCY - type: LATENCY
threshold: 1 threshold: 1
- type: REGEX - type: REGEX
+1 -1
View File
@@ -3,7 +3,7 @@
from setuptools import setup from setuptools import setup
setup(name='cachet-url-monitor', setup(name='cachet-url-monitor',
version='1.4', version='1.5',
description='Cachet URL monitor plugin', description='Cachet URL monitor plugin',
author='Mitsuo Takaki', author='Mitsuo Takaki',
author_email='mitsuotakaki@gmail.com', author_email='mitsuotakaki@gmail.com',
+2 -2
View File
@@ -96,8 +96,8 @@ class ConfigurationTest(unittest.TestCase):
sys.modules['requests'].request = request sys.modules['requests'].request = request
self.configuration.evaluate() self.configuration.evaluate()
self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE, self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_MAJOR_OUTAGE,
'Component status set incorrectly') 'Component status set incorrectly or custom incident status is incorrectly parsed')
def test_evaluate_with_timeout(self): def test_evaluate_with_timeout(self):
def request(method, url, headers, timeout=None): def request(method, url, headers, timeout=None):