Create incident/update component status only after specified amount of failed connection trials

This commit is contained in:
Jacek Szubert
2017-03-09 13:11:08 +11:00
parent ffa141d114
commit 0b1e83eae1
4 changed files with 26 additions and 0 deletions

View File

@@ -28,6 +28,7 @@ endpoint:
threshold: 1 threshold: 1
- type: REGEX - type: REGEX
regex: ".*<body>.*" regex: ".*<body>.*"
allowed_fails: 0
cachet: cachet:
api_url: http://status.cachethq.io/api/v1 api_url: http://status.cachethq.io/api/v1
token: my_token token: my_token
@@ -48,6 +49,7 @@ frequency: 30
- **HTTP_STATUS**, we will verify if the response status code matches what we expect. - **HTTP_STATUS**, we will verify if the response status code matches what we expect.
- **LATENCY**, we measure how long the request took to get a response and fail if it's above the threshold. The unit is in seconds. - **LATENCY**, we measure how long the request took to get a response and fail if it's above the threshold. The unit is in seconds.
- **REGEX**, we verify if the response body matches the given regex. - **REGEX**, we verify if the response body matches the given regex.
- **allowed_fails**, create incident/update component status only after specified amount of failed connection trials.
- **cachet**, this is the settings for our cachet server. - **cachet**, this is the settings for our cachet server.
- **api_url**, the cachet API endpoint. - **api_url**, the cachet API endpoint.
- **token**, the API token. - **token**, the API token.

View File

@@ -63,6 +63,8 @@ class Configuration(object):
self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration') self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration')
self.config_file = config_file self.config_file = config_file
self.data = load(file(self.config_file, 'r')) self.data = load(file(self.config_file, 'r'))
self.current_fails = 0
self.trigger_update = True
# Exposing the configuration to confirm it's parsed as expected. # Exposing the configuration to confirm it's parsed as expected.
self.print_out() self.print_out()
@@ -76,6 +78,7 @@ class Configuration(object):
self.endpoint_method = os.environ.get('ENDPOINT_METHOD') or self.data['endpoint']['method'] self.endpoint_method = os.environ.get('ENDPOINT_METHOD') or self.data['endpoint']['method']
self.endpoint_url = os.environ.get('ENDPOINT_URL') or self.data['endpoint']['url'] self.endpoint_url = os.environ.get('ENDPOINT_URL') or self.data['endpoint']['url']
self.endpoint_timeout = os.environ.get('ENDPOINT_TIMEOUT') or self.data['endpoint'].get('timeout') or 1 self.endpoint_timeout = os.environ.get('ENDPOINT_TIMEOUT') or self.data['endpoint'].get('timeout') or 1
self.allowed_fails = os.environ.get('ALLOWED_FAILS') or self.data['endpoint'].get('allowed_fails') or 0
self.api_url = os.environ.get('CACHET_API_URL') or self.data['cachet']['api_url'] self.api_url = os.environ.get('CACHET_API_URL') or self.data['cachet']['api_url']
self.component_id = os.environ.get('CACHET_COMPONENT_ID') or self.data['cachet']['component_id'] self.component_id = os.environ.get('CACHET_COMPONENT_ID') or self.data['cachet']['component_id']
@@ -176,10 +179,27 @@ class Configuration(object):
del temporary_data['cachet']['token'] del temporary_data['cachet']['token']
return dump(temporary_data, default_flow_style=False) return dump(temporary_data, default_flow_style=False)
def if_trigger_update(self):
"""
Checks if update should be triggered - trigger it for all operational states
and only for non-operational ones above the configured threshold (allowed_fails).
"""
if self.status != 1:
self.current_fails = self.current_fails + 1
self.logger.info('Failure #%s with threshold set to %s' % (self.current_fails, self.allowed_fails))
if self.current_fails <= self.allowed_fails:
self.trigger_update = False
return
self.current_fails = 0
self.trigger_update = True
def push_status(self): def push_status(self):
"""Pushes the status of the component to the cachet server. It will update the component """Pushes the status of the component to the cachet server. It will update the component
status based on the previous call to evaluate(). status based on the previous call to evaluate().
""" """
if not self.trigger_update:
return
params = {'id': self.component_id, 'status': self.status} params = {'id': self.component_id, 'status': self.status}
component_request = requests.put('%s/components/%d' % (self.api_url, self.component_id), params=params, component_request = requests.put('%s/components/%d' % (self.api_url, self.component_id), params=params,
headers=self.headers) headers=self.headers)
@@ -213,6 +233,8 @@ class Configuration(object):
"""If the component status has changed, we create a new incident (if this is the first time it becomes unstable) """If the component status has changed, we create a new incident (if this is the first time it becomes unstable)
or updates the existing incident once it becomes healthy again. or updates the existing incident once it becomes healthy again.
""" """
if not self.trigger_update:
return
if hasattr(self, 'incident_id') and self.status == st.COMPONENT_STATUS_OPERATIONAL: if hasattr(self, 'incident_id') and self.status == st.COMPONENT_STATUS_OPERATIONAL:
# If the incident already exists, it means it was unhealthy but now it's healthy again. # If the incident already exists, it means it was unhealthy but now it's healthy again.
params = {'status': 4, 'visible': self.public_incidents, 'component_id': self.component_id, 'component_status': self.status, params = {'status': 4, 'visible': self.public_incidents, 'component_id': self.component_id, 'component_status': self.status,

View File

@@ -25,6 +25,7 @@ class Agent(object):
""" """
self.configuration.evaluate() self.configuration.evaluate()
self.configuration.push_metrics() self.configuration.push_metrics()
self.configuration.if_trigger_update()
for decorator in self.decorators: for decorator in self.decorators:
decorator.execute(self.configuration) decorator.execute(self.configuration)

View File

@@ -9,6 +9,7 @@ endpoint:
threshold: 1 threshold: 1
- type: REGEX - type: REGEX
regex: '.*(<body).*' regex: '.*(<body).*'
allowed_fails: 0
cachet: cachet:
api_url: https://demo.cachethq.io/api/v1 api_url: https://demo.cachethq.io/api/v1
token: my_token token: my_token