From 0b1e83eae1b7e56360590df7fb09cd5171e4d77a Mon Sep 17 00:00:00 2001 From: Jacek Szubert Date: Thu, 9 Mar 2017 13:11:08 +1100 Subject: [PATCH] Create incident/update component status only after specified amount of failed connection trials --- README.md | 2 ++ cachet_url_monitor/configuration.py | 22 ++++++++++++++++++++++ cachet_url_monitor/scheduler.py | 1 + config.yml | 1 + 4 files changed, 26 insertions(+) diff --git a/README.md b/README.md index 02b034b..2d016f1 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ endpoint: threshold: 1 - type: REGEX regex: ".*.*" + allowed_fails: 0 cachet: api_url: http://status.cachethq.io/api/v1 token: my_token @@ -48,6 +49,7 @@ frequency: 30 - **HTTP_STATUS**, we will verify if the response status code matches what we expect. - **LATENCY**, we measure how long the request took to get a response and fail if it's above the threshold. The unit is in seconds. - **REGEX**, we verify if the response body matches the given regex. + - **allowed_fails**, create incident/update component status only after specified amount of failed connection trials. - **cachet**, this is the settings for our cachet server. - **api_url**, the cachet API endpoint. - **token**, the API token. diff --git a/cachet_url_monitor/configuration.py b/cachet_url_monitor/configuration.py index 626bb8a..69e581f 100644 --- a/cachet_url_monitor/configuration.py +++ b/cachet_url_monitor/configuration.py @@ -63,6 +63,8 @@ class Configuration(object): self.logger = logging.getLogger('cachet_url_monitor.configuration.Configuration') self.config_file = config_file self.data = load(file(self.config_file, 'r')) + self.current_fails = 0 + self.trigger_update = True # Exposing the configuration to confirm it's parsed as expected. self.print_out() @@ -76,6 +78,7 @@ class Configuration(object): self.endpoint_method = os.environ.get('ENDPOINT_METHOD') or self.data['endpoint']['method'] self.endpoint_url = os.environ.get('ENDPOINT_URL') or self.data['endpoint']['url'] self.endpoint_timeout = os.environ.get('ENDPOINT_TIMEOUT') or self.data['endpoint'].get('timeout') or 1 + self.allowed_fails = os.environ.get('ALLOWED_FAILS') or self.data['endpoint'].get('allowed_fails') or 0 self.api_url = os.environ.get('CACHET_API_URL') or self.data['cachet']['api_url'] self.component_id = os.environ.get('CACHET_COMPONENT_ID') or self.data['cachet']['component_id'] @@ -176,10 +179,27 @@ class Configuration(object): del temporary_data['cachet']['token'] return dump(temporary_data, default_flow_style=False) + def if_trigger_update(self): + """ + Checks if update should be triggered - trigger it for all operational states + and only for non-operational ones above the configured threshold (allowed_fails). + """ + + if self.status != 1: + self.current_fails = self.current_fails + 1 + self.logger.info('Failure #%s with threshold set to %s' % (self.current_fails, self.allowed_fails)) + if self.current_fails <= self.allowed_fails: + self.trigger_update = False + return + self.current_fails = 0 + self.trigger_update = True + def push_status(self): """Pushes the status of the component to the cachet server. It will update the component status based on the previous call to evaluate(). """ + if not self.trigger_update: + return params = {'id': self.component_id, 'status': self.status} component_request = requests.put('%s/components/%d' % (self.api_url, self.component_id), params=params, headers=self.headers) @@ -213,6 +233,8 @@ class Configuration(object): """If the component status has changed, we create a new incident (if this is the first time it becomes unstable) or updates the existing incident once it becomes healthy again. """ + if not self.trigger_update: + return if hasattr(self, 'incident_id') and self.status == st.COMPONENT_STATUS_OPERATIONAL: # If the incident already exists, it means it was unhealthy but now it's healthy again. params = {'status': 4, 'visible': self.public_incidents, 'component_id': self.component_id, 'component_status': self.status, diff --git a/cachet_url_monitor/scheduler.py b/cachet_url_monitor/scheduler.py index f32aa67..02af7b6 100644 --- a/cachet_url_monitor/scheduler.py +++ b/cachet_url_monitor/scheduler.py @@ -25,6 +25,7 @@ class Agent(object): """ self.configuration.evaluate() self.configuration.push_metrics() + self.configuration.if_trigger_update() for decorator in self.decorators: decorator.execute(self.configuration) diff --git a/config.yml b/config.yml index 5015d57..de6747f 100644 --- a/config.yml +++ b/config.yml @@ -9,6 +9,7 @@ endpoint: threshold: 1 - type: REGEX regex: '.*(