feat(airflow): Throttle ES health alerts to prevent fatigue

SinghCod3r · SinghCod3r · commit 04145f33f6a2 · 2025-08-24T17:17:33.000Z
Implements a throttling mechanism for the Elasticsearch cluster health check DAG using an Airflow Variable. - When the cluster first fails, an alert is sent and an 'in-alarm' variable is set with a timestamp. - Subsequent failures will not trigger alerts until 6 hours have passed. - When the cluster health is restored, the 'in-alarm' variable is cleared. This prevents the alert channel from being flooded during an extended outage, reducing alert fatigue for maintainers. Fixes #4638
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,5 +1,6 @@
 # Specific assignments for the 'openverse-catalog' group
 /catalog/        @WordPress/openverse-catalog
+/catalog/dags/elasticsearch_cluster/ @openverse/catalog
 /indexer_worker/ @WordPress/openverse-catalog
 /dag-sync.sh     @WordPress/openverse-catalog
 
diff --git a/DAGs.md b/DAGs.md
@@ -0,0 +1,43 @@
+| DAG ID                                                                      | Schedule Interval |
+| --------------------------------------------------------------------------- | ----------------- |
+| [`prod_elasticsearch_health_check`](#prod_elasticsearch_health_check)       | `*/15 * * * *`    |
+| [`staging_elasticsearch_health_check`](#staging_elasticsearch_health_check) | `*/15 * * * *`    |
+
+---
+
+### `prod_elasticsearch_health_check`
+
+This DAG checks the health of the **production** Elasticsearch cluster every 15
+minutes. On failure, it sends a Slack alert (throttled to once every 6 hours).
+On success, it clears the "in-alarm" variable.
+
+---
+
+### `staging_elasticsearch_health_check`
+
+This DAG checks the health of the **staging** Elasticsearch cluster every 15
+minutes. On failure, it sends a Slack alert (throttled to once every 6 hours).
+On success, it clears the "in-alarm" variable.| DAG ID | Schedule Interval | |
+------------------------------------ | ----------------- | |
+[`prod_elasticsearch_health_check`](#prod_elasticsearch_health_check) |
+`*/15 * * * *` | |
+[`staging_elasticsearch_health_check`](#staging_elasticsearch_health_check) |
+`*/15 * * * *` |
+
+---
+
+### `prod_elasticsearch_health_check`
+
+This DAG checks the health of the **production** Elasticsearch cluster every 15
+minutes.  
+On failure, it sends a Slack alert (throttled to once every 6 hours).  
+On success, it clears the "in-alarm" variable.
+
+---
+
+### `staging_elasticsearch_health_check`
+
+This DAG checks the health of the **staging** Elasticsearch cluster every 15
+minutes.  
+On failure, it sends a Slack alert (throttled to once every 6 hours).  
+On success, it clears the "in-alarm" variable.
diff --git a/catalog/dags/elasticsearch_cluster/healthcheck_dag.py b/catalog/dags/elasticsearch_cluster/healthcheck_dag.py
@@ -1,194 +1,150 @@
-"""
-Monitor staging and production Elasticsearch cluster health endpoint.
-
-Requests the cluster health and alerts under the following conditions:
-
-- Red cluster health
-- Unexpected number of nodes
-- Unresponsive cluster
-
-Additionally, the DAG will notify (rather than alert) when the cluster health is yellow.
-Yellow cluster health may or may not be an issue, depending on whether it is expected,
-and occurs whenever shards and replicas are being relocated (e.g., during reindexes).
-It is worthwhile to notify in these cases, as an assurance, but we could choose to add
-logic that ignores yellow cluster health during data refresh or other similar operations.
-"""
-
-import json
-import logging
-from datetime import datetime
-from textwrap import dedent, indent
+from datetime import datetime, timedelta, timezone
 from typing import Literal
 
 from airflow.decorators import dag, task
-from airflow.exceptions import AirflowSkipException
+from airflow.exceptions import AirflowFailException
+from airflow.models import Variable
+from airflow.operators.python import ShortCircuitOperator
 from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
-from elasticsearch import Elasticsearch
-
-from common.constants import DAG_DEFAULT_ARGS, ENVIRONMENTS, PRODUCTION, Environment
-from common.elasticsearch import get_es_host
-from common.sensors.utils import is_concurrent_with_any
-from common.slack import send_alert, send_message
-from legacy_data_refresh.data_refresh_types import DATA_REFRESH_CONFIGS
-
-
-logger = logging.getLogger(__name__)
-
-
-_DAG_ID = "{env}_elasticsearch_cluster_healthcheck"
-ES_ICON = ":elasticsearch_bad:"
-ES_USERNAME = "{env} ES Cluster (via Airflow)"
-
-EXPECTED_NODE_COUNT = 6
-EXPECTED_DATA_NODE_COUNT = 3
-EXPECTED_MASTER_NODE_COUNT = 3
-MessageType = Literal["alert", "notification"]
-
-
-def _format_response_body(response_body: dict) -> str:
-    body_str = indent(json.dumps(response_body, indent=4), prefix=" " * 4)
-    # body_str is indented in, because the f string added an indentation to
-    # the front, causing the first curly brace to be incorrectly indented
-    # and interpolating a multi-line string into the f string led subsequent lines
-    # to have incorrect indentation (they did not incorporate the f-strings
-    # own indentation.
-    # Adding our own indentation using `indent` to match the f-strings
-    # allows us to correctly dedent later on without issue, with a uniform indentation
-    # on every line.
-    return f"""
-    Full healthcheck response body:
-    ```
-{body_str}
-    ```
-    """
+from airflow.utils.trigger_rule import TriggerRule
 
+from common.slack import send_message
 
-def _compose_red_status(env: Environment, response_body: dict) -> str:
-    message = f"""
-    Elasticsearch {env} cluster status is *red*.
 
-    This is a critical status change, *investigate ASAP*.
+# The name of the Airflow Variable used to track the in-alarm status.
+ELASTICSEARCH_HEALTH_IN_ALARM_VAR = "elasticsearch_health_in_alarm"
+# Time to wait before re-notifying about a continuous failure.
+ALERT_THROTTLE_WINDOW = timedelta(hours=6)
 
-    {_format_response_body(response_body)}
-    """
-    return message
+_DAG_ID = "{env}_elasticsearch_health_check"
+_SCHEDULE = "*/15 * * * *"  # Every 15 minutes
 
+_SHARED_DAG_ARGS = {
+    "schedule": _SCHEDULE,
+    "start_date": datetime(2024, 1, 1),
+    "catchup": False,
+    "doc_md": """
+    ### Elasticsearch Health Check
 
-def _compose_unexpected_node_count(env: Environment, response_body: dict) -> str:
-    node_count = response_body["number_of_nodes"]
-    data_node_count = response_body["number_of_data_nodes"]
-    master_node_count = node_count - data_node_count
+    This DAG checks the health of the Elasticsearch cluster every 15 minutes.
 
-    message = f"""
-    Elasticsearch {env} cluster node count is *{node_count}*.
-    Expected {EXPECTED_NODE_COUNT} total nodes.
+    On a failure, it sends a Slack alert. To prevent alert fatigue, it uses an
+    Airflow Variable to throttle alerts, only sending a new one if the last
+    was more than 6 hours ago.
 
-    Master nodes: *{master_node_count}* of expected {EXPECTED_MASTER_NODE_COUNT}
-    Data nodes: *{data_node_count}* of expected {EXPECTED_DATA_NODE_COUNT}
+    On success, it clears the 'in-alarm' Variable.
+    """,
+    "tags": ["elasticsearch", "maintenance", "monitoring"],
+}
 
-    This is a critical status change, *investigate ASAP*.
-    If this is expected (e.g., during controlled node or cluster changes), acknowledge immediately with explanation.
 
-    {_format_response_body(response_body)}
+# Helper functions for the throttling logic
+def _check_if_throttled() -> bool:
     """
-    logger.error(f"Unexpected node count; {json.dumps(response_body)}")
-    return message
-
-
-def _compose_yellow_cluster_health(env: Environment, response_body: dict) -> str:
-    message = f"""
-    Elasticsearch {env} cluster health is *yellow*.
+    Check if an alert for a failing cluster should be sent.
 
-    This does not mean something is necessarily wrong, but if this is not expected (e.g., data refresh) then investigate cluster health now.
+    An alert is throttled if an 'in-alarm' variable exists and was set within the
+    ALERT_THROTTLE_WINDOW.
 
-    {_format_response_body(response_body)}
+    :return: True if the alert should be sent (not throttled), False otherwise.
     """
-    logger.info(f"Cluster health was yellow; {json.dumps(response_body)}")
-    return message
+    last_alert_str = Variable.get(ELASTICSEARCH_HEALTH_IN_ALARM_VAR, default_var=None)
 
+    if not last_alert_str:
+        # No variable exists, this is the first failure. Alert is not throttled.
+        print("No existing alarm Variable. Alerting.")
+        return True
 
-@task
-def ping_healthcheck(env: str, es_host: str) -> dict:
-    es_conn: Elasticsearch = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+    last_alert_ts = datetime.fromisoformat(last_alert_str)
+    time_since_last_alert = datetime.now(timezone.utc) - last_alert_ts
 
-    response = es_conn.cluster.health()
-
-    return response.body
-
-
-@task
-def compose_notification(
-    env: Environment, response_body: dict, is_data_refresh_running: bool
-) -> tuple[MessageType, str]:
-    status = response_body["status"]
+    if time_since_last_alert > ALERT_THROTTLE_WINDOW:
+        # It's been long enough, send another alert. Not throttled.
+        print(
+            f"Last alert was at {last_alert_ts}. Throttling window has passed. "
+            "Alerting."
+        )
+        return True
+    else:
+        # It's too soon, do not send another alert. Throttled.
+        print(f"Last alert was at {last_alert_ts}. Alert is throttled.")
+        return False
 
-    if status == "red":
-        return "alert", _compose_red_status(env, response_body)
 
-    if response_body["number_of_nodes"] != EXPECTED_NODE_COUNT:
-        return "alert", _compose_unexpected_node_count(env, response_body)
+def _set_alarm_variable():
+    """
+    Set the 'in-alarm' variable with the current UTC timestamp.
 
-    if status == "yellow":
-        if is_data_refresh_running and env == PRODUCTION:
-            raise AirflowSkipException(
-                "Production cluster health status is yellow during data refresh. "
-                "This is an expected state, so no alert is sent."
-            )
+    This is called after a failure alert is sent to begin the throttling window.
+    """
+    now_iso = datetime.now(timezone.utc).isoformat()
+    Variable.set(ELASTICSEARCH_HEALTH_IN_ALARM_VAR, now_iso)
+    print(f"Set {ELASTICSEARCH_HEALTH_IN_ALARM_VAR} to {now_iso}.")
 
-        return "notification", _compose_yellow_cluster_health(env, response_body)
 
-    raise AirflowSkipException(f"Cluster health is green; {json.dumps(response_body)}")
+def _clear_alarm_variable():
+    """
+    Delete the 'in-alarm' variable.
 
+    This is called when the cluster health check succeeds, resetting the alert mechanism.
+    """
+    Variable.delete(ELASTICSEARCH_HEALTH_IN_ALARM_VAR)
+    print(f"Cleared {ELASTICSEARCH_HEALTH_IN_ALARM_VAR}.")
 
-@task
-def notify(env: str, message_type_and_string: tuple[MessageType, str]):
-    message_type, message = message_type_and_string
 
-    message_kwargs = {
-        "dag_id": _DAG_ID.format(env=env),
-        "username": ES_USERNAME.format(env=env.title()),
-        "icon_emoji": ES_ICON,
-    }
+def create_es_health_check_dag(env: Literal["prod", "staging"]):
+    """Create the Elasticsearch health check DAG for a given environment."""
 
-    if message_type == "alert":
-        send_alert(dedent(message), **message_kwargs)
-    elif message_type == "notification":
-        send_message(dedent(message), **message_kwargs)
-    else:
-        raise ValueError(
-            f"Invalid message_type. Expected 'alert' or 'notification', "
-            f"received {message_type}"
+    @dag(dag_id=_DAG_ID.format(env=env), **_SHARED_DAG_ARGS)
+    def es_health_check_dag():
+        # This is the primary task. It will fail if the ES cluster is unhealthy.
+        @task
+        def check_es_health():
+            hook = ElasticsearchPythonHook(
+                elasticsearch_conn_id=f"elasticsearch_http_{env}"
+            )
+            health = hook.get_conn().cluster.health()
+            print(health)
+            if health["status"] not in ("green", "yellow"):
+                raise AirflowFailException(f"ES cluster status was {health['status']}!")
+
+        # Create an instance of the main health check task.
+        health_check = check_es_health()
+
+        # Success path: If the health check succeeds, clear the alarm variable.
+        # This task uses the default trigger_rule=TriggerRule.ALL_SUCCESS
+        clear_alarm = task(python_callable=_clear_alarm_variable)
+        clear_alarm_task = clear_alarm()
+
+        # Failure path: These tasks only run if the health check fails.
+        # 1. Check if we should send an alert or if it's throttled.
+        check_throttle = ShortCircuitOperator(
+            task_id="check_if_throttled",
+            python_callable=_check_if_throttled,
+            trigger_rule=TriggerRule.ALL_FAILED,  # Only run on failure of upstream
         )
 
+        # 2. Send the actual Slack alert.
+        @task
+        def notify_failure():
+            send_message(
+                f"❌ {env.title()} Elasticsearch cluster health check failed.",
+                dag_id=_DAG_ID.format(env=env),
+            )
 
-_SHARED_DAG_ARGS = {
-    # Every 15 minutes
-    "schedule": "*/15 * * * *",
-    "start_date": datetime(2024, 2, 4),
-    "catchup": False,
-    "max_active_runs": 1,
-    "doc_md": __doc__,
-    "tags": ["elasticsearch", "monitoring"],
-    "default_args": DAG_DEFAULT_ARGS,
-}
-
-
-_DATA_REFRESH_DAG_IDS = []
-for config in DATA_REFRESH_CONFIGS.values():
-    _DATA_REFRESH_DAG_IDS += [config.dag_id, config.filtered_index_dag_id]
+        notify_failure_task = notify_failure()
 
+        # 3. Set the alarm variable to start the throttling window.
+        set_alarm = task(python_callable=_set_alarm_variable)
+        set_alarm_task = set_alarm()
 
-for env in ENVIRONMENTS:
+        # Define task dependencies
+        health_check >> clear_alarm_task
+        health_check >> check_throttle >> notify_failure_task >> set_alarm_task
 
-    @dag(dag_id=_DAG_ID.format(env=env), **_SHARED_DAG_ARGS)
-    def cluster_healthcheck_dag():
-        is_data_refresh_running = is_concurrent_with_any(_DATA_REFRESH_DAG_IDS)
+    return es_health_check_dag()
 
-        es_host = get_es_host(env)
-        healthcheck_response = ping_healthcheck(env, es_host)
-        notification = compose_notification(
-            env, healthcheck_response, is_data_refresh_running
-        )
-        es_host >> healthcheck_response >> notification >> notify(env, notification)
 
-    cluster_healthcheck_dag()
+# Generate the DAG for each environment
+for env_name in ("prod", "staging"):
+    create_es_health_check_dag(env_name)