From 03d91943982276792360ffca406dab4521f9d846 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Mon, 22 Sep 2025 11:08:55 +0530 Subject: [PATCH 01/12] Add alerting rule templates --- packages/azure_openai/changelog.yml | 5 +++ .../alerting_rule_template/latency_spike.json | 38 +++++++++++++++++++ .../provisioned_utilization.json | 38 +++++++++++++++++++ .../quota_error_rates.json | 38 +++++++++++++++++++ packages/azure_openai/manifest.yml | 4 +- 5 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 packages/azure_openai/kibana/alerting_rule_template/latency_spike.json create mode 100644 packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json create mode 100644 packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json diff --git a/packages/azure_openai/changelog.yml b/packages/azure_openai/changelog.yml index 5dd4f44fb67..bdce41e0628 100644 --- a/packages/azure_openai/changelog.yml +++ b/packages/azure_openai/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "1.10.0" + changes: + - description: Add Alerting Rule Templates. + type: enhancement + link: https://github.com/elastic/integrations/pull/15434 - version: "1.9.0" changes: - description: Add a flag `fips_compatible` to control whether the package is allowed in the ECH FedRAMP High environment. diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json new file mode 100644 index 00000000000..cabef4c93d5 --- /dev/null +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -0,0 +1,38 @@ +{ + "id": "latency_spike", + "type": "alerting_rule_template", + "attributes": { + "name": "Latency Spike", + "tags": ["Azure AI Foundry", + "Azure AI Foundry Latency"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "size": 100, + "esqlQuery": { + "esql": "FROM metrics-azure.ai_foundry-default\n| STATS time_to_response = AVG(azure.ai_foundry.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000 and event.dataset == \\\"azure.ai_foundry\\\"" + }, + "aggType": "count", + "groupBy": "all", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} \ No newline at end of file diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json new file mode 100644 index 00000000000..c50a4ff81ae --- /dev/null +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -0,0 +1,38 @@ +{ + "id": "provisioned_utilization", + "type": "alerting_rule_template", + "attributes": { + "name": "Provisioned Utlization", + "tags": ["Azure AI Foundry", + "Azure AI Foundry Provisioned Utilization"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "size": 100, + "esqlQuery": { + "esql": "FROM metrics-azure.ai_foundry-default\\n| STATS provisioned_utilization = AVG(azure.ai_foundry.provisioned_utilization.avg) by azure.dimensions.model_deployment_name\\n| WHERE provisioned_utilization > 85 and event.dataset == \\\"azure.ai_foundry\\\"" + }, + "aggType": "count", + "groupBy": "all", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} \ No newline at end of file diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json new file mode 100644 index 00000000000..daa644676fd --- /dev/null +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -0,0 +1,38 @@ +{ + "id": "quota_error_rates", + "type": "alerting_rule_template", + "attributes": { + "name": "Quota Error Rates", + "tags": ["Azure AI Foundry", + "Azure AI Foundry Quota Error Rates"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "threshold": [ + 0 + ], + "thresholdComparator": "<", + "size": 100, + "esqlQuery": { + "esql": "FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" + }, + "aggType": "count", + "groupBy": "all", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} \ No newline at end of file diff --git a/packages/azure_openai/manifest.yml b/packages/azure_openai/manifest.yml index 96cefd5dfca..473971e3d69 100644 --- a/packages/azure_openai/manifest.yml +++ b/packages/azure_openai/manifest.yml @@ -1,7 +1,7 @@ -format_version: 3.1.3 +format_version: 3.5.0 name: azure_openai title: "Azure OpenAI" -version: "1.9.0" +version: "1.10.0" source: license: "Elastic-2.0" description: "Collects Azure OpenAI Logs and Metrics" From 3331bd1cff8ce18967a609a705e8904e641538d1 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Mon, 22 Sep 2025 11:11:17 +0530 Subject: [PATCH 02/12] update changelog emtry --- packages/azure_openai/changelog.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/azure_openai/changelog.yml b/packages/azure_openai/changelog.yml index bdce41e0628..d1c8860102e 100644 --- a/packages/azure_openai/changelog.yml +++ b/packages/azure_openai/changelog.yml @@ -3,7 +3,7 @@ changes: - description: Add Alerting Rule Templates. type: enhancement - link: https://github.com/elastic/integrations/pull/15434 + link: https://github.com/elastic/integrations/pull/15412 - version: "1.9.0" changes: - description: Add a flag `fips_compatible` to control whether the package is allowed in the ECH FedRAMP High environment. From 3515dfa847189c5282d620bd00ec7f5c61bbd952 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Fri, 10 Oct 2025 11:10:01 +0530 Subject: [PATCH 03/12] update max and multiply percent field with 100 --- .../kibana/alerting_rule_template/latency_spike.json | 8 ++++---- .../alerting_rule_template/provisioned_utilization.json | 8 ++++---- .../kibana/alerting_rule_template/quota_error_rates.json | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index cabef4c93d5..45850b765da 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -2,9 +2,9 @@ "id": "latency_spike", "type": "alerting_rule_template", "attributes": { - "name": "Latency Spike", - "tags": ["Azure AI Foundry", - "Azure AI Foundry Latency"], + "name": "[Azure OpenAI] Latency Spike", + "tags": ["Azure OpenAI", + "Latency Spike"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" @@ -19,7 +19,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.ai_foundry-default\n| STATS time_to_response = AVG(azure.ai_foundry.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000 and event.dataset == \\\"azure.ai_foundry\\\"" + "esql": "FROM metrics-azure.ai_foundry-default\n| STATS time_to_response = AVG(azure.ai_foundry.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" }, "aggType": "count", "groupBy": "all", diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index c50a4ff81ae..c015a7247a6 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -2,9 +2,9 @@ "id": "provisioned_utilization", "type": "alerting_rule_template", "attributes": { - "name": "Provisioned Utlization", - "tags": ["Azure AI Foundry", - "Azure AI Foundry Provisioned Utilization"], + "name": "[Azure OpenAI] Provisioned Utilization", + "tags": ["Azure OpenAI", + "Provisioned Utilization"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" @@ -19,7 +19,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.ai_foundry-default\\n| STATS provisioned_utilization = AVG(azure.ai_foundry.provisioned_utilization.avg) by azure.dimensions.model_deployment_name\\n| WHERE provisioned_utilization > 85 and event.dataset == \\\"azure.ai_foundry\\\"" + "esql": "FROM metrics-azure.ai_foundry-default\\n| STATS provisioned_utilization = MAX(azure.ai_foundry.provisioned_utilization.avg) * 100\n by azure.dimensions.model_deployment_name\\n| WHERE provisioned_utilization > 85" }, "aggType": "count", "groupBy": "all", diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json index daa644676fd..3e9b7a6bfa5 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -2,9 +2,9 @@ "id": "quota_error_rates", "type": "alerting_rule_template", "attributes": { - "name": "Quota Error Rates", - "tags": ["Azure AI Foundry", - "Azure AI Foundry Quota Error Rates"], + "name": "[Azure OpenAI] Quota Error Rates", + "tags": ["Azure OpenAI", + "Quota Error Rates"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" From 39aba3d37853b560de49edf64fff85857b118050 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Fri, 10 Oct 2025 14:35:24 +0530 Subject: [PATCH 04/12] address review comments --- .../kibana/alerting_rule_template/latency_spike.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index 45850b765da..4d49d774d20 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -4,7 +4,7 @@ "attributes": { "name": "[Azure OpenAI] Latency Spike", "tags": ["Azure OpenAI", - "Latency Spike"], + "Latency"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" From 09ab140c18fa612d3e5da5615065044a0c003e78 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Fri, 10 Oct 2025 16:31:16 +0530 Subject: [PATCH 05/12] fix datastream name --- .../kibana/alerting_rule_template/latency_spike.json | 2 +- .../kibana/alerting_rule_template/provisioned_utilization.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index 4d49d774d20..3809147ccf6 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -19,7 +19,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.ai_foundry-default\n| STATS time_to_response = AVG(azure.ai_foundry.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" + "esql": "FROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" }, "aggType": "count", "groupBy": "all", diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index c015a7247a6..b1896663076 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -19,7 +19,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.ai_foundry-default\\n| STATS provisioned_utilization = MAX(azure.ai_foundry.provisioned_utilization.avg) * 100\n by azure.dimensions.model_deployment_name\\n| WHERE provisioned_utilization > 85" + "esql": "FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" }, "aggType": "count", "groupBy": "all", From 723017a1162c17207488723889bcedde4efb6012 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Mon, 20 Oct 2025 09:29:37 +0530 Subject: [PATCH 06/12] update groupby row --- .../kibana/alerting_rule_template/latency_spike.json | 4 ++-- .../alerting_rule_template/provisioned_utilization.json | 4 ++-- .../kibana/alerting_rule_template/quota_error_rates.json | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index 3809147ccf6..ca55605f442 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -2,7 +2,7 @@ "id": "latency_spike", "type": "alerting_rule_template", "attributes": { - "name": "[Azure OpenAI] Latency Spike", + "name": "[Azure OpenAI] Latency high", "tags": ["Azure OpenAI", "Latency"], "ruleTypeId": ".es-query", @@ -22,7 +22,7 @@ "esql": "FROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" }, "aggType": "count", - "groupBy": "all", + "groupBy": "row", "termSize": 5, "sourceFields": [], "timeField": "@timestamp", diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index b1896663076..8f7f1f79c7e 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -2,7 +2,7 @@ "id": "provisioned_utilization", "type": "alerting_rule_template", "attributes": { - "name": "[Azure OpenAI] Provisioned Utilization", + "name": "[Azure OpenAI] Provisioned Utilization above threshold", "tags": ["Azure OpenAI", "Provisioned Utilization"], "ruleTypeId": ".es-query", @@ -22,7 +22,7 @@ "esql": "FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" }, "aggType": "count", - "groupBy": "all", + "groupBy": "row", "termSize": 5, "sourceFields": [], "timeField": "@timestamp", diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json index 3e9b7a6bfa5..6f069b69347 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -2,7 +2,7 @@ "id": "quota_error_rates", "type": "alerting_rule_template", "attributes": { - "name": "[Azure OpenAI] Quota Error Rates", + "name": "[Azure OpenAI] Quota Error Rates above threshold", "tags": ["Azure OpenAI", "Quota Error Rates"], "ruleTypeId": ".es-query", @@ -22,7 +22,7 @@ "esql": "FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" }, "aggType": "count", - "groupBy": "all", + "groupBy": "row", "termSize": 5, "sourceFields": [], "timeField": "@timestamp", From 0f66cc0fc102baffc883ae0efc64778709b9338f Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Wed, 5 Nov 2025 16:46:40 +0530 Subject: [PATCH 07/12] update rules with comments --- .../alerting_rule_template/latency_spike.json | 16 +++------------- .../provisioned_utilization.json | 16 +++------------- .../quota_error_rates.json | 16 +++------------- 3 files changed, 9 insertions(+), 39 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index ca55605f442..61ee44b8575 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -3,8 +3,7 @@ "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Latency high", - "tags": ["Azure OpenAI", - "Latency"], + "tags": ["Azure OpenAI"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" @@ -13,20 +12,11 @@ "searchType": "esqlQuery", "timeWindowSize": 15, "timeWindowUnit": "m", - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" + "esql": "// The recommended threshold value for response latency is > 5 seconds, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the time_to_response in the WHERE clause, which is specified in milliseconds. \nFROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" }, - "aggType": "count", "groupBy": "row", - "termSize": 5, - "sourceFields": [], - "timeField": "@timestamp", - "excludeHitsFromPreviousRun": true + "timeField": "@timestamp" }, "alertDelay": { "active": 1 diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index 8f7f1f79c7e..9df79520ff3 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -3,8 +3,7 @@ "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Provisioned Utilization above threshold", - "tags": ["Azure OpenAI", - "Provisioned Utilization"], + "tags": ["Azure OpenAI"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" @@ -13,20 +12,11 @@ "searchType": "esqlQuery", "timeWindowSize": 15, "timeWindowUnit": "m", - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "size": 100, "esqlQuery": { - "esql": "FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" + "esql": "// The recommended threshold value for provisioned utilization is > 85%, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the provisioned_utilization in the WHERE clause, which is specified in percent. \n FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" }, - "aggType": "count", "groupBy": "row", - "termSize": 5, - "sourceFields": [], - "timeField": "@timestamp", - "excludeHitsFromPreviousRun": true + "timeField": "@timestamp" }, "alertDelay": { "active": 1 diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json index 6f069b69347..0752ccb4b29 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -3,8 +3,7 @@ "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Quota Error Rates above threshold", - "tags": ["Azure OpenAI", - "Quota Error Rates"], + "tags": ["Azure OpenAI"], "ruleTypeId": ".es-query", "schedule": { "interval": "1m" @@ -13,20 +12,11 @@ "searchType": "esqlQuery", "timeWindowSize": 15, "timeWindowUnit": "m", - "threshold": [ - 0 - ], - "thresholdComparator": "<", - "size": 100, "esqlQuery": { - "esql": "FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" + "esql": "// The recommended threshold value for quota errors is > 0, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the quota_error count in the WHERE clause. \n FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" }, - "aggType": "count", "groupBy": "row", - "termSize": 5, - "sourceFields": [], - "timeField": "@timestamp", - "excludeHitsFromPreviousRun": true + "timeField": "@timestamp" }, "alertDelay": { "active": 1 From 1087ffe564facfb11ce09ee5c99cc94c6b3ff05a Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Wed, 12 Nov 2025 14:38:52 +0530 Subject: [PATCH 08/12] update description --- .../kibana/alerting_rule_template/latency_spike.json | 2 +- .../kibana/alerting_rule_template/provisioned_utilization.json | 2 +- .../kibana/alerting_rule_template/quota_error_rates.json | 2 +- packages/azure_openai/manifest.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index 61ee44b8575..4b6c5c5ad04 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -13,7 +13,7 @@ "timeWindowSize": 15, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// The recommended threshold value for response latency is > 5 seconds, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the time_to_response in the WHERE clause, which is specified in milliseconds. \nFROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" + "esql": "// Alert triggers when the response latency exceeds the recommended threshold value {5 secs} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the time_to_response in the WHERE clause, which is specified in milliseconds. \nFROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" }, "groupBy": "row", "timeField": "@timestamp" diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index 9df79520ff3..c3e2085dee1 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -13,7 +13,7 @@ "timeWindowSize": 15, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// The recommended threshold value for provisioned utilization is > 85%, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the provisioned_utilization in the WHERE clause, which is specified in percent. \n FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" + "esql": "// Alert triggers when the provisioned utilization exceeds the recommended threshold value {85%} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the provisioned_utilization in the WHERE clause, which is specified in percent. \n FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" }, "groupBy": "row", "timeField": "@timestamp" diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json index 0752ccb4b29..f0a0220e307 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -13,7 +13,7 @@ "timeWindowSize": 15, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// The recommended threshold value for quota errors is > 0, and the alerting rule is grouped by Model Name. You can adjust the threshold value by modifying the quota_error count in the WHERE clause. \n FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" + "esql": "// Alert triggers when the quota_error is greater than recommended threshold value {0} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the quota_error count in the WHERE clause. \n FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" }, "groupBy": "row", "timeField": "@timestamp" diff --git a/packages/azure_openai/manifest.yml b/packages/azure_openai/manifest.yml index 473971e3d69..cc704ed4c6d 100644 --- a/packages/azure_openai/manifest.yml +++ b/packages/azure_openai/manifest.yml @@ -1,4 +1,4 @@ -format_version: 3.5.0 +format_version: 3.4.0 name: azure_openai title: "Azure OpenAI" version: "1.10.0" From 9e5a575c430c521f68cfe1c064f06eb8dd39f28b Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Fri, 14 Nov 2025 14:44:07 +0530 Subject: [PATCH 09/12] update query --- .../kibana/alerting_rule_template/latency_spike.json | 6 +++--- .../alerting_rule_template/provisioned_utilization.json | 6 +++--- .../kibana/alerting_rule_template/quota_error_rates.json | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json index 4b6c5c5ad04..2c41d12f641 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json @@ -10,16 +10,16 @@ }, "params": { "searchType": "esqlQuery", - "timeWindowSize": 15, + "timeWindowSize": 10, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// Alert triggers when the response latency exceeds the recommended threshold value {5 secs} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the time_to_response in the WHERE clause, which is specified in milliseconds. \nFROM metrics-azure.open_ai-default\n| STATS time_to_response = AVG(azure.open_ai.time_to_response.avg) by azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000" + "esql": "// Alert triggers when the response latency exceeds the recommended threshold value {5000ms} within the look back time window.\n// The alert is grouped by Model Deployment Name.\n// You can adjust the threshold value by modifying the time_to_response in the WHERE clause, which is specified in milliseconds.\nFROM metrics-azure.open_ai-default\n| KEEP azure.open_ai.time_to_response.avg, azure.dimensions.model_deployment_name, @timestamp\n| WHERE azure.dimensions.model_deployment_name IS NOT NULL\n| STATS time_to_response = MAX(azure.open_ai.time_to_response.avg) BY azure.dimensions.model_deployment_name\n| WHERE time_to_response > 5000\n| EVAL time_to_response = ROUND(time_to_response, 2)\n| SORT time_to_response DESC" }, "groupBy": "row", "timeField": "@timestamp" }, "alertDelay": { - "active": 1 + "active": 2 } }, "managed": true, diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json index c3e2085dee1..8fe2562f53e 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json @@ -10,16 +10,16 @@ }, "params": { "searchType": "esqlQuery", - "timeWindowSize": 15, + "timeWindowSize": 10, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// Alert triggers when the provisioned utilization exceeds the recommended threshold value {85%} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the provisioned_utilization in the WHERE clause, which is specified in percent. \n FROM metrics-azure.open_ai-default\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100\n by azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85" + "esql": "// Alert triggers when the provisioned utilization exceeds the recommended threshold value {85%} within the look back time window.\n// The alert is grouped by Model Deployment Name.\n// You can adjust the threshold value by modifying the provisioned_utilization in the WHERE clause, which is specified in percent.\nFROM metrics-azure.open_ai-default\n| KEEP azure.open_ai.provisioned_managed_utilization_v2.avg, azure.dimensions.model_deployment_name, @timestamp\n| WHERE azure.dimensions.model_deployment_name IS NOT NULL\n| STATS provisioned_utilization = MAX(azure.open_ai.provisioned_managed_utilization_v2.avg) * 100 BY azure.dimensions.model_deployment_name\n| WHERE provisioned_utilization > 85\n| EVAL provisioned_utilization = ROUND(provisioned_utilization, 2)\n| SORT provisioned_utilization DESC" }, "groupBy": "row", "timeField": "@timestamp" }, "alertDelay": { - "active": 1 + "active": 2 } }, "managed": true, diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json index f0a0220e307..0bc10cf02b6 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json @@ -10,16 +10,16 @@ }, "params": { "searchType": "esqlQuery", - "timeWindowSize": 15, + "timeWindowSize": 10, "timeWindowUnit": "m", "esqlQuery": { - "esql": "// Alert triggers when the quota_error is greater than recommended threshold value {0} within the look back time window.\n// The alert is grouped by Model Name.\n// You can adjust the threshold value by modifying the quota_error count in the WHERE clause. \n FROM logs-azure_openai.logs-default\n| STATS quota_error = COUNT(http.response.status_code == 429) by azure.dimensions.model_deployment_name\n| WHERE quota_error > 0" + "esql": "// Alert triggers when the quota_error count is greater than recommended threshold value {5} within the look back time window.\n// The alert is grouped by Model Deployment Name.\n// You can adjust the threshold value by modifying the quota_error count in the WHERE clause.\nFROM logs-azure_openai.logs-default\n| KEEP http.response.status_code, azure.dimensions.model_deployment_name, @timestamp\n| WHERE azure.dimensions.model_deployment_name IS NOT NULL\n| WHERE http.response.status_code == 429\n| STATS quota_error = COUNT(*) BY azure.dimensions.model_deployment_name\n| WHERE quota_error > 5\n| SORT quota_error DESC" }, "groupBy": "row", "timeField": "@timestamp" }, "alertDelay": { - "active": 1 + "active": 2 } }, "managed": true, From 3f32e014c84e0c8fd38f055c308ea0fbd2f5151b Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Mon, 17 Nov 2025 18:07:06 +0530 Subject: [PATCH 10/12] update stack version --- packages/azure_openai/manifest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/azure_openai/manifest.yml b/packages/azure_openai/manifest.yml index cc704ed4c6d..ce7e13ee84e 100644 --- a/packages/azure_openai/manifest.yml +++ b/packages/azure_openai/manifest.yml @@ -14,7 +14,7 @@ categories: - security conditions: kibana: - version: "^8.17.1 || ^9.0.0" + version: "^8.19.0 || ^9.1.0" elastic: subscription: "basic" vars: From 039e2e8c0ff177e6ff30f995fb6af6d105647f04 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Tue, 18 Nov 2025 11:36:37 +0530 Subject: [PATCH 11/12] update rule template ids --- .../{latency_spike.json => azure_openai-latency-spike.json} | 2 +- ...ilization.json => azure_openai-provisioned-utilization.json} | 2 +- ...ota_error_rates.json => azure_openai-quota-error-rates.json} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename packages/azure_openai/kibana/alerting_rule_template/{latency_spike.json => azure_openai-latency-spike.json} (97%) rename packages/azure_openai/kibana/alerting_rule_template/{provisioned_utilization.json => azure_openai-provisioned-utilization.json} (96%) rename packages/azure_openai/kibana/alerting_rule_template/{quota_error_rates.json => azure_openai-quota-error-rates.json} (96%) diff --git a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-latency-spike.json similarity index 97% rename from packages/azure_openai/kibana/alerting_rule_template/latency_spike.json rename to packages/azure_openai/kibana/alerting_rule_template/azure_openai-latency-spike.json index 2c41d12f641..e3c311deffe 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/latency_spike.json +++ b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-latency-spike.json @@ -1,5 +1,5 @@ { - "id": "latency_spike", + "id": "azure_openai-latency-spike", "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Latency high", diff --git a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-provisioned-utilization.json similarity index 96% rename from packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json rename to packages/azure_openai/kibana/alerting_rule_template/azure_openai-provisioned-utilization.json index 8fe2562f53e..edcba5fd2fd 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/provisioned_utilization.json +++ b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-provisioned-utilization.json @@ -1,5 +1,5 @@ { - "id": "provisioned_utilization", + "id": "azure_openai-provisioned-utilization", "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Provisioned Utilization above threshold", diff --git a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-quota-error-rates.json similarity index 96% rename from packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json rename to packages/azure_openai/kibana/alerting_rule_template/azure_openai-quota-error-rates.json index 0bc10cf02b6..c8cb16dedee 100644 --- a/packages/azure_openai/kibana/alerting_rule_template/quota_error_rates.json +++ b/packages/azure_openai/kibana/alerting_rule_template/azure_openai-quota-error-rates.json @@ -1,5 +1,5 @@ { - "id": "quota_error_rates", + "id": "azure_openai-quota-error-rates", "type": "alerting_rule_template", "attributes": { "name": "[Azure OpenAI] Quota Error Rates above threshold", From caa932731ffe04ce104921f282e8ac46c4f8a221 Mon Sep 17 00:00:00 2001 From: muthu-mps Date: Fri, 21 Nov 2025 15:11:09 +0530 Subject: [PATCH 12/12] update stack version --- packages/azure_openai/manifest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/azure_openai/manifest.yml b/packages/azure_openai/manifest.yml index ce7e13ee84e..153b48a3923 100644 --- a/packages/azure_openai/manifest.yml +++ b/packages/azure_openai/manifest.yml @@ -14,7 +14,7 @@ categories: - security conditions: kibana: - version: "^8.19.0 || ^9.1.0" + version: "^8.19.0 || ^9.2.1" elastic: subscription: "basic" vars: