Skip to content

Commit 3607d0a

Browse files
authored
Add health_status field to status change logs data stream (#15852)
* Add health_status field to status change logs data stream * Add processor for health_status field in status_change_logs data stream * Add agent status alert rules * Use more specific index for system metrics, remove RLIKE clauses, and fix field used for CPU usage in alerting rules
1 parent 3798da2 commit 3607d0a

14 files changed

+401
-20
lines changed

packages/elastic_agent/changelog.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,18 @@
11
# newer versions go on top
2+
- version: "2.6.8"
3+
changes:
4+
- description: Adds processor for health_status field to status change logs data stream
5+
type: enhancement
6+
link: https://github.com/elastic/integrations/pull/15852
7+
- description: Add new alerting rules for agent health status changes
8+
type: enhancement
9+
link: https://github.com/elastic/integrations/pull/15852
10+
- description: Use more specifc index and remove RLIKE usage for system metrics alerting rules
11+
type: enhancement
12+
link: https://github.com/elastic/integrations/pull/15852
13+
- description: Use system.process.cpu.total.normalized.pct for CPU usage alerting rule
14+
type: bugfix
15+
link: https://github.com/elastic/integrations/pull/15852
216
- version: "2.6.7"
317
changes:
418
- description: Add mapping for error fields for beats logs.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
{
2+
"events": [
3+
{
4+
"@timestamp": "2024-01-15T10:30:00.000Z",
5+
"data_stream": {
6+
"type": "logs",
7+
"dataset": "elastic_agent.status_change",
8+
"namespace": "default"
9+
},
10+
"agent": {
11+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
12+
},
13+
"status": "online",
14+
"policy_id": "test-policy",
15+
"agentless": false,
16+
"space_id": "default",
17+
"hostname": "test-host"
18+
},
19+
{
20+
"@timestamp": "2024-01-15T10:30:00.000Z",
21+
"data_stream": {
22+
"type": "logs",
23+
"dataset": "elastic_agent.status_change",
24+
"namespace": "default"
25+
},
26+
"agent": {
27+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
28+
},
29+
"status": "offline",
30+
"policy_id": "test-policy",
31+
"agentless": false,
32+
"space_id": "default",
33+
"hostname": "test-host"
34+
},
35+
{
36+
"@timestamp": "2024-01-15T10:30:00.000Z",
37+
"data_stream": {
38+
"type": "logs",
39+
"dataset": "elastic_agent.status_change",
40+
"namespace": "default"
41+
},
42+
"agent": {
43+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
44+
},
45+
"status": "error",
46+
"policy_id": "test-policy",
47+
"agentless": false,
48+
"space_id": "default",
49+
"hostname": "test-host"
50+
},
51+
{
52+
"@timestamp": "2024-01-15T10:30:00.000Z",
53+
"data_stream": {
54+
"type": "logs",
55+
"dataset": "elastic_agent.status_change",
56+
"namespace": "default"
57+
},
58+
"agent": {
59+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
60+
},
61+
"status": "degraded",
62+
"policy_id": "test-policy",
63+
"agentless": false,
64+
"space_id": "default",
65+
"hostname": "test-host"
66+
},
67+
{
68+
"@timestamp": "2024-01-15T10:30:00.000Z",
69+
"data_stream": {
70+
"type": "logs",
71+
"dataset": "elastic_agent.status_change",
72+
"namespace": "default"
73+
},
74+
"agent": {
75+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
76+
},
77+
"status": "updating",
78+
"policy_id": "test-policy",
79+
"agentless": false,
80+
"space_id": "default",
81+
"hostname": "test-host"
82+
},
83+
{
84+
"@timestamp": "2024-01-15T10:30:00.000Z",
85+
"data_stream": {
86+
"type": "logs",
87+
"dataset": "elastic_agent.status_change",
88+
"namespace": "default"
89+
},
90+
"agent": {
91+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
92+
},
93+
"status": "enrolling",
94+
"policy_id": "test-policy",
95+
"agentless": false,
96+
"space_id": "default",
97+
"hostname": "test-host"
98+
},
99+
{
100+
"@timestamp": "2024-01-15T10:30:00.000Z",
101+
"data_stream": {
102+
"type": "logs",
103+
"dataset": "elastic_agent.status_change",
104+
"namespace": "default"
105+
},
106+
"agent": {
107+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
108+
},
109+
"status": "unenrolling",
110+
"policy_id": "test-policy",
111+
"agentless": false,
112+
"space_id": "default",
113+
"hostname": "test-host"
114+
}
115+
]
116+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
{
2+
"expected": [
3+
{
4+
"@timestamp": "2024-01-15T10:30:00.000Z",
5+
"data_stream": {
6+
"type": "logs",
7+
"dataset": "elastic_agent.status_change",
8+
"namespace": "default"
9+
},
10+
"agent": {
11+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
12+
},
13+
"status": "online",
14+
"health_status": "healthy",
15+
"policy_id": "test-policy",
16+
"agentless": false,
17+
"space_id": "default",
18+
"hostname": "test-host"
19+
},
20+
{
21+
"@timestamp": "2024-01-15T10:30:00.000Z",
22+
"data_stream": {
23+
"type": "logs",
24+
"dataset": "elastic_agent.status_change",
25+
"namespace": "default"
26+
},
27+
"agent": {
28+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
29+
},
30+
"status": "offline",
31+
"health_status": "offline",
32+
"policy_id": "test-policy",
33+
"agentless": false,
34+
"space_id": "default",
35+
"hostname": "test-host"
36+
},
37+
{
38+
"@timestamp": "2024-01-15T10:30:00.000Z",
39+
"data_stream": {
40+
"type": "logs",
41+
"dataset": "elastic_agent.status_change",
42+
"namespace": "default"
43+
},
44+
"agent": {
45+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
46+
},
47+
"status": "error",
48+
"health_status": "unhealthy",
49+
"policy_id": "test-policy",
50+
"agentless": false,
51+
"space_id": "default",
52+
"hostname": "test-host"
53+
},
54+
{
55+
"@timestamp": "2024-01-15T10:30:00.000Z",
56+
"data_stream": {
57+
"type": "logs",
58+
"dataset": "elastic_agent.status_change",
59+
"namespace": "default"
60+
},
61+
"agent": {
62+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
63+
},
64+
"status": "degraded",
65+
"health_status": "unhealthy",
66+
"policy_id": "test-policy",
67+
"agentless": false,
68+
"space_id": "default",
69+
"hostname": "test-host"
70+
},
71+
{
72+
"@timestamp": "2024-01-15T10:30:00.000Z",
73+
"data_stream": {
74+
"type": "logs",
75+
"dataset": "elastic_agent.status_change",
76+
"namespace": "default"
77+
},
78+
"agent": {
79+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
80+
},
81+
"status": "updating",
82+
"health_status": "updating",
83+
"policy_id": "test-policy",
84+
"agentless": false,
85+
"space_id": "default",
86+
"hostname": "test-host"
87+
},
88+
{
89+
"@timestamp": "2024-01-15T10:30:00.000Z",
90+
"data_stream": {
91+
"type": "logs",
92+
"dataset": "elastic_agent.status_change",
93+
"namespace": "default"
94+
},
95+
"agent": {
96+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
97+
},
98+
"status": "enrolling",
99+
"health_status": "updating",
100+
"policy_id": "test-policy",
101+
"agentless": false,
102+
"space_id": "default",
103+
"hostname": "test-host"
104+
},
105+
{
106+
"@timestamp": "2024-01-15T10:30:00.000Z",
107+
"data_stream": {
108+
"type": "logs",
109+
"dataset": "elastic_agent.status_change",
110+
"namespace": "default"
111+
},
112+
"agent": {
113+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
114+
},
115+
"status": "unenrolling",
116+
"health_status": "updating",
117+
"policy_id": "test-policy",
118+
"agentless": false,
119+
"space_id": "default",
120+
"hostname": "test-host"
121+
}
122+
]
123+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
description: Pipeline for Elastic Agent status change logs.
3+
processors:
4+
- script:
5+
description: Derive health_status from status field
6+
if: ctx.status != null
7+
lang: painless
8+
source: |
9+
String status = ctx.status;
10+
String healthStatus;
11+
12+
if (status == 'online') {
13+
healthStatus = 'healthy';
14+
} else if (status == 'error' || status == 'degraded') {
15+
healthStatus = 'unhealthy';
16+
} else if (status == 'updating' || status == 'enrolling' || status == 'unenrolling') {
17+
healthStatus = 'updating';
18+
} else {
19+
healthStatus = status;
20+
}
21+
22+
ctx.health_status = healthStatus;
23+
ignore_failure: true

packages/elastic_agent/data_stream/status_change_logs/fields/fields.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
- name: status
22
type: keyword
3+
- name: health_status
4+
type: keyword
35
- name: policy_id
46
type: keyword
57
- name: agentless
Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
{
2-
"@timestamp": 1576280412771,
3-
"data_stream": {
4-
"type": "logs",
5-
"dataset": "elastic_agent.status_change",
6-
"namespace": "default"
7-
},
8-
"agent": {
9-
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
10-
},
11-
"status": "HEALTHY",
12-
"policy_id": "test-policy",
13-
"agentless": false,
14-
"space_id": "default",
15-
"hostname": "test-host"
16-
}
2+
"@timestamp": 1576280412771,
3+
"data_stream": {
4+
"type": "logs",
5+
"dataset": "elastic_agent.status_change",
6+
"namespace": "default"
7+
},
8+
"agent": {
9+
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
10+
},
11+
"status": "online",
12+
"health_status": "healthy",
13+
"policy_id": "test-policy",
14+
"agentless": false,
15+
"space_id": "default",
16+
"hostname": "test-host"
17+
}

packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-cpu-usage-spike-rule.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"thresholdComparator": ">",
1717
"size": 100,
1818
"esqlQuery": {
19-
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5"
19+
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.norm.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5"
2020
},
2121
"aggType": "count",
2222
"groupBy": "row",

packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-memory-usage-rule.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"thresholdComparator": ">",
1717
"size": 100,
1818
"esqlQuery": {
19-
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
19+
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
2020
},
2121
"aggType": "count",
2222
"groupBy": "row",

packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-restarts.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"thresholdComparator": ">",
1717
"size": 100,
1818
"esqlQuery": {
19-
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name"
19+
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name"
2020
},
2121
"aggType": "count",
2222
"groupBy": "row",
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-offline-status",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Offline status",
6+
"tags": ["Elastic Agent"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 5,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"offline\""
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}

0 commit comments

Comments
 (0)