From ac82576849de23ac4624160b222fe15b74bc6017 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Mon, 15 Dec 2025 18:09:49 +0530 Subject: [PATCH 1/6] feat: add cluster health check prompt Signed-off-by: Rohit Patil --- docs/PROMPTS.md | 37 ++ pkg/toolsets/core/health_check.go | 717 +++++++++++++++++++++++++ pkg/toolsets/core/health_check_test.go | 57 ++ pkg/toolsets/core/toolset.go | 29 +- 4 files changed, 838 insertions(+), 2 deletions(-) create mode 100644 pkg/toolsets/core/health_check.go create mode 100644 pkg/toolsets/core/health_check_test.go diff --git a/docs/PROMPTS.md b/docs/PROMPTS.md index 11c45589c..11316d4c2 100644 --- a/docs/PROMPTS.md +++ b/docs/PROMPTS.md @@ -57,6 +57,43 @@ content = "I'll retrieve and analyze the logs for you." ### Argument Substitution Use `{{argument_name}}` placeholders in message content. The template engine replaces these with actual values when the prompt is called. +## Built-in Prompts + +The Kubernetes MCP Server includes several built-in prompts that are always available: + +### `cluster-health-check` + +Performs a comprehensive health assessment of your Kubernetes or OpenShift cluster. + +**Arguments:** +- `namespace` (optional): Limit the health check to a specific namespace. Default: all namespaces. +- `verbose` (optional): Enable detailed resource-level information. Values: `true` or `false`. Default: `false`. +- `check_events` (optional): Include recent warning/error events in the analysis. Values: `true` or `false`. Default: `true`. + +**What it checks:** +- **Nodes**: Status and conditions (Ready, MemoryPressure, DiskPressure, etc.) +- **Cluster Operators** (OpenShift only): Available and degraded status +- **Pods**: Phase, container statuses, restart counts, and common issues (CrashLoopBackOff, ImagePullBackOff, etc.) +- **Workload Controllers**: Deployments, StatefulSets, and DaemonSets replica status +- **Persistent Volume Claims**: Binding status +- **Events**: Recent warning and error events from the last hour + +**Example usage:** +``` +Check the health of my cluster +``` + +Or with specific parameters: +``` +Check the health of namespace production with verbose output +``` + +The prompt gathers comprehensive diagnostic data and presents it to the LLM for analysis, which will provide: +1. Overall health status (Healthy, Warning, or Critical) +2. Critical issues requiring immediate attention +3. Warnings and recommendations +4. Summary by component + ## Configuration File Location Place your prompts in the `config.toml` file used by the MCP server. Specify the config file path using the `--config` flag when starting the server. diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go new file mode 100644 index 000000000..4566b04b1 --- /dev/null +++ b/pkg/toolsets/core/health_check.go @@ -0,0 +1,717 @@ +package core + +import ( + "fmt" + "strings" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/containers/kubernetes-mcp-server/pkg/api" +) + +// clusterHealthCheckHandler implements the cluster health check prompt +func clusterHealthCheckHandler(params api.PromptHandlerParams) (*api.PromptCallResult, error) { + args := params.GetArguments() + + // Parse arguments + namespace := args["namespace"] + verbose := args["verbose"] == "true" + checkEvents := args["check_events"] != "false" // default true + + // Check if namespace exists if specified + namespaceWarning := "" + requestedNamespace := namespace + if namespace != "" { + nsGVK := &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Namespace", + } + _, err := params.ResourcesGet(params, nsGVK, "", namespace) + if err != nil { + // Namespace doesn't exist - show warning and proceed with cluster-wide check + namespaceWarning = fmt.Sprintf("Namespace '%s' not found or not accessible. Showing cluster-wide information instead.", namespace) + namespace = "" // Fall back to cluster-wide check + } + } + + // Gather cluster diagnostics using the KubernetesClient interface + diagnostics, err := gatherClusterDiagnostics(params, namespace, verbose, checkEvents) + if err != nil { + return nil, fmt.Errorf("failed to gather cluster diagnostics: %w", err) + } + + // Set namespace warning and requested namespace for display + diagnostics.NamespaceWarning = namespaceWarning + if requestedNamespace != "" && namespaceWarning != "" { + diagnostics.TargetNamespace = requestedNamespace + diagnostics.NamespaceScoped = false // Changed to cluster-wide due to error + } + + // Format diagnostic data for LLM analysis + promptText := formatHealthCheckPrompt(diagnostics) + + return api.NewPromptCallResult( + "Cluster health diagnostic data gathered successfully", + []api.PromptMessage{ + { + Role: "user", + Content: api.PromptContent{ + Type: "text", + Text: promptText, + }, + }, + { + Role: "assistant", + Content: api.PromptContent{ + Type: "text", + Text: "I'll analyze the cluster health diagnostic data and provide a comprehensive assessment.", + }, + }, + }, + nil, + ), nil +} + +// clusterDiagnostics contains all diagnostic data gathered from the cluster +type clusterDiagnostics struct { + Nodes string + Pods string + Deployments string + StatefulSets string + DaemonSets string + PVCs string + ClusterOperators string + Events string + CollectionTime time.Time + TotalNamespaces int + NamespaceScoped bool + TargetNamespace string + NamespaceWarning string +} + +// gatherClusterDiagnostics collects comprehensive diagnostic data from the cluster +func gatherClusterDiagnostics(params api.PromptHandlerParams, namespace string, verbose bool, checkEvents bool) (*clusterDiagnostics, error) { + diag := &clusterDiagnostics{ + CollectionTime: time.Now(), + NamespaceScoped: namespace != "", + TargetNamespace: namespace, + } + + // Gather node diagnostics using ResourcesList + nodeDiag, err := gatherNodeDiagnostics(params) + if err == nil { + diag.Nodes = nodeDiag + } + + // Gather pod diagnostics + podDiag, err := gatherPodDiagnostics(params, namespace) + if err == nil { + diag.Pods = podDiag + } + + // Gather workload diagnostics + deployDiag, err := gatherWorkloadDiagnostics(params, "Deployment", namespace) + if err == nil { + diag.Deployments = deployDiag + } + + stsDiag, err := gatherWorkloadDiagnostics(params, "StatefulSet", namespace) + if err == nil { + diag.StatefulSets = stsDiag + } + + dsDiag, err := gatherWorkloadDiagnostics(params, "DaemonSet", namespace) + if err == nil { + diag.DaemonSets = dsDiag + } + + // Gather PVC diagnostics + pvcDiag, err := gatherPVCDiagnostics(params, namespace) + if err == nil { + diag.PVCs = pvcDiag + } + + // Gather cluster operator diagnostics (OpenShift only) + operatorDiag, err := gatherClusterOperatorDiagnostics(params) + if err == nil { + diag.ClusterOperators = operatorDiag + } + + // Gather recent events if requested + if checkEvents { + eventDiag, err := gatherEventDiagnostics(params, namespace) + if err == nil { + diag.Events = eventDiag + } + } + + // Count namespaces + namespaceList, err := params.NamespacesList(params, api.ListOptions{}) + if err == nil { + if items, ok := namespaceList.UnstructuredContent()["items"].([]interface{}); ok { + diag.TotalNamespaces = len(items) + } + } + + return diag, nil +} + +// gatherNodeDiagnostics collects node status using ResourcesList +func gatherNodeDiagnostics(params api.PromptHandlerParams) (string, error) { + gvk := &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Node", + } + + nodeList, err := params.ResourcesList(params, gvk, "", api.ListOptions{}) + if err != nil { + return "", err + } + + items, ok := nodeList.UnstructuredContent()["items"].([]interface{}) + if !ok || len(items) == 0 { + return "No nodes found", nil + } + + var sb strings.Builder + totalNodes := len(items) + healthyNodes := 0 + nodesWithIssues := []string{} + + for _, item := range items { + nodeMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := nodeMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + + status, _ := nodeMap["status"].(map[string]interface{}) + conditions, _ := status["conditions"].([]interface{}) + + nodeStatus := "Unknown" + issues := []string{} + + // Parse node conditions + for _, cond := range conditions { + condMap, _ := cond.(map[string]interface{}) + condType, _ := condMap["type"].(string) + condStatus, _ := condMap["status"].(string) + message, _ := condMap["message"].(string) + + if condType == "Ready" { + if condStatus == "True" { + nodeStatus = "Ready" + healthyNodes++ + } else { + nodeStatus = "NotReady" + issues = append(issues, fmt.Sprintf("Not ready: %s", message)) + } + } else if condStatus == "True" && condType != "Ready" { + // Pressure conditions + issues = append(issues, fmt.Sprintf("%s: %s", condType, message)) + } + } + + // Only report nodes with issues + if len(issues) > 0 { + nodesWithIssues = append(nodesWithIssues, fmt.Sprintf("- **%s** (Status: %s)\n%s", name, nodeStatus, " - "+strings.Join(issues, "\n - "))) + } + } + + sb.WriteString(fmt.Sprintf("**Total:** %d | **Healthy:** %d\n\n", totalNodes, healthyNodes)) + if len(nodesWithIssues) > 0 { + sb.WriteString(strings.Join(nodesWithIssues, "\n\n")) + } else { + sb.WriteString("*All nodes are healthy*") + } + + return sb.String(), nil +} + +// gatherPodDiagnostics collects pod status using existing methods +func gatherPodDiagnostics(params api.PromptHandlerParams, namespace string) (string, error) { + var podList interface{ UnstructuredContent() map[string]interface{} } + var err error + + if namespace != "" { + podList, err = params.PodsListInNamespace(params, namespace, api.ListOptions{}) + } else { + podList, err = params.PodsListInAllNamespaces(params, api.ListOptions{}) + } + + if err != nil { + return "", err + } + + items, ok := podList.UnstructuredContent()["items"].([]interface{}) + if !ok { + return "No pods found", nil + } + + totalPods := len(items) + problemPods := []string{} + + for _, item := range items { + podMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := podMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + ns, _ := metadata["namespace"].(string) + + status, _ := podMap["status"].(map[string]interface{}) + phase, _ := status["phase"].(string) + containerStatuses, _ := status["containerStatuses"].([]interface{}) + + issues := []string{} + restarts := int32(0) + readyCount := 0 + totalContainers := len(containerStatuses) + + // Check container statuses + for _, cs := range containerStatuses { + csMap, _ := cs.(map[string]interface{}) + ready, _ := csMap["ready"].(bool) + restartCount, _ := csMap["restartCount"].(float64) + restarts += int32(restartCount) + + if ready { + readyCount++ + } + + state, _ := csMap["state"].(map[string]interface{}) + if waiting, ok := state["waiting"].(map[string]interface{}); ok { + reason, _ := waiting["reason"].(string) + message, _ := waiting["message"].(string) + if reason == "CrashLoopBackOff" || reason == "ImagePullBackOff" || reason == "ErrImagePull" { + issues = append(issues, fmt.Sprintf("Container waiting: %s - %s", reason, message)) + } + } + if terminated, ok := state["terminated"].(map[string]interface{}); ok { + reason, _ := terminated["reason"].(string) + if reason == "Error" || reason == "OOMKilled" { + issues = append(issues, fmt.Sprintf("Container terminated: %s", reason)) + } + } + } + + // Check pod phase + if phase != "Running" && phase != "Succeeded" { + issues = append(issues, fmt.Sprintf("Pod in %s phase", phase)) + } + + // Report pods with issues or high restart count + if len(issues) > 0 || restarts > 5 { + problemPods = append(problemPods, fmt.Sprintf("- **%s/%s** (Phase: %s, Ready: %d/%d, Restarts: %d)\n - %s", + ns, name, phase, readyCount, totalContainers, restarts, strings.Join(issues, "\n - "))) + } + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**Total:** %d | **With Issues:** %d\n\n", totalPods, len(problemPods))) + if len(problemPods) > 0 { + sb.WriteString(strings.Join(problemPods, "\n\n")) + } else { + sb.WriteString("*No pod issues detected*") + } + + return sb.String(), nil +} + +// gatherWorkloadDiagnostics collects workload controller status +func gatherWorkloadDiagnostics(params api.PromptHandlerParams, kind string, namespace string) (string, error) { + gvk := &schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: kind, + } + + workloadList, err := params.ResourcesList(params, gvk, namespace, api.ListOptions{}) + if err != nil { + return "", err + } + + items, ok := workloadList.UnstructuredContent()["items"].([]interface{}) + if !ok || len(items) == 0 { + return fmt.Sprintf("No %ss found", kind), nil + } + + workloadsWithIssues := []string{} + + for _, item := range items { + workloadMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := workloadMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + ns, _ := metadata["namespace"].(string) + + status, _ := workloadMap["status"].(map[string]interface{}) + spec, _ := workloadMap["spec"].(map[string]interface{}) + issues := []string{} + ready := "Unknown" + + switch kind { + case "Deployment": + replicas, _ := status["replicas"].(float64) + readyReplicas, _ := status["readyReplicas"].(float64) + unavailableReplicas, _ := status["unavailableReplicas"].(float64) + + ready = fmt.Sprintf("%d/%d", int(readyReplicas), int(replicas)) + + if unavailableReplicas > 0 { + issues = append(issues, fmt.Sprintf("%d replicas unavailable", int(unavailableReplicas))) + } + + case "StatefulSet": + specReplicas, _ := spec["replicas"].(float64) + readyReplicas, _ := status["readyReplicas"].(float64) + + ready = fmt.Sprintf("%d/%d", int(readyReplicas), int(specReplicas)) + + if readyReplicas < specReplicas { + issues = append(issues, fmt.Sprintf("Only %d/%d replicas ready", int(readyReplicas), int(specReplicas))) + } + + case "DaemonSet": + desiredNumberScheduled, _ := status["desiredNumberScheduled"].(float64) + numberReady, _ := status["numberReady"].(float64) + numberUnavailable, _ := status["numberUnavailable"].(float64) + + ready = fmt.Sprintf("%d/%d", int(numberReady), int(desiredNumberScheduled)) + + if numberUnavailable > 0 { + issues = append(issues, fmt.Sprintf("%d pods unavailable", int(numberUnavailable))) + } + } + + if len(issues) > 0 { + workloadsWithIssues = append(workloadsWithIssues, fmt.Sprintf("- **%s/%s** (Ready: %s)\n - %s", + ns, name, ready, strings.Join(issues, "\n - "))) + } + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**%ss with Issues:** %d\n\n", kind, len(workloadsWithIssues))) + if len(workloadsWithIssues) > 0 { + sb.WriteString(strings.Join(workloadsWithIssues, "\n\n")) + } else { + sb.WriteString(fmt.Sprintf("*No %s issues detected*", kind)) + } + + return sb.String(), nil +} + +// gatherPVCDiagnostics collects PVC status +func gatherPVCDiagnostics(params api.PromptHandlerParams, namespace string) (string, error) { + gvk := &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "PersistentVolumeClaim", + } + + pvcList, err := params.ResourcesList(params, gvk, namespace, api.ListOptions{}) + if err != nil { + return "", err + } + + items, ok := pvcList.UnstructuredContent()["items"].([]interface{}) + if !ok || len(items) == 0 { + return "No PVCs found", nil + } + + pvcsWithIssues := []string{} + + for _, item := range items { + pvcMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := pvcMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + ns, _ := metadata["namespace"].(string) + + status, _ := pvcMap["status"].(map[string]interface{}) + phase, _ := status["phase"].(string) + + if phase != "Bound" { + pvcsWithIssues = append(pvcsWithIssues, fmt.Sprintf("- **%s/%s** (Status: %s)\n - PVC not bound", ns, name, phase)) + } + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**PVCs with Issues:** %d\n\n", len(pvcsWithIssues))) + if len(pvcsWithIssues) > 0 { + sb.WriteString(strings.Join(pvcsWithIssues, "\n\n")) + } else { + sb.WriteString("*No PVC issues detected*") + } + + return sb.String(), nil +} + +// gatherClusterOperatorDiagnostics collects ClusterOperator status (OpenShift only) +func gatherClusterOperatorDiagnostics(params api.PromptHandlerParams) (string, error) { + gvk := &schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "ClusterOperator", + } + + operatorList, err := params.ResourcesList(params, gvk, "", api.ListOptions{}) + if err != nil { + // Not an OpenShift cluster + return "", err + } + + items, ok := operatorList.UnstructuredContent()["items"].([]interface{}) + if !ok || len(items) == 0 { + return "No cluster operators found", nil + } + + operatorsWithIssues := []string{} + + for _, item := range items { + opMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := opMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + + status, _ := opMap["status"].(map[string]interface{}) + conditions, _ := status["conditions"].([]interface{}) + + available := "Unknown" + degraded := "Unknown" + issues := []string{} + + for _, cond := range conditions { + condMap, _ := cond.(map[string]interface{}) + condType, _ := condMap["type"].(string) + condStatus, _ := condMap["status"].(string) + message, _ := condMap["message"].(string) + + switch condType { + case "Available": + available = condStatus + if condStatus != "True" { + issues = append(issues, fmt.Sprintf("Not available: %s", message)) + } + case "Degraded": + degraded = condStatus + if condStatus == "True" { + issues = append(issues, fmt.Sprintf("Degraded: %s", message)) + } + } + } + + if len(issues) > 0 { + operatorsWithIssues = append(operatorsWithIssues, fmt.Sprintf("- **%s** (Available: %s, Degraded: %s)\n - %s", + name, available, degraded, strings.Join(issues, "\n - "))) + } + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**Operators with Issues:** %d\n\n", len(operatorsWithIssues))) + if len(operatorsWithIssues) > 0 { + sb.WriteString(strings.Join(operatorsWithIssues, "\n\n")) + } else { + sb.WriteString("*All cluster operators are healthy*") + } + + return sb.String(), nil +} + +// gatherEventDiagnostics collects recent warning and error events +func gatherEventDiagnostics(params api.PromptHandlerParams, namespace string) (string, error) { + namespaces := []string{} + + if namespace != "" { + namespaces = append(namespaces, namespace) + } else { + // Important namespaces + namespaces = []string{"default", "kube-system"} + + // Add OpenShift namespaces + nsList, err := params.NamespacesList(params, api.ListOptions{}) + if err == nil { + if items, ok := nsList.UnstructuredContent()["items"].([]interface{}); ok { + for _, item := range items { + nsMap, ok := item.(map[string]interface{}) + if !ok { + continue + } + metadata, _ := nsMap["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + if strings.HasPrefix(name, "openshift-") { + namespaces = append(namespaces, name) + } + } + } + } + } + + oneHourAgo := time.Now().Add(-1 * time.Hour) + totalWarnings := 0 + totalErrors := 0 + recentEvents := []string{} + + for _, ns := range namespaces { + eventMaps, err := params.EventsList(params, ns) + if err != nil { + continue + } + + for _, eventMap := range eventMaps { + eventType, _ := eventMap["type"].(string) + + // Only include Warning and Error events + if eventType != string(v1.EventTypeWarning) && eventType != "Error" { + continue + } + + lastSeen, _ := eventMap["lastTimestamp"].(string) + lastSeenTime, err := time.Parse(time.RFC3339, lastSeen) + if err != nil || lastSeenTime.Before(oneHourAgo) { + continue + } + + reason, _ := eventMap["reason"].(string) + message, _ := eventMap["message"].(string) + count, _ := eventMap["count"].(int32) + + involvedObject, _ := eventMap["involvedObject"].(map[string]interface{}) + objectKind, _ := involvedObject["kind"].(string) + objectName, _ := involvedObject["name"].(string) + + if eventType == string(v1.EventTypeWarning) { + totalWarnings++ + } else { + totalErrors++ + } + + // Limit message length + if len(message) > 150 { + message = message[:150] + "..." + } + + recentEvents = append(recentEvents, fmt.Sprintf("- **%s/%s** in `%s` (%s, Count: %d)\n - %s", + objectKind, objectName, ns, reason, count, message)) + } + } + + // Limit to 20 most recent events + if len(recentEvents) > 20 { + recentEvents = recentEvents[:20] + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**Warnings:** %d | **Errors:** %d\n\n", totalWarnings, totalErrors)) + if len(recentEvents) > 0 { + sb.WriteString(strings.Join(recentEvents, "\n\n")) + } else { + sb.WriteString("*No recent warning/error events*") + } + + return sb.String(), nil +} + +// formatHealthCheckPrompt formats diagnostic data into a prompt for LLM analysis +func formatHealthCheckPrompt(diag *clusterDiagnostics) string { + var sb strings.Builder + + sb.WriteString("# Cluster Health Check Diagnostic Data\n\n") + sb.WriteString(fmt.Sprintf("**Collection Time:** %s\n", diag.CollectionTime.Format(time.RFC3339))) + + // Show namespace warning prominently if present + if diag.NamespaceWarning != "" { + sb.WriteString("\n") + sb.WriteString("⚠️ **WARNING:** " + diag.NamespaceWarning + "\n") + sb.WriteString("\n") + sb.WriteString("**Note:** Please verify the namespace name and try again if you want namespace-specific diagnostics.\n") + } + + if diag.NamespaceScoped { + sb.WriteString(fmt.Sprintf("**Scope:** Namespace `%s`\n", diag.TargetNamespace)) + } else { + sb.WriteString(fmt.Sprintf("**Scope:** All namespaces (Total: %d)\n", diag.TotalNamespaces)) + } + sb.WriteString("\n") + + sb.WriteString("## Your Task\n\n") + sb.WriteString("Analyze the following cluster diagnostic data and provide:\n") + sb.WriteString("1. **Overall Health Status**: Healthy, Warning, or Critical\n") + sb.WriteString("2. **Critical Issues**: Issues requiring immediate attention\n") + sb.WriteString("3. **Warnings**: Non-critical issues that should be addressed\n") + sb.WriteString("4. **Recommendations**: Suggested actions to improve cluster health\n") + sb.WriteString("5. **Summary**: Brief overview of findings by component\n\n") + + sb.WriteString("---\n\n") + + if diag.Nodes != "" { + sb.WriteString("## 1. Nodes\n\n") + sb.WriteString(diag.Nodes) + sb.WriteString("\n\n") + } + + if diag.ClusterOperators != "" { + sb.WriteString("## 2. Cluster Operators (OpenShift)\n\n") + sb.WriteString(diag.ClusterOperators) + sb.WriteString("\n\n") + } + + if diag.Pods != "" { + sb.WriteString("## 3. Pods\n\n") + sb.WriteString(diag.Pods) + sb.WriteString("\n\n") + } + + if diag.Deployments != "" || diag.StatefulSets != "" || diag.DaemonSets != "" { + sb.WriteString("## 4. Workload Controllers\n\n") + if diag.Deployments != "" { + sb.WriteString("### Deployments\n\n") + sb.WriteString(diag.Deployments) + sb.WriteString("\n\n") + } + if diag.StatefulSets != "" { + sb.WriteString("### StatefulSets\n\n") + sb.WriteString(diag.StatefulSets) + sb.WriteString("\n\n") + } + if diag.DaemonSets != "" { + sb.WriteString("### DaemonSets\n\n") + sb.WriteString(diag.DaemonSets) + sb.WriteString("\n\n") + } + } + + if diag.PVCs != "" { + sb.WriteString("## 5. Persistent Volume Claims\n\n") + sb.WriteString(diag.PVCs) + sb.WriteString("\n\n") + } + + if diag.Events != "" { + sb.WriteString("## 6. Recent Events (Last Hour)\n\n") + sb.WriteString(diag.Events) + sb.WriteString("\n\n") + } + + sb.WriteString("---\n\n") + sb.WriteString("**Please analyze the above diagnostic data and provide your comprehensive health assessment.**\n") + + return sb.String() +} diff --git a/pkg/toolsets/core/health_check_test.go b/pkg/toolsets/core/health_check_test.go new file mode 100644 index 000000000..eac096acc --- /dev/null +++ b/pkg/toolsets/core/health_check_test.go @@ -0,0 +1,57 @@ +package core + +import ( + "testing" + + "github.com/stretchr/testify/suite" + + "github.com/containers/kubernetes-mcp-server/pkg/prompts" +) + +type ClusterHealthCheckSuite struct { + suite.Suite +} + +func (s *ClusterHealthCheckSuite) TestPromptIsRegistered() { + s.Run("cluster-health-check prompt is registered", func() { + configPrompts := prompts.ConfigPrompts() + + var foundHealthCheck bool + for _, prompt := range configPrompts { + if prompt.Prompt.Name == "cluster-health-check" { + foundHealthCheck = true + + // Verify prompt metadata + s.Equal("cluster-health-check", prompt.Prompt.Name) + s.Equal("Cluster Health Check", prompt.Prompt.Title) + s.Contains(prompt.Prompt.Description, "comprehensive health assessment") + + // Verify arguments + s.Require().Len(prompt.Prompt.Arguments, 3, "should have 3 arguments") + + // Check namespace argument + s.Equal("namespace", prompt.Prompt.Arguments[0].Name) + s.False(prompt.Prompt.Arguments[0].Required) + + // Check verbose argument + s.Equal("verbose", prompt.Prompt.Arguments[1].Name) + s.False(prompt.Prompt.Arguments[1].Required) + + // Check check_events argument + s.Equal("check_events", prompt.Prompt.Arguments[2].Name) + s.False(prompt.Prompt.Arguments[2].Required) + + // Verify handler is set + s.NotNil(prompt.Handler, "handler should be set") + + break + } + } + + s.True(foundHealthCheck, "cluster-health-check prompt should be registered") + }) +} + +func TestClusterHealthCheckSuite(t *testing.T) { + suite.Run(t, new(ClusterHealthCheckSuite)) +} diff --git a/pkg/toolsets/core/toolset.go b/pkg/toolsets/core/toolset.go index c371f1616..7d6375a26 100644 --- a/pkg/toolsets/core/toolset.go +++ b/pkg/toolsets/core/toolset.go @@ -30,8 +30,33 @@ func (t *Toolset) GetTools(o api.Openshift) []api.ServerTool { } func (t *Toolset) GetPrompts() []api.ServerPrompt { - // Core toolset prompts will be added in Feature 3 - return nil + return []api.ServerPrompt{ + { + Prompt: api.Prompt{ + Name: "cluster-health-check", + Title: "Cluster Health Check", + Description: "Perform comprehensive health assessment of Kubernetes/OpenShift cluster", + Arguments: []api.PromptArgument{ + { + Name: "namespace", + Description: "Optional namespace to limit health check scope (default: all namespaces)", + Required: false, + }, + { + Name: "verbose", + Description: "Enable detailed resource-level information (true/false, default: false)", + Required: false, + }, + { + Name: "check_events", + Description: "Include recent warning/error events (true/false, default: true)", + Required: false, + }, + }, + }, + Handler: clusterHealthCheckHandler, + }, + } } func init() { From 30b29f1fc1c3cc03b4e112b233068ca276498502 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Mon, 15 Dec 2025 22:40:51 +0530 Subject: [PATCH 2/6] test(core): fix cluster health check prompt test Signed-off-by: Rohit Patil --- pkg/toolsets/core/health_check_test.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pkg/toolsets/core/health_check_test.go b/pkg/toolsets/core/health_check_test.go index eac096acc..f70351da4 100644 --- a/pkg/toolsets/core/health_check_test.go +++ b/pkg/toolsets/core/health_check_test.go @@ -4,8 +4,6 @@ import ( "testing" "github.com/stretchr/testify/suite" - - "github.com/containers/kubernetes-mcp-server/pkg/prompts" ) type ClusterHealthCheckSuite struct { @@ -13,11 +11,19 @@ type ClusterHealthCheckSuite struct { } func (s *ClusterHealthCheckSuite) TestPromptIsRegistered() { - s.Run("cluster-health-check prompt is registered", func() { - configPrompts := prompts.ConfigPrompts() + s.Run("cluster-health-check prompt is registered via GetPrompts", func() { + // Create a new instance of the core toolset + toolset := &Toolset{} + + // Get prompts from the toolset + prompts := toolset.GetPrompts() + + s.Require().NotNil(prompts, "GetPrompts should not return nil") + s.Require().NotEmpty(prompts, "GetPrompts should return at least one prompt") + // Find the cluster-health-check prompt var foundHealthCheck bool - for _, prompt := range configPrompts { + for _, prompt := range prompts { if prompt.Prompt.Name == "cluster-health-check" { foundHealthCheck = true @@ -31,14 +37,17 @@ func (s *ClusterHealthCheckSuite) TestPromptIsRegistered() { // Check namespace argument s.Equal("namespace", prompt.Prompt.Arguments[0].Name) + s.NotEmpty(prompt.Prompt.Arguments[0].Description) s.False(prompt.Prompt.Arguments[0].Required) // Check verbose argument s.Equal("verbose", prompt.Prompt.Arguments[1].Name) + s.NotEmpty(prompt.Prompt.Arguments[1].Description) s.False(prompt.Prompt.Arguments[1].Required) // Check check_events argument s.Equal("check_events", prompt.Prompt.Arguments[2].Name) + s.NotEmpty(prompt.Prompt.Arguments[2].Description) s.False(prompt.Prompt.Arguments[2].Required) // Verify handler is set From 75b285a103ba7230916b2ddf5b6e8ea1720c57bd Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Tue, 16 Dec 2025 12:18:14 +0530 Subject: [PATCH 3/6] test: accept prompts change notifications in watch tests Signed-off-by: Rohit Patil --- pkg/mcp/mcp_watch_test.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/mcp/mcp_watch_test.go b/pkg/mcp/mcp_watch_test.go index 0d80716d3..dbbabd2be 100644 --- a/pkg/mcp/mcp_watch_test.go +++ b/pkg/mcp/mcp_watch_test.go @@ -83,7 +83,10 @@ func (s *WatchKubeConfigSuite) TestNotifiesToolsChangeMultipleTimes() { notification := s.WaitForNotification(5*time.Second, "notifications/tools/list_changed") // Then s.NotNil(notification, "WatchKubeConfig did not notify on iteration %d", i) - s.Equalf("notifications/tools/list_changed", notification.Method, "WatchKubeConfig did not notify tools change on iteration %d", i) + s.True( + notification.Method == "notifications/tools/list_changed" || notification.Method == "notifications/prompts/list_changed", + "WatchKubeConfig did not notify tools or prompts change on iteration %d, got: %s", i, notification.Method, + ) } } @@ -190,7 +193,10 @@ func (s *WatchClusterStateSuite) TestNotifiesToolsChangeMultipleTimes() { s.AddAPIGroup(`{"name":"` + name + `.example.com","versions":[{"groupVersion":"` + name + `.example.com/v1","version":"v1"}],"preferredVersion":{"groupVersion":"` + name + `.example.com/v1","version":"v1"}}`) notification := s.WaitForNotification(5*time.Second, "notifications/tools/list_changed") s.NotNil(notification, "cluster state watcher did not notify on iteration %d", i) - s.Equalf("notifications/tools/list_changed", notification.Method, "cluster state watcher did not notify tools change on iteration %d", i) + s.True( + notification.Method == "notifications/tools/list_changed" || notification.Method == "notifications/prompts/list_changed", + "cluster state watcher did not notify tools or prompts change on iteration %d, got: %s", i, notification.Method, + ) } } From c3f32174447d7e36dead575bf594ad03e0868c19 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Thu, 18 Dec 2025 11:51:15 +0530 Subject: [PATCH 4/6] fix: align tests with PR #591 and update to new API from PR #589 Signed-off-by: Rohit Patil --- pkg/mcp/mcp_watch_test.go | 10 ++-------- pkg/toolsets/core/health_check.go | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pkg/mcp/mcp_watch_test.go b/pkg/mcp/mcp_watch_test.go index dbbabd2be..0d80716d3 100644 --- a/pkg/mcp/mcp_watch_test.go +++ b/pkg/mcp/mcp_watch_test.go @@ -83,10 +83,7 @@ func (s *WatchKubeConfigSuite) TestNotifiesToolsChangeMultipleTimes() { notification := s.WaitForNotification(5*time.Second, "notifications/tools/list_changed") // Then s.NotNil(notification, "WatchKubeConfig did not notify on iteration %d", i) - s.True( - notification.Method == "notifications/tools/list_changed" || notification.Method == "notifications/prompts/list_changed", - "WatchKubeConfig did not notify tools or prompts change on iteration %d, got: %s", i, notification.Method, - ) + s.Equalf("notifications/tools/list_changed", notification.Method, "WatchKubeConfig did not notify tools change on iteration %d", i) } } @@ -193,10 +190,7 @@ func (s *WatchClusterStateSuite) TestNotifiesToolsChangeMultipleTimes() { s.AddAPIGroup(`{"name":"` + name + `.example.com","versions":[{"groupVersion":"` + name + `.example.com/v1","version":"v1"}],"preferredVersion":{"groupVersion":"` + name + `.example.com/v1","version":"v1"}}`) notification := s.WaitForNotification(5*time.Second, "notifications/tools/list_changed") s.NotNil(notification, "cluster state watcher did not notify on iteration %d", i) - s.True( - notification.Method == "notifications/tools/list_changed" || notification.Method == "notifications/prompts/list_changed", - "cluster state watcher did not notify tools or prompts change on iteration %d, got: %s", i, notification.Method, - ) + s.Equalf("notifications/tools/list_changed", notification.Method, "cluster state watcher did not notify tools change on iteration %d", i) } } diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go index 4566b04b1..c85f451d8 100644 --- a/pkg/toolsets/core/health_check.go +++ b/pkg/toolsets/core/health_check.go @@ -9,6 +9,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" ) // clusterHealthCheckHandler implements the cluster health check prompt @@ -29,7 +30,7 @@ func clusterHealthCheckHandler(params api.PromptHandlerParams) (*api.PromptCallR Version: "v1", Kind: "Namespace", } - _, err := params.ResourcesGet(params, nsGVK, "", namespace) + _, err := kubernetes.NewCore(params).ResourcesGet(params, nsGVK, "", namespace) if err != nil { // Namespace doesn't exist - show warning and proceed with cluster-wide check namespaceWarning = fmt.Sprintf("Namespace '%s' not found or not accessible. Showing cluster-wide information instead.", namespace) @@ -149,7 +150,7 @@ func gatherClusterDiagnostics(params api.PromptHandlerParams, namespace string, } // Count namespaces - namespaceList, err := params.NamespacesList(params, api.ListOptions{}) + namespaceList, err := kubernetes.NewCore(params).NamespacesList(params, api.ListOptions{}) if err == nil { if items, ok := namespaceList.UnstructuredContent()["items"].([]interface{}); ok { diag.TotalNamespaces = len(items) @@ -167,7 +168,7 @@ func gatherNodeDiagnostics(params api.PromptHandlerParams) (string, error) { Kind: "Node", } - nodeList, err := params.ResourcesList(params, gvk, "", api.ListOptions{}) + nodeList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, "", api.ListOptions{}) if err != nil { return "", err } @@ -240,9 +241,9 @@ func gatherPodDiagnostics(params api.PromptHandlerParams, namespace string) (str var err error if namespace != "" { - podList, err = params.PodsListInNamespace(params, namespace, api.ListOptions{}) + podList, err = kubernetes.NewCore(params).PodsListInNamespace(params, namespace, api.ListOptions{}) } else { - podList, err = params.PodsListInAllNamespaces(params, api.ListOptions{}) + podList, err = kubernetes.NewCore(params).PodsListInAllNamespaces(params, api.ListOptions{}) } if err != nil { @@ -334,7 +335,7 @@ func gatherWorkloadDiagnostics(params api.PromptHandlerParams, kind string, name Kind: kind, } - workloadList, err := params.ResourcesList(params, gvk, namespace, api.ListOptions{}) + workloadList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, namespace, api.ListOptions{}) if err != nil { return "", err } @@ -420,7 +421,7 @@ func gatherPVCDiagnostics(params api.PromptHandlerParams, namespace string) (str Kind: "PersistentVolumeClaim", } - pvcList, err := params.ResourcesList(params, gvk, namespace, api.ListOptions{}) + pvcList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, namespace, api.ListOptions{}) if err != nil { return "", err } @@ -469,7 +470,7 @@ func gatherClusterOperatorDiagnostics(params api.PromptHandlerParams) (string, e Kind: "ClusterOperator", } - operatorList, err := params.ResourcesList(params, gvk, "", api.ListOptions{}) + operatorList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, "", api.ListOptions{}) if err != nil { // Not an OpenShift cluster return "", err @@ -546,7 +547,7 @@ func gatherEventDiagnostics(params api.PromptHandlerParams, namespace string) (s namespaces = []string{"default", "kube-system"} // Add OpenShift namespaces - nsList, err := params.NamespacesList(params, api.ListOptions{}) + nsList, err := kubernetes.NewCore(params).NamespacesList(params, api.ListOptions{}) if err == nil { if items, ok := nsList.UnstructuredContent()["items"].([]interface{}); ok { for _, item := range items { @@ -570,7 +571,7 @@ func gatherEventDiagnostics(params api.PromptHandlerParams, namespace string) (s recentEvents := []string{} for _, ns := range namespaces { - eventMaps, err := params.EventsList(params, ns) + eventMaps, err := kubernetes.NewCore(params).EventsList(params, ns) if err != nil { continue } From 60c6270b14e60bce83abf6955272632a7d1af44a Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Thu, 18 Dec 2025 21:48:00 +0530 Subject: [PATCH 5/6] refactor(core): add logging and improve health check API - Remove verbose argument, add klog progress logging - Use CoreV1 clientset directly for cleaner code - Extract prompt initialization to initHealthChecks() Signed-off-by: Rohit Patil --- pkg/toolsets/core/health_check.go | 160 +++++++++++++++---------- pkg/toolsets/core/health_check_test.go | 11 +- pkg/toolsets/core/toolset.go | 30 +---- 3 files changed, 102 insertions(+), 99 deletions(-) diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go index c85f451d8..756ba8c4e 100644 --- a/pkg/toolsets/core/health_check.go +++ b/pkg/toolsets/core/health_check.go @@ -6,7 +6,9 @@ import ( "time" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" "github.com/containers/kubernetes-mcp-server/pkg/api" "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" @@ -14,32 +16,32 @@ import ( // clusterHealthCheckHandler implements the cluster health check prompt func clusterHealthCheckHandler(params api.PromptHandlerParams) (*api.PromptCallResult, error) { + // Parse arguments (GetArguments returns map[string]string for prompts) args := params.GetArguments() - - // Parse arguments namespace := args["namespace"] - verbose := args["verbose"] == "true" checkEvents := args["check_events"] != "false" // default true + klog.Info("Starting cluster health check...") + // Check if namespace exists if specified namespaceWarning := "" requestedNamespace := namespace if namespace != "" { - nsGVK := &schema.GroupVersionKind{ - Group: "", - Version: "v1", - Kind: "Namespace", - } - _, err := kubernetes.NewCore(params).ResourcesGet(params, nsGVK, "", namespace) + _, err := params.CoreV1().Namespaces().Get(params.Context, namespace, metav1.GetOptions{}) if err != nil { // Namespace doesn't exist - show warning and proceed with cluster-wide check namespaceWarning = fmt.Sprintf("Namespace '%s' not found or not accessible. Showing cluster-wide information instead.", namespace) namespace = "" // Fall back to cluster-wide check + klog.Warningf("Namespace '%s' not found, performing cluster-wide health check", requestedNamespace) + } else { + klog.Infof("Performing health check for namespace: %s", namespace) } + } else { + klog.Info("Performing cluster-wide health check") } // Gather cluster diagnostics using the KubernetesClient interface - diagnostics, err := gatherClusterDiagnostics(params, namespace, verbose, checkEvents) + diagnostics, err := gatherClusterDiagnostics(params, namespace, checkEvents) if err != nil { return nil, fmt.Errorf("failed to gather cluster diagnostics: %w", err) } @@ -94,7 +96,7 @@ type clusterDiagnostics struct { } // gatherClusterDiagnostics collects comprehensive diagnostic data from the cluster -func gatherClusterDiagnostics(params api.PromptHandlerParams, namespace string, verbose bool, checkEvents bool) (*clusterDiagnostics, error) { +func gatherClusterDiagnostics(params api.PromptHandlerParams, namespace string, checkEvents bool) (*clusterDiagnostics, error) { diag := &clusterDiagnostics{ CollectionTime: time.Now(), NamespaceScoped: namespace != "", @@ -102,126 +104,136 @@ func gatherClusterDiagnostics(params api.PromptHandlerParams, namespace string, } // Gather node diagnostics using ResourcesList + klog.Info("Collecting node diagnostics...") nodeDiag, err := gatherNodeDiagnostics(params) if err == nil { diag.Nodes = nodeDiag + klog.Info("Node diagnostics collected") + } else { + klog.Warningf("Failed to collect node diagnostics: %v", err) } // Gather pod diagnostics + klog.Info("Collecting pod diagnostics...") podDiag, err := gatherPodDiagnostics(params, namespace) if err == nil { diag.Pods = podDiag + klog.Info("Pod diagnostics collected") + } else { + klog.Warningf("Failed to collect pod diagnostics: %v", err) } // Gather workload diagnostics + klog.Info("Collecting deployment diagnostics...") deployDiag, err := gatherWorkloadDiagnostics(params, "Deployment", namespace) if err == nil { diag.Deployments = deployDiag + klog.Info("Deployment diagnostics collected") + } else { + klog.Warningf("Failed to collect deployment diagnostics: %v", err) } + klog.Info("Collecting statefulset diagnostics...") stsDiag, err := gatherWorkloadDiagnostics(params, "StatefulSet", namespace) if err == nil { diag.StatefulSets = stsDiag + klog.Info("StatefulSet diagnostics collected") + } else { + klog.Warningf("Failed to collect statefulset diagnostics: %v", err) } + klog.Info("Collecting daemonset diagnostics...") dsDiag, err := gatherWorkloadDiagnostics(params, "DaemonSet", namespace) if err == nil { diag.DaemonSets = dsDiag + klog.Info("DaemonSet diagnostics collected") + } else { + klog.Warningf("Failed to collect daemonset diagnostics: %v", err) } // Gather PVC diagnostics + klog.Info("Collecting PVC diagnostics...") pvcDiag, err := gatherPVCDiagnostics(params, namespace) if err == nil { diag.PVCs = pvcDiag + klog.Info("PVC diagnostics collected") + } else { + klog.Warningf("Failed to collect PVC diagnostics: %v", err) } // Gather cluster operator diagnostics (OpenShift only) + klog.Info("Checking for cluster operators (OpenShift)...") operatorDiag, err := gatherClusterOperatorDiagnostics(params) if err == nil { diag.ClusterOperators = operatorDiag + klog.Info("Cluster operator diagnostics collected") } // Gather recent events if requested if checkEvents { + klog.Info("Collecting recent events...") eventDiag, err := gatherEventDiagnostics(params, namespace) if err == nil { diag.Events = eventDiag + klog.Info("Event diagnostics collected") + } else { + klog.Warningf("Failed to collect event diagnostics: %v", err) } } // Count namespaces + klog.Info("Counting namespaces...") namespaceList, err := kubernetes.NewCore(params).NamespacesList(params, api.ListOptions{}) if err == nil { if items, ok := namespaceList.UnstructuredContent()["items"].([]interface{}); ok { diag.TotalNamespaces = len(items) + klog.Infof("Found %d namespaces", diag.TotalNamespaces) } } + klog.Info("Cluster health check data collection completed") return diag, nil } -// gatherNodeDiagnostics collects node status using ResourcesList +// gatherNodeDiagnostics collects node status using CoreV1 clientset func gatherNodeDiagnostics(params api.PromptHandlerParams) (string, error) { - gvk := &schema.GroupVersionKind{ - Group: "", - Version: "v1", - Kind: "Node", - } - - nodeList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, "", api.ListOptions{}) + nodeList, err := params.CoreV1().Nodes().List(params.Context, metav1.ListOptions{}) if err != nil { return "", err } - items, ok := nodeList.UnstructuredContent()["items"].([]interface{}) - if !ok || len(items) == 0 { + if len(nodeList.Items) == 0 { return "No nodes found", nil } var sb strings.Builder - totalNodes := len(items) + totalNodes := len(nodeList.Items) healthyNodes := 0 nodesWithIssues := []string{} - for _, item := range items { - nodeMap, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := nodeMap["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - - status, _ := nodeMap["status"].(map[string]interface{}) - conditions, _ := status["conditions"].([]interface{}) - + for _, node := range nodeList.Items { nodeStatus := "Unknown" issues := []string{} // Parse node conditions - for _, cond := range conditions { - condMap, _ := cond.(map[string]interface{}) - condType, _ := condMap["type"].(string) - condStatus, _ := condMap["status"].(string) - message, _ := condMap["message"].(string) - - if condType == "Ready" { - if condStatus == "True" { + for _, cond := range node.Status.Conditions { + if cond.Type == v1.NodeReady { + if cond.Status == v1.ConditionTrue { nodeStatus = "Ready" healthyNodes++ } else { nodeStatus = "NotReady" - issues = append(issues, fmt.Sprintf("Not ready: %s", message)) + issues = append(issues, fmt.Sprintf("Not ready: %s", cond.Message)) } - } else if condStatus == "True" && condType != "Ready" { + } else if cond.Status == v1.ConditionTrue { // Pressure conditions - issues = append(issues, fmt.Sprintf("%s: %s", condType, message)) + issues = append(issues, fmt.Sprintf("%s: %s", cond.Type, cond.Message)) } } // Only report nodes with issues if len(issues) > 0 { - nodesWithIssues = append(nodesWithIssues, fmt.Sprintf("- **%s** (Status: %s)\n%s", name, nodeStatus, " - "+strings.Join(issues, "\n - "))) + nodesWithIssues = append(nodesWithIssues, fmt.Sprintf("- **%s** (Status: %s)\n%s", node.Name, nodeStatus, " - "+strings.Join(issues, "\n - "))) } } @@ -571,46 +583,40 @@ func gatherEventDiagnostics(params api.PromptHandlerParams, namespace string) (s recentEvents := []string{} for _, ns := range namespaces { - eventMaps, err := kubernetes.NewCore(params).EventsList(params, ns) + eventList, err := params.CoreV1().Events(ns).List(params.Context, metav1.ListOptions{}) if err != nil { continue } - for _, eventMap := range eventMaps { - eventType, _ := eventMap["type"].(string) - + for _, event := range eventList.Items { // Only include Warning and Error events - if eventType != string(v1.EventTypeWarning) && eventType != "Error" { + if event.Type != string(v1.EventTypeWarning) && event.Type != "Error" { continue } - lastSeen, _ := eventMap["lastTimestamp"].(string) - lastSeenTime, err := time.Parse(time.RFC3339, lastSeen) - if err != nil || lastSeenTime.Before(oneHourAgo) { + // Check timestamp + lastSeenTime := event.LastTimestamp.Time + if lastSeenTime.IsZero() { + lastSeenTime = event.EventTime.Time + } + if lastSeenTime.Before(oneHourAgo) { continue } - reason, _ := eventMap["reason"].(string) - message, _ := eventMap["message"].(string) - count, _ := eventMap["count"].(int32) - - involvedObject, _ := eventMap["involvedObject"].(map[string]interface{}) - objectKind, _ := involvedObject["kind"].(string) - objectName, _ := involvedObject["name"].(string) - - if eventType == string(v1.EventTypeWarning) { + if event.Type == string(v1.EventTypeWarning) { totalWarnings++ } else { totalErrors++ } // Limit message length + message := event.Message if len(message) > 150 { message = message[:150] + "..." } recentEvents = append(recentEvents, fmt.Sprintf("- **%s/%s** in `%s` (%s, Count: %d)\n - %s", - objectKind, objectName, ns, reason, count, message)) + event.InvolvedObject.Kind, event.InvolvedObject.Name, ns, event.Reason, event.Count, message)) } } @@ -716,3 +722,29 @@ func formatHealthCheckPrompt(diag *clusterDiagnostics) string { return sb.String() } + +// initHealthChecks initializes the cluster health check prompts +func initHealthChecks() []api.ServerPrompt { + return []api.ServerPrompt{ + { + Prompt: api.Prompt{ + Name: "cluster-health-check", + Title: "Cluster Health Check", + Description: "Perform comprehensive health assessment of Kubernetes/OpenShift cluster", + Arguments: []api.PromptArgument{ + { + Name: "namespace", + Description: "Optional namespace to limit health check scope (default: all namespaces)", + Required: false, + }, + { + Name: "check_events", + Description: "Include recent warning/error events (true/false, default: true)", + Required: false, + }, + }, + }, + Handler: clusterHealthCheckHandler, + }, + } +} diff --git a/pkg/toolsets/core/health_check_test.go b/pkg/toolsets/core/health_check_test.go index f70351da4..64733ec2f 100644 --- a/pkg/toolsets/core/health_check_test.go +++ b/pkg/toolsets/core/health_check_test.go @@ -33,23 +33,18 @@ func (s *ClusterHealthCheckSuite) TestPromptIsRegistered() { s.Contains(prompt.Prompt.Description, "comprehensive health assessment") // Verify arguments - s.Require().Len(prompt.Prompt.Arguments, 3, "should have 3 arguments") + s.Require().Len(prompt.Prompt.Arguments, 2, "should have 2 arguments") // Check namespace argument s.Equal("namespace", prompt.Prompt.Arguments[0].Name) s.NotEmpty(prompt.Prompt.Arguments[0].Description) s.False(prompt.Prompt.Arguments[0].Required) - // Check verbose argument - s.Equal("verbose", prompt.Prompt.Arguments[1].Name) + // Check check_events argument + s.Equal("check_events", prompt.Prompt.Arguments[1].Name) s.NotEmpty(prompt.Prompt.Arguments[1].Description) s.False(prompt.Prompt.Arguments[1].Required) - // Check check_events argument - s.Equal("check_events", prompt.Prompt.Arguments[2].Name) - s.NotEmpty(prompt.Prompt.Arguments[2].Description) - s.False(prompt.Prompt.Arguments[2].Required) - // Verify handler is set s.NotNil(prompt.Handler, "handler should be set") diff --git a/pkg/toolsets/core/toolset.go b/pkg/toolsets/core/toolset.go index 7d6375a26..536b9428c 100644 --- a/pkg/toolsets/core/toolset.go +++ b/pkg/toolsets/core/toolset.go @@ -30,33 +30,9 @@ func (t *Toolset) GetTools(o api.Openshift) []api.ServerTool { } func (t *Toolset) GetPrompts() []api.ServerPrompt { - return []api.ServerPrompt{ - { - Prompt: api.Prompt{ - Name: "cluster-health-check", - Title: "Cluster Health Check", - Description: "Perform comprehensive health assessment of Kubernetes/OpenShift cluster", - Arguments: []api.PromptArgument{ - { - Name: "namespace", - Description: "Optional namespace to limit health check scope (default: all namespaces)", - Required: false, - }, - { - Name: "verbose", - Description: "Enable detailed resource-level information (true/false, default: false)", - Required: false, - }, - { - Name: "check_events", - Description: "Include recent warning/error events (true/false, default: true)", - Required: false, - }, - }, - }, - Handler: clusterHealthCheckHandler, - }, - } + return slices.Concat( + initHealthChecks(), + ) } func init() { From cea59e4e7963f0279ace316759b78758e115d905 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Fri, 19 Dec 2025 10:42:32 +0530 Subject: [PATCH 6/6] refactor(core): use typed clientsets in health check diagnostics Replace unstructured resource access with typed CoreV1 and AppsV1 clientsets for improved type safety and code clarity. Signed-off-by: Rohit Patil --- pkg/toolsets/core/health_check.go | 212 +++++++++++++----------------- 1 file changed, 88 insertions(+), 124 deletions(-) diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go index 756ba8c4e..4abfcd792 100644 --- a/pkg/toolsets/core/health_check.go +++ b/pkg/toolsets/core/health_check.go @@ -247,69 +247,44 @@ func gatherNodeDiagnostics(params api.PromptHandlerParams) (string, error) { return sb.String(), nil } -// gatherPodDiagnostics collects pod status using existing methods +// gatherPodDiagnostics collects pod status using CoreV1 clientset func gatherPodDiagnostics(params api.PromptHandlerParams, namespace string) (string, error) { - var podList interface{ UnstructuredContent() map[string]interface{} } - var err error - - if namespace != "" { - podList, err = kubernetes.NewCore(params).PodsListInNamespace(params, namespace, api.ListOptions{}) - } else { - podList, err = kubernetes.NewCore(params).PodsListInAllNamespaces(params, api.ListOptions{}) - } - + podList, err := params.CoreV1().Pods(namespace).List(params.Context, metav1.ListOptions{}) if err != nil { return "", err } - items, ok := podList.UnstructuredContent()["items"].([]interface{}) - if !ok { + if len(podList.Items) == 0 { return "No pods found", nil } - totalPods := len(items) + totalPods := len(podList.Items) problemPods := []string{} - for _, item := range items { - podMap, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := podMap["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - ns, _ := metadata["namespace"].(string) - - status, _ := podMap["status"].(map[string]interface{}) - phase, _ := status["phase"].(string) - containerStatuses, _ := status["containerStatuses"].([]interface{}) - + for _, pod := range podList.Items { issues := []string{} restarts := int32(0) readyCount := 0 - totalContainers := len(containerStatuses) + totalContainers := len(pod.Status.ContainerStatuses) // Check container statuses - for _, cs := range containerStatuses { - csMap, _ := cs.(map[string]interface{}) - ready, _ := csMap["ready"].(bool) - restartCount, _ := csMap["restartCount"].(float64) - restarts += int32(restartCount) - - if ready { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Ready { readyCount++ } + restarts += cs.RestartCount - state, _ := csMap["state"].(map[string]interface{}) - if waiting, ok := state["waiting"].(map[string]interface{}); ok { - reason, _ := waiting["reason"].(string) - message, _ := waiting["message"].(string) + // Check waiting state + if cs.State.Waiting != nil { + reason := cs.State.Waiting.Reason if reason == "CrashLoopBackOff" || reason == "ImagePullBackOff" || reason == "ErrImagePull" { - issues = append(issues, fmt.Sprintf("Container waiting: %s - %s", reason, message)) + issues = append(issues, fmt.Sprintf("Container waiting: %s - %s", reason, cs.State.Waiting.Message)) } } - if terminated, ok := state["terminated"].(map[string]interface{}); ok { - reason, _ := terminated["reason"].(string) + + // Check terminated state + if cs.State.Terminated != nil { + reason := cs.State.Terminated.Reason if reason == "Error" || reason == "OOMKilled" { issues = append(issues, fmt.Sprintf("Container terminated: %s", reason)) } @@ -317,14 +292,14 @@ func gatherPodDiagnostics(params api.PromptHandlerParams, namespace string) (str } // Check pod phase - if phase != "Running" && phase != "Succeeded" { - issues = append(issues, fmt.Sprintf("Pod in %s phase", phase)) + if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodSucceeded { + issues = append(issues, fmt.Sprintf("Pod in %s phase", pod.Status.Phase)) } // Report pods with issues or high restart count if len(issues) > 0 || restarts > 5 { problemPods = append(problemPods, fmt.Sprintf("- **%s/%s** (Phase: %s, Ready: %d/%d, Restarts: %d)\n - %s", - ns, name, phase, readyCount, totalContainers, restarts, strings.Join(issues, "\n - "))) + pod.Namespace, pod.Name, pod.Status.Phase, readyCount, totalContainers, restarts, strings.Join(issues, "\n - "))) } } @@ -339,79 +314,86 @@ func gatherPodDiagnostics(params api.PromptHandlerParams, namespace string) (str return sb.String(), nil } -// gatherWorkloadDiagnostics collects workload controller status +// gatherWorkloadDiagnostics collects workload controller status using AppsV1 clientset func gatherWorkloadDiagnostics(params api.PromptHandlerParams, kind string, namespace string) (string, error) { - gvk := &schema.GroupVersionKind{ - Group: "apps", - Version: "v1", - Kind: kind, - } - - workloadList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, namespace, api.ListOptions{}) - if err != nil { - return "", err - } - - items, ok := workloadList.UnstructuredContent()["items"].([]interface{}) - if !ok || len(items) == 0 { - return fmt.Sprintf("No %ss found", kind), nil - } - workloadsWithIssues := []string{} - for _, item := range items { - workloadMap, ok := item.(map[string]interface{}) - if !ok { - continue + switch kind { + case "Deployment": + deploymentList, err := params.AppsV1().Deployments(namespace).List(params.Context, metav1.ListOptions{}) + if err != nil { + return "", err + } + if len(deploymentList.Items) == 0 { + return "No Deployments found", nil } - metadata, _ := workloadMap["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - ns, _ := metadata["namespace"].(string) + for _, deployment := range deploymentList.Items { + issues := []string{} + ready := fmt.Sprintf("%d/%d", deployment.Status.ReadyReplicas, deployment.Status.Replicas) - status, _ := workloadMap["status"].(map[string]interface{}) - spec, _ := workloadMap["spec"].(map[string]interface{}) - issues := []string{} - ready := "Unknown" + if deployment.Status.UnavailableReplicas > 0 { + issues = append(issues, fmt.Sprintf("%d replicas unavailable", deployment.Status.UnavailableReplicas)) + } - switch kind { - case "Deployment": - replicas, _ := status["replicas"].(float64) - readyReplicas, _ := status["readyReplicas"].(float64) - unavailableReplicas, _ := status["unavailableReplicas"].(float64) + if len(issues) > 0 { + workloadsWithIssues = append(workloadsWithIssues, fmt.Sprintf("- **%s/%s** (Ready: %s)\n - %s", + deployment.Namespace, deployment.Name, ready, strings.Join(issues, "\n - "))) + } + } - ready = fmt.Sprintf("%d/%d", int(readyReplicas), int(replicas)) + case "StatefulSet": + statefulSetList, err := params.AppsV1().StatefulSets(namespace).List(params.Context, metav1.ListOptions{}) + if err != nil { + return "", err + } + if len(statefulSetList.Items) == 0 { + return "No StatefulSets found", nil + } - if unavailableReplicas > 0 { - issues = append(issues, fmt.Sprintf("%d replicas unavailable", int(unavailableReplicas))) + for _, sts := range statefulSetList.Items { + issues := []string{} + specReplicas := int32(1) + if sts.Spec.Replicas != nil { + specReplicas = *sts.Spec.Replicas } + ready := fmt.Sprintf("%d/%d", sts.Status.ReadyReplicas, specReplicas) - case "StatefulSet": - specReplicas, _ := spec["replicas"].(float64) - readyReplicas, _ := status["readyReplicas"].(float64) - - ready = fmt.Sprintf("%d/%d", int(readyReplicas), int(specReplicas)) + if sts.Status.ReadyReplicas < specReplicas { + issues = append(issues, fmt.Sprintf("Only %d/%d replicas ready", sts.Status.ReadyReplicas, specReplicas)) + } - if readyReplicas < specReplicas { - issues = append(issues, fmt.Sprintf("Only %d/%d replicas ready", int(readyReplicas), int(specReplicas))) + if len(issues) > 0 { + workloadsWithIssues = append(workloadsWithIssues, fmt.Sprintf("- **%s/%s** (Ready: %s)\n - %s", + sts.Namespace, sts.Name, ready, strings.Join(issues, "\n - "))) } + } - case "DaemonSet": - desiredNumberScheduled, _ := status["desiredNumberScheduled"].(float64) - numberReady, _ := status["numberReady"].(float64) - numberUnavailable, _ := status["numberUnavailable"].(float64) + case "DaemonSet": + daemonSetList, err := params.AppsV1().DaemonSets(namespace).List(params.Context, metav1.ListOptions{}) + if err != nil { + return "", err + } + if len(daemonSetList.Items) == 0 { + return "No DaemonSets found", nil + } - ready = fmt.Sprintf("%d/%d", int(numberReady), int(desiredNumberScheduled)) + for _, ds := range daemonSetList.Items { + issues := []string{} + ready := fmt.Sprintf("%d/%d", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) - if numberUnavailable > 0 { - issues = append(issues, fmt.Sprintf("%d pods unavailable", int(numberUnavailable))) + if ds.Status.NumberUnavailable > 0 { + issues = append(issues, fmt.Sprintf("%d pods unavailable", ds.Status.NumberUnavailable)) } - } - if len(issues) > 0 { - workloadsWithIssues = append(workloadsWithIssues, fmt.Sprintf("- **%s/%s** (Ready: %s)\n - %s", - ns, name, ready, strings.Join(issues, "\n - "))) + if len(issues) > 0 { + workloadsWithIssues = append(workloadsWithIssues, fmt.Sprintf("- **%s/%s** (Ready: %s)\n - %s", + ds.Namespace, ds.Name, ready, strings.Join(issues, "\n - "))) + } } + + default: + return "", fmt.Errorf("unsupported workload kind: %s", kind) } var sb strings.Builder @@ -425,41 +407,23 @@ func gatherWorkloadDiagnostics(params api.PromptHandlerParams, kind string, name return sb.String(), nil } -// gatherPVCDiagnostics collects PVC status +// gatherPVCDiagnostics collects PVC status using CoreV1 clientset func gatherPVCDiagnostics(params api.PromptHandlerParams, namespace string) (string, error) { - gvk := &schema.GroupVersionKind{ - Group: "", - Version: "v1", - Kind: "PersistentVolumeClaim", - } - - pvcList, err := kubernetes.NewCore(params).ResourcesList(params, gvk, namespace, api.ListOptions{}) + pvcList, err := params.CoreV1().PersistentVolumeClaims(namespace).List(params.Context, metav1.ListOptions{}) if err != nil { return "", err } - items, ok := pvcList.UnstructuredContent()["items"].([]interface{}) - if !ok || len(items) == 0 { + if len(pvcList.Items) == 0 { return "No PVCs found", nil } pvcsWithIssues := []string{} - for _, item := range items { - pvcMap, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := pvcMap["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - ns, _ := metadata["namespace"].(string) - - status, _ := pvcMap["status"].(map[string]interface{}) - phase, _ := status["phase"].(string) - - if phase != "Bound" { - pvcsWithIssues = append(pvcsWithIssues, fmt.Sprintf("- **%s/%s** (Status: %s)\n - PVC not bound", ns, name, phase)) + for _, pvc := range pvcList.Items { + if pvc.Status.Phase != v1.ClaimBound { + pvcsWithIssues = append(pvcsWithIssues, fmt.Sprintf("- **%s/%s** (Status: %s)\n - PVC not bound", + pvc.Namespace, pvc.Name, pvc.Status.Phase)) } }