diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..fa00365 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,306 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +kAgent Tools is a Go-based MCP (Model Context Protocol) server providing 118+ tools for Kubernetes, cloud-native, and observability operations. Originally migrated from Python, it offers comprehensive functionality for k8s, Helm, Istio, Argo, Cilium, Prometheus, and more. + +## Architecture + +### Core Structure + +``` +cmd/main.go - Entry point, MCP server initialization, tool registration +internal/ - Shared utilities (logger, cache, telemetry, metrics, security) +pkg/ - Tool providers (k8s, helm, istio, argo, cilium, prometheus, utils) +helm/kagent-tools/ - Kubernetes deployment via Helm +test/e2e/ - End-to-end tests using Ginkgo +``` + +### Tool Provider Pattern + +Each `pkg/*/` directory follows this pattern: + +1. **Tool struct** - Holds provider-specific state (e.g., kubeconfig, LLM model) +2. **Handler functions** - Named `handle`, parse MCP request parameters, execute operations +3. **RegisterTools function** - Signature: `RegisterTools(s *server.MCPServer, ...)` - Adds all tools from this provider to the MCP server + +Example from `pkg/k8s/k8s.go`: +```go +func RegisterTools(s *server.MCPServer, llm llms.Model, kubeconfig string, readOnly bool) { + k8sTool := NewK8sToolWithConfig(kubeconfig, llm) + + s.AddTool(mcp.Tool{...}, k8sTool.handleKubectlGetEnhanced) + s.AddTool(mcp.Tool{...}, k8sTool.handleKubectlDescribe) + // ... +} +``` + +**Key convention**: All tool providers export a `RegisterTools` function that gets called from `cmd/main.go:registerMCP()`. + +### MCP Server Lifecycle + +1. **Initialization** (`cmd/main.go:run()`): + - Parse CLI flags (`--port`, `--metrics-port`, `--tools`, `--read-only`, `--kubeconfig`) + - Create MCP server with `server.NewMCPServer()` + - Initialize Prometheus metrics server + +2. **Tool Registration** (`registerMCP()`): + - Maps provider names to registration functions + - Uses `ListTools()` diff technique to track which tools belong to which provider + - Returns `map[string]string` of tool→provider for metrics + +3. **Handler Wrapping** (`wrapToolHandlersWithMetrics()`): + - Applies middleware pattern to ALL tool handlers + - Increments Prometheus counters (`kagent_tools_mcp_invocations_total`, `kagent_tools_mcp_invocations_failure_total`) + - Uses `SetTools()` to replace handlers - **zero changes to pkg/ files required** + +4. **Server Start**: + - HTTP mode: SSE transport on `--port` + - STDIO mode: Direct stdin/stdout communication + - Metrics server runs concurrently on `--metrics-port` + - Both servers gracefully shutdown on SIGTERM/SIGINT + +### Observability + +**Prometheus Metrics** (`internal/metrics/`): +- `kagent_tools_mcp_server_info` - Server metadata (version, commit, build date) +- `kagent_tools_mcp_registered_tools` - Gauge per tool (tool_name, tool_provider) +- `kagent_tools_mcp_invocations_total` - Counter of all invocations +- `kagent_tools_mcp_invocations_failure_total` - Counter of failures + +**ServiceMonitor** for Prometheus Operator is in `helm/kagent-tools/templates/servicemonitor.yaml`. + +### Read-Only Mode + +When `--read-only` flag is set, write operations are disabled. Tool providers check `readOnly` parameter in `RegisterTools()` and skip registering destructive tools (apply, delete, scale, etc.). + +## Development Commands + +### Build & Run + +```bash +# Build binary +make build +# or manually: +go build -o kagent-tools ./cmd/main.go + +# Run locally +./kagent-tools --port 8084 --metrics-port 9090 + +# Run with specific tools only +./kagent-tools --tools k8s,helm,utils + +# Run in read-only mode +./kagent-tools --read-only + +# Run with custom kubeconfig +./kagent-tools --kubeconfig ~/.kube/my-cluster-config +``` + +### Testing + +```bash +# Run all tests +go test ./... + +# Run specific package tests +go test ./internal/metrics/ +go test ./pkg/k8s/ + +# Run tests with verbose output +go test -v ./... + +# Run E2E tests (requires kind cluster) +cd test/e2e && ginkgo -v +``` + +### Docker & Kubernetes + +```bash +# Build Docker image +make docker-build + +# Build with custom tag +make docker-build TOOLS_IMAGE_TAG=my-test-tag + +# Generate Helm Chart.yaml (required before helm commands) +make helm-version + +# Load image into kind cluster (adjust cluster name) +kind load docker-image ghcr.io/kagent-dev/kagent/tools:TAG --name CLUSTER_NAME + +# Install via Helm +helm upgrade -i kagent-tools ./helm/kagent-tools \ + --namespace kagent \ + --create-namespace \ + --set tools.image.tag=TAG + +# Render Helm templates (verify before install) +helm template kagent-tools ./helm/kagent-tools --namespace kagent +``` + +### Code Quality + +```bash +# Format code +make fmt +# or: +go fmt ./... + +# Run linter (if configured) +golangci-lint run + +# Security scan +make scan +``` + +## Key Implementation Patterns + +### MCP Tool Parameters + +Use `mcp.Parse*` functions to extract typed parameters from `CallToolRequest`: + +```go +func handleMyTool(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + requiredParam := mcp.ParseString(request, "param_name", "") + optionalParam := mcp.ParseString(request, "optional", "default_value") + boolParam := mcp.ParseBool(request, "flag", false) + + if requiredParam == "" { + return mcp.NewToolResultError("param_name is required"), nil + } + + // ... execute operation ... + + return mcp.NewToolResultText(output), nil +} +``` + +### Command Execution + +Use `internal/commands` package for shell commands: + +```go +import "github.com/kagent-dev/tools/internal/commands" + +result, err := commands.RunCommand(ctx, "kubectl", "get", "pods") +if err != nil { + return mcp.NewToolResultError(err.Error()), nil +} +return mcp.NewToolResultText(result), nil +``` + +### Cache Invalidation + +Kubernetes operations use `internal/cache` for performance. Invalidate after write operations: + +```go +import "github.com/kagent-dev/tools/internal/cache" + +// After kubectl apply/delete/patch/etc: +cache.InvalidateKubernetesCache() +``` + +### Security Validation + +Use `internal/security` for dangerous operation checks: + +```go +import "github.com/kagent-dev/tools/internal/security" + +if err := security.ValidateKubernetesOperation(args); err != nil { + return mcp.NewToolResultError(err.Error()), nil +} +``` + +## Helm Chart Architecture + +The Helm chart (`helm/kagent-tools/`) deploys the tools server to Kubernetes: + +- **Chart.yaml** - Generated from `Chart-template.yaml` via `make helm-version` +- **values.yaml** - Configuration (image, resources, enabled tools, metrics, ServiceMonitor) +- **templates/deployment.yaml** - Main container with `--port` and `--metrics-port` args +- **templates/service.yaml** - Two services: main (`kagent-tools`) and metrics (`kagent-tools-metrics`) +- **templates/servicemonitor.yaml** - Prometheus Operator integration (conditional via `.Values.tools.metrics.servicemonitor.enabled`) + +**Important**: The instance label is typically `kagent`, not `kagent-tools`, due to nameOverride in production values. + +## Version Management + +Version information is injected at build time via `-ldflags`: + +```go +// internal/version/version.go +var ( + Version = "dev" + GitCommit = "none" + BuildDate = "unknown" +) +``` + +Build with version info: +```bash +# Automatic via Makefile (uses git describe) +make docker-build + +# Manual +go build -ldflags "-X github.com/kagent-dev/tools/internal/version.Version=v1.0.0 ..." ./cmd/main.go +``` + +## Adding a New Tool Provider + +1. Create `pkg/newprovider/newprovider.go` +2. Implement tool struct and handlers +3. Export `RegisterTools(s *server.MCPServer, ...)` function +4. Add to `toolProviderMap` in `cmd/main.go:registerMCP()` +5. Add tests in `pkg/newprovider/newprovider_test.go` +6. Update `--tools` flag documentation in README.md + +**No metrics code needed** - the handler wrapper automatically instruments all tools. + +## Testing Strategy + +- **Unit tests**: Each `pkg/*/` has `*_test.go` files +- **E2E tests**: `test/e2e/` uses Ginkgo/Gomega, deploys to kind cluster +- **Manual testing**: Build binary, run locally, invoke tools via MCP client + +E2E tests require: +- kind cluster running +- Chart.yaml generated (`make helm-version`) +- Docker image built and loaded into kind + +## Git Workflow + +**Branch naming**: `feature/description`, `fix/description`, `observability/prometheus` + +**Commit signatures**: +``` +Signed-off-by: Name +Co-authored-by: Claude +``` + +**Commit message format** (from git log): +``` +feat(scope): short description + +Longer explanation of what changed and why. + +Signed-off-by: ... +Co-authored-by: ... +``` + +Common scopes: `metrics`, `cli`, `deps`, `helm`, `k8s`, `prometheus` + +## Troubleshooting + +**Port conflicts**: Metrics and main server can share the same port (both serve on 8084 by default, `/metrics` endpoint vs MCP endpoints). + +**Helm Chart.yaml missing**: Run `make helm-version` to generate from template. + +**E2E test failures**: Helm installs fail if Chart.yaml doesn't exist or if custom values override instance names inconsistently. + +**ServiceMonitor not discovered**: Ensure `release: prometheus` label matches your Prometheus Operator's `serviceMonitorSelector`. + +**Metrics not working**: Verify deployment has `--metrics-port` arg and container exposes the port. Check service selector matches pod labels. diff --git a/README.md b/README.md index ae07bae..4b1c518 100644 --- a/README.md +++ b/README.md @@ -188,9 +188,21 @@ go build -o kagent-tools . The server runs using sse transport for MCP communication. +#### CLI Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--port`, `-p` | `8084` | Port to run the MCP server on | +| `--metrics-port` | `8084` | Port to run the Prometheus metrics server on | +| `--stdio` | `false` | Use stdio for communication instead of HTTP | +| `--tools` | `[]` (all) | Comma-separated list of tool providers to register | +| `--read-only` | `false` | Disable tools that perform write operations | +| `--kubeconfig` | `""` | Path to kubeconfig file (defaults to in-cluster config) | +| `--version`, `-v` | `false` | Show version information and exit | + ### Testing ```bash -go test -v +go test -v ./... ``` ## Tool Implementation Details @@ -243,6 +255,25 @@ Tools can be configured through environment variables: - `GRAFANA_URL`: Default Grafana server URL - `GRAFANA_API_KEY`: Default Grafana API key +## Observability + +The MCP server exposes Prometheus metrics on a configurable HTTP endpoint (`/metrics`). By default, the metrics endpoint runs on the same port as the MCP server. To run it on a separate port: + +```bash +./kagent-tools --port 8084 --metrics-port 9090 +``` + +### Exposed Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `kagent_tools_mcp_server_info` | Gauge | `server_name`, `version`, `git_commit`, `build_date`, `server_mode` | Server metadata (always set to 1) | +| `kagent_tools_mcp_registered_tools` | Gauge | `tool_name`, `tool_provider` | Set to 1 for each registered tool | +| `kagent_tools_mcp_invocations_total` | Counter | `tool_name`, `tool_provider` | Total number of tool invocations | +| `kagent_tools_mcp_invocations_failure_total` | Counter | `tool_name`, `tool_provider` | Total number of failed tool invocations | + +Standard Go runtime and process metrics are also included (goroutines, memory, CPU, file descriptors, etc.). + ## Error Handling and Debugging The tools provide detailed error messages and support verbose output. When debugging issues: @@ -258,9 +289,8 @@ Potential areas for future improvement: 1. **Native Client Libraries**: Replace CLI calls with native Go client libraries where possible 2. **Advanced Documentation Search**: Implement full vector search for documentation queries 3. **Caching**: Add caching for frequently accessed data -4. **Metrics and Observability**: Add metrics and tracing for tool usage -5. **Configuration Management**: Enhanced configuration management and validation -6. **Parallel Execution**: Support for parallel execution of related operations +4. **Configuration Management**: Enhanced configuration management and validation +5. **Parallel Execution**: Support for parallel execution of related operations ## Contributing diff --git a/cmd/main.go b/cmd/main.go index 374d7ee..4ab1b32 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -8,6 +8,7 @@ import ( "os" "os/signal" "runtime" + "strconv" "strings" "sync" "syscall" @@ -15,6 +16,7 @@ import ( "github.com/joho/godotenv" "github.com/kagent-dev/tools/internal/logger" + "github.com/kagent-dev/tools/internal/metrics" "github.com/kagent-dev/tools/internal/telemetry" "github.com/kagent-dev/tools/internal/version" "github.com/kagent-dev/tools/pkg/argo" @@ -25,16 +27,19 @@ import ( "github.com/kagent-dev/tools/pkg/kubescape" "github.com/kagent-dev/tools/pkg/prometheus" "github.com/kagent-dev/tools/pkg/utils" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/cobra" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" + "github.com/mark3labs/mcp-go/mcp" "github.com/mark3labs/mcp-go/server" ) var ( port int + metricsPort int stdio bool tools []string kubeconfig *string @@ -56,6 +61,7 @@ var rootCmd = &cobra.Command{ func init() { rootCmd.Flags().IntVarP(&port, "port", "p", 8084, "Port to run the server on") + rootCmd.Flags().IntVarP(&metricsPort, "metrics-port", "m", 0, "Port to run the metrics server on (default 0: same as --port)") rootCmd.Flags().BoolVar(&stdio, "stdio", false, "Use stdio for communication instead of HTTP") rootCmd.Flags().StringSliceVar(&tools, "tools", []string{}, "List of tools to register. If empty, all tools are registered.") rootCmd.Flags().BoolVarP(&showVersion, "version", "v", false, "Show version information and exit") @@ -92,6 +98,11 @@ func run(cmd *cobra.Command, args []string) { return } + // 0 means "same as --port" - resolve it before any server logic uses it + if metricsPort == 0 { + metricsPort = port + } + logger.Init(stdio) defer logger.Sync() @@ -134,8 +145,11 @@ func run(cmd *cobra.Command, args []string) { Version, ) - // Register tools - registerMCP(mcp, tools, *kubeconfig, readOnly) + // Register tools and wrap handlers with metrics instrumentation. + // registerMCP returns a map of tool_name -> tool_provider so that + // wrapToolHandlersWithMetrics knows which provider each tool belongs to. + toolProviders := registerMCP(mcp, tools, *kubeconfig, readOnly) + wrapToolHandlersWithMetrics(mcp, toolProviders) // Create wait group for server goroutines var wg sync.WaitGroup @@ -146,6 +160,7 @@ func run(cmd *cobra.Command, args []string) { // HTTP server reference (only used when not in stdio mode) var httpServer *http.Server + var metricsServer *http.Server // Separate server for metrics if metricsPort is different from main port // Start server based on chosen mode wg.Add(1) @@ -170,17 +185,40 @@ func run(cmd *cobra.Command, args []string) { } }) - // Add metrics endpoint (basic implementation for e2e tests) - mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - w.WriteHeader(http.StatusOK) - - // Generate real runtime metrics instead of hardcoded values - metrics := generateRuntimeMetrics() - if err := writeResponse(w, []byte(metrics)); err != nil { - logger.Get().Error("Failed to write metrics response", "error", err) + // Add metrics endpoint + registry := metrics.InitServer() // Initialize Prometheus metrics before starting the server + + if metricsPort != port { // Only start a separate metrics server if the metrics port is different from the main server port + // Create the metrics server outside the goroutine to avoid a race condition + // between the goroutine assigning metricsServer and the shutdown handler reading it + metricsMux := http.NewServeMux() + metricsMux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) + metricsServer = &http.Server{ + Addr: fmt.Sprintf(":%d", metricsPort), + Handler: metricsMux, } - }) + + wg.Add(1) + go func() { + defer wg.Done() + logger.Get().Info("Starting Prometheus metrics endpoint on /metrics", "port", strconv.Itoa(metricsPort)) + if err := metricsServer.ListenAndServe(); err != nil { + if !errors.Is(err, http.ErrServerClosed) { + logger.Get().Error("Metrics endpoint failed", "error", err) + } else { + logger.Get().Info("Metrics server closed gracefully.") + } + } + }() + } else { + logger.Get().Info("Starting Prometheus metrics endpoint on /metrics", "port", strconv.Itoa(port)) + mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) + } + serverMode := "read-write" + if readOnly { + serverMode = "read-only" + } + metrics.KagentToolsMCPServerInfo.WithLabelValues(Name, Version, GitCommit, BuildDate, serverMode).Set(1) // Handle all other routes with the MCP server wrapped in telemetry middleware mux.Handle("/", telemetry.HTTPMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -229,6 +267,19 @@ func run(cmd *cobra.Command, args []string) { rootSpan.AddEvent("server.shutdown.completed") } } + + // Gracefully shutdown metrics server if running separately + if !stdio && metricsServer != nil { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + + if err := metricsServer.Shutdown(shutdownCtx); err != nil { + logger.Get().Error("Failed to shutdown metrics server gracefully", "error", err) + rootSpan.RecordError(err) + } else { + logger.Get().Info("Metrics server shutdown completed") + } + } }() // Wait for all server operations to complete @@ -242,47 +293,6 @@ func writeResponse(w http.ResponseWriter, data []byte) error { return err } -// generateRuntimeMetrics generates real runtime metrics for the /metrics endpoint -func generateRuntimeMetrics() string { - var m runtime.MemStats - runtime.ReadMemStats(&m) - - now := time.Now().Unix() - - // Build metrics in Prometheus format - metrics := strings.Builder{} - - // Go runtime info - metrics.WriteString("# HELP go_info Information about the Go environment.\n") - metrics.WriteString("# TYPE go_info gauge\n") - metrics.WriteString(fmt.Sprintf("go_info{version=\"%s\"} 1\n", runtime.Version())) - - // Process start time - metrics.WriteString("# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.\n") - metrics.WriteString("# TYPE process_start_time_seconds gauge\n") - metrics.WriteString(fmt.Sprintf("process_start_time_seconds %d\n", now)) - - // Memory metrics - metrics.WriteString("# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.\n") - metrics.WriteString("# TYPE go_memstats_alloc_bytes gauge\n") - metrics.WriteString(fmt.Sprintf("go_memstats_alloc_bytes %d\n", m.Alloc)) - - metrics.WriteString("# HELP go_memstats_total_alloc_bytes Total number of bytes allocated, even if freed.\n") - metrics.WriteString("# TYPE go_memstats_total_alloc_bytes counter\n") - metrics.WriteString(fmt.Sprintf("go_memstats_total_alloc_bytes %d\n", m.TotalAlloc)) - - metrics.WriteString("# HELP go_memstats_sys_bytes Number of bytes obtained from system.\n") - metrics.WriteString("# TYPE go_memstats_sys_bytes gauge\n") - metrics.WriteString(fmt.Sprintf("go_memstats_sys_bytes %d\n", m.Sys)) - - // Goroutine count - metrics.WriteString("# HELP go_goroutines Number of goroutines that currently exist.\n") - metrics.WriteString("# TYPE go_goroutines gauge\n") - metrics.WriteString(fmt.Sprintf("go_goroutines %d\n", runtime.NumGoroutine())) - - return metrics.String() -} - func runStdioServer(ctx context.Context, mcp *server.MCPServer) { logger.Get().Info("Running KAgent Tools Server STDIO:", "tools", strings.Join(tools, ",")) stdioServer := server.NewStdioServer(mcp) @@ -291,7 +301,11 @@ func runStdioServer(ctx context.Context, mcp *server.MCPServer) { } } -func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfig string, readOnly bool) { +// registerMCP registers tool providers with the MCP server and returns a mapping +// of tool_name -> tool_provider. This mapping is built using the ListTools() diff +// technique: we snapshot the tool list before and after each provider registers, +// so we know exactly which tools belong to which provider. +func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfig string, readOnly bool) map[string]string { // A map to hold tool providers and their registration functions toolProviderMap := map[string]func(*server.MCPServer){ "argo": func(s *server.MCPServer) { argo.RegisterTools(s, readOnly) }, @@ -310,11 +324,78 @@ func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfi enabledToolProviders = append(enabledToolProviders, name) } } + + // toolToProvider maps each tool name to its provider (e.g., "kubectl_get" -> "k8s"). + // This is used later by wrapToolHandlersWithMetrics to set the correct tool_provider label. + toolToProvider := make(map[string]string) + for _, toolProviderName := range enabledToolProviders { if registerFunc, ok := toolProviderMap[toolProviderName]; ok { + // Snapshot the tool list before this provider registers its tools. + // We need this because ListTools() returns ALL tools from ALL providers, + // so the only way to know which tools belong to THIS provider is to compare + // the list before and after registration. + toolsBefore := mcp.ListTools() + registerFunc(mcp) + + // Determine which tools were just registered by this provider + // by finding tools that exist now but didn't exist before. + // Record each one in Prometheus so we can observe the full tool inventory. + for toolName := range mcp.ListTools() { + if _, existed := toolsBefore[toolName]; !existed { + metrics.KagentToolsMCPRegisteredTools.WithLabelValues(toolName, toolProviderName).Set(1) + toolToProvider[toolName] = toolProviderName + } + } } else { logger.Get().Error("Unknown tool specified", "provider", toolProviderName) } } + + return toolToProvider +} + +// wrapToolHandlersWithMetrics applies the wrapper/middleware pattern to instrument +// all registered MCP tool handlers with Prometheus invocation counters. +// +// How it works: +// 1. Grab all registered tools from the MCP server using ListTools() +// 2. For each tool, wrap its handler with a function that increments metrics +// 3. Replace all tools in the MCP server using SetTools() +// +// The wrapper function: +// - Increments kagent_tools_mcp_invocations_total on every call +// - Increments kagent_tools_mcp_invocations_failure_total only when the handler returns an error +// - Calls the original handler unchanged - the tool's behaviour is not affected +// +// This uses the standard middleware/decorator pattern: the original handler and the +// wrapped handler have the same function signature, so they are interchangeable. +// No changes are required in any pkg/ file - all instrumentation happens centrally here. +func wrapToolHandlersWithMetrics(mcpServer *server.MCPServer, toolToProvider map[string]string) { + allTools := mcpServer.ListTools() + wrapped := make([]server.ServerTool, 0, len(allTools)) + + for name, st := range allTools { + originalHandler := st.Handler + toolName := name // capture for closure + provider := toolToProvider[toolName] + + wrapped = append(wrapped, server.ServerTool{ + Tool: st.Tool, + Handler: func(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + metrics.KagentToolsMCPInvocationsTotal.WithLabelValues(toolName, provider).Inc() + + result, err := originalHandler(ctx, req) + + if err != nil { + metrics.KagentToolsMCPInvocationsFailureTotal.WithLabelValues(toolName, provider).Inc() + } + + return result, err + }, + }) + } + + mcpServer.SetTools(wrapped...) } diff --git a/dashboard/grafana-dash-example.png b/dashboard/grafana-dash-example.png new file mode 100644 index 0000000..6ffe311 Binary files /dev/null and b/dashboard/grafana-dash-example.png differ diff --git a/dashboard/grafana-dashboard.json b/dashboard/grafana-dashboard.json new file mode 100644 index 0000000..801a052 --- /dev/null +++ b/dashboard/grafana-dashboard.json @@ -0,0 +1,819 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 29, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^version$/", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "kagent_tools_mcp_server_info", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Server Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(kagent_tools_mcp_registered_tools)", + "instant": true, + "legendFormat": "Registered Tools", + "range": false, + "refId": "A" + } + ], + "title": "Total Registered Tools", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(kagent_tools_mcp_invocations_total[5m]))", + "instant": true, + "legendFormat": "Total Invocations (5m)", + "range": false, + "refId": "A" + } + ], + "title": "Invocations (Last 5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "100 - (sum(rate(kagent_tools_mcp_invocations_failure_total[5m])) / sum(rate(kagent_tools_mcp_invocations_total[5m])) * 100)", + "instant": true, + "legendFormat": "Success Rate", + "range": false, + "refId": "A" + } + ], + "title": "Success Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_total[$__rate_interval])) by (tool_provider)", + "legendFormat": "{{tool_provider}}", + "range": true, + "refId": "A" + } + ], + "title": "Invocation Rate by Provider", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_total[$__rate_interval]))", + "legendFormat": "Total", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_failure_total[$__rate_interval]))", + "hide": false, + "legendFormat": "Failures", + "range": true, + "refId": "B" + } + ], + "title": "Total Invocations vs Failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(tool_provider) (kagent_tools_mcp_registered_tools)", + "legendFormat": "{{tool_provider}}", + "range": true, + "refId": "A" + } + ], + "title": "Tools by Provider", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "footer": { + "reducers": [] + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Invocations" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "thresholds" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 12 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Invocations" + } + ] + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(tool_name, tool_provider) (kagent_tools_mcp_invocations_total)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(tool_name, tool_provider) (kagent_tools_mcp_invocations_failure_total)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Top Invoked Tools", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "tool_name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "tool_provider 2": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 4, + "Time 2": 5, + "Value #A": 2, + "Value #B": 3, + "tool_name": 0, + "tool_provider 1": 1, + "tool_provider 2": 6 + }, + "renameByName": { + "Value #A": "Invocations", + "Value #B": "Failures", + "tool_name": "Tool Name", + "tool_provider 1": "Provider" + } + } + } + ], + "type": "table" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "kagent", + "mcp", + "tools" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "prometheus" + }, + "includeAll": false, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "kAgent Tools - MCP Observability", + "uid": "kagent-tools-mcp", + "version": 1 +} \ No newline at end of file diff --git a/go.mod b/go.mod index a2f27fc..8461ba7 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/mark3labs/mcp-go v0.43.2 github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 github.com/tmc/langchaingo v0.1.14 @@ -129,8 +131,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pkoukk/tiktoken-go v0.1.8 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect diff --git a/helm/kagent-tools/templates/deployment.yaml b/helm/kagent-tools/templates/deployment.yaml index 001caef..9787694 100644 --- a/helm/kagent-tools/templates/deployment.yaml +++ b/helm/kagent-tools/templates/deployment.yaml @@ -59,6 +59,8 @@ spec: args: - "--port" - "{{ .Values.service.ports.tools.targetPort }}" + - "--metrics-port" + - "{{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }}" {{- if .Values.tools.enabledTools }} - "--tools={{ join "," .Values.tools.enabledTools }}" {{- end }} @@ -98,6 +100,9 @@ spec: - name: http-tools containerPort: {{ .Values.service.ports.tools.targetPort }} protocol: TCP + - name: http-metrics + containerPort: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + protocol: TCP readinessProbe: tcpSocket: port: http-tools diff --git a/helm/kagent-tools/templates/service.yaml b/helm/kagent-tools/templates/service.yaml index 55c7fd2..f578670 100644 --- a/helm/kagent-tools/templates/service.yaml +++ b/helm/kagent-tools/templates/service.yaml @@ -19,3 +19,22 @@ spec: name: tools selector: {{- include "kagent.selectorLabels" . | nindent 4 }} + +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kagent.fullname" . }}-metrics + namespace: {{ include "kagent.namespace" . }} + labels: + {{- include "kagent.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics +spec: + selector: + {{- include "kagent.selectorLabels" . | nindent 4 }} + ports: + - name: prometheus-metrics + protocol: TCP + port: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + targetPort: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + \ No newline at end of file diff --git a/helm/kagent-tools/templates/servicemonitor.yaml b/helm/kagent-tools/templates/servicemonitor.yaml new file mode 100644 index 0000000..ded05cd --- /dev/null +++ b/helm/kagent-tools/templates/servicemonitor.yaml @@ -0,0 +1,23 @@ + +{{- if .Values.tools.metrics.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "kagent.fullname" . }} + namespace: {{ include "kagent.namespace" . }} + labels: + {{- toYaml .Values.tools.metrics.servicemonitor.labels | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "kagent.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: metrics + namespaceSelector: + matchNames: + - {{ include "kagent.namespace" . }} + endpoints: + - port: prometheus-metrics + interval: {{ .Values.tools.metrics.servicemonitor.interval | default "30s" }} + scrapeTimeout: {{ .Values.tools.metrics.servicemonitor.scrapeTimeout | default "10s" }} + path: {{ .Values.tools.metrics.servicemonitor.path | default "/metrics" }} +{{- end }} diff --git a/helm/kagent-tools/values.yaml b/helm/kagent-tools/values.yaml index 556f56e..dd9ef09 100644 --- a/helm/kagent-tools/values.yaml +++ b/helm/kagent-tools/values.yaml @@ -5,6 +5,15 @@ global: tag: "" tools: + metrics: + # port defaults to the main --port value (same server). Set explicitly for a dedicated metrics port. + port: "" + servicemonitor: + enabled: false + interval: 30s + scrapeTimeout: 10s + labels: + release: prometheus loglevel: "debug" # List of tool providers to enable. Empty list means all tools are enabled. # Available: k8s, helm, istio, cilium, argo, prometheus, kubescape, utils diff --git a/internal/metrics/monitoring_server.go b/internal/metrics/monitoring_server.go new file mode 100644 index 0000000..275a01f --- /dev/null +++ b/internal/metrics/monitoring_server.go @@ -0,0 +1,69 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) + +// kAgent Tools MCP Server metrics definition +var ( + KagentToolsMCPServerInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_server_info", + Help: "Information about the MCP server including version and build details", + }, + []string{ + "server_name", + "version", + "git_commit", + "build_date", + "server_mode", // e.g., "read-only" or "read-write" + }, + ) + + KagentToolsMCPRegisteredTools = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_registered_tools", + Help: "Set to 1 for each registered MCP tool provider", + }, + []string{ + "tool_name", + "tool_provider", + }, + ) + + KagentToolsMCPInvocationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_total", + Help: "Total number of MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) + + KagentToolsMCPInvocationsFailureTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_failure_total", + Help: "Total number of failed MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) +) + +func InitServer() *prometheus.Registry { + // New registry for our custom metrics, separate from the default registry + registry := prometheus.NewRegistry() + + // Add Go runtime metrics ( goroutines, GC stats, etc. ) + registry.MustRegister(collectors.NewGoCollector()) + + // Add process metrics (CPU, memory, file descriptors, etc. ) + registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + + // Register kAgent Tools MCP Server metrics + registry.MustRegister(KagentToolsMCPServerInfo) + registry.MustRegister(KagentToolsMCPRegisteredTools) + registry.MustRegister(KagentToolsMCPInvocationsTotal) + registry.MustRegister(KagentToolsMCPInvocationsFailureTotal) + + return registry +} diff --git a/internal/metrics/monitoring_server_test.go b/internal/metrics/monitoring_server_test.go new file mode 100644 index 0000000..495c3e1 --- /dev/null +++ b/internal/metrics/monitoring_server_test.go @@ -0,0 +1,268 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +func TestInitServer_ReturnsRegistry(t *testing.T) { + registry := InitServer() + if registry == nil { + t.Fatal("InitServer() returned nil registry") + } +} + +func TestInitServer_GathersMetrics(t *testing.T) { + registry := InitServer() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + if len(families) == 0 { + t.Fatal("Expected at least one metric family from Go/process collectors, got none") + } +} + +func TestInitServer_RegistersCustomMetrics(t *testing.T) { + registry := InitServer() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + // Build a set of metric names for easy lookup + metricNames := make(map[string]bool) + for _, family := range families { + metricNames[family.GetName()] = true + } + + // Go and process collectors should be present + goMetrics := []string{ + "go_goroutines", + "go_memstats_alloc_bytes", + } + for _, name := range goMetrics { + if !metricNames[name] { + t.Errorf("Expected Go collector metric %q to be registered", name) + } + } +} + +func TestKagentToolsMCPServerInfo_SetAndGather(t *testing.T) { + registry := InitServer() + + // Set the server info metric + KagentToolsMCPServerInfo.WithLabelValues( + "test-server", + "v0.0.1", + "abc123", + "2026-02-12", + "read-write", + ).Set(1) + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_server_info") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_server_info metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 1 { + t.Fatalf("Expected 1 time series, got %d", len(metrics)) + } + + // Verify label values + expectedLabels := map[string]string{ + "server_name": "test-server", + "version": "v0.0.1", + "git_commit": "abc123", + "build_date": "2026-02-12", + "server_mode": "read-write", + } + + for _, label := range metrics[0].GetLabel() { + expected, ok := expectedLabels[label.GetName()] + if !ok { + t.Errorf("Unexpected label %q", label.GetName()) + continue + } + if label.GetValue() != expected { + t.Errorf("Label %q: expected %q, got %q", label.GetName(), expected, label.GetValue()) + } + } + + // Verify gauge value is 1 + if metrics[0].GetGauge().GetValue() != 1 { + t.Errorf("Expected gauge value 1, got %f", metrics[0].GetGauge().GetValue()) + } +} + +func TestKagentToolsMCPRegisteredTools_SetAndGather(t *testing.T) { + registry := InitServer() + + // Register a couple of tool providers + KagentToolsMCPRegisteredTools.WithLabelValues("kubectl_get", "k8s").Set(1) + KagentToolsMCPRegisteredTools.WithLabelValues("helm_list", "helm").Set(1) + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_registered_tools") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_registered_tools metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 2 { + t.Fatalf("Expected 2 time series (one per tool), got %d", len(metrics)) + } +} + +func TestKagentToolsMCPInvocationsTotal_IncAndGather(t *testing.T) { + registry := InitServer() + + // Simulate a few tool invocations + KagentToolsMCPInvocationsTotal.WithLabelValues("kubectl_get", "k8s").Inc() + KagentToolsMCPInvocationsTotal.WithLabelValues("kubectl_get", "k8s").Inc() + KagentToolsMCPInvocationsTotal.WithLabelValues("helm_list", "helm").Inc() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_invocations_total") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_invocations_total metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 2 { + t.Fatalf("Expected 2 time series (one per tool), got %d", len(metrics)) + } + + // Find the kubectl_get series and verify its counter value is 2 + for _, m := range metrics { + for _, label := range m.GetLabel() { + if label.GetName() == "tool_name" && label.GetValue() == "kubectl_get" { + if m.GetCounter().GetValue() != 2 { + t.Errorf("Expected kubectl_get counter to be 2, got %f", m.GetCounter().GetValue()) + } + } + } + } +} + +func TestKagentToolsMCPInvocationsFailureTotal_IncAndGather(t *testing.T) { + registry := InitServer() + + // Simulate a tool failure + KagentToolsMCPInvocationsFailureTotal.WithLabelValues("helm_install", "helm").Inc() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_invocations_failure_total") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_invocations_failure_total metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 1 { + t.Fatalf("Expected 1 time series, got %d", len(metrics)) + } + + if metrics[0].GetCounter().GetValue() != 1 { + t.Errorf("Expected failure counter to be 1, got %f", metrics[0].GetCounter().GetValue()) + } + + // Verify labels + expectedLabels := map[string]string{ + "tool_name": "helm_install", + "tool_provider": "helm", + } + for _, label := range metrics[0].GetLabel() { + expected, ok := expectedLabels[label.GetName()] + if !ok { + t.Errorf("Unexpected label %q", label.GetName()) + continue + } + if label.GetValue() != expected { + t.Errorf("Label %q: expected %q, got %q", label.GetName(), expected, label.GetValue()) + } + } +} + +// findMetricFamily finds a metric family by name from a gathered slice +func findMetricFamily(families []*dto.MetricFamily, name string) *dto.MetricFamily { + for _, family := range families { + if family.GetName() == name { + return family + } + } + return nil +} + +// resetMetrics resets the global metric vectors so tests don't interfere with each other +func resetMetrics() { + KagentToolsMCPServerInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_server_info", + Help: "Information about the MCP server including version and build details", + }, + []string{ + "server_name", + "version", + "git_commit", + "build_date", + "server_mode", + }, + ) + + KagentToolsMCPRegisteredTools = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_registered_tools", + Help: "Set to 1 for each registered MCP tool provider", + }, + []string{ + "tool_name", + "tool_provider", + }, + ) + + KagentToolsMCPInvocationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_total", + Help: "Total number of MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) + + KagentToolsMCPInvocationsFailureTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_failure_total", + Help: "Total number of failed MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) +} + +func TestMain(m *testing.M) { + // Reset metrics before each test run to avoid "duplicate registration" panics + // since InitServer() registers the package-level vars into a new registry each time + resetMetrics() + m.Run() +}