Files
kit/internal/builtin/fetch.go
T
Ed Zynda 0703dd1602 fix: eliminate escape sequence leak from spinner tea.Program instances
Each spinner created a new tea.NewProgram which sent DECRQM queries for
synchronized output mode 2026. When the program exited and restored
cooked terminal mode, the terminal's DECRPM response leaked as visible
^[[?2026;2$y characters. Replace Bubble Tea spinner with a simple
goroutine animation loop writing directly to stderr via lipgloss.
2026-02-25 18:17:25 +03:00

268 lines
7.9 KiB
Go

package builtin
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/JohannesKaufmann/html-to-markdown"
"github.com/PuerkitoBio/goquery"
"github.com/mark3labs/mcp-go/mcp"
"github.com/mark3labs/mcp-go/server"
)
const (
maxResponseSize = 5 * 1024 * 1024 // 5MB
defaultFetchTimeout = 30 * time.Second
maxFetchTimeout = 120 * time.Second
)
// NewFetchServer creates a new MCP server that provides web content fetching capabilities.
// The server includes a single tool "fetch" that retrieves content from URLs and converts
// it to text, markdown, or HTML format. Returns an error if server initialization fails.
func NewFetchServer() (*server.MCPServer, error) {
s := server.NewMCPServer("fetch-server", "1.0.0", server.WithToolCapabilities(true))
// Register the fetch tool
fetchTool := mcp.NewTool("fetch",
mcp.WithDescription(fetchDescription),
mcp.WithString("url",
mcp.Required(),
mcp.Description("The URL to fetch content from"),
),
mcp.WithString("format",
mcp.Required(),
mcp.Enum("text", "markdown", "html"),
mcp.Description("The format to return the content in (text, markdown, or html)"),
),
mcp.WithNumber("timeout",
mcp.Description("Optional timeout in seconds (max 120)"),
mcp.Min(0),
mcp.Max(120),
),
)
s.AddTool(fetchTool, executeFetch)
return s, nil
}
// executeFetch handles the fetch tool execution
func executeFetch(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
// Extract parameters
urlStr, err := request.RequireString("url")
if err != nil {
return mcp.NewToolResultError("url parameter is required and must be a string"), nil
}
format, err := request.RequireString("format")
if err != nil {
return mcp.NewToolResultError("format parameter is required and must be a string"), nil
}
// Validate format
if format != "text" && format != "markdown" && format != "html" {
return mcp.NewToolResultError("format must be 'text', 'markdown', or 'html'"), nil
}
// Parse timeout (optional)
timeout := defaultFetchTimeout
if timeoutSec := request.GetFloat("timeout", 0); timeoutSec > 0 {
timeoutDuration := time.Duration(timeoutSec) * time.Second
timeout = min(timeoutDuration, maxFetchTimeout)
}
// Validate URL
parsedURL, err := url.Parse(urlStr)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("invalid URL: %v", err)), nil
}
// Ensure URL has a scheme
if parsedURL.Scheme == "" {
urlStr = "https://" + urlStr
parsedURL, err = url.Parse(urlStr)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("invalid URL after adding https: %v", err)), nil
}
}
// Only allow HTTP and HTTPS
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return mcp.NewToolResultError("URL must use http:// or https://"), nil
}
// Upgrade HTTP to HTTPS only for external URLs (not localhost/127.0.0.1)
if parsedURL.Scheme == "http" && !isLocalhost(parsedURL.Host) {
parsedURL.Scheme = "https"
urlStr = parsedURL.String()
}
// Create HTTP client with timeout
client := &http.Client{
Timeout: timeout,
}
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("failed to create request: %v", err)), nil
}
// Set headers to mimic a real browser
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
// Make the request
resp, err := client.Do(req)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("request failed: %v", err)), nil
}
defer resp.Body.Close()
// Check status code
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return mcp.NewToolResultError(fmt.Sprintf("request failed with status code: %d", resp.StatusCode)), nil
}
// Check content length
if resp.ContentLength > maxResponseSize {
return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
}
// Read response body with size limit
limitedReader := io.LimitReader(resp.Body, maxResponseSize+1)
bodyBytes, err := io.ReadAll(limitedReader)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("failed to read response: %v", err)), nil
}
// Check if we exceeded the size limit
if len(bodyBytes) > maxResponseSize {
return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
}
content := string(bodyBytes)
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
contentType = "unknown"
}
// Process content based on format
var output string
switch format {
case "text":
if strings.Contains(contentType, "text/html") {
output, err = extractTextFromHTML(content)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("failed to extract text from HTML: %v", err)), nil
}
} else {
output = content
}
case "markdown":
if strings.Contains(contentType, "text/html") {
output, err = convertHTMLToMarkdown(content)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("failed to convert HTML to markdown: %v", err)), nil
}
} else {
output = "```\n" + content + "\n```"
}
case "html":
output = content
default:
output = content
}
// Create result with metadata
title := fmt.Sprintf("%s (%s)", urlStr, contentType)
result := mcp.NewToolResultText(output)
result.Meta = &mcp.Meta{
AdditionalFields: map[string]any{
"title": title,
},
}
return result, nil
}
// extractTextFromHTML extracts plain text from HTML content
func extractTextFromHTML(htmlContent string) (string, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
return "", err
}
// Remove script, style, and other non-content elements
doc.Find("script, style, noscript, iframe, object, embed").Remove()
// Extract text content
text := doc.Text()
// Clean up whitespace
lines := strings.Split(text, "\n")
var cleanLines []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed != "" {
cleanLines = append(cleanLines, trimmed)
}
}
return strings.Join(cleanLines, "\n"), nil
}
// convertHTMLToMarkdown converts HTML content to markdown
func convertHTMLToMarkdown(htmlContent string) (string, error) {
converter := md.NewConverter("", true, nil)
// Remove unwanted elements
converter.Remove("script")
converter.Remove("style")
converter.Remove("meta")
converter.Remove("link")
markdown, err := converter.ConvertString(htmlContent)
if err != nil {
return "", err
}
return markdown, nil
}
// isLocalhost checks if the host is localhost or 127.0.0.1
func isLocalhost(host string) bool {
return strings.HasPrefix(host, "localhost") ||
strings.HasPrefix(host, "127.0.0.1") ||
strings.HasPrefix(host, "::1")
}
const fetchDescription = `Fetches content from a specified URL and returns it in the requested format.
- Fetches content from a specified URL
- Takes a URL and format as input
- Fetches the URL content, converts HTML to markdown or text as requested
- Returns the content in the specified format
- Use this tool when you need to retrieve and analyze web content
Usage notes:
- IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
- The URL must be a fully-formed valid URL
- HTTP URLs will be automatically upgraded to HTTPS
- This tool is read-only and does not modify any files
- Results may be summarized if the content is very large (max 5MB)
- Supports three output formats:
- "text": Plain text extraction from HTML, or raw content for non-HTML
- "markdown": HTML converted to markdown, or code-wrapped for non-HTML
- "html": Raw HTML content
- Timeout can be specified in seconds (default 30s, max 120s)`