internal/builtin/fetch.go

package builtin

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strings"
	"time"

	"github.com/JohannesKaufmann/html-to-markdown"
	"github.com/PuerkitoBio/goquery"
	"github.com/mark3labs/mcp-go/mcp"
	"github.com/mark3labs/mcp-go/server"
)

const (
	maxResponseSize     = 5 * 1024 * 1024 // 5MB
	defaultFetchTimeout = 30 * time.Second
	maxFetchTimeout     = 120 * time.Second
)

// NewFetchServer creates a new MCP server that provides web content fetching capabilities.
// The server includes a single tool "fetch" that retrieves content from URLs and converts
// it to text, markdown, or HTML format. Returns an error if server initialization fails.
func NewFetchServer() (*server.MCPServer, error) {
	s := server.NewMCPServer("fetch-server", "1.0.0", server.WithToolCapabilities(true))

	// Register the fetch tool
	fetchTool := mcp.NewTool("fetch",
		mcp.WithDescription(fetchDescription),
		mcp.WithString("url",
			mcp.Required(),
			mcp.Description("The URL to fetch content from"),
		),
		mcp.WithString("format",
			mcp.Required(),
			mcp.Enum("text", "markdown", "html"),
			mcp.Description("The format to return the content in (text, markdown, or html)"),
		),
		mcp.WithNumber("timeout",
			mcp.Description("Optional timeout in seconds (max 120)"),
			mcp.Min(0),
			mcp.Max(120),
		),
	)

	s.AddTool(fetchTool, executeFetch)

	return s, nil
}

// executeFetch handles the fetch tool execution
func executeFetch(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
	// Extract parameters
	urlStr, err := request.RequireString("url")
	if err != nil {
		return mcp.NewToolResultError("url parameter is required and must be a string"), nil
	}

	format, err := request.RequireString("format")
	if err != nil {
		return mcp.NewToolResultError("format parameter is required and must be a string"), nil
	}

	// Validate format
	if format != "text" && format != "markdown" && format != "html" {
		return mcp.NewToolResultError("format must be 'text', 'markdown', or 'html'"), nil
	}

	// Parse timeout (optional)
	timeout := defaultFetchTimeout
	if timeoutSec := request.GetFloat("timeout", 0); timeoutSec > 0 {
		timeoutDuration := time.Duration(timeoutSec) * time.Second
		timeout = min(timeoutDuration, maxFetchTimeout)
	}

	// Validate URL
	parsedURL, err := url.Parse(urlStr)
	if err != nil {
		return mcp.NewToolResultError(fmt.Sprintf("invalid URL: %v", err)), nil
	}

	// Ensure URL has a scheme
	if parsedURL.Scheme == "" {
		urlStr = "https://" + urlStr
		parsedURL, err = url.Parse(urlStr)
		if err != nil {
			return mcp.NewToolResultError(fmt.Sprintf("invalid URL after adding https: %v", err)), nil
		}
	}

	// Only allow HTTP and HTTPS
	if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
		return mcp.NewToolResultError("URL must use http:// or https://"), nil
	}

	// Upgrade HTTP to HTTPS only for external URLs (not localhost/127.0.0.1)
	if parsedURL.Scheme == "http" && !isLocalhost(parsedURL.Host) {
		parsedURL.Scheme = "https"
		urlStr = parsedURL.String()
	}

	// Create HTTP client with timeout
	client := &http.Client{
		Timeout: timeout,
	}

	// Create request with context
	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
	if err != nil {
		return mcp.NewToolResultError(fmt.Sprintf("failed to create request: %v", err)), nil
	}

	// Set headers to mimic a real browser
	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
	req.Header.Set("Accept-Language", "en-US,en;q=0.9")

	// Make the request
	resp, err := client.Do(req)
	if err != nil {
		return mcp.NewToolResultError(fmt.Sprintf("request failed: %v", err)), nil
	}
	defer resp.Body.Close()

	// Check status code
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return mcp.NewToolResultError(fmt.Sprintf("request failed with status code: %d", resp.StatusCode)), nil
	}

	// Check content length
	if resp.ContentLength > maxResponseSize {
		return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
	}

	// Read response body with size limit
	limitedReader := io.LimitReader(resp.Body, maxResponseSize+1)
	bodyBytes, err := io.ReadAll(limitedReader)
	if err != nil {
		return mcp.NewToolResultError(fmt.Sprintf("failed to read response: %v", err)), nil
	}

	// Check if we exceeded the size limit
	if len(bodyBytes) > maxResponseSize {
		return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
	}

	content := string(bodyBytes)
	contentType := resp.Header.Get("Content-Type")
	if contentType == "" {
		contentType = "unknown"
	}

	// Process content based on format
	var output string
	switch format {
	case "text":
		if strings.Contains(contentType, "text/html") {
			output, err = extractTextFromHTML(content)
			if err != nil {
				return mcp.NewToolResultError(fmt.Sprintf("failed to extract text from HTML: %v", err)), nil
			}
		} else {
			output = content
		}

	case "markdown":
		if strings.Contains(contentType, "text/html") {
			output, err = convertHTMLToMarkdown(content)
			if err != nil {
				return mcp.NewToolResultError(fmt.Sprintf("failed to convert HTML to markdown: %v", err)), nil
			}
		} else {
			output = "```\n" + content + "\n```"
		}

	case "html":
		output = content

	default:
		output = content
	}

	// Create result with metadata
	title := fmt.Sprintf("%s (%s)", urlStr, contentType)
	result := mcp.NewToolResultText(output)
	result.Meta = &mcp.Meta{
		AdditionalFields: map[string]any{
			"title": title,
		},
	}

	return result, nil
}

// extractTextFromHTML extracts plain text from HTML content
func extractTextFromHTML(htmlContent string) (string, error) {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
	if err != nil {
		return "", err
	}

	// Remove script, style, and other non-content elements
	doc.Find("script, style, noscript, iframe, object, embed").Remove()

	// Extract text content
	text := doc.Text()

	// Clean up whitespace
	lines := strings.Split(text, "\n")
	var cleanLines []string
	for _, line := range lines {
		trimmed := strings.TrimSpace(line)
		if trimmed != "" {
			cleanLines = append(cleanLines, trimmed)
		}
	}

	return strings.Join(cleanLines, "\n"), nil
}

// convertHTMLToMarkdown converts HTML content to markdown
func convertHTMLToMarkdown(htmlContent string) (string, error) {
	converter := md.NewConverter("", true, nil)

	// Remove unwanted elements
	converter.Remove("script")
	converter.Remove("style")
	converter.Remove("meta")
	converter.Remove("link")

	markdown, err := converter.ConvertString(htmlContent)
	if err != nil {
		return "", err
	}

	return markdown, nil
}

// isLocalhost checks if the host is localhost or 127.0.0.1
func isLocalhost(host string) bool {
	return strings.HasPrefix(host, "localhost") ||
		strings.HasPrefix(host, "127.0.0.1") ||
		strings.HasPrefix(host, "::1")
}

const fetchDescription = `Fetches content from a specified URL and returns it in the requested format.

- Fetches content from a specified URL
- Takes a URL and format as input
- Fetches the URL content, converts HTML to markdown or text as requested
- Returns the content in the specified format
- Use this tool when you need to retrieve and analyze web content

Usage notes:
  - IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
  - The URL must be a fully-formed valid URL
  - HTTP URLs will be automatically upgraded to HTTPS
  - This tool is read-only and does not modify any files
  - Results may be summarized if the content is very large (max 5MB)
  - Supports three output formats:
    - "text": Plain text extraction from HTML, or raw content for non-HTML
    - "markdown": HTML converted to markdown, or code-wrapped for non-HTML
    - "html": Raw HTML content
  - Timeout can be specified in seconds (default 30s, max 120s)`
new builtin servers 2025-06-24 16:53:48 +03:00			`package builtin`

			`import (`
			`"context"`
			`"fmt"`
			`"io"`
			`"net/http"`
			`"net/url"`
			`"strings"`
			`"time"`

			`"github.com/JohannesKaufmann/html-to-markdown"`
			`"github.com/PuerkitoBio/goquery"`
			`"github.com/mark3labs/mcp-go/mcp"`
			`"github.com/mark3labs/mcp-go/server"`
			`)`

			`const (`
			`maxResponseSize = 5 * 1024 * 1024 // 5MB`
			`defaultFetchTimeout = 30 * time.Second`
			`maxFetchTimeout = 120 * time.Second`
			`)`

godoc 2025-11-12 16:48:46 +03:00			`// NewFetchServer creates a new MCP server that provides web content fetching capabilities.`
			`// The server includes a single tool "fetch" that retrieves content from URLs and converts`
			`// it to text, markdown, or HTML format. Returns an error if server initialization fails.`
new builtin servers 2025-06-24 16:53:48 +03:00			`func NewFetchServer() (*server.MCPServer, error) {`
			`s := server.NewMCPServer("fetch-server", "1.0.0", server.WithToolCapabilities(true))`

			`// Register the fetch tool`
			`fetchTool := mcp.NewTool("fetch",`
			`mcp.WithDescription(fetchDescription),`
			`mcp.WithString("url",`
			`mcp.Required(),`
			`mcp.Description("The URL to fetch content from"),`
			`),`
			`mcp.WithString("format",`
			`mcp.Required(),`
			`mcp.Enum("text", "markdown", "html"),`
			`mcp.Description("The format to return the content in (text, markdown, or html)"),`
			`),`
			`mcp.WithNumber("timeout",`
			`mcp.Description("Optional timeout in seconds (max 120)"),`
			`mcp.Min(0),`
			`mcp.Max(120),`
			`),`
			`)`

			`s.AddTool(fetchTool, executeFetch)`

			`return s, nil`
			`}`

			`// executeFetch handles the fetch tool execution`
			`func executeFetch(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {`
			`// Extract parameters`
			`urlStr, err := request.RequireString("url")`
			`if err != nil {`
			`return mcp.NewToolResultError("url parameter is required and must be a string"), nil`
			`}`

			`format, err := request.RequireString("format")`
			`if err != nil {`
			`return mcp.NewToolResultError("format parameter is required and must be a string"), nil`
			`}`

			`// Validate format`
			`if format != "text" && format != "markdown" && format != "html" {`
			`return mcp.NewToolResultError("format must be 'text', 'markdown', or 'html'"), nil`
			`}`

			`// Parse timeout (optional)`
			`timeout := defaultFetchTimeout`
			`if timeoutSec := request.GetFloat("timeout", 0); timeoutSec > 0 {`
			`timeoutDuration := time.Duration(timeoutSec) * time.Second`
fix: eliminate escape sequence leak from spinner tea.Program instances 2026-02-25 18:17:25 +03:00			`timeout = min(timeoutDuration, maxFetchTimeout)`
new builtin servers 2025-06-24 16:53:48 +03:00			`}`

			`// Validate URL`
			`parsedURL, err := url.Parse(urlStr)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("invalid URL: %v", err)), nil`
			`}`

			`// Ensure URL has a scheme`
			`if parsedURL.Scheme == "" {`
			`urlStr = "https://" + urlStr`
			`parsedURL, err = url.Parse(urlStr)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("invalid URL after adding https: %v", err)), nil`
			`}`
			`}`

			`// Only allow HTTP and HTTPS`
			`if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {`
			`return mcp.NewToolResultError("URL must use http:// or https://"), nil`
			`}`

			`// Upgrade HTTP to HTTPS only for external URLs (not localhost/127.0.0.1)`
			`if parsedURL.Scheme == "http" && !isLocalhost(parsedURL.Host) {`
			`parsedURL.Scheme = "https"`
			`urlStr = parsedURL.String()`
			`}`

			`// Create HTTP client with timeout`
			`client := &http.Client{`
			`Timeout: timeout,`
			`}`

			`// Create request with context`
			`req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("failed to create request: %v", err)), nil`
			`}`

			`// Set headers to mimic a real browser`
			`req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")`
			`req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8")`
			`req.Header.Set("Accept-Language", "en-US,en;q=0.9")`

			`// Make the request`
			`resp, err := client.Do(req)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("request failed: %v", err)), nil`
			`}`
			`defer resp.Body.Close()`

			`// Check status code`
			`if resp.StatusCode < 200 \|\| resp.StatusCode >= 300 {`
			`return mcp.NewToolResultError(fmt.Sprintf("request failed with status code: %d", resp.StatusCode)), nil`
			`}`

			`// Check content length`
			`if resp.ContentLength > maxResponseSize {`
			`return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil`
			`}`

			`// Read response body with size limit`
			`limitedReader := io.LimitReader(resp.Body, maxResponseSize+1)`
			`bodyBytes, err := io.ReadAll(limitedReader)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("failed to read response: %v", err)), nil`
			`}`

			`// Check if we exceeded the size limit`
			`if len(bodyBytes) > maxResponseSize {`
			`return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil`
			`}`

			`content := string(bodyBytes)`
			`contentType := resp.Header.Get("Content-Type")`
			`if contentType == "" {`
			`contentType = "unknown"`
			`}`

			`// Process content based on format`
			`var output string`
			`switch format {`
			`case "text":`
			`if strings.Contains(contentType, "text/html") {`
			`output, err = extractTextFromHTML(content)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("failed to extract text from HTML: %v", err)), nil`
			`}`
			`} else {`
			`output = content`
			`}`

			`case "markdown":`
			`if strings.Contains(contentType, "text/html") {`
			`output, err = convertHTMLToMarkdown(content)`
			`if err != nil {`
			`return mcp.NewToolResultError(fmt.Sprintf("failed to convert HTML to markdown: %v", err)), nil`
			`}`
			`} else {`
			output = "```\n" + content + "\n```"
			`}`

			`case "html":`
			`output = content`

			`default:`
			`output = content`
			`}`

			`// Create result with metadata`
			`title := fmt.Sprintf("%s (%s)", urlStr, contentType)`
			`result := mcp.NewToolResultText(output)`
Update mcp-go 2025-08-11 20:21:05 +03:00			`result.Meta = &mcp.Meta{`
			`AdditionalFields: map[string]any{`
			`"title": title,`
			`},`
new builtin servers 2025-06-24 16:53:48 +03:00			`}`

			`return result, nil`
			`}`

			`// extractTextFromHTML extracts plain text from HTML content`
			`func extractTextFromHTML(htmlContent string) (string, error) {`
			`doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))`
			`if err != nil {`
			`return "", err`
			`}`

			`// Remove script, style, and other non-content elements`
			`doc.Find("script, style, noscript, iframe, object, embed").Remove()`

			`// Extract text content`
			`text := doc.Text()`

			`// Clean up whitespace`
			`lines := strings.Split(text, "\n")`
			`var cleanLines []string`
			`for _, line := range lines {`
			`trimmed := strings.TrimSpace(line)`
			`if trimmed != "" {`
			`cleanLines = append(cleanLines, trimmed)`
			`}`
			`}`

			`return strings.Join(cleanLines, "\n"), nil`
			`}`

			`// convertHTMLToMarkdown converts HTML content to markdown`
			`func convertHTMLToMarkdown(htmlContent string) (string, error) {`
			`converter := md.NewConverter("", true, nil)`

			`// Remove unwanted elements`
			`converter.Remove("script")`
			`converter.Remove("style")`
			`converter.Remove("meta")`
			`converter.Remove("link")`

			`markdown, err := converter.ConvertString(htmlContent)`
			`if err != nil {`
			`return "", err`
			`}`

			`return markdown, nil`
			`}`

			`// isLocalhost checks if the host is localhost or 127.0.0.1`
			`func isLocalhost(host string) bool {`
			`return strings.HasPrefix(host, "localhost") \|\|`
			`strings.HasPrefix(host, "127.0.0.1") \|\|`
			`strings.HasPrefix(host, "::1")`
			`}`

			const fetchDescription = `Fetches content from a specified URL and returns it in the requested format.

			`- Fetches content from a specified URL`
			`- Takes a URL and format as input`
			`- Fetches the URL content, converts HTML to markdown or text as requested`
			`- Returns the content in the specified format`
			`- Use this tool when you need to retrieve and analyze web content`

			`Usage notes:`
			`- IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".`
			`- The URL must be a fully-formed valid URL`
			`- HTTP URLs will be automatically upgraded to HTTPS`
			`- This tool is read-only and does not modify any files`
			`- Results may be summarized if the content is very large (max 5MB)`
			`- Supports three output formats:`
			`- "text": Plain text extraction from HTML, or raw content for non-HTML`
			`- "markdown": HTML converted to markdown, or code-wrapped for non-HTML`
			`- "html": Raw HTML content`
			- Timeout can be specified in seconds (default 30s, max 120s)`