mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-14 03:30:26 +00:00
272 lines
8.0 KiB
Go
272 lines
8.0 KiB
Go
package builtin
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/JohannesKaufmann/html-to-markdown"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/mark3labs/mcp-go/mcp"
|
|
"github.com/mark3labs/mcp-go/server"
|
|
)
|
|
|
|
const (
|
|
maxResponseSize = 5 * 1024 * 1024 // 5MB
|
|
defaultFetchTimeout = 30 * time.Second
|
|
maxFetchTimeout = 120 * time.Second
|
|
)
|
|
|
|
// NewFetchServer creates a new MCP server that provides web content fetching capabilities.
|
|
// The server includes a single tool "fetch" that retrieves content from URLs and converts
|
|
// it to text, markdown, or HTML format. Returns an error if server initialization fails.
|
|
func NewFetchServer() (*server.MCPServer, error) {
|
|
s := server.NewMCPServer("fetch-server", "1.0.0", server.WithToolCapabilities(true))
|
|
|
|
// Register the fetch tool
|
|
fetchTool := mcp.NewTool("fetch",
|
|
mcp.WithDescription(fetchDescription),
|
|
mcp.WithString("url",
|
|
mcp.Required(),
|
|
mcp.Description("The URL to fetch content from"),
|
|
),
|
|
mcp.WithString("format",
|
|
mcp.Required(),
|
|
mcp.Enum("text", "markdown", "html"),
|
|
mcp.Description("The format to return the content in (text, markdown, or html)"),
|
|
),
|
|
mcp.WithNumber("timeout",
|
|
mcp.Description("Optional timeout in seconds (max 120)"),
|
|
mcp.Min(0),
|
|
mcp.Max(120),
|
|
),
|
|
)
|
|
|
|
s.AddTool(fetchTool, executeFetch)
|
|
|
|
return s, nil
|
|
}
|
|
|
|
// executeFetch handles the fetch tool execution
|
|
func executeFetch(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
|
|
// Extract parameters
|
|
urlStr, err := request.RequireString("url")
|
|
if err != nil {
|
|
return mcp.NewToolResultError("url parameter is required and must be a string"), nil
|
|
}
|
|
|
|
format, err := request.RequireString("format")
|
|
if err != nil {
|
|
return mcp.NewToolResultError("format parameter is required and must be a string"), nil
|
|
}
|
|
|
|
// Validate format
|
|
if format != "text" && format != "markdown" && format != "html" {
|
|
return mcp.NewToolResultError("format must be 'text', 'markdown', or 'html'"), nil
|
|
}
|
|
|
|
// Parse timeout (optional)
|
|
timeout := defaultFetchTimeout
|
|
if timeoutSec := request.GetFloat("timeout", 0); timeoutSec > 0 {
|
|
timeoutDuration := time.Duration(timeoutSec) * time.Second
|
|
if timeoutDuration > maxFetchTimeout {
|
|
timeout = maxFetchTimeout
|
|
} else {
|
|
timeout = timeoutDuration
|
|
}
|
|
}
|
|
|
|
// Validate URL
|
|
parsedURL, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("invalid URL: %v", err)), nil
|
|
}
|
|
|
|
// Ensure URL has a scheme
|
|
if parsedURL.Scheme == "" {
|
|
urlStr = "https://" + urlStr
|
|
parsedURL, err = url.Parse(urlStr)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("invalid URL after adding https: %v", err)), nil
|
|
}
|
|
}
|
|
|
|
// Only allow HTTP and HTTPS
|
|
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
|
|
return mcp.NewToolResultError("URL must use http:// or https://"), nil
|
|
}
|
|
|
|
// Upgrade HTTP to HTTPS only for external URLs (not localhost/127.0.0.1)
|
|
if parsedURL.Scheme == "http" && !isLocalhost(parsedURL.Host) {
|
|
parsedURL.Scheme = "https"
|
|
urlStr = parsedURL.String()
|
|
}
|
|
|
|
// Create HTTP client with timeout
|
|
client := &http.Client{
|
|
Timeout: timeout,
|
|
}
|
|
|
|
// Create request with context
|
|
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("failed to create request: %v", err)), nil
|
|
}
|
|
|
|
// Set headers to mimic a real browser
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
|
|
|
// Make the request
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("request failed: %v", err)), nil
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Check status code
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return mcp.NewToolResultError(fmt.Sprintf("request failed with status code: %d", resp.StatusCode)), nil
|
|
}
|
|
|
|
// Check content length
|
|
if resp.ContentLength > maxResponseSize {
|
|
return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
|
|
}
|
|
|
|
// Read response body with size limit
|
|
limitedReader := io.LimitReader(resp.Body, maxResponseSize+1)
|
|
bodyBytes, err := io.ReadAll(limitedReader)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("failed to read response: %v", err)), nil
|
|
}
|
|
|
|
// Check if we exceeded the size limit
|
|
if len(bodyBytes) > maxResponseSize {
|
|
return mcp.NewToolResultError("response too large (exceeds 5MB limit)"), nil
|
|
}
|
|
|
|
content := string(bodyBytes)
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if contentType == "" {
|
|
contentType = "unknown"
|
|
}
|
|
|
|
// Process content based on format
|
|
var output string
|
|
switch format {
|
|
case "text":
|
|
if strings.Contains(contentType, "text/html") {
|
|
output, err = extractTextFromHTML(content)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("failed to extract text from HTML: %v", err)), nil
|
|
}
|
|
} else {
|
|
output = content
|
|
}
|
|
|
|
case "markdown":
|
|
if strings.Contains(contentType, "text/html") {
|
|
output, err = convertHTMLToMarkdown(content)
|
|
if err != nil {
|
|
return mcp.NewToolResultError(fmt.Sprintf("failed to convert HTML to markdown: %v", err)), nil
|
|
}
|
|
} else {
|
|
output = "```\n" + content + "\n```"
|
|
}
|
|
|
|
case "html":
|
|
output = content
|
|
|
|
default:
|
|
output = content
|
|
}
|
|
|
|
// Create result with metadata
|
|
title := fmt.Sprintf("%s (%s)", urlStr, contentType)
|
|
result := mcp.NewToolResultText(output)
|
|
result.Meta = &mcp.Meta{
|
|
AdditionalFields: map[string]any{
|
|
"title": title,
|
|
},
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// extractTextFromHTML extracts plain text from HTML content
|
|
func extractTextFromHTML(htmlContent string) (string, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Remove script, style, and other non-content elements
|
|
doc.Find("script, style, noscript, iframe, object, embed").Remove()
|
|
|
|
// Extract text content
|
|
text := doc.Text()
|
|
|
|
// Clean up whitespace
|
|
lines := strings.Split(text, "\n")
|
|
var cleanLines []string
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed != "" {
|
|
cleanLines = append(cleanLines, trimmed)
|
|
}
|
|
}
|
|
|
|
return strings.Join(cleanLines, "\n"), nil
|
|
}
|
|
|
|
// convertHTMLToMarkdown converts HTML content to markdown
|
|
func convertHTMLToMarkdown(htmlContent string) (string, error) {
|
|
converter := md.NewConverter("", true, nil)
|
|
|
|
// Remove unwanted elements
|
|
converter.Remove("script")
|
|
converter.Remove("style")
|
|
converter.Remove("meta")
|
|
converter.Remove("link")
|
|
|
|
markdown, err := converter.ConvertString(htmlContent)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return markdown, nil
|
|
}
|
|
|
|
// isLocalhost checks if the host is localhost or 127.0.0.1
|
|
func isLocalhost(host string) bool {
|
|
return strings.HasPrefix(host, "localhost") ||
|
|
strings.HasPrefix(host, "127.0.0.1") ||
|
|
strings.HasPrefix(host, "::1")
|
|
}
|
|
|
|
const fetchDescription = `Fetches content from a specified URL and returns it in the requested format.
|
|
|
|
- Fetches content from a specified URL
|
|
- Takes a URL and format as input
|
|
- Fetches the URL content, converts HTML to markdown or text as requested
|
|
- Returns the content in the specified format
|
|
- Use this tool when you need to retrieve and analyze web content
|
|
|
|
Usage notes:
|
|
- IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
|
|
- The URL must be a fully-formed valid URL
|
|
- HTTP URLs will be automatically upgraded to HTTPS
|
|
- This tool is read-only and does not modify any files
|
|
- Results may be summarized if the content is very large (max 5MB)
|
|
- Supports three output formats:
|
|
- "text": Plain text extraction from HTML, or raw content for non-HTML
|
|
- "markdown": HTML converted to markdown, or code-wrapped for non-HTML
|
|
- "html": Raw HTML content
|
|
- Timeout can be specified in seconds (default 30s, max 120s)`
|