Compare commits

...

2 Commits

Author SHA1 Message Date
Arvin Xu cccb01f57d ♻️ refactor: remove redundant update-status call from GatewayStreamNotifier
Gateway now handles session completion directly in pushEvent when it
receives agent_runtime_end, so the separate update-status HTTP call
is no longer needed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 13:21:18 +08:00
Arvin Xu bfa1b70c96 🐛 fix(web-crawler): prevent happy-dom CSS parsing crash in htmlToMarkdown
- Disable CSS file loading and JS evaluation in happy-dom Window (root cause)
- Add try-catch around Readability.parse() for defense in depth
- Add regression tests for invalid CSS selectors and external stylesheet links

Closes LOBE-6869

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 12:40:02 +08:00
3 changed files with 44 additions and 11 deletions
@@ -50,6 +50,39 @@ describe('htmlToMarkdown', () => {
expect(result.content.length).toBeLessThan(html.length);
}, 20000);
it('should not crash on HTML with invalid CSS selectors (LOBE-6869)', () => {
// Regression: happy-dom throws TypeError on pages with CSS selectors it cannot parse.
// htmlToMarkdown must not propagate this — it should fall back to raw HTML conversion.
const html = `
<html><head>
<style>:is(.foo, :has(> .bar)) { color: red }</style>
</head><body>
<script type="application/ld+json">{"@type":"Article","name":"Test"}</script>
<p>Valid content here</p>
</body></html>`;
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
expect(result).toBeDefined();
expect(result.content).toContain('Valid content');
});
it('should not crash on HTML with external stylesheet links (LOBE-6869)', () => {
// Regression: happy-dom's HTMLLinkElement.#loadStyleSheet can crash on CSS parsing.
// disableCSSFileLoading should prevent this path entirely.
const html = `
<html><head>
<link rel="stylesheet" href="https://example.com/styles.css">
</head><body>
<p>Content with external CSS</p>
</body></html>`;
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
expect(result).toBeDefined();
expect(result.content).toContain('Content with external CSS');
});
it('should not truncate HTML under 1 MB', () => {
const html = '<html><body><p>Small content</p></body></html>';
@@ -31,13 +31,21 @@ export const htmlToMarkdown = (
{ url, filterOptions }: { filterOptions: FilterOptions; url: string },
): HtmlToMarkdownOutput => {
const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml;
const window = new Window({ url });
const window = new Window({
settings: { disableCSSFileLoading: true, disableJavaScriptEvaluation: true },
url,
});
const document = window.document;
document.body.innerHTML = html;
// @ts-expect-error reason: Readability expects a Document type
const parsedContent = new Readability(document).parse();
let parsedContent: ReturnType<Readability<string>['parse']> = null;
try {
// @ts-expect-error reason: Readability expects a Document type
parsedContent = new Readability(document).parse();
} catch {
// happy-dom may throw on pages with invalid CSS selectors — fall back to raw HTML
}
const useReadability = filterOptions.enableReadability ?? true;
@@ -96,14 +96,6 @@ export class GatewayStreamNotifier implements IStreamEventManager {
type: 'agent_runtime_end',
});
const status =
reason === 'error' ? 'error' : reason === 'interrupted' ? 'interrupted' : 'completed';
this.httpPost('/api/operations/update-status', {
operationId,
status,
summary: reasonDetail,
});
return result;
}