Compare commits

...

1 Commits

Author SHA1 Message Date
rdmclin2 c0c31009cb chore: optimize strip markdown & preview 2026-04-24 11:26:48 +08:00
3 changed files with 418 additions and 31 deletions
+109
View File
@@ -0,0 +1,109 @@
/**
* Dev-only preview: run realistic LLM-style markdown through stripMarkdown
* and print before/after side-by-side for eyeball review.
*
* Run: bunx tsx scripts/previewStripMarkdown.ts
*/
import { stripMarkdown } from '../src/server/services/bot/platforms/stripMarkdown';
interface Sample {
input: string;
name: string;
}
const samples: Sample[] = [
{
input: ['| 待办 |', '|------|', '| 写 PR 描述 |', '| 跑 E2E |', '| 发布 canary |'].join('\n'),
name: '1 列表格(应渲染为无序列表)',
},
{
input: ['| Name | Age |', '|------|-----|', '| Alice | 30 |', '| Bob | 25 |'].join('\n'),
name: '2 列表格(窄表,单行记录)',
},
{
input: [
'| Model | Context | Price |',
'|-------|---------|-------|',
'| GPT-5 | 1M | $3/M |',
'| Claude Opus 4.7 | 1M | $15/M |',
'| Gemini 2.5 | 2M | $1.25/M |',
].join('\n'),
name: '3 列表格(窄表临界)',
},
{
input: [
'| 姓名 | 年龄 | 职位 | 城市 |',
'|------|------|------|------|',
'| Alice | 30 | 工程师 | 上海 |',
'| Bob | 25 | 设计师 | 北京 |',
].join('\n'),
name: '4 列表格(宽表,多行记录块)',
},
{
input: [
'# 📊 今日销售简报',
'',
'这是 **Q4 第 3 周** 的关键数据,请重点关注 *华东区*。',
'',
'## 核心指标',
'',
'| 指标 | 值 | 环比 |',
'|------|----|----|',
'| GMV | ¥1.2M | +12% |',
'| 订单数 | 3,421 | +8% |',
'| 客单价 | ¥350 | +3% |',
'',
'## 待办事项',
'',
'1. 跟进 [华东区报告](https://example.com/report-q4w3)',
'2. 回复 `sales@company.com` 的邮件',
'3. 准备 ***周会 PPT***',
'',
'## 示例代码',
'',
'```ts',
'const revenue = orders.reduce((s, o) => s + o.amount, 0);',
'// 注意:这里不应被转换 # 或 **',
'```',
'',
'> 提醒:周五下班前提交',
'',
'---',
'',
'附:~~废弃方案~~已移除。',
].join('\n'),
name: '5 真实 LLM 输出(全要素)',
},
{
input: [
'支持的功能:',
'- 文本对话 with `markdown` 渲染',
'- 图片输入: ![示例](https://img.example.com/x.png)',
'- 工具调用(见 [文档](https://docs.example.com))',
' - 嵌套项 A',
' - 嵌套项 B',
'- **重要**:暂不支持语音',
].join('\n'),
name: '6 列表 + 行内格式 + 图片 + 嵌套',
},
{
input: [
'变量命名遵循 `snake_case`,例如 `user_id`、`created_at`。',
'不要与 _italic_ 混淆 —— 后者前后需有空格。',
].join('\n'),
name: '7 下划线边界(snake_case vs italic)',
},
];
const divider = (char: string, len = 72) => char.repeat(len);
for (const { name, input } of samples) {
console.log('\n' + divider('='));
console.log('CASE: ' + name);
console.log(divider('='));
console.log('--- INPUT ---');
console.log(input);
console.log('--- OUTPUT ---');
console.log(stripMarkdown(input));
}
console.log('\n' + divider('='));
@@ -33,7 +33,24 @@ describe('stripMarkdown', () => {
it('should remove fenced code block markers but keep content', () => {
const input = '```typescript\nconst x = 1;\n```';
expect(stripMarkdown(input)).toBe('const x = 1;\n');
expect(stripMarkdown(input)).toBe('const x = 1;');
});
it('should not inject extra blank lines after a fenced code block', () => {
// Before the fix, the captured code content carried a trailing \n that
// stacked with the \n\n after the closing fence → two blank lines.
const input = ['```js', 'a', 'b', '```', '', 'next'].join('\n');
expect(stripMarkdown(input)).toBe('a\nb\n\nnext');
});
it('should not mangle markdown-like syntax inside code blocks', () => {
// Content inside a fenced code block must survive verbatim — no heading /
// table / emphasis rewriting should be applied to it.
const input = ['```md', '# Not a heading', '**not bold**', '| a | b |', '```'].join('\n');
const result = stripMarkdown(input);
expect(result).toContain('# Not a heading');
expect(result).toContain('**not bold**');
expect(result).toContain('| a | b |');
});
it('should convert links to text (url) format', () => {
@@ -50,11 +67,175 @@ describe('stripMarkdown', () => {
expect(stripMarkdown('> quoted text')).toBe('| quoted text');
});
it('should handle tables by converting to bullet list', () => {
const input = '| Name | Age |\n|------|-----|\n| Alice | 30 |\n| Bob | 25 |';
const result = stripMarkdown(input);
expect(result).toContain('- Name: Alice, Age: 30');
expect(result).toContain('- Name: Bob, Age: 25');
describe('tables', () => {
it('should convert narrow tables (23 cols) to single-line records', () => {
const input = '| Name | Age |\n|------|-----|\n| Alice | 30 |\n| Bob | 25 |';
const result = stripMarkdown(input);
expect(result).toContain('- Name: Alice, Age: 30');
expect(result).toContain('- Name: Bob, Age: 25');
});
it('should render single-column tables as a plain bullet list', () => {
const input = '| Item |\n|------|\n| Apple |\n| Banana |';
const result = stripMarkdown(input);
expect(result).toBe('- Apple\n- Banana');
});
it('should render wide tables (4+ cols) as multi-line record blocks', () => {
const input = [
'| 姓名 | 年龄 | 职位 | 城市 |',
'|------|------|------|------|',
'| Alice | 30 | 工程师 | 上海 |',
'| Bob | 25 | 设计师 | 北京 |',
].join('\n');
const result = stripMarkdown(input);
expect(result).toContain('【1】');
expect(result).toContain('姓名: Alice');
expect(result).toContain('年龄: 30');
expect(result).toContain('职位: 工程师');
expect(result).toContain('城市: 上海');
expect(result).toContain('【2】');
expect(result).toContain('姓名: Bob');
});
it('should handle escaped pipes inside cells', () => {
const input = '| Key | Value |\n|-----|-------|\n| pipe | a \\| b |';
const result = stripMarkdown(input);
expect(result).toContain('- Key: pipe, Value: a | b');
});
it('should skip empty cells without emitting stray "header: " fragments', () => {
const input = '| Name | Age |\n|------|-----|\n| Alice | |\n| | 25 |';
const result = stripMarkdown(input);
expect(result).toContain('- Name: Alice');
expect(result).toContain('- Age: 25');
expect(result).not.toContain('Age: \n');
expect(result).not.toContain('Name: ,');
});
it('should not treat a pipe-only line as a table', () => {
// `|--|` without a preceding header row should not trigger table parsing.
const input = 'just a line with | pipe characters |\nand another | one |';
const result = stripMarkdown(input);
expect(result).toBe(input);
});
it('should preserve the blank line between a table and the following block', () => {
// Before the fix, the body-row regex ate the trailing \n after the last
// row, collapsing the blank separator before the next heading.
const input = ['| a | b |', '|---|---|', '| 1 | 2 |', '', '## Next'].join('\n');
const result = stripMarkdown(input);
expect(result).toBe('- a: 1, b: 2\n\nNext');
});
});
describe('lists', () => {
it('should preserve unordered list markers', () => {
const input = '- Apple\n- Banana\n- Cherry';
expect(stripMarkdown(input)).toBe('- Apple\n- Banana\n- Cherry');
});
it('should preserve ordered list markers', () => {
const input = '1. First\n2. Second\n3. Third';
expect(stripMarkdown(input)).toBe('1. First\n2. Second\n3. Third');
});
it('should preserve nested list indentation', () => {
const input = '- Parent\n - Child A\n - Child B';
expect(stripMarkdown(input)).toBe('- Parent\n - Child A\n - Child B');
});
it('should strip inline formatting inside list items', () => {
const input =
'- Run `npm install`\n- Visit [docs](https://example.com)\n- **Important** note';
const result = stripMarkdown(input);
expect(result).toContain('- Run npm install');
expect(result).toContain('- Visit docs (https://example.com)');
expect(result).toContain('- Important note');
});
});
describe('horizontal rules', () => {
it('should normalize dash HR', () => {
expect(stripMarkdown('---')).toBe('---');
});
it('should normalize asterisk HR', () => {
expect(stripMarkdown('***')).toBe('---');
});
it('should normalize underscore HR', () => {
expect(stripMarkdown('___')).toBe('---');
});
it('should not swallow the blank line after a horizontal rule', () => {
// A trailing `\s*` in the HR regex would greedily eat the newline after
// `---`, collapsing the intended blank-line separator.
expect(stripMarkdown('before\n\n---\n\nafter')).toBe('before\n\n---\n\nafter');
});
});
describe('mixed inline formatting', () => {
it('should handle bold wrapping italic', () => {
expect(stripMarkdown('**bold _italic_**')).toBe('bold italic');
});
it('should handle multiple emphases on one line', () => {
expect(stripMarkdown('A **bold** and *italic* and `code` here')).toBe(
'A bold and italic and code here',
);
});
it('should not treat underscores inside identifiers as italic', () => {
// `some_snake_case` has underscores flanked by word chars, not whitespace —
// the italic rule must leave these alone or we'd mangle code/variable names.
expect(stripMarkdown('some_snake_case variable')).toBe('some_snake_case variable');
});
});
describe('structure preservation', () => {
it('should preserve blank lines between paragraphs', () => {
const input = 'First paragraph.\n\nSecond paragraph.';
expect(stripMarkdown(input)).toBe(input);
});
it('should pass through emoji and Chinese unchanged', () => {
expect(stripMarkdown('你好 世界 🎉 **重点**')).toBe('你好 世界 🎉 重点');
});
it('should handle a realistic LLM response with table + code + list', () => {
const input = [
'# 功能对比 🎯',
'',
'这里是 **三款工具** 的对比:',
'',
'| 工具 | 价格 |',
'|------|------|',
'| A 工具 | 免费 |',
'| B 工具 | $10 |',
'',
'## 推荐',
'',
'1. 先试试 *免费* 的 A',
'2. 详见 [文档](https://example.com)',
'',
'```bash',
'npm install a-tool',
'```',
].join('\n');
const result = stripMarkdown(input);
expect(result).not.toContain('**');
expect(result).not.toContain('```');
expect(result).not.toMatch(/^#\s/m);
expect(result).toContain('功能对比 🎯');
expect(result).toContain('三款工具');
expect(result).toContain('- 工具: A 工具, 价格: 免费');
expect(result).toContain('- 工具: B 工具, 价格: $10');
expect(result).toContain('1. 先试试 免费 的 A');
expect(result).toContain('2. 详见 文档 (https://example.com)');
expect(result).toContain('npm install a-tool');
});
});
it('should handle a complex mixed markdown document', () => {
@@ -6,39 +6,128 @@
* - Remove syntactic noise (**, `, #, []() etc.)
* - Keep code block content intact (just remove the fences)
* - Convert links to "text (url)" format so URLs are still accessible
* - Convert tables to aligned plain-text representation
* - Convert tables to a readable plain-text layout tuned for mobile chat
* (notably WeChat, which has no Markdown/HTML rendering and uses a
* proportional font that makes column-aligned ASCII tables unreliable).
*/
const CODE_BLOCK_PLACEHOLDER = '\u0000CODEBLOCK_';
/**
* Split a Markdown table row on unescaped `|`.
* Handles cells that legitimately contain `\|` (escaped pipe).
*/
function splitTableRow(row: string): string[] {
let s = row.trim();
if (s.startsWith('|')) s = s.slice(1);
if (s.endsWith('|')) s = s.slice(0, -1);
const cells: string[] = [];
let buf = '';
for (let i = 0; i < s.length; i++) {
const ch = s[i];
if (ch === '\\' && s[i + 1] === '|') {
buf += '|';
i++;
continue;
}
if (ch === '|') {
cells.push(buf);
buf = '';
continue;
}
buf += ch;
}
cells.push(buf);
return cells.map((c) => c.trim());
}
/**
* Render a parsed table as plain text, picking a layout based on column count.
*
* - 1 column → plain bullet list of values
* - 23 columns → single-line "- header: value, header: value" records
* - 4+ columns → multi-line record blocks prefixed with 【N】, one field per line
*
* WeChat wraps long single-line messages awkwardly on mobile; splitting wide
* tables into field-per-line blocks keeps each row scannable.
*/
function formatTableAsText(headers: string[], rows: string[][]): string {
if (rows.length === 0) return '';
const colCount = headers.length;
const hasMeaningfulHeaders = headers.some((h) => h.length > 0);
const joinPairs = (cells: string[]): string[] =>
cells
.map((cell, i) => {
const header = headers[i] ?? '';
const value = cell ?? '';
if (!hasMeaningfulHeaders || !header) return value;
if (!value) return '';
return `${header}: ${value}`;
})
.filter((s) => s.length > 0);
if (colCount <= 1) {
return rows
.map((cells) => `- ${(cells[0] ?? '').trim()}`)
.filter((line) => line !== '- ')
.join('\n');
}
if (colCount >= 4) {
return rows
.map((cells, idx) => {
const fields = joinPairs(cells);
return [`${idx + 1}`, ...fields].join('\n');
})
.join('\n\n');
}
return rows
.map((cells) => {
const pairs = joinPairs(cells);
return pairs.length > 0 ? `- ${pairs.join(', ')}` : '';
})
.filter((line) => line.length > 0)
.join('\n');
}
export function stripMarkdown(md: string): string {
let text = md;
// --- Block-level transforms (order matters) ---
// --- Step 1: protect fenced code blocks ---
// Keep their content intact by swapping in placeholders before any other
// transform runs. Pipes or `#` inside code would otherwise be mangled by
// the table/heading rules below.
// We strip the captured content's trailing `\n` (always present because
// the closing fence sits on its own line); leaving it in would stack with
// the newline(s) following the closing fence and produce extra blank lines.
const codeBlocks: string[] = [];
text = text.replaceAll(/^```[\w-]*\n([\s\S]*?)^```/gm, (_match, content: string) => {
const idx = codeBlocks.push(content.replace(/\n$/, '')) - 1;
return `${CODE_BLOCK_PLACEHOLDER}${idx}\u0000`;
});
// Fenced code blocks: remove fences, keep content
text = text.replaceAll(/^```[\w-]*\n([\s\S]*?)^```/gm, '$1');
// --- Step 2: block-level transforms ---
// Tables: convert to bullet-style rows
// Tables → readable plain text. The separator row must contain at least one
// `-` per our regex, which prevents accidental matches on stray `|`-only lines.
// The body group `(?:\|.+\|\n?)*` greedily consumes the trailing `\n` after
// the last row when present; we re-emit it so a blank line that originally
// separated the table from the following content survives the rewrite.
text = text.replaceAll(
/^(\|.+\|)\n\|[-\s|:]+\|\n((?:\|.+\|\n?)*)/gm,
(_match, headerRow: string, bodyRows: string) => {
const parseRow = (row: string) =>
row
.split('|')
.slice(1, -1)
.map((c: string) => c.trim());
const headers = parseRow(headerRow);
/^(\|.+\|)\n\|[\s:|]*-[-\s:|]*\|\n((?:\|.+\|\n?)*)/gm,
(match, headerRow: string, bodyRows: string) => {
const headers = splitTableRow(headerRow);
const rows = bodyRows
.trim()
.split('\n')
.filter(Boolean)
.map((r: string) => parseRow(r));
return rows
.map((cells: string[]) =>
cells.map((cell: string, i: number) => `${headers[i]}: ${cell}`).join(', '),
)
.map((line: string) => `- ${line}`)
.join('\n');
.map((r: string) => splitTableRow(r));
const trailingNewline = match.endsWith('\n') ? '\n' : '';
return formatTableAsText(headers, rows) + trailingNewline;
},
);
@@ -48,10 +137,12 @@ export function stripMarkdown(md: string): string {
// Blockquotes: replace > with vertical bar
text = text.replaceAll(/^>\s?/gm, '| ');
// Horizontal rules
text = text.replaceAll(/^[-*_]{3,}\s*$/gm, '---');
// Horizontal rules. Use `[ \t]*$` rather than `\s*$` because `\s` matches
// `\n`, and in /m mode a trailing `\s*$` would greedily consume the line's
// terminating newline, swallowing a blank-line separator from the next block.
text = text.replaceAll(/^[-*_]{3,}[ \t]*$/gm, '---');
// --- Inline transforms ---
// --- Step 3: inline transforms ---
// Images: ![alt](url) → alt
text = text.replaceAll(/!\[([^\]]*)\]\([^)]+\)/g, '$1');
@@ -77,5 +168,11 @@ export function stripMarkdown(md: string): string {
// Inline code: `text`
text = text.replaceAll(/`([^`]+)`/g, '$1');
// --- Step 4: restore protected code blocks ---
text = text.replaceAll(
new RegExp(`${CODE_BLOCK_PLACEHOLDER}(\\d+)\\0`, 'g'),
(_match, idx: string) => codeBlocks[Number(idx)] ?? '',
);
return text;
}