chore: optimize strip markdown & preview

2026-06-14 03:30:19 +00:00 · 2026-04-24 11:26:48 +08:00
3 changed files with 418 additions and 31 deletions
@@ -0,0 +1,109 @@
+/**
+ * Dev-only preview: run realistic LLM-style markdown through stripMarkdown
+ * and print before/after side-by-side for eyeball review.
+ *
+ * Run: bunx tsx scripts/previewStripMarkdown.ts
+ */
+import { stripMarkdown } from '../src/server/services/bot/platforms/stripMarkdown';
+
+interface Sample {
+  input: string;
+  name: string;
+}
+
+const samples: Sample[] = [
+  {
+    input: ['| 待办 |', '|------|', '| 写 PR 描述 |', '| 跑 E2E |', '| 发布 canary |'].join('\n'),
+    name: '1 列表格(应渲染为无序列表)',
+  },
+  {
+    input: ['| Name | Age |', '|------|-----|', '| Alice | 30 |', '| Bob | 25 |'].join('\n'),
+    name: '2 列表格(窄表,单行记录)',
+  },
+  {
+    input: [
+      '| Model | Context | Price |',
+      '|-------|---------|-------|',
+      '| GPT-5 | 1M | $3/M |',
+      '| Claude Opus 4.7 | 1M | $15/M |',
+      '| Gemini 2.5 | 2M | $1.25/M |',
+    ].join('\n'),
+    name: '3 列表格(窄表临界)',
+  },
+  {
+    input: [
+      '| 姓名 | 年龄 | 职位 | 城市 |',
+      '|------|------|------|------|',
+      '| Alice | 30 | 工程师 | 上海 |',
+      '| Bob | 25 | 设计师 | 北京 |',
+    ].join('\n'),
+    name: '4 列表格(宽表,多行记录块)',
+  },
+  {
+    input: [
+      '# 📊 今日销售简报',
+      '',
+      '这是 **Q4 第 3 周** 的关键数据,请重点关注 *华东区*。',
+      '',
+      '## 核心指标',
+      '',
+      '| 指标 | 值 | 环比 |',
+      '|------|----|----|',
+      '| GMV | ¥1.2M | +12% |',
+      '| 订单数 | 3,421 | +8% |',
+      '| 客单价 | ¥350 | +3% |',
+      '',
+      '## 待办事项',
+      '',
+      '1. 跟进 [华东区报告](https://example.com/report-q4w3)',
+      '2. 回复 `sales@company.com` 的邮件',
+      '3. 准备 ***周会 PPT***',
+      '',
+      '## 示例代码',
+      '',
+      '```ts',
+      'const revenue = orders.reduce((s, o) => s + o.amount, 0);',
+      '// 注意:这里不应被转换 # 或 **',
+      '```',
+      '',
+      '> 提醒:周五下班前提交',
+      '',
+      '---',
+      '',
+      '附:~~废弃方案~~已移除。',
+    ].join('\n'),
+    name: '5 真实 LLM 输出(全要素)',
+  },
+  {
+    input: [
+      '支持的功能:',
+      '- 文本对话 with `markdown` 渲染',
+      '- 图片输入: ![示例](https://img.example.com/x.png)',
+      '- 工具调用(见 [文档](https://docs.example.com))',
+      '  - 嵌套项 A',
+      '  - 嵌套项 B',
+      '- **重要**:暂不支持语音',
+    ].join('\n'),
+    name: '6 列表 + 行内格式 + 图片 + 嵌套',
+  },
+  {
+    input: [
+      '变量命名遵循 `snake_case`,例如 `user_id`、`created_at`。',
+      '不要与 _italic_ 混淆 —— 后者前后需有空格。',
+    ].join('\n'),
+    name: '7 下划线边界(snake_case vs italic)',
+  },
+];
+
+const divider = (char: string, len = 72) => char.repeat(len);
+
+for (const { name, input } of samples) {
+  console.log('\n' + divider('='));
+  console.log('CASE: ' + name);
+  console.log(divider('='));
+  console.log('--- INPUT ---');
+  console.log(input);
+  console.log('--- OUTPUT ---');
+  console.log(stripMarkdown(input));
+}
+console.log('\n' + divider('='));
@@ -33,7 +33,24 @@ describe('stripMarkdown', () => {

  it('should remove fenced code block markers but keep content', () => {
    const input = '```typescript\nconst x = 1;\n```';
-    expect(stripMarkdown(input)).toBe('const x = 1;\n');
+    expect(stripMarkdown(input)).toBe('const x = 1;');
+  });
+
+  it('should not inject extra blank lines after a fenced code block', () => {
+    // Before the fix, the captured code content carried a trailing \n that
+    // stacked with the \n\n after the closing fence → two blank lines.
+    const input = ['```js', 'a', 'b', '```', '', 'next'].join('\n');
+    expect(stripMarkdown(input)).toBe('a\nb\n\nnext');
+  });
+
+  it('should not mangle markdown-like syntax inside code blocks', () => {
+    // Content inside a fenced code block must survive verbatim — no heading /
+    // table / emphasis rewriting should be applied to it.
+    const input = ['```md', '# Not a heading', '**not bold**', '| a | b |', '```'].join('\n');
+    const result = stripMarkdown(input);
+    expect(result).toContain('# Not a heading');
+    expect(result).toContain('**not bold**');
+    expect(result).toContain('| a | b |');
  });

  it('should convert links to text (url) format', () => {
@@ -50,11 +67,175 @@ describe('stripMarkdown', () => {
    expect(stripMarkdown('> quoted text')).toBe('| quoted text');
  });

-  it('should handle tables by converting to bullet list', () => {
-    const input = '| Name | Age |\n|------|-----|\n| Alice | 30 |\n| Bob | 25 |';
-    const result = stripMarkdown(input);
-    expect(result).toContain('- Name: Alice, Age: 30');
-    expect(result).toContain('- Name: Bob, Age: 25');
+  describe('tables', () => {
+    it('should convert narrow tables (2–3 cols) to single-line records', () => {
+      const input = '| Name | Age |\n|------|-----|\n| Alice | 30 |\n| Bob | 25 |';
+      const result = stripMarkdown(input);
+      expect(result).toContain('- Name: Alice, Age: 30');
+      expect(result).toContain('- Name: Bob, Age: 25');
+    });
+
+    it('should render single-column tables as a plain bullet list', () => {
+      const input = '| Item |\n|------|\n| Apple |\n| Banana |';
+      const result = stripMarkdown(input);
+      expect(result).toBe('- Apple\n- Banana');
+    });
+
+    it('should render wide tables (4+ cols) as multi-line record blocks', () => {
+      const input = [
+        '| 姓名 | 年龄 | 职位 | 城市 |',
+        '|------|------|------|------|',
+        '| Alice | 30 | 工程师 | 上海 |',
+        '| Bob | 25 | 设计师 | 北京 |',
+      ].join('\n');
+      const result = stripMarkdown(input);
+      expect(result).toContain('【1】');
+      expect(result).toContain('姓名: Alice');
+      expect(result).toContain('年龄: 30');
+      expect(result).toContain('职位: 工程师');
+      expect(result).toContain('城市: 上海');
+      expect(result).toContain('【2】');
+      expect(result).toContain('姓名: Bob');
+    });
+
+    it('should handle escaped pipes inside cells', () => {
+      const input = '| Key | Value |\n|-----|-------|\n| pipe | a \\| b |';
+      const result = stripMarkdown(input);
+      expect(result).toContain('- Key: pipe, Value: a | b');
+    });
+
+    it('should skip empty cells without emitting stray "header: " fragments', () => {
+      const input = '| Name | Age |\n|------|-----|\n| Alice |  |\n|  | 25 |';
+      const result = stripMarkdown(input);
+      expect(result).toContain('- Name: Alice');
+      expect(result).toContain('- Age: 25');
+      expect(result).not.toContain('Age: \n');
+      expect(result).not.toContain('Name: ,');
+    });
+
+    it('should not treat a pipe-only line as a table', () => {
+      // `|--|` without a preceding header row should not trigger table parsing.
+      const input = 'just a line with | pipe characters |\nand another | one |';
+      const result = stripMarkdown(input);
+      expect(result).toBe(input);
+    });
+
+    it('should preserve the blank line between a table and the following block', () => {
+      // Before the fix, the body-row regex ate the trailing \n after the last
+      // row, collapsing the blank separator before the next heading.
+      const input = ['| a | b |', '|---|---|', '| 1 | 2 |', '', '## Next'].join('\n');
+      const result = stripMarkdown(input);
+      expect(result).toBe('- a: 1, b: 2\n\nNext');
+    });
+  });
+
+  describe('lists', () => {
+    it('should preserve unordered list markers', () => {
+      const input = '- Apple\n- Banana\n- Cherry';
+      expect(stripMarkdown(input)).toBe('- Apple\n- Banana\n- Cherry');
+    });
+
+    it('should preserve ordered list markers', () => {
+      const input = '1. First\n2. Second\n3. Third';
+      expect(stripMarkdown(input)).toBe('1. First\n2. Second\n3. Third');
+    });
+
+    it('should preserve nested list indentation', () => {
+      const input = '- Parent\n  - Child A\n  - Child B';
+      expect(stripMarkdown(input)).toBe('- Parent\n  - Child A\n  - Child B');
+    });
+
+    it('should strip inline formatting inside list items', () => {
+      const input =
+        '- Run `npm install`\n- Visit [docs](https://example.com)\n- **Important** note';
+      const result = stripMarkdown(input);
+      expect(result).toContain('- Run npm install');
+      expect(result).toContain('- Visit docs (https://example.com)');
+      expect(result).toContain('- Important note');
+    });
+  });
+
+  describe('horizontal rules', () => {
+    it('should normalize dash HR', () => {
+      expect(stripMarkdown('---')).toBe('---');
+    });
+
+    it('should normalize asterisk HR', () => {
+      expect(stripMarkdown('***')).toBe('---');
+    });
+
+    it('should normalize underscore HR', () => {
+      expect(stripMarkdown('___')).toBe('---');
+    });
+
+    it('should not swallow the blank line after a horizontal rule', () => {
+      // A trailing `\s*` in the HR regex would greedily eat the newline after
+      // `---`, collapsing the intended blank-line separator.
+      expect(stripMarkdown('before\n\n---\n\nafter')).toBe('before\n\n---\n\nafter');
+    });
+  });
+
+  describe('mixed inline formatting', () => {
+    it('should handle bold wrapping italic', () => {
+      expect(stripMarkdown('**bold _italic_**')).toBe('bold italic');
+    });
+
+    it('should handle multiple emphases on one line', () => {
+      expect(stripMarkdown('A **bold** and *italic* and `code` here')).toBe(
+        'A bold and italic and code here',
+      );
+    });
+
+    it('should not treat underscores inside identifiers as italic', () => {
+      // `some_snake_case` has underscores flanked by word chars, not whitespace —
+      // the italic rule must leave these alone or we'd mangle code/variable names.
+      expect(stripMarkdown('some_snake_case variable')).toBe('some_snake_case variable');
+    });
+  });
+
+  describe('structure preservation', () => {
+    it('should preserve blank lines between paragraphs', () => {
+      const input = 'First paragraph.\n\nSecond paragraph.';
+      expect(stripMarkdown(input)).toBe(input);
+    });
+
+    it('should pass through emoji and Chinese unchanged', () => {
+      expect(stripMarkdown('你好 世界 🎉 **重点**')).toBe('你好 世界 🎉 重点');
+    });
+
+    it('should handle a realistic LLM response with table + code + list', () => {
+      const input = [
+        '# 功能对比 🎯',
+        '',
+        '这里是 **三款工具** 的对比:',
+        '',
+        '| 工具 | 价格 |',
+        '|------|------|',
+        '| A 工具 | 免费 |',
+        '| B 工具 | $10 |',
+        '',
+        '## 推荐',
+        '',
+        '1. 先试试 *免费* 的 A',
+        '2. 详见 [文档](https://example.com)',
+        '',
+        '```bash',
+        'npm install a-tool',
+        '```',
+      ].join('\n');
+
+      const result = stripMarkdown(input);
+      expect(result).not.toContain('**');
+      expect(result).not.toContain('```');
+      expect(result).not.toMatch(/^#\s/m);
+      expect(result).toContain('功能对比 🎯');
+      expect(result).toContain('三款工具');
+      expect(result).toContain('- 工具: A 工具, 价格: 免费');
+      expect(result).toContain('- 工具: B 工具, 价格: $10');
+      expect(result).toContain('1. 先试试 免费 的 A');
+      expect(result).toContain('2. 详见 文档 (https://example.com)');
+      expect(result).toContain('npm install a-tool');
+    });
  });

  it('should handle a complex mixed markdown document', () => {
@@ -6,39 +6,128 @@
 * - Remove syntactic noise (**, `, #, []() etc.)
 * - Keep code block content intact (just remove the fences)
 * - Convert links to "text (url)" format so URLs are still accessible
- * - Convert tables to aligned plain-text representation
+ * - Convert tables to a readable plain-text layout tuned for mobile chat
+ *   (notably WeChat, which has no Markdown/HTML rendering and uses a
+ *   proportional font that makes column-aligned ASCII tables unreliable).
 */
+
+const CODE_BLOCK_PLACEHOLDER = '\u0000CODEBLOCK_';
+
+/**
+ * Split a Markdown table row on unescaped `|`.
+ * Handles cells that legitimately contain `\|` (escaped pipe).
+ */
+function splitTableRow(row: string): string[] {
+  let s = row.trim();
+  if (s.startsWith('|')) s = s.slice(1);
+  if (s.endsWith('|')) s = s.slice(0, -1);
+
+  const cells: string[] = [];
+  let buf = '';
+  for (let i = 0; i < s.length; i++) {
+    const ch = s[i];
+    if (ch === '\\' && s[i + 1] === '|') {
+      buf += '|';
+      i++;
+      continue;
+    }
+    if (ch === '|') {
+      cells.push(buf);
+      buf = '';
+      continue;
+    }
+    buf += ch;
+  }
+  cells.push(buf);
+  return cells.map((c) => c.trim());
+}
+
+/**
+ * Render a parsed table as plain text, picking a layout based on column count.
+ *
+ * - 1 column → plain bullet list of values
+ * - 2–3 columns → single-line "- header: value, header: value" records
+ * - 4+ columns → multi-line record blocks prefixed with 【N】, one field per line
+ *
+ * WeChat wraps long single-line messages awkwardly on mobile; splitting wide
+ * tables into field-per-line blocks keeps each row scannable.
+ */
+function formatTableAsText(headers: string[], rows: string[][]): string {
+  if (rows.length === 0) return '';
+
+  const colCount = headers.length;
+  const hasMeaningfulHeaders = headers.some((h) => h.length > 0);
+
+  const joinPairs = (cells: string[]): string[] =>
+    cells
+      .map((cell, i) => {
+        const header = headers[i] ?? '';
+        const value = cell ?? '';
+        if (!hasMeaningfulHeaders || !header) return value;
+        if (!value) return '';
+        return `${header}: ${value}`;
+      })
+      .filter((s) => s.length > 0);
+
+  if (colCount <= 1) {
+    return rows
+      .map((cells) => `- ${(cells[0] ?? '').trim()}`)
+      .filter((line) => line !== '- ')
+      .join('\n');
+  }
+
+  if (colCount >= 4) {
+    return rows
+      .map((cells, idx) => {
+        const fields = joinPairs(cells);
+        return [`【${idx + 1}】`, ...fields].join('\n');
+      })
+      .join('\n\n');
+  }
+
+  return rows
+    .map((cells) => {
+      const pairs = joinPairs(cells);
+      return pairs.length > 0 ? `- ${pairs.join(', ')}` : '';
+    })
+    .filter((line) => line.length > 0)
+    .join('\n');
+}
+
 export function stripMarkdown(md: string): string {
  let text = md;

-  // --- Block-level transforms (order matters) ---
+  // --- Step 1: protect fenced code blocks ---
+  // Keep their content intact by swapping in placeholders before any other
+  // transform runs. Pipes or `#` inside code would otherwise be mangled by
+  // the table/heading rules below.
+  // We strip the captured content's trailing `\n` (always present because
+  // the closing fence sits on its own line); leaving it in would stack with
+  // the newline(s) following the closing fence and produce extra blank lines.
+  const codeBlocks: string[] = [];
+  text = text.replaceAll(/^```[\w-]*\n([\s\S]*?)^```/gm, (_match, content: string) => {
+    const idx = codeBlocks.push(content.replace(/\n$/, '')) - 1;
+    return `${CODE_BLOCK_PLACEHOLDER}${idx}\u0000`;
+  });

-  // Fenced code blocks: remove fences, keep content
-  text = text.replaceAll(/^```[\w-]*\n([\s\S]*?)^```/gm, '$1');
+  // --- Step 2: block-level transforms ---

-  // Tables: convert to bullet-style rows
+  // Tables → readable plain text. The separator row must contain at least one
+  // `-` per our regex, which prevents accidental matches on stray `|`-only lines.
+  // The body group `(?:\|.+\|\n?)*` greedily consumes the trailing `\n` after
+  // the last row when present; we re-emit it so a blank line that originally
+  // separated the table from the following content survives the rewrite.
  text = text.replaceAll(
-    /^(\|.+\|)\n\|[-\s|:]+\|\n((?:\|.+\|\n?)*)/gm,
-    (_match, headerRow: string, bodyRows: string) => {
-      const parseRow = (row: string) =>
-        row
-          .split('|')
-          .slice(1, -1)
-          .map((c: string) => c.trim());
-
-      const headers = parseRow(headerRow);
+    /^(\|.+\|)\n\|[\s:|]*-[-\s:|]*\|\n((?:\|.+\|\n?)*)/gm,
+    (match, headerRow: string, bodyRows: string) => {
+      const headers = splitTableRow(headerRow);
      const rows = bodyRows
        .trim()
        .split('\n')
        .filter(Boolean)
-        .map((r: string) => parseRow(r));
-
-      return rows
-        .map((cells: string[]) =>
-          cells.map((cell: string, i: number) => `${headers[i]}: ${cell}`).join(', '),
-        )
-        .map((line: string) => `- ${line}`)
-        .join('\n');
+        .map((r: string) => splitTableRow(r));
+      const trailingNewline = match.endsWith('\n') ? '\n' : '';
+      return formatTableAsText(headers, rows) + trailingNewline;
    },
  );

@@ -48,10 +137,12 @@ export function stripMarkdown(md: string): string {
  // Blockquotes: replace > with vertical bar
  text = text.replaceAll(/^>\s?/gm, '| ');

-  // Horizontal rules
-  text = text.replaceAll(/^[-*_]{3,}\s*$/gm, '---');
+  // Horizontal rules. Use `[ \t]*$` rather than `\s*$` because `\s` matches
+  // `\n`, and in /m mode a trailing `\s*$` would greedily consume the line's
+  // terminating newline, swallowing a blank-line separator from the next block.
+  text = text.replaceAll(/^[-*_]{3,}[ \t]*$/gm, '---');

-  // --- Inline transforms ---
+  // --- Step 3: inline transforms ---

  // Images: ![alt](url) → alt
  text = text.replaceAll(/!\[([^\]]*)\]\([^)]+\)/g, '$1');
@@ -77,5 +168,11 @@ export function stripMarkdown(md: string): string {
  // Inline code: `text`
  text = text.replaceAll(/`([^`]+)`/g, '$1');

+  // --- Step 4: restore protected code blocks ---
+  text = text.replaceAll(
+    new RegExp(`${CODE_BLOCK_PLACEHOLDER}(\\d+)\\0`, 'g'),
+    (_match, idx: string) => codeBlocks[Number(idx)] ?? '',
+  );
+
  return text;
 }