Skip to content

Commit e726b5d

Browse files
refactor: use marked.parse() instead of regex for markdown stripping
- Replace custom regex-based markdown stripping with marked.parse() - marked.parse() handles all markdown edge cases automatically - Extract alt text from images before removing HTML tags - Preserve line breaks by converting <br> and </p><p> to spaces - Reduces maintenance burden by leveraging marked's robust parsing Addresses feedback from PR #4572
1 parent 8e4b031 commit e726b5d

File tree

1 file changed

+26
-43
lines changed

1 file changed

+26
-43
lines changed

src/utils/stripMarkdownAndHtml.ts

Lines changed: 26 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
import { marked } from 'marked';
2+
13
/**
24
* Strips HTML tags and Markdown formatting from text, returning plain text.
35
* This is useful for cleaning summaries and excerpts that may contain formatting.
6+
*
7+
* Uses marked.parse() to convert markdown to HTML first, which handles all markdown
8+
* edge cases, then strips the HTML tags and decodes entities.
49
*
510
* @param text - The text to clean (may contain HTML tags and Markdown formatting)
611
* @returns Plain text with all HTML tags and Markdown formatting removed
@@ -12,52 +17,30 @@ export const stripMarkdownAndHtml = (text: string | null | undefined): string =>
1217

1318
let cleaned = text;
1419

15-
// First, remove HTML tags (including self-closing tags)
16-
// This regex matches <tag>content</tag> and <tag /> patterns
17-
cleaned = cleaned.replace(/<[^>]*>/g, '');
18-
19-
// Remove Markdown formatting:
20-
// - Bold + Italic: ***text*** or ___text___ (must come before bold/italic to avoid conflicts)
21-
cleaned = cleaned.replace(/\*\*\*([^*]+?)\*\*\*/g, '$1');
22-
cleaned = cleaned.replace(/___([^_]+?)___/g, '$1');
23-
24-
// - Bold: **text** or __text__ (must come before italic to avoid conflicts)
25-
cleaned = cleaned.replace(/\*\*([^*]+?)\*\*/g, '$1');
26-
cleaned = cleaned.replace(/__([^_]+?)__/g, '$1');
27-
28-
// - Italic: *text* or _text_ (after bold to avoid matching ** as two italics)
29-
// Since bold markers (** and __) are already removed, match single markers
30-
// Use a simple pattern that avoids lookbehinds for better browser compatibility
31-
cleaned = cleaned.replace(/\*([^*\n]+?)\*/g, '$1');
32-
cleaned = cleaned.replace(/_([^_\n]+?)_/g, '$1');
33-
34-
// - Strikethrough: ~~text~~
35-
cleaned = cleaned.replace(/~~([^~]+)~~/g, '$1');
36-
37-
// - Headers: # Header, ## Header, etc.
38-
cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
39-
40-
// - Images: ![alt](url) -> alt (must come before links to avoid conflicts)
41-
cleaned = cleaned.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1');
42-
43-
// - Links: [text](url) -> text
44-
cleaned = cleaned.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
45-
46-
// - Code blocks: `code` -> code
47-
cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
48-
49-
// - Code blocks: ```code``` -> code
50-
cleaned = cleaned.replace(/```[\s\S]*?```/g, '');
20+
// First, convert markdown to HTML using marked.parse()
21+
// This handles all markdown edge cases that marked already knows about
22+
try {
23+
cleaned = marked.parse(cleaned, {
24+
breaks: true,
25+
gfm: true,
26+
}) as string;
27+
} catch (error) {
28+
// If parsing fails, fall back to original text
29+
// This shouldn't happen in normal cases, but provides a safety net
30+
console.warn('Failed to parse markdown:', error);
31+
}
5132

52-
// - Lists: - item, * item, + item, 1. item -> item
53-
cleaned = cleaned.replace(/^[\s]*[-*+]\s+/gm, '');
54-
cleaned = cleaned.replace(/^\d+\.\s+/gm, '');
33+
// Extract alt text from images before removing HTML tags
34+
// marked.parse() converts ![alt](url) to <img alt="alt" src="url">
35+
cleaned = cleaned.replace(/<img[^>]*alt=["']([^"']*)["'][^>]*>/gi, '$1');
5536

56-
// - Blockquotes: > text -> text
57-
cleaned = cleaned.replace(/^>\s+/gm, '');
37+
// Replace <br> and </p><p> with spaces to preserve line breaks as spaces
38+
cleaned = cleaned.replace(/<br\s*\/?>/gi, ' ');
39+
cleaned = cleaned.replace(/<\/p>\s*<p>/gi, ' ');
5840

59-
// - Horizontal rules: ---, ***, ___
60-
cleaned = cleaned.replace(/^[-*_]{3,}$/gm, '');
41+
// Remove HTML tags (including self-closing tags)
42+
// This regex matches <tag>content</tag> and <tag /> patterns
43+
cleaned = cleaned.replace(/<[^>]*>/g, '');
6144

6245
// Clean up extra whitespace
6346
cleaned = cleaned.replace(/\s+/g, ' ').trim();

0 commit comments

Comments
 (0)