refactor: use marked.parse() instead of regex for markdown stripping

KostasKoukianakis · KostasKoukianakis · commit e726b5d65297 · 2026-01-16T06:02:02.000+02:00
- Replace custom regex-based markdown stripping with marked.parse() - marked.parse() handles all markdown edge cases automatically - Extract alt text from images before removing HTML tags - Preserve line breaks by converting <br> and </p><p> to spaces - Reduces maintenance burden by leveraging marked's robust parsing Addresses feedback from PR #4572
diff --git a/src/utils/stripMarkdownAndHtml.ts b/src/utils/stripMarkdownAndHtml.ts
@@ -1,6 +1,11 @@
+import { marked } from 'marked';
+
 /**
  * Strips HTML tags and Markdown formatting from text, returning plain text.
  * This is useful for cleaning summaries and excerpts that may contain formatting.
+ * 
+ * Uses marked.parse() to convert markdown to HTML first, which handles all markdown
+ * edge cases, then strips the HTML tags and decodes entities.
  *
  * @param text - The text to clean (may contain HTML tags and Markdown formatting)
  * @returns Plain text with all HTML tags and Markdown formatting removed
@@ -12,52 +17,30 @@ export const stripMarkdownAndHtml = (text: string | null | undefined): string =>
 
   let cleaned = text;
 
-  // First, remove HTML tags (including self-closing tags)
-  // This regex matches <tag>content</tag> and <tag /> patterns
-  cleaned = cleaned.replace(/<[^>]*>/g, '');
-
-  // Remove Markdown formatting:
-  // - Bold + Italic: ***text*** or ___text___ (must come before bold/italic to avoid conflicts)
-  cleaned = cleaned.replace(/\*\*\*([^*]+?)\*\*\*/g, '$1');
-  cleaned = cleaned.replace(/___([^_]+?)___/g, '$1');
-
-  // - Bold: **text** or __text__ (must come before italic to avoid conflicts)
-  cleaned = cleaned.replace(/\*\*([^*]+?)\*\*/g, '$1');
-  cleaned = cleaned.replace(/__([^_]+?)__/g, '$1');
-
-  // - Italic: *text* or _text_ (after bold to avoid matching ** as two italics)
-  // Since bold markers (** and __) are already removed, match single markers
-  // Use a simple pattern that avoids lookbehinds for better browser compatibility
-  cleaned = cleaned.replace(/\*([^*\n]+?)\*/g, '$1');
-  cleaned = cleaned.replace(/_([^_\n]+?)_/g, '$1');
-
-  // - Strikethrough: ~~text~~
-  cleaned = cleaned.replace(/~~([^~]+)~~/g, '$1');
-
-  // - Headers: # Header, ## Header, etc.
-  cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
-
-  // - Images: ![alt](url) -> alt (must come before links to avoid conflicts)
-  cleaned = cleaned.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1');
-
-  // - Links: [text](url) -> text
-  cleaned = cleaned.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
-
-  // - Code blocks: `code` -> code
-  cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
-
-  // - Code blocks: ```code``` -> code
-  cleaned = cleaned.replace(/```[\s\S]*?```/g, '');
+  // First, convert markdown to HTML using marked.parse()
+  // This handles all markdown edge cases that marked already knows about
+  try {
+    cleaned = marked.parse(cleaned, {
+      breaks: true,
+      gfm: true,
+    }) as string;
+  } catch (error) {
+    // If parsing fails, fall back to original text
+    // This shouldn't happen in normal cases, but provides a safety net
+    console.warn('Failed to parse markdown:', error);
+  }
 
-  // - Lists: - item, * item, + item, 1. item -> item
-  cleaned = cleaned.replace(/^[\s]*[-*+]\s+/gm, '');
-  cleaned = cleaned.replace(/^\d+\.\s+/gm, '');
+  // Extract alt text from images before removing HTML tags
+  // marked.parse() converts ![alt](url) to <img alt="alt" src="url">
+  cleaned = cleaned.replace(/<img[^>]*alt=["']([^"']*)["'][^>]*>/gi, '$1');
 
-  // - Blockquotes: > text -> text
-  cleaned = cleaned.replace(/^>\s+/gm, '');
+  // Replace <br> and </p><p> with spaces to preserve line breaks as spaces
+  cleaned = cleaned.replace(/<br\s*\/?>/gi, ' ');
+  cleaned = cleaned.replace(/<\/p>\s*<p>/gi, ' ');
 
-  // - Horizontal rules: ---, ***, ___
-  cleaned = cleaned.replace(/^[-*_]{3,}$/gm, '');
+  // Remove HTML tags (including self-closing tags)
+  // This regex matches <tag>content</tag> and <tag /> patterns
+  cleaned = cleaned.replace(/<[^>]*>/g, '');
 
   // Clean up extra whitespace
   cleaned = cleaned.replace(/\s+/g, ' ').trim();