1+ import { marked } from 'marked' ;
2+
13/**
24 * Strips HTML tags and Markdown formatting from text, returning plain text.
35 * This is useful for cleaning summaries and excerpts that may contain formatting.
6+ *
7+ * Uses marked.parse() to convert markdown to HTML first, which handles all markdown
8+ * edge cases, then strips the HTML tags and decodes entities.
49 *
510 * @param text - The text to clean (may contain HTML tags and Markdown formatting)
611 * @returns Plain text with all HTML tags and Markdown formatting removed
@@ -12,52 +17,30 @@ export const stripMarkdownAndHtml = (text: string | null | undefined): string =>
1217
1318 let cleaned = text ;
1419
15- // First, remove HTML tags (including self-closing tags)
16- // This regex matches <tag>content</tag> and <tag /> patterns
17- cleaned = cleaned . replace ( / < [ ^ > ] * > / g, '' ) ;
18-
19- // Remove Markdown formatting:
20- // - Bold + Italic: ***text*** or ___text___ (must come before bold/italic to avoid conflicts)
21- cleaned = cleaned . replace ( / \* \* \* ( [ ^ * ] + ?) \* \* \* / g, '$1' ) ;
22- cleaned = cleaned . replace ( / _ _ _ ( [ ^ _ ] + ?) _ _ _ / g, '$1' ) ;
23-
24- // - Bold: **text** or __text__ (must come before italic to avoid conflicts)
25- cleaned = cleaned . replace ( / \* \* ( [ ^ * ] + ?) \* \* / g, '$1' ) ;
26- cleaned = cleaned . replace ( / _ _ ( [ ^ _ ] + ?) _ _ / g, '$1' ) ;
27-
28- // - Italic: *text* or _text_ (after bold to avoid matching ** as two italics)
29- // Since bold markers (** and __) are already removed, match single markers
30- // Use a simple pattern that avoids lookbehinds for better browser compatibility
31- cleaned = cleaned . replace ( / \* ( [ ^ * \n ] + ?) \* / g, '$1' ) ;
32- cleaned = cleaned . replace ( / _ ( [ ^ _ \n ] + ?) _ / g, '$1' ) ;
33-
34- // - Strikethrough: ~~text~~
35- cleaned = cleaned . replace ( / ~ ~ ( [ ^ ~ ] + ) ~ ~ / g, '$1' ) ;
36-
37- // - Headers: # Header, ## Header, etc.
38- cleaned = cleaned . replace ( / ^ # { 1 , 6 } \s + / gm, '' ) ;
39-
40- // - Images:  -> alt (must come before links to avoid conflicts)
41- cleaned = cleaned . replace ( / ! \[ ( [ ^ \] ] * ) \] \( [ ^ ) ] + \) / g, '$1' ) ;
42-
43- // - Links: [text](url) -> text
44- cleaned = cleaned . replace ( / \[ ( [ ^ \] ] + ) \] \( [ ^ ) ] + \) / g, '$1' ) ;
45-
46- // - Code blocks: `code` -> code
47- cleaned = cleaned . replace ( / ` ( [ ^ ` ] + ) ` / g, '$1' ) ;
48-
49- // - Code blocks: ```code``` -> code
50- cleaned = cleaned . replace ( / ` ` ` [ \s \S ] * ?` ` ` / g, '' ) ;
20+ // First, convert markdown to HTML using marked.parse()
21+ // This handles all markdown edge cases that marked already knows about
22+ try {
23+ cleaned = marked . parse ( cleaned , {
24+ breaks : true ,
25+ gfm : true ,
26+ } ) as string ;
27+ } catch ( error ) {
28+ // If parsing fails, fall back to original text
29+ // This shouldn't happen in normal cases, but provides a safety net
30+ console . warn ( 'Failed to parse markdown:' , error ) ;
31+ }
5132
52- // - Lists: - item, * item, + item, 1. item -> item
53- cleaned = cleaned . replace ( / ^ [ \s ] * [ - * + ] \s + / gm , '' ) ;
54- cleaned = cleaned . replace ( / ^ \d + \. \s + / gm , '' ) ;
33+ // Extract alt text from images before removing HTML tags
34+ // marked.parse() converts  to <img alt="alt" src="url">
35+ cleaned = cleaned . replace ( / < i m g [ ^ > ] * a l t = [ " ' ] ( [ ^ " ' ] * ) [ " ' ] [ ^ > ] * > / gi , '$1 ' ) ;
5536
56- // - Blockquotes: > text -> text
57- cleaned = cleaned . replace ( / ^ > \s + / gm, '' ) ;
37+ // Replace <br> and </p><p> with spaces to preserve line breaks as spaces
38+ cleaned = cleaned . replace ( / < b r \s * \/ ? > / gi, ' ' ) ;
39+ cleaned = cleaned . replace ( / < \/ p > \s * < p > / gi, ' ' ) ;
5840
59- // - Horizontal rules: ---, ***, ___
60- cleaned = cleaned . replace ( / ^ [ - * _ ] { 3 , } $ / gm, '' ) ;
41+ // Remove HTML tags (including self-closing tags)
42+ // This regex matches <tag>content</tag> and <tag /> patterns
43+ cleaned = cleaned . replace ( / < [ ^ > ] * > / g, '' ) ;
6144
6245 // Clean up extra whitespace
6346 cleaned = cleaned . replace ( / \s + / g, ' ' ) . trim ( ) ;
0 commit comments