From 4ef52ec6950fb331900ba95f2bbf02fd7290fda1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 28 Dec 2025 15:16:59 +0800 Subject: [PATCH] Change DOCX extraction to use HTML tags for whitespace - Replace tabs with HTML em spaces - Convert all newlines to break tags --- lightrag/api/routers/document_routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d906aa5cf9..a6360aaa30 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1015,10 +1015,10 @@ def escape_cell(cell_value: str | None) -> str: # CRITICAL: Escape backslash first to avoid double-escaping return ( text.replace("\\", "\\\\") # Must be first: \ -> \\ - .replace("\t", "\\t") # Tab -> \t (visible) - .replace("\r\n", "\\n") # Windows newline -> \n - .replace("\r", "\\n") # Mac newline -> \n - .replace("\n", "\\n") # Unix newline -> \n + .replace("\t", "  ") # Tab -> \t (visible) + .replace("\r\n", "
") # Windows newline -> \n + .replace("\r", "
") # Mac newline -> \n + .replace("\n", "
") # Unix newline -> \n ) content_parts = []