11"""Load Data from a Confluence Space"""
22import logging
3+ from enum import Enum
34from io import BytesIO
45from typing import Any , Callable , Dict , List , Optional , Union
56
1617logger = logging .getLogger (__name__ )
1718
1819
20+ class ContentFormat (str , Enum ):
21+ STORAGE = "body.storage"
22+ VIEW = "body.view"
23+
24+ def get_content (self , page : dict ) -> str :
25+ if self == ContentFormat .STORAGE :
26+ return page ["body" ]["storage" ]["value" ]
27+ elif self == ContentFormat .VIEW :
28+ return page ["body" ]["view" ]["value" ]
29+
30+ raise ValueError ("unknown content format" )
31+
32+
1933class ConfluenceLoader (BaseLoader ):
2034 """
2135 Load Confluence pages. Port of https://llamahub.ai/l/confluence
@@ -31,6 +45,12 @@ class ConfluenceLoader(BaseLoader):
3145 Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
3246 SVG, Word and Excel.
3347
48+ Confluence API supports difference format of page content. The storage format is the
49+ raw XML representation for storage. The view format is the HTML representation for
50+ viewing with macros are rendered as though it is viewed by users. You can pass
51+ a enum `content_format` argument to `load()` to specify the content format, this is
52+ set to `ContentFormat.STORAGE` by default.
53+
3454 Hint: space_key and page_id can both be found in the URL of a page in Confluence
3555 - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
3656
@@ -178,6 +198,7 @@ def load(
178198 include_archived_content : bool = False ,
179199 include_attachments : bool = False ,
180200 include_comments : bool = False ,
201+ content_format : ContentFormat = ContentFormat .STORAGE ,
181202 limit : Optional [int ] = 50 ,
182203 max_pages : Optional [int ] = 1000 ,
183204 ocr_languages : Optional [str ] = None ,
@@ -200,6 +221,8 @@ def load(
200221 :type include_attachments: bool, optional
201222 :param include_comments: defaults to False
202223 :type include_comments: bool, optional
224+ :param content_format: Specify content format, defaults to ContentFormat.STORAGE
225+ :type content_format: ContentFormat
203226 :param limit: Maximum number of pages to retrieve per request, defaults to 50
204227 :type limit: int, optional
205228 :param max_pages: Maximum number of pages to retrieve in total, defaults 1000
@@ -228,13 +251,14 @@ def load(
228251 limit = limit ,
229252 max_pages = max_pages ,
230253 status = "any" if include_archived_content else "current" ,
231- expand = "body.storage. value" ,
254+ expand = content_format . value ,
232255 )
233256 docs += self .process_pages (
234257 pages ,
235258 include_restricted_content ,
236259 include_attachments ,
237260 include_comments ,
261+ content_format ,
238262 ocr_languages ,
239263 )
240264
@@ -258,13 +282,14 @@ def load(
258282 limit = limit ,
259283 max_pages = max_pages ,
260284 include_archived_spaces = include_archived_content ,
261- expand = "body.storage. value" ,
285+ expand = content_format . value ,
262286 )
263287 docs += self .process_pages (
264288 pages ,
265289 include_restricted_content ,
266290 include_attachments ,
267291 include_comments ,
292+ content_format ,
268293 ocr_languages ,
269294 )
270295
@@ -282,11 +307,15 @@ def load(
282307 ),
283308 before_sleep = before_sleep_log (logger , logging .WARNING ),
284309 )(self .confluence .get_page_by_id )
285- page = get_page (page_id = page_id , expand = "body.storage. value" )
310+ page = get_page (page_id = page_id , expand = content_format . value )
286311 if not include_restricted_content and not self .is_public_page (page ):
287312 continue
288313 doc = self .process_page (
289- page , include_attachments , include_comments , ocr_languages
314+ page ,
315+ include_attachments ,
316+ include_comments ,
317+ content_format ,
318+ ocr_languages ,
290319 )
291320 docs .append (doc )
292321
@@ -363,6 +392,7 @@ def process_pages(
363392 include_restricted_content : bool ,
364393 include_attachments : bool ,
365394 include_comments : bool ,
395+ content_format : ContentFormat ,
366396 ocr_languages : Optional [str ] = None ,
367397 ) -> List [Document ]:
368398 """Process a list of pages into a list of documents."""
@@ -371,7 +401,11 @@ def process_pages(
371401 if not include_restricted_content and not self .is_public_page (page ):
372402 continue
373403 doc = self .process_page (
374- page , include_attachments , include_comments , ocr_languages
404+ page ,
405+ include_attachments ,
406+ include_comments ,
407+ content_format ,
408+ ocr_languages ,
375409 )
376410 docs .append (doc )
377411
@@ -382,6 +416,7 @@ def process_page(
382416 page : dict ,
383417 include_attachments : bool ,
384418 include_comments : bool ,
419+ content_format : ContentFormat ,
385420 ocr_languages : Optional [str ] = None ,
386421 ) -> Document :
387422 try :
@@ -396,9 +431,11 @@ def process_page(
396431 attachment_texts = self .process_attachment (page ["id" ], ocr_languages )
397432 else :
398433 attachment_texts = []
399- text = BeautifulSoup (page ["body" ]["storage" ]["value" ], "lxml" ).get_text (
400- " " , strip = True
401- ) + "" .join (attachment_texts )
434+
435+ content = content_format .get_content (page )
436+ text = BeautifulSoup (content , "lxml" ).get_text (" " , strip = True ) + "" .join (
437+ attachment_texts
438+ )
402439 if include_comments :
403440 comments = self .confluence .get_page_comments (
404441 page ["id" ], expand = "body.view.value" , depth = "all"
0 commit comments