Skip to content

Commit 7ad13cd

Browse files
haoqixuhwchase17
andauthored
feat: add content_format param to ConfluenceLoader.load() (#5922)
Confluence API supports difference format of page content. The storage format is the raw XML representation for storage. The view format is the HTML representation for viewing with macros rendered as though it is viewed by users. Add the `content_format` parameter to `ConfluenceLoader.load()` to specify the content format, this is set to `ContentFormat.STORAGE` by default. #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: Harrison Chase <[email protected]>
1 parent c5a46e7 commit 7ad13cd

File tree

1 file changed

+45
-8
lines changed

1 file changed

+45
-8
lines changed

langchain/document_loaders/confluence.py

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Load Data from a Confluence Space"""
22
import logging
3+
from enum import Enum
34
from io import BytesIO
45
from typing import Any, Callable, Dict, List, Optional, Union
56

@@ -16,6 +17,19 @@
1617
logger = logging.getLogger(__name__)
1718

1819

20+
class ContentFormat(str, Enum):
21+
STORAGE = "body.storage"
22+
VIEW = "body.view"
23+
24+
def get_content(self, page: dict) -> str:
25+
if self == ContentFormat.STORAGE:
26+
return page["body"]["storage"]["value"]
27+
elif self == ContentFormat.VIEW:
28+
return page["body"]["view"]["value"]
29+
30+
raise ValueError("unknown content format")
31+
32+
1933
class ConfluenceLoader(BaseLoader):
2034
"""
2135
Load Confluence pages. Port of https://llamahub.ai/l/confluence
@@ -31,6 +45,12 @@ class ConfluenceLoader(BaseLoader):
3145
Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
3246
SVG, Word and Excel.
3347
48+
Confluence API supports difference format of page content. The storage format is the
49+
raw XML representation for storage. The view format is the HTML representation for
50+
viewing with macros are rendered as though it is viewed by users. You can pass
51+
a enum `content_format` argument to `load()` to specify the content format, this is
52+
set to `ContentFormat.STORAGE` by default.
53+
3454
Hint: space_key and page_id can both be found in the URL of a page in Confluence
3555
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
3656
@@ -178,6 +198,7 @@ def load(
178198
include_archived_content: bool = False,
179199
include_attachments: bool = False,
180200
include_comments: bool = False,
201+
content_format: ContentFormat = ContentFormat.STORAGE,
181202
limit: Optional[int] = 50,
182203
max_pages: Optional[int] = 1000,
183204
ocr_languages: Optional[str] = None,
@@ -200,6 +221,8 @@ def load(
200221
:type include_attachments: bool, optional
201222
:param include_comments: defaults to False
202223
:type include_comments: bool, optional
224+
:param content_format: Specify content format, defaults to ContentFormat.STORAGE
225+
:type content_format: ContentFormat
203226
:param limit: Maximum number of pages to retrieve per request, defaults to 50
204227
:type limit: int, optional
205228
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
@@ -228,13 +251,14 @@ def load(
228251
limit=limit,
229252
max_pages=max_pages,
230253
status="any" if include_archived_content else "current",
231-
expand="body.storage.value",
254+
expand=content_format.value,
232255
)
233256
docs += self.process_pages(
234257
pages,
235258
include_restricted_content,
236259
include_attachments,
237260
include_comments,
261+
content_format,
238262
ocr_languages,
239263
)
240264

@@ -258,13 +282,14 @@ def load(
258282
limit=limit,
259283
max_pages=max_pages,
260284
include_archived_spaces=include_archived_content,
261-
expand="body.storage.value",
285+
expand=content_format.value,
262286
)
263287
docs += self.process_pages(
264288
pages,
265289
include_restricted_content,
266290
include_attachments,
267291
include_comments,
292+
content_format,
268293
ocr_languages,
269294
)
270295

@@ -282,11 +307,15 @@ def load(
282307
),
283308
before_sleep=before_sleep_log(logger, logging.WARNING),
284309
)(self.confluence.get_page_by_id)
285-
page = get_page(page_id=page_id, expand="body.storage.value")
310+
page = get_page(page_id=page_id, expand=content_format.value)
286311
if not include_restricted_content and not self.is_public_page(page):
287312
continue
288313
doc = self.process_page(
289-
page, include_attachments, include_comments, ocr_languages
314+
page,
315+
include_attachments,
316+
include_comments,
317+
content_format,
318+
ocr_languages,
290319
)
291320
docs.append(doc)
292321

@@ -363,6 +392,7 @@ def process_pages(
363392
include_restricted_content: bool,
364393
include_attachments: bool,
365394
include_comments: bool,
395+
content_format: ContentFormat,
366396
ocr_languages: Optional[str] = None,
367397
) -> List[Document]:
368398
"""Process a list of pages into a list of documents."""
@@ -371,7 +401,11 @@ def process_pages(
371401
if not include_restricted_content and not self.is_public_page(page):
372402
continue
373403
doc = self.process_page(
374-
page, include_attachments, include_comments, ocr_languages
404+
page,
405+
include_attachments,
406+
include_comments,
407+
content_format,
408+
ocr_languages,
375409
)
376410
docs.append(doc)
377411

@@ -382,6 +416,7 @@ def process_page(
382416
page: dict,
383417
include_attachments: bool,
384418
include_comments: bool,
419+
content_format: ContentFormat,
385420
ocr_languages: Optional[str] = None,
386421
) -> Document:
387422
try:
@@ -396,9 +431,11 @@ def process_page(
396431
attachment_texts = self.process_attachment(page["id"], ocr_languages)
397432
else:
398433
attachment_texts = []
399-
text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text(
400-
" ", strip=True
401-
) + "".join(attachment_texts)
434+
435+
content = content_format.get_content(page)
436+
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
437+
attachment_texts
438+
)
402439
if include_comments:
403440
comments = self.confluence.get_page_comments(
404441
page["id"], expand="body.view.value", depth="all"

0 commit comments

Comments
 (0)