Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions langchain/document_loaders/confluence.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Load Data from a Confluence Space"""
import logging
from enum import Enum
from io import BytesIO
from typing import Any, Callable, List, Optional, Union

Expand All @@ -16,6 +17,19 @@
logger = logging.getLogger(__name__)


class ContentFormat(str, Enum):
STORAGE = "body.storage"
VIEW = "body.view"

def get_content(self, page: dict) -> str:
if self == ContentFormat.STORAGE:
return page["body"]["storage"]["value"]
elif self == ContentFormat.VIEW:
return page["body"]["view"]["value"]

raise ValueError("unknown content format")


class ConfluenceLoader(BaseLoader):
"""
Load Confluence pages. Port of https://llamahub.ai/l/confluence
Expand All @@ -31,6 +45,12 @@ class ConfluenceLoader(BaseLoader):
Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
SVG, Word and Excel.

Confluence API supports difference format of page content. The storage format is the
raw XML representation for storage. The view format is the HTML representation for
viewing with macros are rendered as though it is viewed by users. You can pass
a enum `content_format` argument to `load()` to specify the content format, this is
set to `ContentFormat.STORAGE` by default.

Hint: space_key and page_id can both be found in the URL of a page in Confluence
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>

Expand Down Expand Up @@ -178,6 +198,7 @@ def load(
include_archived_content: bool = False,
include_attachments: bool = False,
include_comments: bool = False,
content_format: ContentFormat = ContentFormat.STORAGE,
limit: Optional[int] = 50,
max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None,
Expand All @@ -200,6 +221,8 @@ def load(
:type include_attachments: bool, optional
:param include_comments: defaults to False
:type include_comments: bool, optional
:param content_format: Specify content format, defaults to ContentFormat.STORAGE
:type content_format: ContentFormat
:param limit: Maximum number of pages to retrieve per request, defaults to 50
:type limit: int, optional
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
Expand Down Expand Up @@ -228,13 +251,14 @@ def load(
limit=limit,
max_pages=max_pages,
status="any" if include_archived_content else "current",
expand="body.storage.value",
expand=content_format.value,
)
docs += self.process_pages(
pages,
include_restricted_content,
include_attachments,
include_comments,
content_format,
ocr_languages,
)

Expand All @@ -258,13 +282,14 @@ def load(
limit=limit,
max_pages=max_pages,
include_archived_spaces=include_archived_content,
expand="body.storage.value",
expand=content_format.value,
)
docs += self.process_pages(
pages,
include_restricted_content,
include_attachments,
include_comments,
content_format,
ocr_languages,
)

Expand All @@ -282,11 +307,15 @@ def load(
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand="body.storage.value")
page = get_page(page_id=page_id, expand=content_format.value)
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(
page, include_attachments, include_comments, ocr_languages
page,
include_attachments,
include_comments,
content_format,
ocr_languages,
)
docs.append(doc)

Expand Down Expand Up @@ -350,6 +379,7 @@ def process_pages(
include_restricted_content: bool,
include_attachments: bool,
include_comments: bool,
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
) -> List[Document]:
"""Process a list of pages into a list of documents."""
Expand All @@ -358,7 +388,11 @@ def process_pages(
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(
page, include_attachments, include_comments, ocr_languages
page,
include_attachments,
include_comments,
content_format,
ocr_languages,
)
docs.append(doc)

Expand All @@ -369,6 +403,7 @@ def process_page(
page: dict,
include_attachments: bool,
include_comments: bool,
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
) -> Document:
try:
Expand All @@ -383,9 +418,11 @@ def process_page(
attachment_texts = self.process_attachment(page["id"], ocr_languages)
else:
attachment_texts = []
text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text(
" ", strip=True
) + "".join(attachment_texts)

content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
attachment_texts
)
if include_comments:
comments = self.confluence.get_page_comments(
page["id"], expand="body.view.value", depth="all"
Expand Down