Feature/add acreom loader (#5780)

inferense · rlancemartin · web-flow · commit 1913320cbecd · 2023-06-15T11:53:00.000-07:00
adding new loader for [acreom](https://acreom.com) vaults. It's based on the Obsidian loader with some additional text processing for acreom specific markdown elements. @eyurtsev please take a look! --------- Co-authored-by: rlm <pexpresss31@gmail.com>
diff --git a/docs/modules/indexes/document_loaders/examples/acreom.ipynb b/docs/modules/indexes/document_loaders/examples/acreom.ipynb
@@ -0,0 +1,75 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e310c8dc-acd0-48d2-801c-f37ce99acd2d",
+   "metadata": {},
+   "source": [
+    "# acreom"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a2c95d-4114-431e-904a-32d79005c28b",
+   "metadata": {},
+   "source": [
+    "[acreom](https://acreom.com) is a dev-first knowledge base with tasks running on local markdown files.\n",
+    "\n",
+    "Below is an example on how to load a local acreom vault into Langchain. As the local vault in acreom is a folder of plain text .md files, the loader requires the path to the directory. \n",
+    "\n",
+    "Vault files may contain some metadata which is stored as a YAML header. These values will be added to the document’s metadata if `collect_metadata` is set to true. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0169bee5-aa7a-4ec7-b7e7-b3bb2e58f3bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import AcreomLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1b49ab3-616b-4149-bef5-7559d65d3d2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = AcreomLoader('<path-to-acreom-vault>', collect_metadata=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3127a018-9c1c-4886-8321-f5666d970a95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
@@ -1,5 +1,6 @@
 """All different types of document loaders."""
 
+from langchain.document_loaders.acreom import AcreomLoader
 from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
 from langchain.document_loaders.airtable import AirtableLoader
 from langchain.document_loaders.apify_dataset import ApifyDatasetLoader
@@ -136,6 +137,7 @@
 TelegramChatLoader = TelegramChatFileLoader
 
 __all__ = [
+    "AcreomLoader",
     "AZLyricsLoader",
     "AirbyteJSONLoader",
     "AirtableLoader",
diff --git a/langchain/document_loaders/acreom.py b/langchain/document_loaders/acreom.py
@@ -0,0 +1,73 @@
+"""Loader that loads acreom vault from a directory."""
+import re
+from pathlib import Path
+from typing import Iterator, List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class AcreomLoader(BaseLoader):
+    FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
+
+    def __init__(
+        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
+    ):
+        """Initialize with path."""
+        self.file_path = path
+        self.encoding = encoding
+        self.collect_metadata = collect_metadata
+
+    def _parse_front_matter(self, content: str) -> dict:
+        """Parse front matter metadata from the content and return it as a dict."""
+        if not self.collect_metadata:
+            return {}
+        match = self.FRONT_MATTER_REGEX.search(content)
+        front_matter = {}
+        if match:
+            lines = match.group(1).split("\n")
+            for line in lines:
+                if ":" in line:
+                    key, value = line.split(":", 1)
+                    front_matter[key.strip()] = value.strip()
+                else:
+                    # Skip lines without a colon
+                    continue
+        return front_matter
+
+    def _remove_front_matter(self, content: str) -> str:
+        """Remove front matter metadata from the given content."""
+        if not self.collect_metadata:
+            return content
+        return self.FRONT_MATTER_REGEX.sub("", content)
+
+    def _process_acreom_content(self, content: str) -> str:
+        # remove acreom specific elements from content that
+        # do not contribute to the context of current document
+        content = re.sub("\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content)  # rm tasks
+        content = re.sub("#", "", content)  # rm hashtags
+        content = re.sub("\[\[.*?\]\]", "", content)  # rm doclinks
+        return content
+
+    def lazy_load(self) -> Iterator[Document]:
+        ps = list(Path(self.file_path).glob("**/*.md"))
+
+        for p in ps:
+            with open(p, encoding=self.encoding) as f:
+                text = f.read()
+
+            front_matter = self._parse_front_matter(text)
+            text = self._remove_front_matter(text)
+
+            text = self._process_acreom_content(text)
+
+            metadata = {
+                "source": str(p.name),
+                "path": str(p),
+                **front_matter,
+            }
+
+            yield Document(page_content=text, metadata=metadata)
+
+    def load(self) -> List[Document]:
+        return list(self.lazy_load())