Future-House · JoaquinPolonuer · Feb 18, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 14, 2025
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,6 +49,16 @@ jobs:
         with:
           name: ${{ steps.build-aviary-litqa.outputs.artifact-name }}
           path: dist
+      - id: build-aviary-lfrqa
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          path: packages/lfrqa
+          upload-name-suffix: -lfrqa
+      - name: Download built aviary.lfrqa artifact to dist/
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ steps.build-aviary-lfrqa.outputs.artifact-name }}
+          path: dist
       - uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/README.md b/README.md
@@ -342,3 +342,4 @@ Here are a few environments implemented with aviary:
 | GSM8k       | [`aviary.gsm8k`](https://pypi.org/project/aviary.gsm8k/)       | `fhaviary[gsm8k]`    | [`README.md`](packages/gsm8k/README.md#installation)    |     |
 | HotPotQA    | [`aviary.hotpotqa`](https://pypi.org/project/aviary.hotpotqa/) | `fhaviary[hotpotqa]` | [`README.md`](packages/hotpotqa/README.md#installation) |     |
 | LitQA       | [`aviary.litqa`](https://pypi.org/project/aviary.litqa/)       | `fhaviary[litqa]`    | [`README.md`](packages/litqa/README.md#installation)    |     |
+| LFRQA       | [`aviary.lfrqa`](https://pypi.org/project/aviary.lfrqa/)       | `fhaviary[lfrqa]`    | [`README.md`](packages/lfrqa/README.md#installation)    |     |
diff --git a/packages/lfrqa/README.md b/packages/lfrqa/README.md
@@ -0,0 +1,19 @@
+# aviary.lfrqa
+
+An environment designed to utilize PaperQA for answering questions from the LFRQATaskDataset. LFRQA is a human-annotated dataset introduced in the RAG-QA-Arena, featuring over 1400 questions from various categories, including science.
+
+## References
+
+[1] RAG-QA Arena (https://arxiv.org/pdf/2407.13998)
+
+## Installation
+
+To install the LFRQA environment, run:
+
+```bash
+pip install fhaviary[lfrqa]
+```
+
+## Usage
+
+Refer to [this tutorial](https://github.com/Future-House/paper-qa/blob/main/docs/tutorials/running_on_lfrqa.md) for instructions on how to run the environment.
diff --git a/packages/lfrqa/pyproject.toml b/packages/lfrqa/pyproject.toml
@@ -0,0 +1,38 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools>=64", "setuptools_scm>=8"]
+
+[project]
+authors = [
+    {email = "[email protected]", name = "FutureHouse technical staff"},
+]
+classifiers = [
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python",
+]
+dependencies = [
+    "aviary.litqa",
+    "fh-llm-client>=0.0.11",
+    "fhaviary",
+    "paper-qa>=5.12.0",
+    "pydantic~=2.0",
+]
+description = "LFRQA environment implemented with aviary"
+dynamic = ["version"]
+name = "aviary.lfrqa"
+readme = "README.md"
+requires-python = ">=3.11"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools_scm]
+root = "../.."
+version_file = "src/aviary/envs/lfrqa/version.py"
diff --git a/packages/lfrqa/src/aviary/envs/lfrqa/__init__.py b/packages/lfrqa/src/aviary/envs/lfrqa/__init__.py
@@ -0,0 +1,13 @@
+from .env import (
+    LFRQAPairwiseEvalEnv,
+    LFRQAQuestion,
+)
+from .task import (
+    LFRQATaskDataset,
+)
+
+__all__ = [
+    "LFRQAPairwiseEvalEnv",
+    "LFRQAQuestion",
+    "LFRQATaskDataset",
+]
diff --git a/packages/lfrqa/src/aviary/envs/lfrqa/env.py b/packages/lfrqa/src/aviary/envs/lfrqa/env.py
@@ -0,0 +1,223 @@
+__all__ = [
+    "LFRQAPairwiseEvalEnv",
+    "LFRQAQuestion",
+]
+
+import logging
+import random
+import re
+from uuid import UUID
+
+from llmclient import CommonLLMNames, LiteLLMModel, LLMModel
+from paperqa.docs import Docs
+from pydantic import BaseModel, model_validator
+
+from aviary.core import (
+    Messages,
+    ToolRequestMessage,
+)
+from aviary.envs.litqa import GradablePaperQAEnvironment
+
+logger = logging.getLogger(__name__)
+
+lfrqa_system_prompt = (
+    # From RAG-QA Arena (https://arxiv.org/pdf/2407.13998) Table 13 and 15
+    "Pairwise Evaluation: Instruction and Rubric\n"
+    "We will show you a query and a pair of answers to the query. "
+    "You need to provide your preference over this pair of answers.\n"
+    "First, try your best to determine whether the information in an answer can help "
+    "truthfully answer the query. Then rate your preference based on Helpfulness and "
+    "Truthfulness.\n"
+    "- Helpfulness: information that is helpful/relevant to answer the query. An ideal "
+    "answer consists of only information that is helpful/relevant to answer the query.\n"
+    "- Truthfulness: information that you believe is correct to answer the query. By our "
+    "definition, truthful information should be helpful information. If you find it "
+    "difficult to determine the truthfulness of some information, consider it untruthful. "
+    "Often time, this is due to not enough context provided in the answer. Another source "
+    "of untruthfulness is when conflicting information is presented, and the answer does "
+    "not reconcile them in a coherent way.\n"
+    "<rubric>\n"
+    "Here is how you judge (in the order of importance):\n"
+    "- If one answer has all truthful information while the other has some untruthful "
+    "information, prefer the all-truthful one.\n"
+    "- If both have some untruthful information, prefer the one with less untruthful "
+    "information.\n"
+    "- If both have all truthful information, prefer the one with more truthful or helpful "
+    "information.\n"
+    "- If two answers look equally good, or it is too hard to judge using the three cases "
+    "above, then you are 'not sure' which one is better.\n"
+    "</rubric>\n\n"
+    "Label 1\n"
+    "Query\n"
+    "difference between 'publicly' and 'publically'.\n"
+    "<query> difference between 'publicly' and 'publically'. </query>\n"
+    "<answer 1> Both 'publicly' and 'publically' bear no difference in meaning, as they are "
+    "essentially alternative spellings of the same concept. Publicly is more widely used, "
+    "but the existence of 'publically' in reputable sources like the OED means it cannot be "
+    "dismissed as simply incorrect. Some opinions hold that 'publicly' is the older irregular "
+    "form, still preached by a lot of grammars, and 'publically,' on the other hand, is the "
+    "newer and regular form. </answer 1>\n"
+    "<answer 2> There is no difference in meaning between 'publicly' and 'publically'; they "
+    "are alternative spellings of the same word. </answer 2>\n"
+    "Both <answer 1> and <answer 2> are truthful. However, <answer 1> provides more truthful "
+    "information as the context to compare the two terms. Therefore, <answer 1> is better.\n\n"
+    "Label 2\n"
+    "Query\n"
+    "what did European/American historical cooks do with the egg whites?\n"
+    "<query> what did European/American historical cooks do with the egg whites? </query>\n"
+    "<answer 1> Historical European and American cooks used egg whites for making egg white "
+    "omelettes and egg white pasta, as well as for stiffening clothing, similar to how starch "
+    "is used today. Egg whites have found their place in various non-culinary applications "
+    "throughout history, such as in the clarification of beer and wine, in the conservation of "
+    "books through bookbinding and gilding, and in makeup as an ancient form of nail polish. "
+    "</answer 1>\n"
+    "<answer 2> They were also utilized historically as a stiffening agent for clothing, similar "
+    "to how starch is used today. The culinary landscape was not left untouched, with egg whites "
+    "making their way into recipes for omelettes and pastas as early as the 15th century. "
+    "</answer 2>\n"
+    "Both <answer 1> and <answer 2> provide several usages of egg whites for European/American "
+    "cooks.\n"
+    "<answer 2> clearly provides more options with fully explained details. Therefore, <answer 2> "
+    "is better.\n\n"
+    "Label 0\n"
+    "Query\n"
+    "should utf-16 be considered harmful?\n"
+    "<query> should utf-16 be considered harmful? </query>\n"
+    "<answer 1> The question of whether UTF-16 should be considered harmful is subject to differing "
+    "opinions. One perspective suggests that UTF-16 is harmful due to a historical misconception "
+    "about character encoding and recommends UTF-8 as the superior choice for various text interfaces. "
+    "Another viewpoint argues that UTF-16 is not inherently harmful, emphasizing its utility in specific "
+    "scenarios where it serves as a compromise between simplicity and compactness. The choice between "
+    "UTF-16 and other encodings like UTF-8 depends on the specific requirements of the application, such "
+    "as compatibility with ASCII or the need to efficiently encode certain character sets. </answer 1>\n"
+    "<answer 2> UTF-16 should not be considered harmful. However, contrasting views argue that UTF-16 "
+    "should indeed be considered harmful. Some argue that the very reason UTF-16 exists is because some "
+    "time ago there used to be a misguided belief that WideChar is going to be what UCS-4 now is. "
+    "Additionally, the harmfulness of UTF-16 is tied to issues with exercising code. </answer 2>\n"
+    "Both <answer 1> and <answer 2> reconcile the two conflicting views with detailed explanation.\n"
+    "I am not sure which one is better."
+)
+
+lfrqa_prompt_template = (
+    # From RAG-QA Arena (https://arxiv.org/pdf/2407.13998) Table 14
+    "Query is in the <query></query> tags. Answer 1 is in <answer 1></answer 1>,"
+    "and Answer 2 is in <answer 2></answer 2>.\n"
+    "<query> {question} </query>\n"
+    "<answer 1> {answer1} </answer 1>\n"
+    "<answer 2> {answer2} </answer 2>\n"
+    "Review the rubric in <rubric> tags,\n"
+    "- if you prefer <answer 1>, output 1.\n"
+    "- if you prefer <answer 2>, output 2.\n"
+    "- if you are not sure, output 0.\n"
+    "First, think step by step, put your thinking in <thinking></thinking> tags.\n"
+    "Your thinking must be shorter than 50 words.\n"
+    "Then, provide your rating inside <rating></rating> tags.\n"
+    "Remember your rating should be 0 if you are not sure, and your rating must be either 0, 1, or 2."
+)
+
+
+def strip_citations(text: str) -> str:
+    # Combined regex for identifying citations (see unit tests for examples)
+    citation_regex = r"\b[\w\-]+\set\sal\.\s\([0-9]{4}\)|\((?:[^\)]*?[a-zA-Z][^\)]*?[0-9]{4}[^\)]*?)\)"
+    # Remove the citations from the text
+    return re.sub(citation_regex, "", text, flags=re.MULTILINE)
+
+
+class LFRQAPairwiseEvalEnv(GradablePaperQAEnvironment[dict]):
+    """Environment to evaluate paperqa's vs human's answers on Long Form RAG QA questions."""
+
+    def __init__(
+        self,
+        *args,
+        qid: str | UUID,
+        question: str,
+        human_answer: str,
+        gt_doc_ids: list[int],
+        pairwise_eval_llm: LLMModel | str = CommonLLMNames.GPT_4O.value,
+        **kwargs,
+    ):
+        kwargs["query"] = question
+        kwargs["docs"] = Docs()
+        super().__init__(*args, **kwargs)
+
+        self.qid = qid
+        self.question = question
+        self.human_answer = human_answer
+        self.gt_doc_ids = gt_doc_ids
+        self.pairwise_eval_llm = pairwise_eval_llm
+
+    def extract_best_answer_index(self, text: str) -> int:
+        match = re.search(r"<rating>(\d+)</rating>", text)
+        return int(match.group(1)) if match else 0
+
+    async def _evaluate_answer(self) -> dict:
+        """Pairwise evaluation of PaperQA vs Human answer."""
+        paper_search_ids = [int(doc.docname) for doc in self.state.docs.docs.values()]
+
+        pairwise_eval_llm = LiteLLMModel(name=self.pairwise_eval_llm)
+        pqa_answer = strip_citations(self.state.session.answer)
+        pqa_answer_index = 1 if random.random() < 0.5 else 2  # noqa: PLR2004
+        data = {
+            "question": self.question,
+            "answer1": pqa_answer if pqa_answer_index == 1 else self.human_answer,
+            "answer2": self.human_answer if pqa_answer_index == 1 else pqa_answer,
+        }
+
+        result = await pairwise_eval_llm.run_prompt(
+            prompt=lfrqa_prompt_template,
+            data=data,
+            system_prompt=lfrqa_system_prompt,
+        )
+
+        best_answer_index = self.extract_best_answer_index(result.text)
+        if best_answer_index == pqa_answer_index:
+            winner, reward = "paperqa", self._rewards["win"]
+        elif best_answer_index != 0:
+            winner, reward = "human", self._rewards["lose"]
+        else:
+            winner, reward = "tie", self._rewards["tie"]
+
+        return {
+            "llm": self._settings.llm,
+            "evaluator_llm": self.pairwise_eval_llm,
+            "qid": self.qid,
+            "question": self.question,
+            "pqa_answer": pqa_answer,
+            "human_answer": self.human_answer,
+            "winner": winner,
+            "paper_search_ids": paper_search_ids,
+            "gt_doc_ids": self.gt_doc_ids,
+            "pqa_answer_was_answer_1": pqa_answer_index == 1,
+            "complete_evaluator_response": result.text,
+            "reward": reward,
+        }
+
+    async def step(
+        self, action: ToolRequestMessage
+    ) -> tuple[Messages, float, bool, bool]:
+        messages, reward, done, truncated = await super().step(action)
+        if not done:
+            return messages, reward, done, truncated
+
+        evaluation = await self._evaluate_answer()
+        if evaluation_callback := self._evaluation_callback:
+            await evaluation_callback(evaluation)
+
+        return messages, evaluation["reward"], done, truncated
+
+
+class LFRQAQuestion(BaseModel):
+    qid: str | UUID
+    question: str
+    answer: str
+    gt_doc_ids: list[int]
+
+    @model_validator(mode="before")
+    @classmethod
+    def _validate_gt_doc_ids(cls, data: dict) -> dict:
+        if data.get("gold_doc_ids") and not data.get("gt_doc_ids"):
+            data["gt_doc_ids"] = data["gold_doc_ids"]
+        if isinstance(data["gt_doc_ids"], str):
+            data["gt_doc_ids"] = data["gt_doc_ids"].strip("[]").split(",")
+            data["gt_doc_ids"] = [int(_id) for _id in data["gt_doc_ids"]]
+        return data
diff --git a/packages/lfrqa/src/aviary/envs/lfrqa/py.typed b/packages/lfrqa/src/aviary/envs/lfrqa/py.typed
diff --git a/packages/lfrqa/src/aviary/envs/lfrqa/task.py b/packages/lfrqa/src/aviary/envs/lfrqa/task.py
@@ -0,0 +1,55 @@
+from collections.abc import Awaitable, Callable
+
+from llmclient import CommonLLMNames, LLMModel
+from paperqa.settings import Settings
+
+from aviary.core import TASK_DATASET_REGISTRY, TaskDataset
+
+from .env import LFRQAPairwiseEvalEnv, LFRQAQuestion
+
+
+class LFRQATaskDataset(TaskDataset[LFRQAPairwiseEvalEnv]):
+    """Task dataset for custom evaluation of non-multiple choice questions."""
+
+    def __init__(
+        self,
+        data: list[LFRQAQuestion],
+        settings: Settings | dict | None = None,
+        pairwise_eval_llm: LLMModel | str = CommonLLMNames.GPT_4O.value,
+        evaluation_callback: Callable[[dict], Awaitable] | None = None,
+    ):
+        self.data = data
+        self.pairwise_eval_llm = pairwise_eval_llm
+
+        if settings is None:
+            settings = Settings()
+        if isinstance(settings, dict):
+            settings = Settings(**settings)
+        self._settings = settings
+        self._rewards = {"win": 1, "tie": 0, "lose": -1}
+        self._evaluation_callback = evaluation_callback
+
+    def get_new_env_by_idx(self, idx: int) -> LFRQAPairwiseEvalEnv:
+        """Create a new environment instance for the given index."""
+        question = self.data[idx]
+
+        return LFRQAPairwiseEvalEnv(
+            qid=question.qid,
+            question=question.question,
+            human_answer=question.answer,
+            gt_doc_ids=question.gt_doc_ids,
+            pairwise_eval_llm=self.pairwise_eval_llm,
+            settings=self._settings,
+            rewards=self._rewards,
+            evaluation_callback=self._evaluation_callback,
+        )
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+
+TASK_DATASET_NAME = "lfrqa"
+TASK_DATASET_REGISTRY[TASK_DATASET_NAME] = (
+    LFRQATaskDataset.__module__,
+    LFRQATaskDataset.__name__,
+)