SakanaAI · fuodorov · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/README.md b/README.md
@@ -80,12 +80,17 @@ Next, configure valid [AWS Credentials](https://docs.aws.amazon.com/cli/v1/userg
 
 Our code can optionally use a Semantic Scholar API Key (`S2_API_KEY`) for higher throughput during literature search [if you have one](https://www.semanticscholar.org/product/api). This is used during both the ideation and paper writing stages. The system should work without it, though you might encounter rate limits or reduced novelty checking during ideation. If you experience issues with Semantic Scholar, you can skip the citation phase during paper generation.
 
+#### OpenAlex API (Literature Search)
+
+As an alternative to Semantic Scholar, you can use OpenAlex for literature search. OpenAlex provides free access to academic papers and does not require an API key. However, setting the `OPENALEX_MAIL_ADDRESS` environment variable is recommended for better access to the OpenAlex API. Use the `--engine openalex` argument when running the ideation script to use OpenAlex instead of Semantic Scholar.
+
 #### Setting API Keys
 
 Ensure you provide the necessary API keys as environment variables for the models you intend to use. For example:
 ```bash
 export OPENAI_API_KEY="YOUR_OPENAI_KEY_HERE"
 export S2_API_KEY="YOUR_S2_KEY_HERE"
+export OPENALEX_MAIL_ADDRESS="your@email.com"  # Optional, for better OpenAlex access
 # Set AWS credentials if using Bedrock
 # export AWS_ACCESS_KEY_ID="YOUR_AWS_ACCESS_KEY_ID"
 # export AWS_SECRET_ACCESS_KEY="YOUR_AWS_SECRET_KEY"
@@ -107,7 +112,19 @@ Before running the full AI Scientist-v2 experiment pipeline, you first use the `
      --max-num-generations 20 \
      --num-reflections 5
     ```
+
+    To use OpenAlex instead of Semantic Scholar for literature search:
+    ```bash
+    python ai_scientist/perform_ideation_temp_free.py \
+     --workshop-file "ai_scientist/ideas/my_research_topic.md" \
+     --model gpt-4o-2024-05-13 \
+     --max-num-generations 20 \
+     --num-reflections 5 \
+     --engine openalex
+    ```
+
     *   `--workshop-file`: Path to your topic description Markdown file.
+    *   `--engine`: Choose between `semantic_scholar` (default) or `openalex` for literature search.
     *   `--model`: The LLM to use for generating ideas (ensure you have the corresponding API key set).
     *   `--max-num-generations`: How many distinct research ideas to attempt generating.
     *   `--num-reflections`: How many refinement steps the LLM should perform for each idea.

diff --git a/ai_scientist/perform_ideation_temp_free.py b/ai_scientist/perform_ideation_temp_free.py
@@ -4,7 +4,7 @@
 import re
 import traceback
 from typing import Any, Dict, List
-
+import os 
 import sys
 
 sys.path.append(osp.join(osp.dirname(__file__), ".."))
@@ -15,50 +15,61 @@
 )
 
 from ai_scientist.tools.semantic_scholar import SemanticScholarSearchTool
+from ai_scientist.tools.openalex import OpenAlexSearchTool
 from ai_scientist.tools.base_tool import BaseTool
 
-# Create tool instances
-semantic_scholar_tool = SemanticScholarSearchTool()
-
 # Define tools at the top of the file
-tools = [
-    semantic_scholar_tool,
-    {
-        "name": "FinalizeIdea",
-        "description": """Finalize your idea by providing the idea details.
-
-The IDEA JSON should include the following fields:
-- "Name": A short descriptor of the idea. Lowercase, no spaces, underscores allowed.
-- "Title": A catchy and informative title for the proposal.
-- "Short Hypothesis": A concise statement of the main hypothesis or research question. Clarify the need for this specific direction, ensure this is the best setting to investigate this idea, and there are not obvious other simpler ways to answer the question.
-- "Related Work": A brief discussion of the most relevant related work and how the proposal clearly distinguishes from it, and is not a trivial extension.
-- "Abstract": An abstract that summarizes the proposal in conference format (approximately 250 words).
-- "Experiments": A list of experiments that would be conducted to validate the proposal. Ensure these are simple and feasible. Be specific in exactly how you would test the hypothesis, and detail precise algorithmic changes. Include the evaluation metrics you would use.
-- "Risk Factors and Limitations": A list of potential risks and limitations of the proposal.""",
-    },
-]
+def get_search_tool(engine: str = "semantic_scholar"):
+    """Get the appropriate search tool based on the engine."""
+    if engine == "openalex":
+        return OpenAlexSearchTool()
+
+    return SemanticScholarSearchTool()
+
+def get_tools(engine: str = "semantic_scholar"):
+    """Get the tools list with the appropriate search tool."""
+    search_tool = get_search_tool(engine)
+    return [
+        search_tool,
+        {
+            "name": "FinalizeIdea",
+            "description": """Finalize your idea by providing the idea details.
+            The IDEA JSON should include the following fields:
+            - "Name": A short descriptor of the idea. Lowercase, no spaces, underscores allowed.
+            - "Title": A catchy and informative title for the proposal.
+            - "Short Hypothesis": A concise statement of the main hypothesis or research question. Clarify the need for this specific direction, ensure this is the best setting to investigate this idea, and there are not obvious other simpler ways to answer the question.
+            - "Related Work": A brief discussion of the most relevant related work and how the proposal clearly distinguishes from it, and is not a trivial extension.
+            - "Abstract": An abstract that summarizes the proposal in conference format (approximately 250 words).
+            - "Experiments": A list of experiments that would be conducted to validate the proposal. Ensure these are simple and feasible. Be specific in exactly how you would test the hypothesis, and detail precise algorithmic changes. Include the evaluation metrics you would use.
+            - "Risk Factors and Limitations": A list of potential risks and limitations of the proposal.""",
+        },
+    ]
 
 # Create a tools dictionary for easy lookup
-tools_dict = {tool.name: tool for tool in tools if isinstance(tool, BaseTool)}
-
-# Create a string with the tool descriptions
-tool_descriptions = "\n\n".join(
-    (
-        f"- **{tool.name}**: {tool.description}"
-        if isinstance(tool, BaseTool)
-        else f"- **{tool['name']}**: {tool['description']}"
+def create_tools_dict_and_descriptions(tools):
+    tools_dict = {tool.name: tool for tool in tools if isinstance(tool, BaseTool)}
+
+    # Create a string with the tool descriptions
+    tool_descriptions = "\n\n".join(
+        (
+            f"- **{tool.name}**: {tool.description}"
+            if isinstance(tool, BaseTool)
+            else f"- **{tool['name']}**: {tool['description']}"
+        )
+        for tool in tools
     )
-    for tool in tools
-)
-
-# Extract tool names for the prompt
-tool_names = [
-    f'"{tool.name}"' if isinstance(tool, BaseTool) else f'"{tool["name"]}"'
-    for tool in tools
-]
-tool_names_str = ", ".join(tool_names)
-
-system_prompt = f"""You are an experienced AI researcher who aims to propose high-impact research ideas resembling exciting grant proposals. Feel free to propose any novel ideas or experiments; make sure they are novel. Be very creative and think out of the box. Each proposal should stem from a simple and elegant question, observation, or hypothesis about the topic. For example, they could involve very interesting and simple interventions or investigations that explore new possibilities or challenge existing assumptions. Clearly clarify how the proposal distinguishes from the existing literature.
+
+    # Extract tool names for the prompt
+    tool_names = [
+        f'"{tool.name}"' if isinstance(tool, BaseTool) else f'"{tool["name"]}"'
+        for tool in tools
+    ]
+    tool_names_str = ", ".join(tool_names)
+
+    return tools_dict, tool_descriptions, tool_names_str
+
+def create_system_prompt(tool_descriptions: str, tool_names_str: str) -> str:
+    return f"""You are an experienced AI researcher who aims to propose high-impact research ideas resembling exciting grant proposals. Feel free to propose any novel ideas or experiments; make sure they are novel. Be very creative and think out of the box. Each proposal should stem from a simple and elegant question, observation, or hypothesis about the topic. For example, they could involve very interesting and simple interventions or investigations that explore new possibilities or challenge existing assumptions. Clearly clarify how the proposal distinguishes from the existing literature.
 
 Ensure that the proposal does not require resources beyond what an academic lab could afford. These proposals should lead to papers that are publishable at top ML conferences.
 
@@ -72,7 +83,7 @@
 <The action to take, exactly one of {tool_names_str}>
 
 ARGUMENTS:
-<If ACTION is "SearchSemanticScholar", provide the search query as {{"query": "your search query"}}. If ACTION is "FinalizeIdea", provide the idea details as {{"idea": {{ ... }}}} with the IDEA JSON specified below.>
+<If ACTION is a search tool, provide the search query as {{"query": "your search query"}}. If ACTION is "FinalizeIdea", provide the idea details as {{"idea": {{ ... }}}} with the IDEA JSON specified below.>
 
 If you choose to finalize your idea, provide the IDEA JSON in the arguments:
 
@@ -133,8 +144,18 @@ def generate_temp_free_idea(
     max_num_generations: int = 20,
     num_reflections: int = 5,
     reload_ideas: bool = True,
+    engine: str = "semantic_scholar",
 ) -> List[Dict]:
     idea_str_archive = []
+
+    # Get tools based on engine
+    tools = get_tools(engine)
+    tools_dict, tool_descriptions, tool_names_str = create_tools_dict_and_descriptions(tools)
+    system_prompt = create_system_prompt(tool_descriptions, tool_names_str)
+
+    print(f"Using search engine: {engine}")
+    print(f"Available tools: {tool_names_str}")
+
     # load ideas from file
     if reload_ideas and osp.exists(idea_fname):
         with open(idea_fname, "r") as f:
@@ -295,6 +316,13 @@ def generate_temp_free_idea(
         default=5,
         help="Number of reflection rounds per proposal.",
     )
+    parser.add_argument(
+        "--engine",
+        type=str,
+        default="semantic_scholar",
+        choices=["semantic_scholar", "openalex"],
+        help="Search engine to use for literature search. Choose 'semantic_scholar' or 'openalex'.",
+    )
     args = parser.parse_args()
 
     # Create the LLM client
@@ -315,5 +343,6 @@ def generate_temp_free_idea(
         workshop_description=workshop_description,
         max_num_generations=args.max_num_generations,
         num_reflections=args.num_reflections,
+        engine=args.engine,
     )
     print(f"{args.workshop_file} generated {len(ideas)} ideas.")
diff --git a/ai_scientist/tools/openalex.py b/ai_scientist/tools/openalex.py
@@ -0,0 +1,167 @@
+import os
+import time
+import warnings
+from typing import Dict, List, Optional
+
+import backoff
+
+from ai_scientist.tools.base_tool import BaseTool
+
+
+def on_backoff(details: Dict) -> None:
+    print(
+        f"Backing off {details['wait']:0.1f} seconds after {details['tries']} tries "
+        f"calling function {details['target'].__name__} at {time.strftime('%X')}"
+    )
+
+
+class OpenAlexSearchTool(BaseTool):
+    def __init__(
+        self,
+        name: str = "SearchOpenAlex",
+        description: str = (
+            "Search for relevant literature using OpenAlex. "
+            "Provide a search query to find relevant papers."
+        ),
+        max_results: int = 10,
+    ):
+        parameters = [
+            {
+                "name": "query",
+                "type": "str",
+                "description": "The search query to find relevant papers.",
+            }
+        ]
+        super().__init__(name, description, parameters)
+        self.max_results = max_results
+        self.mail = os.getenv("OPENALEX_MAIL_ADDRESS", None)
+        if self.mail is None:
+            print("[WARNING] Please set OPENALEX_MAIL_ADDRESS for better access to OpenAlex API!")
+        try:
+            import pyalex
+            self.pyalex = pyalex
+            if self.mail:
+                pyalex.config.email = self.mail
+        except ImportError:
+            self.pyalex = None
+            print("[ERROR] pyalex is not installed. Please install pyalex to use OpenAlexSearchTool.")
+
+    def use_tool(self, query: str) -> Optional[str]:
+        if self.pyalex is None:
+            return "pyalex is not installed. Please install pyalex to use this tool."
+        papers = self.search_for_papers(query)
+        if papers:
+            return self.format_papers(papers)
+        else:
+            return "No papers found."
+
+    @backoff.on_exception(
+        backoff.expo,
+        (Exception,),
+        on_backoff=on_backoff,
+        max_tries=3,
+    )
+    def search_for_papers(self, query: str) -> Optional[List[Dict]]:
+        if not query or self.pyalex is None:
+            return None
+        try:
+            from pyalex import Works
+            # Request specific fields including abstract_inverted_index
+            search_results = Works().search(query).filter(is_paratext=False).select([
+                "id", "title", "authorships", "publication_year", "cited_by_count",
+                "locations", "abstract_inverted_index"
+            ]).get()
+
+            if not search_results:
+                return None
+
+            search_results.sort(key=lambda x: x.get("cited_by_count", 0), reverse=True)
+
+            papers = []
+            for work in search_results[:self.max_results]:
+                paper = self.extract_info_from_work(work)
+                print(paper)
+                papers.append(paper)
+
+            return papers if papers else None
+        except Exception as e:
+            print(f"[ERROR] Failed to search OpenAlex: {e}")
+            return None
+
+    def extract_info_from_work(self, work, max_abstract_length: int = 1000) -> dict:
+        try:
+            venue = "Unknown"
+            for location in work.get("locations", []):
+                if location.get("source") is not None:
+                    venue = location["source"].get("display_name", "Unknown")
+                    if venue != "":
+                        break
+            title = work.get("title", "Unknown Title")
+
+            # Extract abstract from different possible fields
+            abstract = ""
+            if work.get("abstract"):
+                abstract = work.get("abstract")
+            elif work.get("abstract_inverted_index"):
+                # Reconstruct abstract from inverted index
+                abstract = self._reconstruct_abstract_from_inverted_index(work.get("abstract_inverted_index"))
+
+            if not abstract:
+                abstract = "No abstract available."
+
+            if len(abstract) > max_abstract_length:
+                print(f"[WARNING] {title=}: {len(abstract)=} is too long! Use first {max_abstract_length} chars.")
+                abstract = abstract[:max_abstract_length]
+
+            authors_list = [author["author"].get("display_name", "Unknown") for author in work.get("authorships", [])]
+            authors = " and ".join(authors_list) if len(authors_list) < 20 else f"{authors_list[0]} et al."
+            paper = dict(
+                title=title,
+                authors=authors,
+                venue=venue,
+                year=work.get("publication_year", "Unknown Year"),
+                abstract=abstract,
+                citationCount=work.get("cited_by_count", 0),
+            )
+            return paper
+        except Exception as e:
+            print(f"[ERROR] Failed to extract info from work: {e}")
+            return dict(
+                title="Unknown Title",
+                authors="Unknown Authors",
+                venue="Unknown Venue",
+                year="Unknown Year",
+                abstract="No abstract available.",
+                citationCount=0,
+            )
+
+    def _reconstruct_abstract_from_inverted_index(self, inverted_index: dict) -> str:
+        """Reconstruct abstract text from OpenAlex inverted index format."""
+        try:
+            if not inverted_index:
+                return ""
+
+            # Create a list to store words with their positions
+            word_positions = []
+            for word, positions in inverted_index.items():
+                for pos in positions:
+                    word_positions.append((pos, word))
+
+            # Sort by position and join words
+            word_positions.sort(key=lambda x: x[0])
+            abstract = " ".join([word for _, word in word_positions])
+
+            return abstract
+        except Exception as e:
+            print(f"[ERROR] Failed to reconstruct abstract from inverted index: {e}")
+            return ""
+
+    def format_papers(self, papers: List[Dict]) -> str:
+        paper_strings = []
+        for i, paper in enumerate(papers):
+            paper_strings.append(
+                f"""{i + 1}: {paper.get('title', 'Unknown Title')}. {paper.get('authors', 'Unknown Authors')}. {paper.get('venue', 'Unknown Venue')}, {paper.get('year', 'Unknown Year')}.
+Number of citations: {paper.get('citationCount', 'N/A')}
+Abstract: {paper.get('abstract', 'No abstract available.')}"""
+            )
+        return "\n\n".join(paper_strings)
diff --git a/requirements.txt b/requirements.txt
@@ -27,3 +27,4 @@ jsonschema
 omegaconf
 botocore
 boto3
+pyalex
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,3 +27,4 @@ jsonschema @@
     omegaconf
     botocore
     boto3
+    pyalex