From 6203c49dcf20eff19da90b853050e2eb598a1ec5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 27 Jan 2026 22:35:53 +0000
Subject: [PATCH] Add BinaryAutoResolver for auto-resolving binary questions

Implements an auto-resolver that takes a MetaculusQuestion (specifically BinaryQuestion)
and determines whether it should resolve to "yes" or "no" based on research.

Features:
- Uses SmartSearcher, Perplexity, and optionally Hyperbrowser for research
- Agent-based decision making with OpenAI Agents SDK
- Configurable research and decision models
- Returns ResolutionReport with confidence scores and evidence

Includes:
- BinaryAutoResolver class with resolve() method
- ResolutionReport and ResolutionDecision data models
- Agent tool for use with OpenAI Agents SDK
- Unit tests for data models and resolver

https://claude.ai/code/session_013SWrAV6WxgZNzN7Znq1uyE
---
 .../test_auto_resolver.py                     | 202 +++++++
 .../agents_and_tools/auto_resolver.py         | 535 ++++++++++++++++++
 2 files changed, 737 insertions(+)
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_auto_resolver.py
 create mode 100644 forecasting_tools/agents_and_tools/auto_resolver.py

diff --git a/code_tests/unit_tests/test_agents_and_tools/test_auto_resolver.py b/code_tests/unit_tests/test_agents_and_tools/test_auto_resolver.py
new file mode 100644
index 00000000..d7978c64
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_auto_resolver.py
@@ -0,0 +1,202 @@
+import logging
+from datetime import datetime
+
+import pytest
+
+from forecasting_tools.agents_and_tools.auto_resolver import (
+    BinaryAutoResolver,
+    ResearchEvidence,
+    ResearchResult,
+    ResolutionDecision,
+    ResolutionReport,
+)
+from forecasting_tools.data_models.questions import (
+    BinaryQuestion,
+    MetaculusQuestion,
+    NumericQuestion,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TestResolutionDecision:
+    def test_resolution_decision_values(self) -> None:
+        assert ResolutionDecision.YES.value == "yes"
+        assert ResolutionDecision.NO.value == "no"
+        assert ResolutionDecision.AMBIGUOUS.value == "ambiguous"
+        assert ResolutionDecision.ANNULLED.value == "annulled"
+
+    def test_resolution_decision_from_string(self) -> None:
+        assert ResolutionDecision("yes") == ResolutionDecision.YES
+        assert ResolutionDecision("no") == ResolutionDecision.NO
+        assert ResolutionDecision("ambiguous") == ResolutionDecision.AMBIGUOUS
+        assert ResolutionDecision("annulled") == ResolutionDecision.ANNULLED
+
+
+class TestResolutionReport:
+    def test_resolution_report_creation(self) -> None:
+        report = ResolutionReport(
+            question_text="Will X happen by Y date?",
+            question_id=12345,
+            resolution=ResolutionDecision.YES,
+            confidence=0.85,
+            reasoning="Based on evidence, X has happened.",
+            evidence_summary="Event X occurred on date Z.",
+            sources_consulted=["source1", "source2"],
+        )
+
+        assert report.question_text == "Will X happen by Y date?"
+        assert report.question_id == 12345
+        assert report.resolution == ResolutionDecision.YES
+        assert report.confidence == 0.85
+        assert len(report.sources_consulted) == 2
+
+    def test_resolution_report_confidence_validation(self) -> None:
+        # Valid confidence
+        report = ResolutionReport(
+            question_text="Test",
+            question_id=None,
+            resolution=ResolutionDecision.NO,
+            confidence=0.5,
+            reasoning="Test reasoning",
+            evidence_summary="Test evidence",
+        )
+        assert report.confidence == 0.5
+
+        # Invalid confidence (too high)
+        with pytest.raises(ValueError):
+            ResolutionReport(
+                question_text="Test",
+                question_id=None,
+                resolution=ResolutionDecision.NO,
+                confidence=1.5,
+                reasoning="Test reasoning",
+                evidence_summary="Test evidence",
+            )
+
+        # Invalid confidence (negative)
+        with pytest.raises(ValueError):
+            ResolutionReport(
+                question_text="Test",
+                question_id=None,
+                resolution=ResolutionDecision.NO,
+                confidence=-0.1,
+                reasoning="Test reasoning",
+                evidence_summary="Test evidence",
+            )
+
+
+class TestResearchResult:
+    def test_research_result_creation(self) -> None:
+        result = ResearchResult(
+            query="test query",
+            content="test content",
+            sources=["source1", "source2"],
+            source_type="smart_search",
+        )
+
+        assert result.query == "test query"
+        assert result.content == "test content"
+        assert len(result.sources) == 2
+        assert result.source_type == "smart_search"
+
+    def test_research_result_source_types(self) -> None:
+        valid_types = ["smart_search", "perplexity", "computer_use", "other"]
+        for source_type in valid_types:
+            result = ResearchResult(
+                query="test",
+                content="test",
+                sources=[],
+                source_type=source_type,
+            )
+            assert result.source_type == source_type
+
+
+class TestResearchEvidence:
+    def test_research_evidence_creation(self) -> None:
+        evidence = ResearchEvidence(
+            raw_evidence="Combined evidence from multiple sources",
+            sources=["source1", "source2", "source3"],
+            num_searches_completed=3,
+        )
+
+        assert "Combined evidence" in evidence.raw_evidence
+        assert len(evidence.sources) == 3
+        assert evidence.num_searches_completed == 3
+
+
+class TestBinaryAutoResolver:
+    def test_resolver_initialization_default(self) -> None:
+        resolver = BinaryAutoResolver()
+
+        assert resolver.use_computer_use is False
+        assert resolver.num_searches == 3
+
+    def test_resolver_initialization_custom(self) -> None:
+        resolver = BinaryAutoResolver(
+            research_model="gpt-4o",
+            decision_model="gpt-4o",
+            use_computer_use=True,
+            num_searches=5,
+        )
+
+        assert resolver.use_computer_use is True
+        assert resolver.num_searches == 5
+
+    def test_resolver_rejects_non_binary_questions(self) -> None:
+        resolver = BinaryAutoResolver()
+
+        # Create a non-binary question (NumericQuestion)
+        numeric_question = NumericQuestion(
+            question_text="What will be the temperature?",
+            upper_bound=100.0,
+            lower_bound=0.0,
+            open_upper_bound=True,
+            open_lower_bound=True,
+        )
+
+        with pytest.raises(TypeError) as exc_info:
+            import asyncio
+
+            asyncio.run(resolver.resolve(numeric_question))
+
+        assert "Expected BinaryQuestion" in str(exc_info.value)
+        assert "NumericQuestion" in str(exc_info.value)
+
+    def test_create_binary_question_for_testing(self) -> None:
+        """Test helper to verify we can create valid BinaryQuestion objects."""
+        question = BinaryQuestion(
+            question_text="Will SpaceX launch Starship in 2024?",
+            id_of_post=12345,
+            resolution_criteria="Resolves YES if SpaceX successfully launches Starship.",
+            fine_print="Must be a full stack launch.",
+            background_info="SpaceX has been developing Starship for years.",
+        )
+
+        assert question.question_type == "binary"
+        assert question.id_of_post == 12345
+        assert "SpaceX" in question.question_text
+
+
+class TestAutoResolverTools:
+    def test_resolve_binary_question_tool_exists(self) -> None:
+        from forecasting_tools.agents_and_tools.auto_resolver import (
+            resolve_binary_question,
+        )
+
+        # Verify the tool is callable
+        assert callable(resolve_binary_question)
+
+    def test_create_auto_resolver_agent_function_exists(self) -> None:
+        from forecasting_tools.agents_and_tools.auto_resolver import (
+            create_auto_resolver_agent,
+        )
+
+        assert callable(create_auto_resolver_agent)
+
+    def test_run_auto_resolver_agent_function_exists(self) -> None:
+        from forecasting_tools.agents_and_tools.auto_resolver import (
+            run_auto_resolver_agent,
+        )
+
+        assert callable(run_auto_resolver_agent)
diff --git a/forecasting_tools/agents_and_tools/auto_resolver.py b/forecasting_tools/agents_and_tools/auto_resolver.py
new file mode 100644
index 00000000..badb603e
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/auto_resolver.py
@@ -0,0 +1,535 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+from forecasting_tools.agents_and_tools.research.computer_use import ComputerUse
+from forecasting_tools.agents_and_tools.research.smart_searcher import SmartSearcher
+from forecasting_tools.ai_models.agent_wrappers import (
+    AgentRunner,
+    AgentSdkLlm,
+    AgentTool,
+    AiAgent,
+    agent_tool,
+    general_trace_or_span,
+)
+from forecasting_tools.ai_models.general_llm import GeneralLlm
+from forecasting_tools.ai_models.resource_managers.monetary_cost_manager import (
+    MonetaryCostManager,
+)
+from forecasting_tools.data_models.questions import BinaryQuestion, MetaculusQuestion
+from forecasting_tools.util.misc import clean_indents
+
+logger = logging.getLogger(__name__)
+
+
+class ResolutionDecision(str, Enum):
+    YES = "yes"
+    NO = "no"
+    AMBIGUOUS = "ambiguous"
+    ANNULLED = "annulled"
+
+
+class ResolutionReport(BaseModel):
+    question_text: str
+    question_id: int | None
+    resolution: ResolutionDecision
+    confidence: float = Field(
+        ge=0.0, le=1.0, description="Confidence in the resolution decision"
+    )
+    reasoning: str
+    evidence_summary: str
+    sources_consulted: list[str] = Field(default_factory=list)
+
+
+class BinaryAutoResolver:
+    """
+    Auto-resolver for binary (yes/no) questions.
+
+    Takes a BinaryQuestion as input and uses research tools to determine
+    whether the question should resolve to "yes" or "no".
+    """
+
+    def __init__(
+        self,
+        research_model: str | GeneralLlm = "openrouter/openai/o4-mini",
+        decision_model: str | GeneralLlm = "openrouter/openai/o4-mini",
+        use_computer_use: bool = False,
+        num_searches: int = 3,
+    ) -> None:
+        """
+        Initialize the auto-resolver.
+
+        Args:
+            research_model: Model to use for research tasks
+            decision_model: Model to use for final decision making
+            use_computer_use: Whether to enable browser-based research (slower but more thorough)
+            num_searches: Number of search queries to run
+        """
+        self.research_model = GeneralLlm.to_llm(research_model)
+        self.decision_model = GeneralLlm.to_llm(decision_model)
+        self.use_computer_use = use_computer_use
+        self.num_searches = num_searches
+
+    async def resolve(self, question: MetaculusQuestion) -> ResolutionReport:
+        """
+        Resolve a binary question by researching and determining the outcome.
+
+        Args:
+            question: The Metaculus question to resolve
+
+        Returns:
+            ResolutionReport containing the resolution decision and supporting evidence
+        """
+        if not isinstance(question, BinaryQuestion):
+            raise TypeError(
+                f"Expected BinaryQuestion, got {type(question).__name__}. "
+                "This resolver currently only supports binary questions."
+            )
+
+        with general_trace_or_span(
+            "BinaryAutoResolver.resolve",
+            data={"question_id": question.id_of_post},
+        ):
+            with MonetaryCostManager() as cost_manager:
+                report = await self._resolve_binary_question(question)
+                logger.info(
+                    f"Resolution complete. Cost: ${cost_manager.current_usage:.4f}"
+                )
+                return report
+
+    async def _resolve_binary_question(
+        self, question: BinaryQuestion
+    ) -> ResolutionReport:
+        """Internal method to resolve binary questions."""
+        logger.info(f"Starting resolution for question: {question.question_text[:100]}...")
+
+        # Step 1: Gather evidence through research
+        evidence = await self._gather_evidence(question)
+
+        # Step 2: Make resolution decision
+        report = await self._make_resolution_decision(question, evidence)
+
+        return report
+
+    async def _gather_evidence(self, question: BinaryQuestion) -> ResearchEvidence:
+        """Gather evidence about the question's resolution."""
+        question_details = question.give_question_details_as_markdown()
+
+        # Generate research queries
+        research_queries = await self._generate_research_queries(question_details)
+
+        # Run searches in parallel
+        search_tasks = []
+
+        # SmartSearcher for detailed research
+        for query in research_queries[: self.num_searches]:
+            search_tasks.append(self._run_smart_search(query))
+
+        # Perplexity for quick factual lookups
+        search_tasks.append(self._run_perplexity_search(question))
+
+        # Optionally use computer use for complex research
+        if self.use_computer_use:
+            search_tasks.append(self._run_computer_use_research(question))
+
+        # Gather all results
+        results = await asyncio.gather(*search_tasks, return_exceptions=True)
+
+        # Process results
+        search_results = []
+        sources = []
+        for result in results:
+            if isinstance(result, Exception):
+                logger.warning(f"Search failed: {result}")
+            elif isinstance(result, ResearchResult):
+                search_results.append(result)
+                sources.extend(result.sources)
+
+        combined_evidence = "\n\n---\n\n".join(
+            [r.content for r in search_results if r.content]
+        )
+
+        return ResearchEvidence(
+            raw_evidence=combined_evidence,
+            sources=list(set(sources)),
+            num_searches_completed=len(search_results),
+        )
+
+    async def _generate_research_queries(
+        self, question_details: str
+    ) -> list[str]:
+        """Generate research queries for the question."""
+        prompt = clean_indents(
+            f"""
+            You are a research assistant helping to determine if a prediction question has resolved.
+
+            Given the following question details, generate {self.num_searches + 2} search queries
+            that would help determine if this question has resolved and what the resolution is.
+
+            Focus on:
+            1. Finding official announcements or confirmations
+            2. Finding news articles about the outcome
+            3. Finding data sources that would show the resolution
+            4. Finding any official statements from relevant parties
+
+            Question Details:
+            {question_details}
+
+            Return the queries as a JSON list of strings. Return only the JSON list.
+            Example: ["query 1", "query 2", "query 3"]
+            """
+        )
+
+        queries = await self.research_model.invoke_and_return_verified_type(
+            prompt, list[str]
+        )
+        logger.info(f"Generated {len(queries)} research queries")
+        return queries
+
+    async def _run_smart_search(self, query: str) -> ResearchResult:
+        """Run a SmartSearcher query."""
+        try:
+            searcher = SmartSearcher(
+                include_works_cited_list=True,
+                use_brackets_around_citations=True,
+                num_searches_to_run=2,
+                num_sites_per_search=5,
+                model=self.research_model,
+            )
+            result = await searcher.invoke(query)
+            return ResearchResult(
+                query=query,
+                content=result,
+                sources=[query],  # SmartSearcher includes sources in the result
+                source_type="smart_search",
+            )
+        except Exception as e:
+            logger.warning(f"SmartSearcher failed for query '{query}': {e}")
+            return ResearchResult(
+                query=query,
+                content="",
+                sources=[],
+                source_type="smart_search",
+            )
+
+    async def _run_perplexity_search(
+        self, question: BinaryQuestion
+    ) -> ResearchResult:
+        """Run a Perplexity search for quick facts."""
+        try:
+            llm = GeneralLlm(
+                model="openrouter/perplexity/sonar-reasoning-pro",
+                reasoning_effort="high",
+                web_search_options={"search_context_size": "high"},
+                populate_citations=True,
+            )
+
+            prompt = clean_indents(
+                f"""
+                I need to determine if the following prediction question has resolved and what the outcome is.
+
+                Question: {question.question_text}
+
+                Resolution Criteria:
+                {question.resolution_criteria}
+
+                Please search for the most recent and authoritative information about:
+                1. Has this event/outcome occurred?
+                2. What official sources confirm the outcome?
+                3. What is the current status?
+
+                Provide specific dates, sources, and evidence.
+                """
+            )
+
+            result = await llm.invoke(prompt)
+            return ResearchResult(
+                query="perplexity_resolution_search",
+                content=result,
+                sources=["Perplexity AI Search"],
+                source_type="perplexity",
+            )
+        except Exception as e:
+            logger.warning(f"Perplexity search failed: {e}")
+            return ResearchResult(
+                query="perplexity_resolution_search",
+                content="",
+                sources=[],
+                source_type="perplexity",
+            )
+
+    async def _run_computer_use_research(
+        self, question: BinaryQuestion
+    ) -> ResearchResult:
+        """Use browser automation for complex research tasks."""
+        try:
+            computer = ComputerUse()
+            prompt = clean_indents(
+                f"""
+                Research the following prediction question to determine if it has resolved:
+
+                Question: {question.question_text}
+
+                Resolution Criteria:
+                {question.resolution_criteria}
+
+                Please:
+                1. Search for official sources and announcements
+                2. Check relevant websites mentioned in the question
+                3. Look for news articles about the outcome
+                4. Document what you find with specific dates and sources
+
+                {f"Start by checking: {question.page_url}" if question.page_url else ""}
+                """
+            )
+
+            result = await computer.answer_prompt(prompt)
+            return ResearchResult(
+                query="computer_use_research",
+                content=result.as_string,
+                sources=[result.recording_url or "Browser research session"],
+                source_type="computer_use",
+            )
+        except Exception as e:
+            logger.warning(f"Computer use research failed: {e}")
+            return ResearchResult(
+                query="computer_use_research",
+                content="",
+                sources=[],
+                source_type="computer_use",
+            )
+
+    async def _make_resolution_decision(
+        self,
+        question: BinaryQuestion,
+        evidence: ResearchEvidence,
+    ) -> ResolutionReport:
+        """Make the final resolution decision based on gathered evidence."""
+        question_details = question.give_question_details_as_markdown()
+
+        prompt = clean_indents(
+            f"""
+            You are a resolution analyst for Metaculus, a prediction platform.
+            Your task is to determine if a binary question should resolve to "yes" or "no".
+
+            ## Question Details
+            {question_details}
+
+            ## Research Evidence
+            The following evidence was gathered from multiple sources:
+
+            {evidence.raw_evidence}
+
+            ## Your Task
+            Based on the resolution criteria and the evidence gathered, determine:
+            1. Has the question resolved? If the outcome is unclear or hasn't happened yet, indicate "ambiguous"
+            2. If resolved, should it resolve to "yes" or "no"?
+            3. What is your confidence level (0.0 to 1.0)?
+            4. Provide reasoning for your decision
+            5. Summarize the key evidence that supports your decision
+
+            Important:
+            - Follow the resolution criteria exactly as written
+            - Only resolve "yes" or "no" if there is clear evidence
+            - If the evidence is inconclusive or the event hasn't occurred, use "ambiguous"
+            - If the question should be annulled (e.g., the resolution criteria can never be met), use "annulled"
+
+            Return your answer as a JSON object with the following structure:
+            {{
+                "resolution": "yes" or "no" or "ambiguous" or "annulled",
+                "confidence": 0.0 to 1.0,
+                "reasoning": "Your detailed reasoning here",
+                "evidence_summary": "Summary of key evidence supporting the decision"
+            }}
+
+            Return only the JSON object.
+            """
+        )
+
+        result = await self.decision_model.invoke_and_return_verified_type(
+            prompt, dict
+        )
+
+        resolution_str = result["resolution"].lower()
+        try:
+            resolution = ResolutionDecision(resolution_str)
+        except ValueError:
+            logger.warning(
+                f"Invalid resolution value '{resolution_str}', defaulting to AMBIGUOUS"
+            )
+            resolution = ResolutionDecision.AMBIGUOUS
+
+        return ResolutionReport(
+            question_text=question.question_text,
+            question_id=question.id_of_post,
+            resolution=resolution,
+            confidence=float(result.get("confidence", 0.5)),
+            reasoning=result.get("reasoning", "No reasoning provided"),
+            evidence_summary=result.get("evidence_summary", "No evidence summary"),
+            sources_consulted=evidence.sources,
+        )
+
+
+class ResearchResult(BaseModel):
+    """Result from a single research source."""
+
+    query: str
+    content: str
+    sources: list[str]
+    source_type: Literal["smart_search", "perplexity", "computer_use", "other"]
+
+
+class ResearchEvidence(BaseModel):
+    """Combined evidence from all research sources."""
+
+    raw_evidence: str
+    sources: list[str]
+    num_searches_completed: int
+
+
+# Agent tools for use with the OpenAI Agents SDK
+
+
+@agent_tool
+async def resolve_binary_question(question_url_or_id: str | int) -> str:
+    """
+    Automatically resolve a binary Metaculus question by researching its outcome.
+
+    Takes a Metaculus question URL or ID and returns the resolution ("yes" or "no")
+    along with supporting evidence.
+
+    Use this when you need to determine if a binary prediction question has resolved.
+    """
+    from forecasting_tools.helpers.metaculus_api import MetaculusApi
+
+    if isinstance(question_url_or_id, str):
+        try:
+            question_url_or_id = int(question_url_or_id)
+        except ValueError:
+            pass
+
+    if isinstance(question_url_or_id, int):
+        question = MetaculusApi.get_question_by_post_id(question_url_or_id)
+    else:
+        question = MetaculusApi.get_question_by_url(question_url_or_id)
+
+    if not isinstance(question, BinaryQuestion):
+        return f"Error: Question is not a binary question. Type: {type(question).__name__}"
+
+    resolver = BinaryAutoResolver()
+    report = await resolver.resolve(question)
+
+    return clean_indents(
+        f"""
+        ## Resolution Report
+
+        **Question**: {report.question_text}
+        **Resolution**: {report.resolution.value}
+        **Confidence**: {report.confidence:.0%}
+
+        ### Reasoning
+        {report.reasoning}
+
+        ### Evidence Summary
+        {report.evidence_summary}
+
+        ### Sources Consulted
+        {chr(10).join(f"- {source}" for source in report.sources_consulted[:10])}
+        """
+    )
+
+
+def create_auto_resolver_agent(
+    model: str = "openrouter/openai/o4-mini",
+) -> AiAgent:
+    """
+    Create an agent specialized for auto-resolving prediction questions.
+
+    Returns an AiAgent configured with research tools for question resolution.
+    """
+    from forecasting_tools.agents_and_tools.minor_tools import (
+        grab_question_details_from_metaculus,
+        perplexity_reasoning_pro_search,
+        smart_searcher_search,
+    )
+    from forecasting_tools.agents_and_tools.research.computer_use import ComputerUse
+
+    tools: list[AgentTool] = [
+        resolve_binary_question,
+        grab_question_details_from_metaculus,
+        perplexity_reasoning_pro_search,
+        smart_searcher_search,
+        ComputerUse.computer_use_tool,
+    ]
+
+    agent = AiAgent(
+        name="AutoResolverAgent",
+        instructions=clean_indents(
+            """
+            You are an expert at resolving prediction questions on Metaculus.
+
+            Your task is to determine if binary questions have resolved and what
+            the resolution should be based on the resolution criteria.
+
+            When given a question to resolve:
+            1. First grab the question details to understand the resolution criteria
+            2. Use the research tools to find evidence about the outcome
+            3. Use the resolve_binary_question tool to make the final determination
+            4. Report your findings with confidence level and supporting evidence
+
+            Always follow the resolution criteria exactly as written.
+            Only resolve "yes" or "no" if there is clear evidence.
+            If uncertain, indicate that the question cannot be resolved yet.
+            """
+        ),
+        model=AgentSdkLlm(model=model),
+        tools=tools,
+    )
+
+    return agent
+
+
+async def run_auto_resolver_agent(
+    question_url_or_id: str | int,
+    model: str = "openrouter/openai/o4-mini",
+) -> str:
+    """
+    Run the auto-resolver agent on a question.
+
+    Args:
+        question_url_or_id: Metaculus question URL or post ID
+        model: Model to use for the agent
+
+    Returns:
+        The agent's resolution report as a string
+    """
+    agent = create_auto_resolver_agent(model=model)
+
+    prompt = f"Please resolve this Metaculus question: {question_url_or_id}"
+
+    result = await AgentRunner.run(agent, prompt)
+
+    return result.final_output
+
+
+if __name__ == "__main__":
+    # Example usage
+    import sys
+
+    logging.basicConfig(level=logging.INFO)
+
+    if len(sys.argv) > 1:
+        question_id = sys.argv[1]
+    else:
+        question_id = "31866"  # Example question ID
+
+    async def main():
+        result = await run_auto_resolver_agent(question_id)
+        print(result)
+
+    asyncio.run(main())