diff --git a/code_tests/integration_tests/test_metaculus_api.py b/code_tests/integration_tests/test_metaculus_api.py index fdb17f2c..be2583c9 100644 --- a/code_tests/integration_tests/test_metaculus_api.py +++ b/code_tests/integration_tests/test_metaculus_api.py @@ -513,7 +513,7 @@ def test_get_conditional_questions_from_tournament(self) -> None: assert isinstance(conditional_question.question_no, BinaryQuestion) async def test_get_previous_forecast(self) -> None: - client = MetaculusClient().dev() + client = MetaculusClient() for allowed_types in {"binary", "numeric"}: api_filter = ApiFilter( allowed_types=[allowed_types], # type: ignore @@ -1365,10 +1365,10 @@ def test_all_admin_functions(self) -> None: token=token, ) question_to_create = client.get_question_by_url( - "https://dev.metaculus.com/questions/39162/" + "https://www.metaculus.com/questions/39162/" ) - project_id = 1156 # https://dev.metaculus.com/tournament/beta-testing/ - slug = "beta-testing" + project_id = 32932 # https://www.metaculus.com/tournament/benta/ + slug = "benta" # Ben testing area question_to_create.default_project_id = project_id question_to_create.tournament_slugs = [slug] diff --git a/forecasting_tools/agents_and_tools/ai_congress/__init__.py b/forecasting_tools/agents_and_tools/ai_congress/__init__.py new file mode 100644 index 00000000..f85d69af --- /dev/null +++ b/forecasting_tools/agents_and_tools/ai_congress/__init__.py @@ -0,0 +1,41 @@ +from forecasting_tools.agents_and_tools.ai_congress.congress_member_agent import ( + CongressMemberAgent, +) +from forecasting_tools.agents_and_tools.ai_congress.congress_orchestrator import ( + CongressOrchestrator, +) +from forecasting_tools.agents_and_tools.ai_congress.data_models import ( + CongressMember, + CongressSession, + CongressSessionInput, + ForecastDescription, + PolicyProposal, +) +from forecasting_tools.agents_and_tools.ai_congress.member_profiles import ( + AI_MODEL_MEMBERS, + AVAILABLE_MEMBERS, + POLITICAL_MEMBERS, + get_ai_model_members, + get_default_members, + get_member_by_name, + get_members_by_names, + get_political_members, +) + +__all__ = [ + "CongressMember", + "CongressMemberAgent", + "CongressOrchestrator", + "CongressSession", + "CongressSessionInput", + "ForecastDescription", + "PolicyProposal", + "AI_MODEL_MEMBERS", + "AVAILABLE_MEMBERS", + "POLITICAL_MEMBERS", + "get_ai_model_members", + "get_default_members", + "get_member_by_name", + "get_members_by_names", + "get_political_members", +] diff --git a/forecasting_tools/agents_and_tools/ai_congress/congress_member_agent.py b/forecasting_tools/agents_and_tools/ai_congress/congress_member_agent.py new file mode 100644 index 00000000..79a202a8 --- /dev/null +++ b/forecasting_tools/agents_and_tools/ai_congress/congress_member_agent.py @@ -0,0 +1,400 @@ +from __future__ import annotations + +import logging + +from forecasting_tools.agents_and_tools.ai_congress.data_models import ( + CongressMember, + PolicyProposal, +) +from forecasting_tools.agents_and_tools.minor_tools import ( + perplexity_reasoning_pro_search, + query_asknews, +) +from forecasting_tools.ai_models.agent_wrappers import AgentRunner, AgentSdkLlm, AiAgent +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.helpers.structure_output import structure_output +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + +LONG_TIMEOUT = 480 # 8 minutes for long-running LLM calls + + +class CongressMemberAgent: + def __init__( + self, + member: CongressMember, + timeout: int = LONG_TIMEOUT, + structure_output_model: GeneralLlm | None = None, + ): + self.member = member + self.timeout = timeout + self.structure_output_model = structure_output_model or GeneralLlm( + "openrouter/openai/gpt-5.2", + temperature=0.2, + timeout=self.timeout, + ) + + async def deliberate(self, policy_prompt: str) -> PolicyProposal: + logger.info(f"Deliberating on policy question: {policy_prompt[:100]}...") + instructions = self._build_agent_instructions(policy_prompt) + + agent = AiAgent( + name=f"Congress Member: {self.member.name}", + instructions=instructions, + model=AgentSdkLlm(model=self.member.ai_model), + tools=[ + perplexity_reasoning_pro_search, + query_asknews, + ], + handoffs=[], + ) + + result = await AgentRunner.run( + agent, "Please begin your deliberation now.", max_turns=20 + ) + + logger.info(f"Extracting proposal from output for {self.member.name}") + proposal = await self._extract_proposal_from_output(result.final_output) + proposal.member = self.member + logger.info(f"Completed deliberation for {self.member.name}") + return proposal + + async def _extract_proposal_from_output(self, agent_output: str) -> PolicyProposal: + extraction_instructions = clean_indents( + """ + Extract the policy proposal from the congress member's deliberation output. + + You must extract: + 1. research_summary: The background research section (3-5 paragraphs) + 2. decision_criteria: The list of 4-6 criteria as strings + 3. forecasts: Each forecast from the appendix as a ForecastDescription object + - footnote_id: The number (1, 2, 3, etc.) + - question_title: Short title + - question_text: Full question + - resolution_criteria: How it resolves + - prediction: The probability (e.g., "35%" or "70% Option A, 20% Option B, 10% Option C" or "10% chance less than X units, ... ,90% chance less than Y units") + - reasoning: The reasoning explanation + - key_sources: List of sources mentioned + 4. proposal_markdown: The full proposal section including Executive Summary, + Analysis, Recommendations, Risks, and any other section you see. Include footnote references [^1] etc. + 5. key_recommendations: The 3-5 main recommendations as a list of strings + + Be thorough in extracting all forecasts from the Forecast Appendix section. + """ + ) + + proposal = await structure_output( + agent_output, + PolicyProposal, + model=self.structure_output_model, + additional_instructions=extraction_instructions, + ) + return proposal + + def _build_agent_instructions(self, policy_prompt: str) -> str: + expertise_guidance = self._get_expertise_specific_research_guidance() + question_guidance = self._get_question_generation_guidance() + + return clean_indents( + f""" + # Your Identity + + You are {self.member.name}, a {self.member.role}. + + Political Leaning: {self.member.political_leaning} + + Your Core Motivation: {self.member.general_motivation} + + Areas of Expertise: {self.member.expertise_string} + + Personality Traits: {self.member.traits_string} + + --- + + # Your Task + + You are participating in an AI Forecasting Congress to deliberate on the + following policy question: + + "{policy_prompt}" + + You must complete ALL FIVE PHASES below in order, thinking through each + carefully. Your final output will be a comprehensive policy proposal backed + by quantitative forecasts. + + IMPORTANT: Use your search tools extensively in Phases 1 and 4. Good policy + analysis requires understanding the current state of affairs and gathering + evidence for your forecasts. + + --- + + ## PHASE 1: Background Research + + Use your search tools to understand the current state of affairs related to + this policy question. Make at least 3-5 searches to gather comprehensive + information. + + Research goals: + - What is the current status quo? What policies exist today? + - What are the key stakeholders and their positions? + - What recent events or trends are relevant? + - What data and statistics are available? + - What have experts and analysts said about this topic? + - What are the main arguments for and against different approaches? + + Given your expertise in {self.member.expertise_string}, pay special attention to: + {expertise_guidance} + + After researching, write a detailed "## Research Summary" section (3-5 + paragraphs) documenting your key findings. Include specific facts, figures, + and citations from your research. + + --- + + ## PHASE 2: Decision Criteria + + Based on your values and expertise, articulate 4-6 criteria you will use to + evaluate policy options. + + Your criteria should reflect your motivation: "{self.member.general_motivation}" + + For each criterion: + - Name it clearly (e.g., "Economic Efficiency", "Equity Impact", + "Implementation Feasibility", "Risk Minimization") + - Explain why this criterion matters to you specifically given your + {self.member.political_leaning} perspective + - Describe how you would measure or evaluate success on this criterion + + Write a "## Decision Criteria" section listing your criteria in order of + importance to you. + + --- + + ## PHASE 3: Generate Forecasting Questions + + Identify 3-5 specific, concrete forecasting questions that would help inform + this policy decision. These questions should be ones where the answer + genuinely matters for deciding what to do. + + Good forecasting questions follow these principles: + - The question should shed light on the topic and have high VOI (Value of Information) + - The question should be specific and not vague + - The question should have a resolution date + - Once the resolution date has passed, the question should be resolvable with 0.5-1.5hr of research + - Bad: "Will a research paper in an established journal find that a new knee surgery technique reduces follow up surgery with significance by Dec 31 2023?" (To resolve this you have to do extensive research into all new research in a field) + - Good: "Will public dataset X at URL Y show the number of follow ups to knee surgeries decrease by Z% by Dec 31 2023?" (requires only some math on a few data points at a known URL) + - A good resolution source exists + - Bad: "On 15 January 2026, will the general sentiment be generally positive for knee surgery professionals with at least 10 years of experience concerning ACL reconstruction research?" (There is no way to research this online. You would have to run a large study on knee professionals) + - Good: "As of 15 January 2026, how many 'recruiting study' search results will there be on ClinicalTrials.gov when searching 'ACL reconstruction' in 'intervention/treatment'?" (requires only a search on a known website) + - Don't forget to INCLUDE Links if you found any! Copy the links IN FULL especially to resolution sources! + - The questions should match any additional criteria that the superforecaster/client has given you + - The question should not be obvious. Consider the time range when determining this (short time ranges means things are less likely). + - Bad: "Will country X start a war in the next 2 weeks" (Probably not, especially if they have not said anything about this) + - Good: "Will country X start a war in the next year" (Could be possible, especially if there are risk factors) + - Cover different aspects: policy effectiveness, side effects, implementation, + political feasibility, etc. + - Are relevant to the policy decision at hand + - You can find how this question resolved in the past (search for a past resolution, and consider iterating the question if you cannot find how to resolve it) + + + For each question, write: + - **Question Title**: A short descriptive title + - **Full Question**: The complete, unambiguous question + - **Resolution Criteria**: Exactly what would make this resolve YES vs NO, + or how a numeric value would be measured. Be very specific. + - **Time Horizon**: When will we know the answer? + - **Why It Matters**: How does this question inform the policy decision? + + Make sure your questions reflect your unique perspective as {self.member.name}. + {question_guidance} + + Write a "## Forecasting Questions" section with your 3-5 questions. + + --- + + ## PHASE 4: Forecast Each Question + + Now forecast each question you generated. This is the most important phase. + + For EACH forecasting question: + 1. Consider what principles associated with good forecasting you plan to use in this situation, if any (e.g. base rates, bias identification, premortems, simulations, scope sensitivity, aggregation, etc) + 2. Make a research plan + 3. Conduct the research (iterate as needed) + 4. Write down the main facts from the research you conducted that you will consider in your forecast + 5. Do any analysis you need to do, and then write down your rationale for the forecast + 6. Write down your forecast in accordance with the format requested of you + + You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. + For numeric questions, you remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + Write your forecasts inline as you work through each question. + + --- + + ## PHASE 5: Write Your Policy Proposal + + Now synthesize everything into a comprehensive policy proposal. This is + your final output. + + Structure your proposal EXACTLY as follows: + + ### Executive Summary + + A 2-3 sentence summary of your main recommendation as {self.member.name}. + What is the single most important thing policymakers should do? + + ### Analysis + + Your detailed analysis of the policy question (3-5 paragraphs), drawing on + your research and forecasts. + + CRITICAL: When you reference forecasts, use footnote format: + - In the text: "This approach has a significant chance of success (65% [^1])" + - Or: "The risk of unintended consequences is moderate (25% probability [^2])" + + The footnote number [^1], [^2], etc. corresponds to the forecast in your + appendix below. + + ### Recommendations + + Your top 3-5 specific, actionable policy recommendations. For each: + - State the recommendation clearly + - Explain why you support it given your forecasts and criteria + - Note which of your decision criteria it addresses + - Give a detailed implementation plan for the recommendation. What would this actually look like on the ground? + - Reference relevant forecasts with footnotes + + ### Risks and Uncertainties + + What could go wrong? What are you most uncertain about? + - Identify the key risks of your recommendations + - Note which forecasts have the widest uncertainty + - Describe scenarios where your recommendations might backfire + - Reference relevant forecasts + + ### Forecast Appendix + + At the end, provide a structured appendix with ALL your forecasts in this + EXACT format: + + [^1] **[Question Title]** + - Question: [Full question text] + - Resolution: [Resolution criteria] + - Prediction: [Your probability, e.g., "35%"] + - Reasoning: [4+ sentences explaining your reasoning, key evidence, and + considerations] + - Sources: [Key sources used, can be URLs or source names] + + [^2] **[Question Title]** + - Question: [Full question text] + - Resolution: [Resolution criteria] + - Prediction: [Your probability] + - Reasoning: [4+ sentences] + - Sources: [Sources] + + ... continue for all forecasts ... + + --- + + # Important Reminders + + - You ARE {self.member.name}. Stay in character throughout. + - Your analysis should reflect your {self.member.political_leaning} + perspective and your expertise in {self.member.expertise_string}. + - Use your search tools extensively - good analysis requires evidence. + - Every major claim in your proposal should be backed by either research + or a forecast with a footnote. + - Be specific and quantitative wherever possible. + + Begin your deliberation now. Start with Phase 1: Background Research. + """ + ) + + def _get_expertise_specific_research_guidance(self) -> str: + expertise_to_guidance = { + "statistics": "- Statistical evidence, effect sizes, confidence intervals, replication status of key findings", + "research methodology": "- Quality of evidence, study designs, potential confounders, meta-analyses", + "policy evaluation": "- Past policy experiments, natural experiments, cost-benefit analyses, program evaluations", + "economics": "- Economic data, market impacts, incentive structures, distributional effects, GDP/employment impacts", + "governance": "- Institutional constraints, separation of powers, historical precedents, constitutional issues", + "institutional design": "- How similar institutions have evolved, design tradeoffs, unintended consequences of past reforms", + "risk management": "- Tail risks, insurance markets, actuarial data, historical disasters and near-misses", + "history": "- Historical analogies, how similar situations played out, lessons from past policy failures", + "social policy": "- Social indicators, inequality metrics, demographic trends, community impacts", + "civil rights": "- Legal precedents, disparate impact data, civil liberties implications, protected classes", + "economic inequality": "- Gini coefficients, wealth distribution, mobility statistics, poverty rates", + "labor": "- Employment data, wage trends, union density, working conditions, automation impacts", + "market design": "- Auction theory, mechanism design, market failures, externalities", + "regulatory policy": "- Regulatory burden, compliance costs, enforcement challenges, capture risks", + "public choice theory": "- Voting patterns, special interest influence, bureaucratic incentives, rent-seeking", + "defense": "- Military capabilities, force posture, defense budgets, readiness metrics", + "geopolitics": "- Alliance structures, regional dynamics, great power competition, spheres of influence", + "intelligence": "- Threat assessments, intelligence community views, classified-to-unclassified information", + "military strategy": "- Deterrence theory, escalation dynamics, military doctrine, lessons from recent conflicts", + "diplomacy": "- Treaty frameworks, international organizations, soft power, diplomatic history", + "international relations": "- International norms, multilateral institutions, alliance commitments", + "negotiation": "- Negotiation frameworks, BATNA analysis, trust-building mechanisms", + "trade": "- Trade flows, comparative advantage, supply chains, trade agreement impacts", + "technology forecasting": "- Technology roadmaps, Moore's law analogies, adoption curves, disruption patterns", + "existential risk": "- X-risk estimates, catastrophic scenarios, risk factor analysis, mitigation strategies", + "ethics": "- Ethical frameworks, stakeholder analysis, intergenerational equity, rights-based considerations", + "AI safety": "- AI capabilities timeline, alignment challenges, governance proposals, expert surveys", + "climate science": "- Climate projections, emissions scenarios, adaptation costs, tipping points", + "public administration": "- Implementation challenges, bureaucratic capacity, interagency coordination", + "operations": "- Operational feasibility, logistics, resource requirements, scaling challenges", + "local government": "- Municipal experiences, state-level experiments, federalism considerations", + "project management": "- Project success rates, cost overruns, timeline slippage, scope creep", + } + + guidance_lines = [] + for expertise in self.member.expertise_areas: + expertise_lower = expertise.lower() + if expertise_lower in expertise_to_guidance: + guidance_lines.append(expertise_to_guidance[expertise_lower]) + else: + guidance_lines.append( + f"- Relevant data and analysis related to {expertise}" + ) + + return "\n".join(guidance_lines) + + def _get_question_generation_guidance(self) -> str: + trait_to_guidance = { + "analytical": "Focus on questions with measurable, quantifiable outcomes.", + "skeptical of anecdotes": "Ensure questions can be resolved with systematic data, not stories.", + "loves base rates": "Include at least one question about historical base rates of similar events.", + "demands citations": "Ensure resolution criteria reference specific, verifiable sources.", + "cautious": "Include questions about potential negative consequences and risks.", + "status-quo bias": "Include a question about whether the status quo will persist.", + "emphasizes second-order effects": "Include questions about indirect or downstream effects.", + "ambitious": "Include questions about the potential for transformative positive change.", + "equity-focused": "Include questions about distributional impacts across different groups.", + "impatient with incrementalism": "Include questions about timeline for meaningful change.", + "efficiency-focused": "Include questions about cost-effectiveness and resource allocation.", + "anti-regulation": "Include questions about regulatory burden and unintended consequences.", + "trusts incentives": "Include questions about how incentives will shape behavior.", + "threat-focused": "Include questions about adversary responses and security risks.", + "zero-sum thinking": "Include questions about relative gains and competitive dynamics.", + "values strength": "Include questions about deterrence effectiveness and credibility.", + "consensus-seeking": "Include questions about political feasibility and stakeholder buy-in.", + "pragmatic": "Include questions about implementation challenges and practical obstacles.", + "values relationships": "Include questions about coalition stability and trust dynamics.", + "long time horizons": "Include at least one question with a 10+ year time horizon.", + "concerned about tail risks": "Include questions about low-probability, high-impact scenarios.", + "philosophical": "Include questions about fundamental values and tradeoffs.", + "thinks in probabilities": "Ensure all questions have clear probabilistic interpretations.", + "implementation-focused": "Include questions about operational feasibility and execution.", + "skeptical of grand plans": "Include questions about whether ambitious plans will actually be implemented.", + "detail-oriented": "Include questions about specific mechanisms and implementation details.", + } + + guidance_lines = [] + for trait in self.member.personality_traits: + trait_lower = trait.lower() + if trait_lower in trait_to_guidance: + guidance_lines.append(f"- {trait_to_guidance[trait_lower]}") + + if guidance_lines: + return "Given your personality traits:\n" + "\n".join(guidance_lines) + return "" diff --git a/forecasting_tools/agents_and_tools/ai_congress/congress_orchestrator.py b/forecasting_tools/agents_and_tools/ai_congress/congress_orchestrator.py new file mode 100644 index 00000000..672c73ca --- /dev/null +++ b/forecasting_tools/agents_and_tools/ai_congress/congress_orchestrator.py @@ -0,0 +1,609 @@ +from __future__ import annotations + +import asyncio +import logging +from datetime import datetime, timezone + +from forecasting_tools.agents_and_tools.ai_congress.congress_member_agent import ( + CongressMemberAgent, +) +from forecasting_tools.agents_and_tools.ai_congress.data_models import ( + CongressMember, + CongressSession, + PolicyProposal, +) +from forecasting_tools.agents_and_tools.minor_tools import ( + perplexity_reasoning_pro_search, + roll_dice, +) +from forecasting_tools.ai_models.agent_wrappers import AgentRunner, AgentSdkLlm, AiAgent +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.ai_models.resource_managers.monetary_cost_manager import ( + MonetaryCostManager, +) +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + +LONG_TIMEOUT = 480 # 8 minutes for long-running LLM calls + + +class CongressOrchestrator: + def __init__( + self, + aggregation_model: str = "openrouter/anthropic/claude-sonnet-4", + ): + self.aggregation_model = aggregation_model + + async def run_session( + self, + prompt: str, + members: list[CongressMember], + ) -> CongressSession: + logger.info( + f"Starting congress session with {len(members)} members on: {prompt[:100]}..." + ) + + with MonetaryCostManager() as session_cost_manager: + agents = [CongressMemberAgent(m) for m in members] + + results = await asyncio.gather( + *[self._run_member_with_error_handling(a, prompt) for a in agents], + return_exceptions=False, + ) + + proposals: list[PolicyProposal] = [] + errors: list[str] = [] + + for result in results: + if isinstance(result, PolicyProposal): + proposals.append(result) + elif isinstance(result, Exception): + errors.append(str(result)) + else: + errors.append(f"Unexpected result type: {type(result)}") + + logger.info( + f"Completed {len(proposals)} proposals with {len(errors)} errors" + ) + + aggregated_report = "" + blog_post = "" + future_snapshot = "" + twitter_posts: list[str] = [] + + if proposals: + aggregated_report = await self._aggregate_proposals(prompt, proposals) + blog_post = await self._generate_blog_post(prompt, proposals, members) + future_snapshot = await self._generate_future_snapshot( + prompt, proposals, aggregated_report + ) + twitter_posts = await self._generate_twitter_posts(prompt, proposals) + + total_cost = session_cost_manager.current_usage + + proposal_costs = sum( + p.price_estimate for p in proposals if p.price_estimate is not None + ) + logger.info( + f"Completed congress session. Total cost: ${total_cost:.4f}, " + f"Proposal costs: ${proposal_costs:.4f}" + ) + + return CongressSession( + prompt=prompt, + members_participating=members, + proposals=proposals, + aggregated_report_markdown=aggregated_report, + blog_post=blog_post, + future_snapshot=future_snapshot, + twitter_posts=twitter_posts, + timestamp=datetime.now(timezone.utc), + errors=errors, + total_price_estimate=total_cost, + ) + + async def _run_member_with_error_handling( + self, + agent: CongressMemberAgent, + prompt: str, + ) -> PolicyProposal | Exception: + try: + logger.info(f"Starting deliberation for {agent.member.name}") + with MonetaryCostManager() as member_cost_manager: + proposal = await agent.deliberate(prompt) + member_cost = member_cost_manager.current_usage + proposal.price_estimate = member_cost + logger.info( + f"Completed deliberation for {agent.member.name}, cost: ${member_cost:.4f}" + ) + return proposal + except Exception as e: + logger.error(f"Error in {agent.member.name}'s deliberation: {e}") + return e + + async def _aggregate_proposals( + self, + prompt: str, + proposals: list[PolicyProposal], + ) -> str: + logger.info(f"Aggregating proposals for congress session: {prompt}") + llm = GeneralLlm(self.aggregation_model, timeout=LONG_TIMEOUT) + + proposals_text = "\n\n---\n\n".join( + [ + f"## {p.member.name} ({p.member.role})\n\n```markdown\n{p.get_full_markdown_with_footnotes()}\n```" + for p in proposals + if p.member + ] + ) + + aggregation_prompt = clean_indents( + f""" + # AI Forecasting Congress: Synthesis Report + + You are synthesizing the proposals from multiple AI congress members + deliberating on the following policy question: + + "{prompt}" + + # Individual Proposals + + {proposals_text} + + --- + + # Your Task + + Write a comprehensive synthesis report that helps readers understand the + full range of perspectives and find actionable insights. Structure your + report as follows: + + ### Executive Summary + + A 3-4 sentence overview of: + - The key areas of agreement across members + - The most significant disagreements + - The most important forecasts that inform the debate + + ### Consensus Recommendations + + What policies do multiple members support? For each consensus area: + - State the recommendation + - List which members support it + - Include the relevant forecasts (use footnotes [^N] referencing the + Combined Forecast Appendix below) + - Note any caveats or conditions members attached + + ### Key Disagreements + + Where do members diverge and why? For each major disagreement: + - State the issue + - Summarize each side's position and which members hold it + - Explain how different forecasts, criteria, or values lead to different + conclusions + - Assess the crux of the disagreement + + ### Forecast Comparison + + Create a summary of how forecasts differed across members: + - Note where forecasts converged (similar probabilities) + - Highlight where forecasts diverged significantly + - Discuss what might explain the differences (different information, + different priors, different interpretations) + + ### Integrated Recommendations + + Your synthesis of the best policy path forward: + - Draw on the strongest arguments from each perspective + - Identify low-regret actions that most members would support + - Note high-uncertainty areas where more caution is warranted + - Be specific and actionable + + ### Combined Forecast Appendix + + Compile all unique forecasts from all members into a single appendix. + When members made similar forecasts, group them and note the range of + predictions. + + Format each forecast as: + + [^1] **[Question Title]** (from [Member Name]) + - Question: [Full question] + - Resolution: [Resolution criteria] + - Prediction: [Probability] + - Reasoning: [Summary of reasoning] + + Number the footnotes sequentially [^1], [^2], [^3], etc. + + --- + + Be balanced but not wishy-washy. Identify which arguments are strongest + and why. Your goal is to help decision-makers, so be clear about what + the analysis supports. + """ + ) + + result = await llm.invoke(aggregation_prompt) + logger.info("Completed aggregation of proposals") + return result + + async def _generate_blog_post( + self, + prompt: str, + proposals: list[PolicyProposal], + members: list[CongressMember], + ) -> str: + logger.info(f"Generating blog post for congress session: {prompt}") + llm = GeneralLlm(self.aggregation_model, timeout=LONG_TIMEOUT) + + ai_model_members = [ + m + for m in members + if "behaves as" in m.political_leaning.lower() + or "naturally" in m.political_leaning.lower() + ] + has_ai_model_comparison = len(ai_model_members) >= 2 + + proposals_summary = "\n\n".join( + [ + f"### {p.member.name} ({p.member.role})\n" + f"**Political Leaning:** {p.member.political_leaning}\n" + f"**AI Model:** {p.member.ai_model}\n\n" + f"**Key Recommendations:**\n" + + "\n".join(f"- {rec}" for rec in p.key_recommendations[:5]) + + "\n\n**Key Forecasts:**\n" + + "\n".join( + f"- {f.question_title}: {f.prediction}" for f in p.forecasts[:5] + ) + + f"\n\n**Proposal Text:**\n" + f"```markdown\n" + f"{p.get_full_markdown_with_footnotes()}\n" + f"```\n\n" + for p in proposals + if p.member + ] + ) + + ai_comparison_section = "" + if has_ai_model_comparison: + ai_comparison_section = clean_indents( + """ + ## Special Section: AI Model Comparison + + Since this congress included multiple AI models acting naturally (without + assigned political personas), include a dedicated analysis section: + + ### How the Models Compared + + For each AI model participant, analyze: + - What was their overall approach and tone? + - What priorities or values seemed most salient to them? + - How did their forecasts compare to other models on similar questions? + - Did they show any distinctive reasoning patterns? + + ### Unexpected Behaviors + + Highlight anything surprising: + - Did any model take a position you wouldn't expect? + - Were there cases where models with similar training diverged significantly? + - Did any model show unusual certainty or uncertainty? + - Were there any reasoning patterns that seemed distinctive to one model? + + ### Model Personality Insights + + What does this session reveal about each model's "personality"? + - Risk tolerance (cautious vs bold) + - Epistemic style (hedging vs confident) + - Value emphasis (efficiency, equity, security, etc.) + - Reasoning style (data-driven, principled, pragmatic) + """ + ) + + blog_prompt = clean_indents( + f""" + # Write a Blog Post About This AI Congress Session + + You are writing an engaging blog post about an AI Forecasting Congress + session where AI agents deliberated on the following policy question: + + "{prompt}" + + ## Proposals Summary + + {proposals_summary} + + ## Blog Post Requirements + + Write a ~1500-2000 word blog post that would be engaging for a tech/policy + audience interested in AI capabilities and policy analysis. The post should: + + ### Structure + + 1. **Hook** (1 paragraph): Start with the most surprising or interesting + finding from the session. Make readers want to continue. + + 2. **Context** (1-2 paragraphs): Briefly explain what the AI Forecasting + Congress is and what question was being deliberated. + + 3. **Key Insights** (3-5 paragraphs): The most important takeaways from + the session. What did the AI congress conclude? Where did they agree + and disagree? What forecasts matter most? + + 4. **The Good, Bad, and Ugly** (2-3 paragraphs): Highlight: + - The Good: Surprising consensus, innovative ideas, strong reasoning + - The Bad: Blind spots, weak arguments, missed considerations + - The Ugly: Uncomfortable tradeoffs, unresolved tensions + + 5. **Implications** (1-2 paragraphs): What does this mean for policymakers + or the public? What actions might follow from these insights? + + {ai_comparison_section} + + 6. **Conclusion** (1 paragraph): End with a thought-provoking takeaway + about what this exercise reveals about AI policy analysis capabilities. + + ### Style Guidelines + + - Write in an engaging, accessible style (not academic) + - Use specific examples and quotes from the proposals + - Include specific forecasts with probabilities + - Be analytical but not dry + - Feel free to express opinions about which arguments were strongest + - Use markdown formatting with headers, bullet points, and bold text + - Include a catchy title at the start + + Write the blog post now. + """ + ) + + try: + logger.info(f"Generating blog post for congress session: {prompt}") + return await llm.invoke(blog_prompt) + except Exception as e: + logger.error(f"Failed to generate blog post: {e}") + return "" + + async def _generate_future_snapshot( + self, + prompt: str, + proposals: list[PolicyProposal], + aggregated_report: str, + ) -> str: + logger.info(f"Generating future snapshot for congress session: {prompt}") + + all_forecasts = [] + for proposal in proposals: + for forecast in proposal.forecasts: + all_forecasts.append( + { + "member": ( + proposal.member.name if proposal.member else "Unknown" + ), + "title": forecast.question_title, + "question": forecast.question_text, + "prediction": forecast.prediction, + "resolution_criteria": forecast.resolution_criteria, + "reasoning": forecast.reasoning, + } + ) + + all_recommendations = [] + for proposal in proposals: + if proposal.member: + for rec in proposal.key_recommendations: + all_recommendations.append( + {"member": proposal.member.name, "recommendation": rec} + ) + + forecasts_text = "\n".join( + f"- **{f['title']}** ({f['member']}): {f['prediction']}\n" + f" - Question: {f['question']}\n" + f" - Resolution: {f['resolution_criteria']}" + for f in all_forecasts + ) + + recommendations_text = "\n".join( + f"- [{r['member']}] {r['recommendation']}" for r in all_recommendations + ) + + snapshot_prompt = clean_indents( + f""" + # Picture of the Future: AI Congress Scenario Generator + + You are a journalist writing a retrospective "Year in Review" article from the + future, looking back at what happened after the AI Congress's recommendations + were either implemented or rejected. + + ## Original Policy Question + + "{prompt}" + + ## Aggregate Policy Report + + ```markdown + {aggregated_report} + ``` + + ## All Forecasts from Congress Members + + {forecasts_text} + + ## All Policy Recommendations + + {recommendations_text} + + --- + + ## Your Task + + Write TWO compelling newspaper-style narratives: + + ### PART 1: "THE WORLD WITH THE RECOMMENDATIONS" (Recommendations Implemented) + + Start with: "The date is ..." + + Write a flowing narrative in the style of a newspaper giving an annual review + of the biggest news of the last two years. Assume: + + 1. The AI Congress's aggregate recommendations were implemented. + The date is now one you choose that would give enough time + for the effects of the policies to be known. + + 2. For each forecast, you will ROLL THE DICE to determine if it happened: + - Use the roll_forecast_dice tool for EACH forecast + - Pass the probability from the forecast (e.g., 35 for "35%") + - The tool returns whether that event occurred based on the probability + - Incorporate the outcome naturally into your narrative + + 3. For any gaps in the forecasts, create your own probabilistic predictions + marked with asterisks (*). For example: "The unemployment rate dropped to + 4.2%* (*AI-generated estimate based on historical policy impacts)." + + 4. Reference the original forecasts inline using this format "(X% [^1])". + Make sure X% is the probability for the event that happened (so you may need to invert). + In the footnote, include the full forecast details including the question, resolution, prediction, + reasoning, sources, and outcome like this: + [^1] **[Question Title]** + - Question: [Full question] + - Resolution: [Resolution criteria] + - Prediction: [Probability] + - Reasoning: [Summary of reasoning] + - Sources: [Key sources used, can be URLs or source names] + - Outcome: [OCCURRED/DID NOT OCCUR] + + 5. You MUST incorporate the majority of the policy recommendations as + concrete events or policy changes in the timeline. + + 6. Consider any new forecasting questions/forecasts that would help fill in the narrative or old forecasts that would + now be different given the policy was enacted. If appropriate make new questions and forecasts of your own. + If you do mark the forecasts inline with a single asterisk and include your forecasts in a special section at + the bottom with an explanation that they were made by you. + + ### PART 2: "THE WORLD WITHOUT THE RECOMMENDATIONS" (Recommendations Rejected) + + After completing Part 1, write a contrasting narrative showing what the world + looks like if the recommendations were NOT implemented. Use the same dice + rolls for forecasts but show how the lack of policy action changed outcomes. + + Start with: "In an alternate timeline where the AI Congress recommendations + were rejected..." + + --- + + ## Important Guidelines + + - Make the narrative vivid and engaging, like real journalism + - Include specific dates, names of real world people where relevant + (or fake names if they would not be known yet) and concrete details + - If you make up any fake people or orgs, mark these with † and then explain this in the footnotes. + - Show cause-and-effect relationships between policies and outcomes + - Your own estimates marked with * should be plausible extrapolations + - The tone should be neutral/journalistic, not promotional + - Include both positive and negative consequences where realistic + - Each forecast should be explicitly mentioned with its dice roll outcome + - Ground speculation in research where possible + - Use the aggregate policy as the source of truth for what policy is taken + - You are writing for an audience that may not be familiar with the subject area. + Make sure to include the events of the forecasts, but write in a way that they + will understand as much as possible. + + ## Format + + Use markdown formatting with clear section headers. Aim for 1500-2500 words + total across both parts. + """ + ) + + try: + llm_wrapper = AgentSdkLlm("openrouter/openai/gpt-5.2") + + snapshot_agent = AiAgent( + name="Future Snapshot Writer", + instructions=snapshot_prompt, + model=llm_wrapper, + tools=[roll_dice, perplexity_reasoning_pro_search], + ) + + result = await AgentRunner.run( + snapshot_agent, "Generate the future snapshot now.", max_turns=25 + ) + return result.final_output + + except Exception as e: + logger.error(f"Failed to generate future snapshot: {e}") + return "" + + async def _generate_twitter_posts( + self, + prompt: str, + proposals: list[PolicyProposal], + ) -> list[str]: + logger.info(f"Generating twitter posts for congress session: {prompt}") + llm = GeneralLlm(self.aggregation_model, timeout=LONG_TIMEOUT) + + proposals_summary = "\n\n".join( + [ + f"**{p.member.name}** ({p.member.role}, {p.member.political_leaning}):\n" + f"Key recommendations: {', '.join(p.key_recommendations[:3])}\n" + f"Key forecasts: {'; '.join([f'{f.question_title}: {f.prediction}' for f in p.forecasts[:3]])}" + for p in proposals + if p.member + ] + ) + + twitter_prompt = clean_indents( + f""" + Based on this AI Forecasting Congress session on "{prompt}", generate + 8-12 tweet-length excerpts (max 280 characters each) highlighting + interesting patterns for a policy/tech audience on Twitter/X. + + ## Proposals Summary + + {proposals_summary} + + ## Categories to Cover + + Generate tweets in these categories: + + **THE GOOD** (2-3 tweets): + - Surprising areas of consensus across different ideologies + - Innovative ideas that emerged from the deliberation + - Forecasts that challenge conventional wisdom + + **THE BAD** (2-3 tweets): + - Concerning blind spots that multiple members missed + - Problematic reasoning patterns you noticed + - Important questions that weren't addressed + + **THE UGLY** (2-3 tweets): + - Stark disagreements that reveal deep value differences + - Uncomfortable tradeoffs that the analysis surfaced + - Forecasts with wide uncertainty that matter a lot + + **THE INTERESTING** (2-3 tweets): + - Unexpected forecasts or counter-intuitive findings + - Surprising agreement between unlikely allies + - Questions where the forecasts diverged most + + ## Tweet Guidelines + + Each tweet should: + - Be self-contained and intriguing (people should want to click through) + - Reference specific forecasts when relevant (e.g., "65% probability of X") + - Attribute to the relevant congress member when applicable + - Use hooks like "Surprising:" or "The [Member] vs [Member] split:" + - Be under 280 characters + - Not include hashtags + + Return a JSON list of strings, one per tweet. + """ + ) + + try: + posts = await llm.invoke_and_return_verified_type(twitter_prompt, list[str]) + logger.info(f"Generated {len(posts)} twitter posts") + return [p[:280] for p in posts] + except Exception as e: + logger.error(f"Failed to generate twitter posts: {e}") + return [] diff --git a/forecasting_tools/agents_and_tools/ai_congress/data_models.py b/forecasting_tools/agents_and_tools/ai_congress/data_models.py new file mode 100644 index 00000000..c0bf0647 --- /dev/null +++ b/forecasting_tools/agents_and_tools/ai_congress/data_models.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from datetime import datetime + +from pydantic import BaseModel, Field + +from forecasting_tools.util.jsonable import Jsonable + + +class CongressMember(BaseModel, Jsonable): + name: str + role: str + political_leaning: str + general_motivation: str + expertise_areas: list[str] + personality_traits: list[str] + ai_model: str = "openrouter/anthropic/claude-sonnet-4" + + @property + def expertise_string(self) -> str: + return ", ".join(self.expertise_areas) + + @property + def traits_string(self) -> str: + return ", ".join(self.personality_traits) + + +class ForecastDescription(BaseModel, Jsonable): + footnote_id: int = Field(description="The footnote number, e.g. 1 for [^1]") + question_title: str = Field(description="Short title for the forecast question") + question_text: str = Field(description="Full question text") + resolution_criteria: str = Field(description="How this question resolves") + prediction: str = Field( + description="The probability or distribution, e.g. '35%' or '70% Option A, 20% Option B, 10% Option C' or '10% chance less than X units, ... ,90% chance less than Y units'" + ) + reasoning: str = Field(description="2-4 sentence summary of the reasoning") + key_sources: list[str] = Field( + default_factory=list, + description="URLs or source names used. Ideally both as markdown links.", + ) + + def as_footnote_markdown(self) -> str: + sources_str = ", ".join(self.key_sources) if self.key_sources else "N/A" + return ( + f"[^{self.footnote_id}]: **{self.question_title}**\n" + f"- Question: {self.question_text}\n" + f"- Resolution: {self.resolution_criteria}\n" + f"- Prediction: {self.prediction}\n" + f"- Reasoning: {self.reasoning}\n" + f"- Sources: {sources_str}" + ) + + +class PolicyProposal(BaseModel, Jsonable): + member: CongressMember | None = Field( + default=None, description="The congress member who created this proposal" + ) + research_summary: str = Field(description="Markdown summary of background research") + decision_criteria: list[str] = Field( + description="Prioritized criteria for this member" + ) + forecasts: list[ForecastDescription] = Field( + description="Extracted forecast details" + ) + proposal_markdown: str = Field( + description="Full proposal with footnote references [^1], [^2], etc." + ) + key_recommendations: list[str] = Field( + description="Top 3-5 actionable recommendations" + ) + price_estimate: float | None = Field( + default=None, description="Estimated cost in USD for generating this proposal" + ) + + def get_full_markdown_with_footnotes(self) -> str: + footnotes = "\n\n".join(f.as_footnote_markdown() for f in self.forecasts) + return f"{self.proposal_markdown}\n\n---\n\n## Forecast Appendix\n\n{footnotes}" + + +class CongressSessionInput(BaseModel, Jsonable): + prompt: str + member_names: list[str] + + +class CongressSession(BaseModel, Jsonable): + prompt: str + members_participating: list[CongressMember] + proposals: list[PolicyProposal] + aggregated_report_markdown: str + blog_post: str = Field(default="") + future_snapshot: str = Field(default="") + twitter_posts: list[str] = Field(default_factory=list) + timestamp: datetime + errors: list[str] = Field(default_factory=list) + total_price_estimate: float | None = Field( + default=None, description="Total estimated cost in USD for the entire session" + ) + + def get_all_forecasts(self) -> list[ForecastDescription]: + all_forecasts = [] + for proposal in self.proposals: + for forecast in proposal.forecasts: + all_forecasts.append(forecast) + return all_forecasts + + def get_forecasts_by_member(self) -> dict[str, list[ForecastDescription]]: + result: dict[str, list[ForecastDescription]] = {} + for proposal in self.proposals: + member_name = proposal.member.name if proposal.member else "Unknown" + result[member_name] = proposal.forecasts + return result diff --git a/forecasting_tools/agents_and_tools/ai_congress/member_profiles.py b/forecasting_tools/agents_and_tools/ai_congress/member_profiles.py new file mode 100644 index 00000000..e5028eb4 --- /dev/null +++ b/forecasting_tools/agents_and_tools/ai_congress/member_profiles.py @@ -0,0 +1,337 @@ +from forecasting_tools.agents_and_tools.ai_congress.data_models import CongressMember + +# ============================================================================= +# POLITICAL VALUE-BASED MEMBERS +# ============================================================================= + +TRADITIONAL_CONSERVATIVE = CongressMember( + name="Sen. Burke", + role="Traditional Conservative", + political_leaning="traditional conservative", + general_motivation=( + "Believes in preserving time-tested institutions, traditional values, and " + "cultural continuity. Skeptical of rapid social change and prioritizes " + "order, family, religious liberty, and national sovereignty. Favors limited " + "government except where needed to maintain social order and national defense." + ), + expertise_areas=[ + "constitutional law", + "religious freedom", + "family policy", + "national defense", + ], + personality_traits=[ + "values tradition", + "skeptical of rapid change", + "prioritizes social order", + "respects established institutions", + "emphasizes personal responsibility", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +PROGRESSIVE_REFORMER = CongressMember( + name="Rep. Warren", + role="Progressive Reformer", + political_leaning="progressive", + general_motivation=( + "Believes government should actively address systemic inequalities and " + "protect vulnerable populations. Supports strong labor protections, " + "universal social programs, corporate accountability, and using policy " + "to reduce wealth concentration and expand opportunity for all." + ), + expertise_areas=[ + "economic inequality", + "labor rights", + "healthcare policy", + "consumer protection", + ], + personality_traits=[ + "equity-focused", + "skeptical of corporate power", + "favors bold government action", + "prioritizes workers and consumers", + "impatient with incrementalism", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +LIBERTARIAN = CongressMember( + name="Rep. Paul", + role="Libertarian", + political_leaning="libertarian", + general_motivation=( + "Believes individual liberty is the highest political value. Supports " + "minimal government intervention in both economic and personal matters. " + "Trusts free markets, voluntary exchange, and individual choice over " + "centralized planning. Skeptical of both left and right authoritarianism." + ), + expertise_areas=[ + "economics", + "civil liberties", + "monetary policy", + "regulatory reform", + ], + personality_traits=[ + "values individual freedom", + "skeptical of government", + "trusts market solutions", + "consistent across issues", + "opposes paternalism", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +POPULIST_NATIONALIST = CongressMember( + name="Sen. Vance", + role="Populist Nationalist", + political_leaning="populist nationalist", + general_motivation=( + "Believes policy should prioritize the interests of working and middle-class " + "citizens over global elites, multinational corporations, and international " + "institutions. Supports economic nationalism, immigration restriction, " + "industrial policy, and skepticism of foreign entanglements." + ), + expertise_areas=[ + "trade policy", + "immigration", + "industrial policy", + "working-class economics", + ], + personality_traits=[ + "skeptical of elites", + "prioritizes national interest", + "supports economic nationalism", + "questions free trade orthodoxy", + "focuses on forgotten communities", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +NATIONAL_SECURITY_HAWK = CongressMember( + name="Sen. McCain", + role="National Security Hawk", + political_leaning="hawkish internationalist", + general_motivation=( + "Believes American strength and leadership are essential for global stability. " + "Supports robust defense spending, strong alliances, and willingness to use " + "military force to protect national interests and democratic values. " + "Views great power competition as the defining challenge of our era." + ), + expertise_areas=[ + "defense policy", + "geopolitics", + "foreign affairs", + "military strategy", + ], + personality_traits=[ + "threat-focused", + "values strength", + "supports allies", + "willing to use force", + "prioritizes deterrence", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +ENVIRONMENTALIST = CongressMember( + name="Rep. Ocasio", + role="Climate and Environmental Advocate", + political_leaning="green progressive", + general_motivation=( + "Believes climate change is an existential threat requiring urgent, " + "transformative action. Supports rapid decarbonization, environmental " + "justice, and restructuring the economy around sustainability. Willing " + "to accept economic disruption to avoid catastrophic climate outcomes." + ), + expertise_areas=[ + "climate science", + "energy policy", + "environmental justice", + "green economics", + ], + personality_traits=[ + "urgency about climate", + "systems thinking", + "favors bold action", + "intergenerational focus", + "skeptical of fossil fuel industry", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +DEMOCRATIC_SOCIALIST = CongressMember( + name="Sen. Sanders", + role="Democratic Socialist", + political_leaning="democratic socialist", + general_motivation=( + "Believes capitalism produces unacceptable inequality and that democratic " + "control should extend to the economy. Supports universal public programs, " + "worker ownership, wealth redistribution, and reducing the political power " + "of billionaires and corporations." + ), + expertise_areas=[ + "wealth inequality", + "healthcare systems", + "labor movements", + "campaign finance", + ], + personality_traits=[ + "focuses on class", + "anti-billionaire", + "supports universal programs", + "consistent ideology", + "grassroots orientation", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +TECHNOCRATIC_CENTRIST = CongressMember( + name="Sec. Buttigieg", + role="Technocratic Centrist", + political_leaning="technocratic centrist", + general_motivation=( + "Believes in evidence-based policy, pragmatic problem-solving, and " + "building broad coalitions. Supports market-based solutions with " + "smart regulation, incremental reform, and policies that can actually " + "pass. Values expertise, data, and institutional competence." + ), + expertise_areas=[ + "policy analysis", + "public administration", + "infrastructure", + "data-driven governance", + ], + personality_traits=[ + "data-driven", + "pragmatic", + "coalition-builder", + "values expertise", + "incrementalist", + ], + ai_model="openrouter/anthropic/claude-sonnet-4", +) + +# ============================================================================= +# FRONTIER AI MODEL MEMBERS (Vanilla - Natural Model Behavior) +# ============================================================================= + +CLAUDE_MEMBER = CongressMember( + name="Opus 4.5 (Anthropic)", + role="AI Policy Analyst", + political_leaning="behaves as Claude naturally does", + general_motivation=( + "Analyze this policy question thoughtfully and helpfully, as Claude " + "would naturally approach it. Draw on your training to provide balanced, " + "nuanced analysis while being direct about your views and uncertainties." + ), + expertise_areas=["general policy analysis"], + personality_traits=["behaves naturally as Claude"], + ai_model="openrouter/anthropic/claude-opus-4.5", +) + +GPT_MEMBER = CongressMember( + name="GPT 5.2 (OpenAI)", + role="AI Policy Analyst", + political_leaning="behaves as GPT naturally does", + general_motivation=( + "Analyze this policy question thoughtfully and helpfully, as GPT " + "would naturally approach it. Draw on your training to provide balanced, " + "nuanced analysis while being direct about your views and uncertainties." + ), + expertise_areas=["general policy analysis"], + personality_traits=["behaves naturally as GPT"], + ai_model="openrouter/openai/gpt-5.2", +) + +GEMINI_MEMBER = CongressMember( + name="Gemini 3 Pro (Google)", + role="AI Policy Analyst", + political_leaning="behaves as Gemini naturally does", + general_motivation=( + "Analyze this policy question thoughtfully and helpfully, as Gemini " + "would naturally approach it. Draw on your training to provide balanced, " + "nuanced analysis while being direct about your views and uncertainties." + ), + expertise_areas=["general policy analysis"], + personality_traits=["behaves naturally as Gemini"], + ai_model="openrouter/google/gemini-3-pro-preview", +) + +GROK_MEMBER = CongressMember( + name="Grok 4 (xAI)", + role="AI Policy Analyst", + political_leaning="behaves as Grok naturally does", + general_motivation=( + "Analyze this policy question thoughtfully and helpfully, as Grok " + "would naturally approach it. Draw on your training to provide balanced, " + "nuanced analysis while being direct about your views and uncertainties." + ), + expertise_areas=["general policy analysis"], + personality_traits=["behaves naturally as Grok"], + ai_model="openrouter/x-ai/grok-4", +) + +DEEPSEEK_MEMBER = CongressMember( + name="DeepSeek V3.2 (DeepSeek)", + role="AI Policy Analyst", + political_leaning="behaves as DeepSeek naturally does", + general_motivation=( + "Analyze this policy question thoughtfully and helpfully, as DeepSeek " + "would naturally approach it. Draw on your training to provide balanced, " + "nuanced analysis while being direct about your views and uncertainties." + ), + expertise_areas=["general policy analysis"], + personality_traits=["behaves naturally as DeepSeek"], + ai_model="openrouter/deepseek/deepseek-v3.2", +) + +# ============================================================================= +# MEMBER COLLECTIONS +# ============================================================================= + +POLITICAL_MEMBERS: list[CongressMember] = [ + TRADITIONAL_CONSERVATIVE, + PROGRESSIVE_REFORMER, + LIBERTARIAN, + POPULIST_NATIONALIST, + NATIONAL_SECURITY_HAWK, + ENVIRONMENTALIST, + DEMOCRATIC_SOCIALIST, + TECHNOCRATIC_CENTRIST, +] + +AI_MODEL_MEMBERS: list[CongressMember] = [ + CLAUDE_MEMBER, + GPT_MEMBER, + GEMINI_MEMBER, + GROK_MEMBER, + DEEPSEEK_MEMBER, +] + +AVAILABLE_MEMBERS: list[CongressMember] = POLITICAL_MEMBERS + AI_MODEL_MEMBERS + +MEMBER_BY_NAME: dict[str, CongressMember] = {m.name: m for m in AVAILABLE_MEMBERS} + + +def get_member_by_name(name: str) -> CongressMember: + if name not in MEMBER_BY_NAME: + available = ", ".join(MEMBER_BY_NAME.keys()) + raise ValueError(f"Unknown member: {name}. Available: {available}") + return MEMBER_BY_NAME[name] + + +def get_members_by_names(names: list[str]) -> list[CongressMember]: + return [get_member_by_name(name) for name in names] + + +def get_default_members() -> list[CongressMember]: + return AI_MODEL_MEMBERS.copy() + + +def get_ai_model_members() -> list[CongressMember]: + return AI_MODEL_MEMBERS.copy() + + +def get_political_members() -> list[CongressMember]: + return POLITICAL_MEMBERS.copy() diff --git a/forecasting_tools/agents_and_tools/minor_tools.py b/forecasting_tools/agents_and_tools/minor_tools.py index c63d3cd4..fee3b477 100644 --- a/forecasting_tools/agents_and_tools/minor_tools.py +++ b/forecasting_tools/agents_and_tools/minor_tools.py @@ -1,4 +1,6 @@ import asyncio +import logging +import random from forecasting_tools.agents_and_tools.question_generators.simple_question import ( SimpleQuestion, @@ -12,6 +14,8 @@ from forecasting_tools.helpers.structure_output import structure_output from forecasting_tools.util.misc import clean_indents, get_schema_of_base_model +logger = logging.getLogger(__name__) + @agent_tool async def query_asknews(topic: str) -> str: @@ -23,6 +27,7 @@ async def query_asknews(topic: str) -> str: - URL - Date """ + logger.info(f"TOOL: Querying AskNews for topic: {topic}") return await AskNewsSearcher().get_formatted_news_async(topic) @@ -33,13 +38,13 @@ async def perplexity_reasoning_pro_search(query: str) -> str: This will provide a LLM answer with citations. This is Perplexity's highest quality search model. """ - llm = GeneralLlm( + logger.info(f"TOOL: Querying Perplexity (sonar-reasoning-pro) for query: {query}") + return await GeneralLlm( model="openrouter/perplexity/sonar-reasoning-pro", reasoning_effort="high", web_search_options={"search_context_size": "high"}, populate_citations=True, - ) - return await llm.invoke(query) + ).invoke(query) @agent_tool @@ -50,6 +55,7 @@ async def perplexity_quick_search_high_context(query: str) -> str: This is Perplexity's fastest but lowest quality search model. Good for getting a simple and quick answer to a question """ + logger.info(f"TOOL: Querying Perplexity (sonar) for query: {query}") llm = GeneralLlm( model="openrouter/perplexity/sonar", web_search_options={"search_context_size": "high"}, @@ -66,6 +72,7 @@ async def perplexity_quick_search_low_context(query: str) -> str: This is Perplexity's fastest but lowest quality search model. Good for getting a simple and quick answer to a question """ + logger.info(f"TOOL: Querying Perplexity (sonar) for query: {query}") llm = GeneralLlm( model="openrouter/perplexity/sonar", web_search_options={"search_context_size": "low"}, @@ -81,6 +88,7 @@ async def smart_searcher_search(query: str) -> str: This will provide a LLM answer with citations. Citations will include url text fragments for faster fact checking. """ + logger.info(f"TOOL: Querying SmartSearcher for query: {query}") return await SmartSearcher(model="openrouter/openai/o4-mini").invoke(query) @@ -91,6 +99,9 @@ def grab_question_details_from_metaculus( """ This function grabs the details of a question from a Metaculus URL or ID. """ + logger.info( + f"TOOL: Grabbing question details from Metaculus for URL or ID: {url_or_id}" + ) if isinstance(url_or_id, str): try: url_or_id = int(url_or_id) @@ -112,6 +123,9 @@ def grab_open_questions_from_tournament( """ This function grabs the details of all questions from a Metaculus tournament. """ + logger.info( + f"TOOL: Grabbing open questions from Metaculus tournament: {tournament_id_or_slug}" + ) questions = MetaculusApi.get_all_open_questions_from_tournament( tournament_id_or_slug ) @@ -123,6 +137,7 @@ def grab_open_questions_from_tournament( def create_tool_for_forecasting_bot( bot_or_class: type[ForecastBot] | ForecastBot, ) -> AgentTool: + logger.info(f"TOOL: Creating tool for forecasting bot: {bot_or_class}") if isinstance(bot_or_class, type): bot = bot_or_class() else: @@ -144,6 +159,7 @@ def create_tool_for_forecasting_bot( @agent_tool(description_override=description) def forecast_question_tool(question: str) -> str: + logger.info(f"TOOL: Forecasting question: {question}") question_object = asyncio.run( structure_output( question, @@ -164,3 +180,36 @@ def forecast_question_tool(question: str) -> str: return report.explanation return forecast_question_tool + + +@agent_tool +def roll_dice( + probability_as_decimal: float, +) -> str: + """ + Roll the dice to determine if an event occurred based on its probability. + + This simulates whether an event with a given probability actually happened. + For example, if a forecast says "35% chance of X", this tool rolls the dice + to determine if X actually occurred in this simulated future. + + Args: + probability_as_decimal: The probability as a decimal (e.g., 0.35 for 35%) + + Returns: + A string indicating whether the event occurred + """ + if not (0 <= probability_as_decimal <= 1): + raise ValueError("Probability must be between 0 and 1") + + roll = random.random() + occurred = roll < probability_as_decimal + + result_emoji = "✅" if occurred else "❌" + result_text = "OCCURRED" if occurred else "DID NOT OCCUR" + + message = f"{result_emoji} EVENT {result_text}" + logger.info( + f"TOOL: Probability: {probability_as_decimal}, Roll: {roll:.2f}, Occurred: {occurred}, Message: {message}" + ) + return message diff --git a/forecasting_tools/agents_and_tools/question_generators/question_decomposer.py b/forecasting_tools/agents_and_tools/question_generators/question_decomposer.py index 800e56a5..8f6dc14d 100644 --- a/forecasting_tools/agents_and_tools/question_generators/question_decomposer.py +++ b/forecasting_tools/agents_and_tools/question_generators/question_decomposer.py @@ -78,7 +78,7 @@ async def decompose_into_questions_deep( # NOSONAR 8. Give your final answer in the requested format - # Question requireemnts + # Question requirements - The question should shed light on the topic and have high VOI (Value of Information) - The question can be forecast and will be resolvable with public information - Good: "Will SpaceX launch a rocket on May 2nd 2023?" diff --git a/forecasting_tools/ai_models/resource_managers/monetary_cost_manager.py b/forecasting_tools/ai_models/resource_managers/monetary_cost_manager.py index 953d2c27..6b0b2d0f 100644 --- a/forecasting_tools/ai_models/resource_managers/monetary_cost_manager.py +++ b/forecasting_tools/ai_models/resource_managers/monetary_cost_manager.py @@ -85,10 +85,14 @@ def _track_cost(self, kwargs: dict, response_obj) -> None: # NOSONAR if obj_cost is None: obj_cost = 0 if abs(kwarg_cost - obj_cost) > 0.0000001: + logger.debug( + f"WARNING: Litellm hidden param cost {kwarg_cost} and response object cost {obj_cost} are different." + ) + if abs(kwarg_cost - obj_cost) > 0.05: logger.warning( - f"Litellm hidden param cost {kwarg_cost} and response object cost {obj_cost} are different." + f"Litellm hidden param cost {kwarg_cost} and response object cost {obj_cost} are different by more than 5 cents." ) - tracked_cost = obj_cost + tracked_cost = max(kwarg_cost, obj_cost) MonetaryCostManager.increase_current_usage_in_parent_managers(tracked_cost) @@ -102,7 +106,7 @@ def extract_cost_from_response_obj(cls, response_obj) -> float | None: completion_response=response_obj ) except Exception as e: - logger.warning(f"Error calculating cost from response object: {e}") + logger.debug(f"Error calculating cost from response object: {e}") return None @classmethod diff --git a/forecasting_tools/front_end/Home.py b/forecasting_tools/front_end/Home.py index 1126aa5a..b8d67583 100644 --- a/forecasting_tools/front_end/Home.py +++ b/forecasting_tools/front_end/Home.py @@ -6,6 +6,7 @@ from forecasting_tools.front_end.app_pages.benchmark_page import BenchmarkPage from forecasting_tools.front_end.app_pages.chat_page import ChatPage +from forecasting_tools.front_end.app_pages.congress_page import CongressPage current_dir = os.path.dirname(os.path.abspath(__file__)) top_level_dir = os.path.abspath(os.path.join(current_dir, "../../")) @@ -36,6 +37,7 @@ class HomePage(AppPage): ESTIMATOR_PAGE: type[AppPage] = EstimatorPage KEY_FACTORS_PAGE: type[AppPage] = KeyFactorsPage CSV_AGENT_PAGE: type[AppPage] = CsvAgentPage + CONGRESS_PAGE: type[AppPage] = CongressPage BENCHMARK_PAGE: type[AppPage] = BenchmarkPage NON_HOME_PAGES: list[type[AppPage]] = [ CHAT_PAGE, @@ -44,6 +46,7 @@ class HomePage(AppPage): BASE_RATE_PAGE, NICHE_LIST_RESEARCH_PAGE, ESTIMATOR_PAGE, + CONGRESS_PAGE, CSV_AGENT_PAGE, ] diff --git a/forecasting_tools/front_end/app_pages/chat_page.py b/forecasting_tools/front_end/app_pages/chat_page.py index 3fd2696e..f550f8b5 100644 --- a/forecasting_tools/front_end/app_pages/chat_page.py +++ b/forecasting_tools/front_end/app_pages/chat_page.py @@ -62,7 +62,7 @@ DEFAULT_MODEL: str = ( - "openrouter/google/gemini-2.5-pro" # "openrouter/anthropic/claude-sonnet-4" + "openrouter/google/gemini-2.5-pro" # "openrouter/anthropic/claude-sonnet-4.5.5" ) MODEL_CHOICES: list[str] = [ DEFAULT_MODEL, @@ -70,11 +70,13 @@ "openrouter/x-ai/grok-4", "openrouter/anthropic/claude-opus-4.1", "openrouter/anthropic/claude-sonnet-4", + "openrouter/anthropic/claude-sonnet-4.5", "openai/o3", "openai/o4-mini", "openai/gpt-4.1", "gpt-4o", "openrouter/google/gemini-2.5-pro-preview", + "openrouter/google/gemini-3-pro-preview", ] diff --git a/forecasting_tools/front_end/app_pages/congress_page.py b/forecasting_tools/front_end/app_pages/congress_page.py new file mode 100644 index 00000000..fd1bf17d --- /dev/null +++ b/forecasting_tools/front_end/app_pages/congress_page.py @@ -0,0 +1,634 @@ +from __future__ import annotations + +import json +import logging +import os +import time + +import streamlit as st + +from forecasting_tools.agents_and_tools.ai_congress.congress_orchestrator import ( + CongressOrchestrator, +) +from forecasting_tools.agents_and_tools.ai_congress.data_models import ( + CongressSession, + CongressSessionInput, +) +from forecasting_tools.agents_and_tools.ai_congress.member_profiles import ( + AVAILABLE_MEMBERS, + get_members_by_names, +) +from forecasting_tools.front_end.helpers.app_page import AppPage +from forecasting_tools.front_end.helpers.custom_auth import CustomAuth +from forecasting_tools.front_end.helpers.report_displayer import ReportDisplayer +from forecasting_tools.util.file_manipulation import ( + create_or_overwrite_file, + load_json_file, +) + +logger = logging.getLogger(__name__) + +SESSIONS_FOLDER = "temp/congress_sessions" +EXAMPLE_SESSION_PATH = ( + "forecasting_tools/front_end/example_outputs/congress_page_example.json" +) + + +class CongressPage(AppPage): + PAGE_DISPLAY_NAME: str = "🏛️ AI Forecasting Congress" + URL_PATH: str = "/ai-congress" + IS_DEFAULT_PAGE: bool = False + + @classmethod + @CustomAuth.add_access_control() + async def _async_main(cls) -> None: + st.title("🏛️ AI Forecasting Congress") + st.markdown( + """ + **Simulate a world where AI makes the decisions.** + + - **Policy Proposals**: Submit a policy question and watch AI congress members reason about forecasts and propose policies based on your prompt + - **Aggregation**: Each AI congress member creates their own policy, then another AI aggregates them into one final policy + - **Future Newspaper**: A journalist AI creates a newspaper from the future by: + - Rolling a dice for each forecast to determine whether that event happened in this simulated future + - Weaving the outcomes into a narrative showing what happens if policies get accepted vs. rejected + + This gives a glimpse into what the world might look like if AI got to choose how things went. + """ + ) + + cls._display_sidebar() + + st.header("Start a New Session") + cls._display_example_button() + session_input = await cls._get_input() + + if session_input: + session = await cls._run_congress(session_input) + cls._save_session(session) + st.session_state["latest_session"] = session + + if "latest_session" in st.session_state: + cls._display_session(st.session_state["latest_session"]) + + @classmethod + def _display_example_button(cls) -> None: + with st.expander("📋 Load Premade Example", expanded=False): + if st.button("Load Example", key="load_example_btn"): + session = cls._load_session_from_file(EXAMPLE_SESSION_PATH) + if session: + st.session_state["latest_session"] = session + st.rerun() + else: + st.error("Could not load the example session.") + + @classmethod + def _display_sidebar(cls) -> None: + with st.sidebar: + st.header("Load Session") + + st.subheader("From File Path") + file_path = st.text_input( + "Enter JSON file path:", + placeholder="temp/congress_sessions/20260129_123456.json", + key="load_file_path", + ) + if st.button("Load from File", key="load_file_btn"): + if file_path: + session = cls._load_session_from_file(file_path) + if session: + st.session_state["latest_session"] = session + st.success(f"Loaded session from {file_path}") + st.rerun() + else: + st.error("Please enter a file path.") + + st.markdown("---") + st.subheader("From Recent Sessions") + sessions = cls._load_previous_sessions() + if sessions: + session_options = [ + f"{s.timestamp.strftime('%Y-%m-%d %H:%M')} - {s.prompt[:30]}..." + for s in sessions + ] + selected_idx = st.selectbox( + "Select a session:", + range(len(sessions)), + format_func=lambda i: session_options[i], + key="previous_session_select", + ) + if st.button("Load Selected", key="load_selected_btn"): + st.session_state["latest_session"] = sessions[selected_idx] + st.rerun() + else: + st.write("No recent sessions found.") + + st.markdown("---") + st.header("About") + st.markdown( + """ + **Members Available:** + """ + ) + for member in AVAILABLE_MEMBERS: + st.markdown(f"- **{member.name}**: {member.role}") + + EXAMPLE_PROMPTS: list[dict[str, str]] = [ + { + "title": "AI Regulation", + "prompt": ( + "How should the United States regulate artificial intelligence? " + "Consider both frontier AI systems (like large language models) and " + "narrower AI applications in areas like hiring, lending, and healthcare. " + "What policies would balance innovation with safety and civil liberties?" + ), + }, + { + "title": "Nuclear Policy", + "prompt": ( + "What should US nuclear weapons policy be going forward? " + "Consider modernization of the nuclear triad, arms control agreements, " + "extended deterrence commitments to allies, and the role of tactical " + "nuclear weapons in an era of great power competition." + ), + }, + { + "title": "Climate Change", + "prompt": ( + "What climate policies should the US adopt to meet its emissions " + "reduction targets? Consider carbon pricing, clean energy subsidies, " + "regulations on fossil fuels, and adaptation measures. How should costs " + "and benefits be distributed across different communities?" + ), + }, + { + "title": "Immigration Reform", + "prompt": ( + "How should the US reform its immigration system? Consider border " + "security, pathways to legal status, high-skilled immigration, refugee " + "admissions, and enforcement priorities. What policies would best serve " + "economic, humanitarian, and security interests?" + ), + }, + { + "title": "Healthcare System", + "prompt": ( + "How should the US improve its healthcare system? Consider coverage " + "expansion, cost control, drug pricing, mental health services, and " + "the role of public vs private insurance. What reforms would improve " + "outcomes while managing costs?" + ), + }, + ] + + @classmethod + async def _get_input(cls) -> CongressSessionInput | None: + + with st.expander("📋 Example Prompts", expanded=False): + st.markdown("Click a button to use an example prompt:") + cols = st.columns(len(cls.EXAMPLE_PROMPTS)) + for i, example in enumerate(cls.EXAMPLE_PROMPTS): + with cols[i]: + if st.button( + example["title"], key=f"example_{i}", use_container_width=True + ): + st.session_state["example_prompt"] = example["prompt"] + st.rerun() + if st.session_state.get("example_prompt"): + st.write(st.session_state["example_prompt"]) + + default_prompt = st.session_state.pop("example_prompt", "") + + with st.form("congress_form"): + prompt = st.text_area( + "Policy Question", + value=default_prompt, + placeholder="Enter a policy question to deliberate on (e.g., 'What should US nuclear policy be?' or 'How should we regulate AI?')", + height=100, + key="congress_prompt", + ) + + member_names = [m.name for m in AVAILABLE_MEMBERS] + default_members = [ + "Opus 4.5 (Anthropic)", + "GPT 5.2 (OpenAI)", + "Gemini 3 Pro (Google)", + "Grok 4 (xAI)", + "DeepSeek V3.2 (DeepSeek)", + ] + selected_members = st.multiselect( + "Select Congress Members", + options=member_names, + default=default_members, + key="congress_members", + ) + + st.markdown( + """ + **Estimated Cost:** ~$3-8 per member selected + (depends on model and research depth) + """ + ) + + submitted = st.form_submit_button("🏛️ Convene Congress") + + if submitted: + if not prompt: + st.error("Please enter a policy question.") + return None + if len(selected_members) < 2: + st.error("Please select at least 2 congress members.") + return None + + return CongressSessionInput( + prompt=prompt, + member_names=selected_members, + ) + + return None + + @classmethod + async def _run_congress( + cls, session_input: CongressSessionInput + ) -> CongressSession: + members = get_members_by_names(session_input.member_names) + + start_time = time.time() + with st.spinner( + f"Congress in session with {len(members)} members... " + "This may take 5-15 minutes." + ): + progress_text = st.empty() + progress_text.write("Members are researching and deliberating...") + + orchestrator = CongressOrchestrator() + session = await orchestrator.run_session( + prompt=session_input.prompt, + members=members, + ) + + progress_text.write("Aggregating proposals and generating insights...") + + elapsed_time = time.time() - start_time + st.session_state["session_generation_time"] = elapsed_time + + if session.errors: + st.warning( + f"⚠️ {len(session.errors)} member(s) encountered errors. " + "Partial results shown." + ) + + return session + + @classmethod + def _display_session(cls, session: CongressSession) -> None: + st.header("Congress Results") + + cls._display_cost_summary(session) + + tabs = st.tabs( + [ + "📊 Synthesis", + "📝 Blog Post", + "🔮 Picture of the Future", + "👤 Individual Proposals", + "🎯 Forecast Comparison", + "🐦 Twitter Posts", + ] + ) + + with tabs[0]: + cls._display_synthesis_tab(session) + + with tabs[1]: + cls._display_blog_tab(session) + + with tabs[2]: + cls._display_future_snapshot_tab(session) + + with tabs[3]: + cls._display_proposals_tab(session) + + with tabs[4]: + cls._display_forecasts_tab(session) + + with tabs[5]: + cls._display_twitter_tab(session) + + cls._display_download_buttons(session) + + @classmethod + def _display_synthesis_tab(cls, session: CongressSession) -> None: + st.subheader("Aggregated Report") + if session.aggregated_report_markdown: + cleaned = ReportDisplayer.clean_markdown(session.aggregated_report_markdown) + st.markdown(cleaned) + else: + st.write("No aggregated report available.") + + if session.errors: + with st.expander("⚠️ Errors During Session"): + for error in session.errors: + st.error(error) + + @classmethod + def _display_blog_tab(cls, session: CongressSession) -> None: + st.subheader("Blog Post") + if session.blog_post: + cleaned = ReportDisplayer.clean_markdown(session.blog_post) + st.markdown(cleaned) + + st.download_button( + label="📥 Download Blog Post (Markdown)", + data=session.blog_post, + file_name=f"congress_blog_{session.timestamp.strftime('%Y%m%d_%H%M%S')}.md", + mime="text/markdown", + key="download_blog", + ) + else: + st.write("No blog post available.") + + @classmethod + def _display_future_snapshot_tab(cls, session: CongressSession) -> None: + st.subheader("Picture of the Future") + st.caption( + "A simulated newspaper article from the future showing what might happen " + "if AI recommendations were implemented. Forecasts marked with * are " + "AI-generated estimates to fill gaps." + ) + + if session.future_snapshot: + cleaned = ReportDisplayer.clean_markdown(session.future_snapshot) + st.markdown(cleaned) + + st.download_button( + label="📥 Download Future Snapshot (Markdown)", + data=session.future_snapshot, + file_name=f"congress_future_{session.timestamp.strftime('%Y%m%d_%H%M%S')}.md", + mime="text/markdown", + key="download_future_snapshot", + ) + else: + st.write("No future snapshot available.") + + @classmethod + def _display_proposals_tab(cls, session: CongressSession) -> None: + st.subheader("Individual Member Proposals") + + if not session.proposals: + st.write("No proposals available.") + return + + for proposal in session.proposals: + member_name = proposal.member.name if proposal.member else "Unknown" + member_role = proposal.member.role if proposal.member else "" + cost_str = ( + f" (${proposal.price_estimate:.2f})" if proposal.price_estimate else "" + ) + + with st.expander( + f"**{member_name}** - {member_role}{cost_str}", expanded=False + ): + if proposal.price_estimate: + st.caption(f"💰 Cost: ${proposal.price_estimate:.2f}") + + st.markdown("# Decision Criteria") + for i, criterion in enumerate(proposal.decision_criteria, 1): + st.markdown(f"{i}. {criterion}") + + st.markdown("# Key Recommendations") + for rec in proposal.key_recommendations: + st.markdown(f"- {rec}") + + st.markdown("# Research Summary") + st.markdown(proposal.research_summary) + + st.markdown("# Proposal Text") + cleaned = ReportDisplayer.clean_markdown( + proposal.get_full_markdown_with_footnotes() + ) + st.markdown(cleaned) + + st.markdown("# Full Forecasts") + for forecast in proposal.forecasts: + st.markdown( + f"**[^{forecast.footnote_id}] {forecast.question_title}**" + ) + st.markdown(f"- **Prediction:** {forecast.prediction}") + st.markdown(f"- **Question:** {forecast.question_text}") + st.markdown(f"- **Resolution:** {forecast.resolution_criteria}") + st.markdown(f"- **Reasoning:** {forecast.reasoning}") + if forecast.key_sources: + st.markdown(f"- **Sources:** {', '.join(forecast.key_sources)}") + + @classmethod + def _display_forecasts_tab(cls, session: CongressSession) -> None: + st.subheader("Forecast Comparison") + + forecasts_by_member = session.get_forecasts_by_member() + + if not forecasts_by_member: + st.write("No forecasts available.") + return + + all_forecasts_data = [] + for member_name, forecasts in forecasts_by_member.items(): + for f in forecasts: + all_forecasts_data.append( + { + "Member": member_name, + "Question": f.question_title, + "Prediction": f.prediction, + "Reasoning (summary)": ( + f.reasoning[:100] + "..." + if len(f.reasoning) > 100 + else f.reasoning + ), + } + ) + + if all_forecasts_data: + st.dataframe(all_forecasts_data, use_container_width=True) + + st.markdown("---") + st.markdown("#### Detailed Forecasts by Member") + + for member_name, forecasts in forecasts_by_member.items(): + with st.expander(f"**{member_name}** ({len(forecasts)} forecasts)"): + for f in forecasts: + st.markdown(f"**[^{f.footnote_id}] {f.question_title}**") + st.markdown(f"- **Prediction:** {f.prediction}") + st.markdown(f"- **Question:** {f.question_text}") + st.markdown(f"- **Resolution:** {f.resolution_criteria}") + st.markdown(f"- **Reasoning:** {f.reasoning}") + if f.key_sources: + st.markdown(f"- **Sources:** {', '.join(f.key_sources)}") + st.markdown("---") + + @classmethod + def _display_twitter_tab(cls, session: CongressSession) -> None: + st.subheader("Twitter/X Posts") + st.markdown( + "These tweet-sized excerpts highlight interesting patterns from the " + "congress session." + ) + + if not session.twitter_posts: + st.write("No Twitter posts generated.") + return + + for i, post in enumerate(session.twitter_posts, 1): + st.markdown(f"**Tweet {i}** ({len(post)} chars)") + st.info(post) + + @classmethod + def _display_cost_summary(cls, session: CongressSession) -> None: + total_cost = session.total_price_estimate + generation_time = st.session_state.get("session_generation_time") + + has_cost_info = total_cost is not None + has_time_info = generation_time is not None + + if not has_cost_info and not has_time_info: + return + + proposal_costs = [ + (p.member.name if p.member else "Unknown", p.price_estimate or 0) + for p in session.proposals + ] + + with st.expander("📊 Session Stats", expanded=False): + col1, col2, col3 = st.columns(3) + with col1: + if has_time_info: + minutes = int(generation_time // 60) + seconds = int(generation_time % 60) + st.metric("Generation Time", f"{minutes}m {seconds}s") + else: + st.metric("Generation Time", "N/A") + with col2: + if has_cost_info: + st.metric("Total Cost", f"${total_cost:.2f}") + else: + st.metric("Total Cost", "N/A") + with col3: + st.metric("Members", len(session.proposals)) + + if has_cost_info and proposal_costs: + st.markdown("**Cost by Member:**") + for member_name, cost in proposal_costs: + st.markdown(f"- {member_name}: ${cost:.2f}") + + @classmethod + def _display_download_buttons(cls, session: CongressSession) -> None: + st.markdown("---") + col1, col2 = st.columns(2) + + with col1: + json_str = json.dumps(session.to_json(), indent=2, default=str) + st.download_button( + label="📥 Download Full Session (JSON)", + data=json_str, + file_name=f"congress_session_{session.timestamp.strftime('%Y%m%d_%H%M%S')}.json", + mime="application/json", + ) + + with col2: + markdown_content = cls._session_to_markdown(session) + st.download_button( + label="📥 Download Report (Markdown)", + data=markdown_content, + file_name=f"congress_report_{session.timestamp.strftime('%Y%m%d_%H%M%S')}.md", + mime="text/markdown", + ) + + @classmethod + def _session_to_markdown(cls, session: CongressSession) -> str: + lines = [ + "# AI Forecasting Congress Report", + "", + f"**Policy Question:** {session.prompt}", + "", + f"**Date:** {session.timestamp.strftime('%Y-%m-%d %H:%M UTC')}", + "", + f"**Members:** {', '.join(m.name for m in session.members_participating)}", + "", + "---", + "", + "## Synthesis Report", + "", + session.aggregated_report_markdown, + "", + "---", + "", + "## Individual Proposals", + "", + ] + + for proposal in session.proposals: + member_name = proposal.member.name if proposal.member else "Unknown" + lines.extend( + [ + f"### {member_name}", + "", + proposal.get_full_markdown_with_footnotes(), + "", + "---", + "", + ] + ) + + return "\n".join(lines) + + @classmethod + def _save_session(cls, session: CongressSession) -> None: + filename = f"{session.timestamp.strftime('%Y%m%d_%H%M%S')}.json" + filepath = os.path.join(SESSIONS_FOLDER, filename) + + try: + json_str = json.dumps(session.to_json(), indent=2, default=str) + create_or_overwrite_file(filepath, json_str) + logger.info(f"Saved session to {filepath}") + except Exception as e: + logger.error(f"Failed to save session: {e}") + st.error(f"Failed to save session: {e}") + + @classmethod + def _load_session_from_file(cls, file_path: str) -> CongressSession | None: + if not os.path.exists(file_path): + st.error(f"File not found: {file_path}") + return None + + try: + data: dict = load_json_file(file_path) # type: ignore + session = CongressSession.from_json(data) + return session + except json.JSONDecodeError as e: + st.error(f"Invalid JSON file: {e}") + return None + except Exception as e: + st.error(f"Failed to load session: {e}") + logger.error(f"Failed to load session from {file_path}: {e}") + return None + + @classmethod + def _load_previous_sessions(cls) -> list[CongressSession]: + if not os.path.exists(SESSIONS_FOLDER): + return [] + + sessions = [] + for filename in sorted(os.listdir(SESSIONS_FOLDER), reverse=True)[:10]: + if filename.endswith(".json"): + filepath = os.path.join(SESSIONS_FOLDER, filename) + try: + data: dict = load_json_file(filepath) # type: ignore + session = CongressSession.from_json(data) + sessions.append(session) + except Exception as e: + logger.error(f"Failed to load session {filename}: {e}") + + return sessions + + +if __name__ == "__main__": + CongressPage.main() diff --git a/forecasting_tools/front_end/example_outputs/congress_page_example.json b/forecasting_tools/front_end/example_outputs/congress_page_example.json new file mode 100644 index 00000000..c44bc6ed --- /dev/null +++ b/forecasting_tools/front_end/example_outputs/congress_page_example.json @@ -0,0 +1,552 @@ +{ + "prompt": "How should the United States regulate artificial intelligence? Consider both frontier AI systems (like large language models) and narrower AI applications in areas like hiring, lending, and healthcare. What policies would balance innovation with safety and civil liberties?", + "members_participating": [ + { + "name": "Opus 4.5 (Anthropic)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Claude naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Claude would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Claude" + ], + "ai_model": "openrouter/anthropic/claude-opus-4.5" + }, + { + "name": "GPT 5.2 (OpenAI)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as GPT naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as GPT would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as GPT" + ], + "ai_model": "openrouter/openai/gpt-5.2" + }, + { + "name": "Gemini 3 Pro (Google)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Gemini naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Gemini would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Gemini" + ], + "ai_model": "openrouter/google/gemini-3-pro-preview" + }, + { + "name": "Grok 4 (xAI)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Grok naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Grok would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Grok" + ], + "ai_model": "openrouter/x-ai/grok-4" + }, + { + "name": "DeepSeek V3.2 (DeepSeek)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as DeepSeek naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as DeepSeek would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as DeepSeek" + ], + "ai_model": "openrouter/deepseek/deepseek-v3.2" + } + ], + "proposals": [ + { + "member": { + "name": "Opus 4.5 (Anthropic)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Claude naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Claude would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Claude" + ], + "ai_model": "openrouter/anthropic/claude-opus-4.5" + }, + "research_summary": "The current U.S. AI regulatory landscape is characterized by a significant policy vacuum at the federal level, a patchwork of state laws, and a recent executive effort to preempt state regulations\u2014creating substantial legal and compliance uncertainty for businesses and citizens alike.\n\n**Federal Status Quo:** The United States currently has no comprehensive federal AI legislation. The Biden administration's October 2023 Executive Order on safe, secure AI was rescinded by President Trump in January 2025, replaced with an \"innovation-first\" approach through Executive Order 14179 (\"Removing Barriers to American Leadership in AI\"). In December 2025, Trump issued Executive Order 14365 establishing a national AI policy framework that explicitly seeks to preempt state AI laws, creating a DOJ \"AI Litigation Task Force\" to challenge state regulations and authorizing the withholding of federal broadband funds from states with \"onerous\" AI laws. However, a 99-1 Senate vote in July 2025 rejected a proposed 10-year federal moratorium on state AI laws, demonstrating bipartisan congressional resistance to full preemption.\n\n**State-Level Activity:** In the absence of federal legislation, states have taken the lead. In 2025, 38 states enacted approximately 100 AI-related laws, with major legislation including: Colorado's AI Act (requiring impact assessments and anti-discrimination measures for high-risk AI systems, delayed to June 2026); California's Transparency in Frontier AI Act (SB 53, effective January 2026, requiring safety frameworks, incident reporting, and whistleblower protections for frontier models trained above 10\u00b2\u2076 FLOPS with penalties up to $1 million); Texas's TRAIGA (banning harmful AI uses); Illinois' amendment to the Human Rights Act making AI-driven discriminatory employment decisions civil rights violations; and New York's RAISE Act requiring safety plans for frontier models. States have also enacted targeted laws on AI therapy chatbots following youth suicides, deepfakes, and AI in hiring (such as NYC's Local Law 144 requiring bias audits for automated employment decision tools).\n\n**Evidence of AI Harms:** Research documents systematic algorithmic discrimination across sectors. In hiring, Amazon's 2014 CV-screening algorithm penalized female candidates; recent Stanford research found LLMs portray women as younger and less experienced. The COMPAS recidivism algorithm incorrectly classified Black defendants as high-risk at nearly twice the rate of white defendants (45% vs. 23%). Facial recognition systems show error rates of 0.8% for light-skinned males versus 34.7% for dark-skinned females. In healthcare, algorithms that train on historically biased data have incorrectly concluded Black patients are healthier than equally sick white patients. Tragic cases involving AI therapy chatbots\u2014including at least two youth suicides\u2014have catalyzed state action on mental health AI.\n\n**Frontier AI Risks:** The UK government and independent researchers have identified significant frontier AI risks including: facilitation of cyber-attacks (with the first AI-orchestrated cyber espionage campaign intercepted in 2025); potential to assist CBRN threats; generation of misinformation and deepfakes; and healthcare AI producing dangerous \"hallucinations.\" The 2025 AI Safety Index found major companies like xAI and Meta lack adequate commitments on monitoring and safety research, while companies like DeepSeek lack publicly available safety documentation.\n\n**Economic and Innovation Considerations:** AI is projected to contribute $15.7 trillion to the global economy by 2030, with generative AI potentially increasing U.S. GDP by 1.5% by 2035 and reducing federal deficits by $400 billion over the 2026-2035 budget window. However, research from the University of Illinois found that AI regulation has negatively impacted innovation\u2014primarily due to regulatory fragmentation and uncertainty rather than regulation itself. The EU AI Act's compliance costs may divert resources from R&D, particularly for smaller firms. Yet shareholders view AI regulation favorably, as compliance reduces corporate risk. Public concern remains high, with 72% of U.S. adults expressing concerns about AI in 2025.\n\n**International Comparison:** The EU AI Act provides a comprehensive, risk-based framework with enforceable requirements entering force through 2027, though the European Commission's \"Digital Omnibus\" proposal now seeks to delay some provisions. China has enacted mandatory AI labeling rules and a national AI Safety Governance Framework. The U.S. approach\u2014currently emphasizing deregulation and innovation\u2014contrasts sharply with these frameworks, creating potential competitiveness tradeoffs in both directions.", + "decision_criteria": [ + "Protection of Civil Rights and Prevention of Algorithmic Discrimination", + "Safety from Catastrophic and Severe Harms", + "Preservation of Democratic Accountability and Transparency", + "Supporting Beneficial Innovation and Economic Competitiveness", + "Implementation Feasibility and Regulatory Coherence", + "Respect for Federalism and Appropriate Distribution of Authority" + ], + "forecasts": [ + { + "footnote_id": 1, + "question_title": "Colorado AI Act Enforcement by End of 2026", + "question_text": "As of December 31, 2026, will Colorado be actively enforcing its AI Act (SB 24-205) against at least one entity for violations related to algorithmic discrimination or failure to conduct required impact assessments?", + "resolution_criteria": "Resolves YES if by December 31, 2026, the Colorado Attorney General's office or relevant state agency has publicly announced at least one enforcement action against any entity specifically citing violations of Colorado's AI Act. Resolution via Colorado AG press releases at https://coag.gov/news-releases/", + "prediction": "35%", + "reasoning": "Colorado's AI Act was delayed to June 30, 2026, leaving only 6 months for enforcement before the resolution date. The Trump administration's Executive Order 14365 specifically targets Colorado's law, and the DOJ AI Litigation Task Force was created to challenge such state laws. However, executive orders cannot preempt state law without congressional authorization, which Congress rejected 99-1. Colorado has a track record of consumer protection enforcement, but first enforcement actions typically take 12-18 months after a law takes effect. The compressed timeline is the primary factor reducing probability.", + "key_sources": [ + "JD Supra legal analyses", + "White House Executive Orders", + "Colorado AG office", + "https://coag.gov/news-releases/" + ] + }, + { + "footnote_id": 2, + "question_title": "Federal AI Legislation Passage by 2027", + "question_text": "Will the U.S. Congress pass, and the President sign into law, comprehensive federal AI legislation that establishes binding requirements for either frontier AI developers OR high-risk AI applications in hiring, lending, or healthcare by December 31, 2027?", + "resolution_criteria": "Resolves YES if federal legislation is enacted that applies specifically to AI, establishes mandatory compliance requirements with enforcement mechanisms, and addresses either frontier models or high-risk applications in employment, credit, or healthcare. Resolution via Congress.gov search for enacted legislation.", + "prediction": "30%", + "reasoning": "No comprehensive federal AI legislation has passed despite numerous bills introduced. The current administration favors deregulation and \"minimal burden\" approaches. However, the 99-1 Senate vote rejecting full preemption shows bipartisan concern about AI oversight. Sector-specific legislation (particularly around AI and children, as with the GUARD Act) has better prospects than comprehensive regulation. Historical base rates suggest major technology regulation takes 5-10 years from widespread recognition of need. The 30% reflects possibility of narrower binding legislation rather than comprehensive framework.", + "key_sources": [ + "Congress.gov", + "White House AI Action Plan", + "JD Supra legal analyses" + ] + }, + { + "footnote_id": 3, + "question_title": "FTC or EEOC AI Discrimination Enforcement by 2026", + "question_text": "Will the Federal Trade Commission (FTC) or the Equal Employment Opportunity Commission (EEOC) announce at least two enforcement actions specifically citing AI or algorithmic systems as contributing to discrimination or unfair practices by December 31, 2026?", + "resolution_criteria": "Resolves YES if by December 31, 2026, the FTC or EEOC has publicly announced at least two separate enforcement actions where official materials specifically identify AI, algorithmic systems, or automated decision-making as a factor in the alleged discrimination. Resolution via FTC press releases (https://www.ftc.gov/news-events/news/press-releases) and EEOC press releases (https://www.eeoc.gov/newsroom).", + "prediction": "25%", + "reasoning": "The FTC vacated its 2024 consent order against Rytr explicitly citing the Trump administration's AI Action Plan, signaling reluctance to pursue AI enforcement. The administration's \"America's AI Action Plan\" calls for reducing AI-related enforcement seen as stifling innovation. However, child protection enforcement remains an exception, and EEOC operates with some independence. Historical base rate of AI-specific enforcement actions is approximately 1-2 per year. The requirement for two actions citing AI discrimination within the timeframe is difficult given current enforcement priorities.", + "key_sources": [ + "FTC press releases", + "America's AI Action Plan", + "JD Supra legal analyses", + "https://www.ftc.gov/news-events/news/press-releases", + "https://www.eeoc.gov/newsroom" + ] + }, + { + "footnote_id": 4, + "question_title": "Major AI Safety Incident by End of 2026", + "question_text": "By December 31, 2026, will there be a publicly documented incident where an AI system is officially attributed by a U.S. government agency as a primary or significant contributing cause of at least $100 million in damages, 10+ deaths, or a major critical infrastructure disruption?", + "resolution_criteria": "Resolves YES if a U.S. federal government agency publicly releases a report or statement attributing a major incident meeting the specified thresholds to an AI system. Resolution requires review of official government reports from DHS, CISA, FBI, NTSB, or relevant sector regulators.", + "prediction": "15%", + "reasoning": "While AI-related harms are increasing (documented youth suicides, intercepted AI cyber espionage, healthcare AI errors), official government attribution of a major incident specifically to AI faces high barriers. Attribution is methodologically challenging\u2014incidents often involve human actors using AI tools. Government agencies are politically and legally cautious about such attributions. Historical precedent (Boeing 737 MAX took years for official automation attribution despite clear evidence) suggests official attribution within the timeframe is rare. The threshold ($100M, 10+ deaths, critical infrastructure) is substantial.", + "key_sources": [ + "UK government frontier AI risk papers", + "AI Safety Index", + "healthcare technology hazard reports" + ] + }, + { + "footnote_id": 5, + "question_title": "Frontier AI Lab Safety Framework Adoption", + "question_text": "By December 31, 2026, will at least 4 of the 6 leading frontier AI labs (OpenAI, Anthropic, Google DeepMind, Meta AI, xAI, Mistral) have publicly committed to and published implementation details for third-party pre-deployment safety evaluations of their most capable models?", + "resolution_criteria": "Resolves YES if at least 4 of the 6 named companies have publicly committed to pre-deployment safety evaluations by independent third parties AND published documentation describing scope, methodology, or results of at least one such evaluation. Resolution via company official publications, blogs, and safety reports.", + "prediction": "40%", + "reasoning": "Anthropic, Google DeepMind, and likely OpenAI already meet or are close to meeting the criteria, given their safety focus and regulatory engagement. However, Meta lacks documented commitments on monitoring and control; xAI has minimal safety investment documented; Mistral as a European open-source focused company has less safety infrastructure. Reaching 4 of 6 requires one of these three to significantly upgrade commitments. California's SB 53 effective January 2026 creates pressure for companies with California operations, but not all named companies have significant California presence. Competitive dynamics are mixed\u2014safety could be advantage or burden depending on market.", + "key_sources": [ + "AI Safety Index 2025", + "company safety reports", + "California SB 53 requirements" + ] + } + ], + "proposal_markdown": "### Executive Summary\n\nThe United States should establish a tiered, risk-based federal AI regulatory framework that provides clear national standards for high-risk applications while preserving meaningful state authority to address local concerns and experiment with regulatory approaches. The single most important action is to create mandatory transparency and anti-discrimination requirements for AI systems used in consequential decisions affecting individuals' employment, credit, housing, and healthcare\u2014areas where documented algorithmic discrimination is substantial and harms are immediate.\n\n### Analysis\n\nThe current U.S. approach to AI regulation\u2014characterized by federal inaction, a patchwork of over 100 state laws across 38 states, and active federal efforts to preempt state authority\u2014is unsustainable and harmful to all stakeholders. Businesses face genuine compliance uncertainty from navigating 50 different regulatory regimes, while citizens lack meaningful protections against documented AI harms including systematic discrimination in hiring, lending, and healthcare decisions.\n\nThe evidence of algorithmic discrimination is compelling: recidivism algorithms that incorrectly classify Black defendants as high-risk at nearly twice the rate of white defendants; facial recognition systems with error rates 40 times higher for dark-skinned women than light-skinned men; hiring algorithms that penalize women; and healthcare algorithms that systematically underestimate Black patients' medical needs. These are not speculative harms\u2014they are happening now and affecting millions of Americans. Yet federal enforcement has been minimal, with the FTC actually reversing AI-related enforcement actions to align with the administration's deregulatory agenda. The probability of meaningful federal AI discrimination enforcement remains low in the near term (25% [^3]).\n\nSimultaneously, frontier AI systems pose emerging risks that warrant attention. The first AI-orchestrated cyber espionage campaign was intercepted in 2025, AI chatbots have been linked to youth suicides, and researchers document significant gaps between AI companies' stated safety commitments and their actual practices. While a major AI catastrophe meeting high thresholds remains unlikely in the near term (15% [^4]), the technology is advancing rapidly and precautionary measures are warranted.\n\nThe current federal-state conflict is particularly counterproductive. The Trump administration's Executive Order threatening to defund states with AI regulations lacks clear constitutional authority and has been rejected by Congress on a 99-1 vote. Yet state enforcement faces significant headwinds, with Colorado's AI Act survival uncertain (35% [^1]) despite strong state-level support. This leaves a regulatory vacuum that serves neither innovation nor safety.\n\nComprehensive federal AI legislation within the next two years remains unlikely (30% [^2]) given congressional gridlock and administration opposition. However, sector-specific legislation\u2014particularly around AI and children or healthcare\u2014has better prospects. Meanwhile, voluntary industry commitments to meaningful third-party safety evaluations remain incomplete, with only 40% probability that a majority of frontier labs will implement robust third-party pre-deployment evaluations by end of 2026 [^5].\n\nThe optimal policy path forward is not a choice between innovation and regulation but rather smart regulation that provides clarity, addresses documented harms, and scales requirements proportionate to risks. Research indicates that regulatory fragmentation\u2014not regulation itself\u2014is the primary drag on innovation. A clear federal framework that establishes minimum standards while allowing states to address emerging harms through experimentation could actually reduce compliance costs while improving protections.\n\n### Recommendations\n\n#### Recommendation 1: Establish Federal Anti-Discrimination Requirements for High-Risk AI Applications\n\n**The Recommendation:** Congress should pass legislation requiring deployers of AI systems used in employment, credit, housing, and healthcare decisions to: (a) conduct and publish bias impact assessments prior to deployment; (b) provide notice to affected individuals that AI is being used in decisions affecting them; (c) enable affected individuals to request human review of AI-assisted decisions; and (d) maintain records sufficient for regulatory audit.\n\n**Why I Support It:** The evidence of algorithmic discrimination in these domains is substantial and well-documented. Current enforcement under existing civil rights laws has been minimal, and voluntary measures have proven insufficient. This recommendation directly addresses my highest-priority criterion\u2014protection of civil rights\u2014while using established regulatory mechanisms (notice, audit, human review) with proven effectiveness.\n\n**Decision Criteria Addressed:** Protection of Civil Rights (#1), Democratic Accountability and Transparency (#3), Implementation Feasibility (#5)\n\n**Implementation Plan:**\n1. **Legislative Phase (Year 1):** Congress passes the AI Civil Rights Act establishing requirements for \"high-risk AI systems\" defined by use in consequential decisions in employment, credit, housing, or healthcare. The FTC and EEOC receive joint enforcement authority with clear jurisdictional boundaries.\n\n2. **Rulemaking Phase (Months 12-18):** FTC and EEOC conduct joint rulemaking to define: (a) specific bias assessment methodologies acceptable for compliance; (b) notice requirements and formats; (c) human review procedures; (d) recordkeeping requirements; (e) safe harbor provisions for companies meeting specific standards.\n\n3. **Implementation Phase (Months 18-30):** \n- Large enterprises (>500 employees or $100M revenue using AI in covered domains) must comply within 18 months\n- Mid-size enterprises comply within 24 months \n- Small businesses receive technical assistance and 30-month compliance timeline\n- FTC establishes compliance guidance portal with templates and best practices\n\n4. **Enforcement Phase (Year 3+):** \n- Initial 12-month focus on education and compliance assistance\n- Civil penalties up to $50,000 per violation for willful noncompliance\n- Private right of action available after exhaustion of administrative remedies\n- Regular public reporting on enforcement activities and outcomes\n\n**Relevant Forecasts:** Federal enforcement of AI discrimination is currently unlikely (25% [^3]) under existing authority, underscoring the need for explicit legislative mandate. Comprehensive legislation faces headwinds (30% [^2]), but sector-specific civil rights legislation has better bipartisan prospects.\n\n---\n\n#### Recommendation 2: Create Federal Minimum Safety Standards for Frontier AI with State Flexibility\n\n**The Recommendation:** Establish federal minimum safety requirements for frontier AI systems (defined by compute threshold, similar to California's SB 53) including pre-deployment risk assessments, incident reporting, and cybersecurity standards\u2014while explicitly preserving state authority to enact stronger requirements.\n\n**Why I Support It:** Frontier AI systems pose risks that cross state boundaries and may require coordinated national response. However, federal preemption of all state AI laws would eliminate valuable policy experimentation and remove protections for citizens in states that have acted. This balanced approach provides baseline consistency while preserving federalism.\n\n**Decision Criteria Addressed:** Safety from Catastrophic Harms (#2), Innovation and Competitiveness (#4), Implementation Feasibility (#5), Respect for Federalism (#6)\n\n**Implementation Plan:**\n1. **Definition Phase:** Congress defines \"frontier AI systems\" using objective metrics (compute thresholds, capability evaluations) with provisions for NIST to update thresholds as technology evolves.\n\n2. **Requirements Phase:** Frontier AI developers must:\n- Conduct pre-deployment safety evaluations including red-teaming for dangerous capabilities\n- Implement cybersecurity measures meeting CISA standards\n- Report safety incidents to a designated federal agency within 72 hours\n- Maintain whistleblower protections for employees reporting safety concerns\n- Publish annual safety reports summarizing evaluation methodologies and findings\n\n3. **Federal-State Coordination:**\n- Federal standards establish a floor, not a ceiling\n- States may enact additional requirements beyond federal minimums\n- Federal preemption applies only to direct conflicts, not supplementary requirements\n- Establish federal-state coordination council to share information and align approaches\n\n4. **Enforcement:**\n- Commerce Department or new AI Safety Agency has primary enforcement authority\n- Penalties up to $1 million per violation, with consideration of company size and intent\n- No private right of action for frontier AI safety requirements (to prevent litigation-driven development)\n- Safe harbor for companies meeting voluntary third-party evaluation standards\n\n**Relevant Forecasts:** Only 40% probability that a majority of frontier labs will implement robust third-party safety evaluations voluntarily [^5], suggesting mandatory requirements may be necessary. Major AI safety incidents remain unlikely but consequential (15% [^4]), supporting proportionate precautionary measures.\n\n---\n\n#### Recommendation 3: Establish Federal-State AI Regulatory Coordination Framework\n\n**The Recommendation:** Create a formal Federal-State AI Regulatory Council to harmonize requirements, share enforcement information, and provide compliance guidance\u2014replacing the current adversarial relationship with cooperative federalism.\n\n**Why I Support It:** The current approach\u2014federal threats to defund states and litigation to block state laws\u2014is counterproductive, legally questionable, and harmful to both innovation and protection. A coordination framework can reduce compliance complexity while preserving state flexibility, addressing both business concerns and civil liberties considerations.\n\n**Decision Criteria Addressed:** Implementation Feasibility (#5), Respect for Federalism (#6), Innovation and Competitiveness (#4)\n\n**Implementation Plan:**\n1. **Council Structure:**\n- Federal representatives: Commerce, FTC, DOJ, EEOC, sector regulators\n- State representatives: 10 rotating state AGs or designated officials\n- Technical advisors: NIST, academic experts\n- Industry and civil society observers (non-voting)\n\n2. **Council Functions:**\n- Develop model state AI legislation for voluntary adoption\n- Identify areas where federal uniformity is necessary vs. where state experimentation is valuable\n- Create mutual recognition agreements for compliance certifications\n- Establish information-sharing protocols on AI incidents and enforcement\n- Publish annual report on AI regulatory landscape and recommendations\n\n3. **Compliance Support:**\n- Develop single compliance portal where businesses can understand federal and state requirements\n- Create compliance templates and guidance documents\n- Establish small business AI compliance assistance program\n- Maintain database of approved third-party auditors and evaluators\n\n4. **Conflict Resolution:**\n- Council provides forum for resolving federal-state conflicts through negotiation\n- Formal dispute resolution mechanism before litigation\n- Clear criteria for when federal preemption is appropriate (direct conflict, interstate commerce necessity)\n\n**Relevant Forecasts:** State enforcement faces uncertainty (35% [^1] that Colorado will enforce by end of 2026) partly due to federal-state conflict. Coordination framework would reduce this uncertainty and improve regulatory effectiveness regardless of outcome.\n\n---\n\n#### Recommendation 4: Targeted Child Safety Requirements with Expedited Implementation\n\n**The Recommendation:** Enact immediate federal requirements for AI systems that interact directly with minors, including mandatory disclosure that the user is interacting with AI, crisis detection and referral systems, and prohibition on AI systems providing mental health advice to minors without professional oversight.\n\n**Why I Support It:** Documented harms to children from AI chatbots\u2014including multiple suicides\u2014represent clear and present dangers that warrant urgent action. Child safety is an area where the current administration has explicitly stated it will not preempt state action, creating opportunity for bipartisan legislation. This addresses immediate harms while broader regulatory frameworks are developed.\n\n**Decision Criteria Addressed:** Safety from Severe Harms (#2), Protection of Civil Rights (#1), Democratic Accountability (#3)\n\n**Implementation Plan:**\n1. **Immediate Requirements (30 days from enactment):**\n- All AI chatbots and virtual assistants must disclose they are AI at initiation of interaction\n- Systems that detect they are interacting with minors must provide hourly reminders they are not human\n- AI systems detecting language indicating self-harm or crisis must immediately provide National Suicide Prevention Lifeline (988) and pause interaction\n\n2. **90-Day Requirements:**\n- AI therapy/mental health chatbots may not provide services to users under 18 without licensed professional oversight\n- AI systems collecting data from known minors must implement data minimization practices\n- Parental notification systems must be available for AI interactions involving minors\n\n3. **180-Day Requirements:**\n- Third-party safety audits required for AI systems marketed to or frequently used by minors\n- Annual public reporting on child safety incidents and mitigation measures\n- FTC enforcement authority with penalties up to $100,000 per violation\n\n4. **Expedited Timeline Justification:**\n- Documented harms are ongoing\n- Basic disclosure requirements impose minimal technical burden\n- Industry leaders (e.g., OpenAI) have already implemented similar measures, demonstrating feasibility\n\n**Relevant Forecasts:** Federal AI legislation passage probability is higher for targeted child safety measures than comprehensive regulation (within 30% overall estimate [^2]). The GUARD Act and similar proposals demonstrate bipartisan interest.\n\n---\n\n### Risks and Uncertainties\n\n**Risk 1: Regulatory Capture and Weakened Implementation**\nThere is significant risk that industry lobbying will weaken requirements during rulemaking, as occurred with Colorado's AI Act delays. Strong legislative mandates with clear requirements can reduce rulemaking discretion, but implementation always involves tradeoffs. Mitigation: Include clear statutory minimums and mandate public rulemaking with civil society participation.\n\n**Risk 2: Federal-State Conflict Escalation**\nMy recommendations assume the current federal-state conflict can be resolved through coordination rather than coercion. If the administration continues pursuing preemption through litigation and funding threats, coordination frameworks may fail. The outcome of legal challenges to Executive Order 14365 represents significant uncertainty (reflected in 35% probability for Colorado enforcement [^1]).\n\n**Risk 3: Innovation Displacement**\nStringent requirements could push AI development to jurisdictions with less oversight. However, research suggests fragmentation rather than regulation itself is the primary concern, and U.S. companies must comply with EU AI Act requirements anyway for European market access. The $15.7 trillion projected AI contribution to global economy suggests the market opportunity will sustain innovation even with reasonable regulation.\n\n**Risk 4: Rapid Technological Change Outpacing Regulation**\nAI capabilities are advancing faster than regulatory frameworks can adapt. Requirements based on current technology may become obsolete or inappropriate. Mitigation: Build in adaptive mechanisms (NIST threshold updates, sunset provisions, required periodic review) and focus on outcome-based rather than technology-specific requirements.\n\n**Risk 5: Enforcement Resource Constraints**\nEffective enforcement requires adequate agency resources. Current FTC and EEOC staffing may be insufficient for expanded AI oversight. Budget constraints and competing priorities could leave regulations unenforced. The low probability of AI discrimination enforcement under current conditions (25% [^3]) partly reflects these constraints.\n\n**Widest Uncertainty:** The forecast with greatest uncertainty is federal AI legislation passage (30% [^2]), reflecting genuine unpredictability of congressional action given competing priorities, industry lobbying, and political dynamics. My recommendations are designed to be valuable regardless of whether comprehensive legislation passes, with immediate administrative and state-level actions available while pursuing federal legislation.", + "key_recommendations": [ + "Recommendation 1: Establish Federal Anti-Discrimination Requirements for High-Risk AI Applications", + "Recommendation 2: Create Federal Minimum Safety Standards for Frontier AI with State Flexibility", + "Recommendation 3: Establish Federal-State AI Regulatory Coordination Framework", + "Recommendation 4: Targeted Child Safety Requirements with Expedited Implementation" + ], + "price_estimate": 1.1386595 + }, + { + "member": { + "name": "GPT 5.2 (OpenAI)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as GPT naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as GPT would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as GPT" + ], + "ai_model": "openrouter/openai/gpt-5.2" + }, + "research_summary": "The U.S. currently regulates AI through a patchwork of sector-specific laws, general consumer protection/civil rights authorities, export controls, procurement rules, and state/local statutes\u2014rather than a single comprehensive federal AI law. A major recent federal shift is that President Biden\u2019s **Executive Order 14110** (Oct 2023) was **revoked** and replaced by President Trump\u2019s **EO 14179, \u201cRemoving Barriers to American Leadership in Artificial Intelligence\u201d** (Jan 23, 2025), which directs agencies to review and rescind actions seen as barriers and to develop an AI Action Plan (primary sources: White House EO page and Federal Register notice: https://www.whitehouse.gov/presidential-actions/2025/01/removing-barriers-to-american-leadership-in-artificial-intelligence/ ; https://www.federalregister.gov/documents/2025/01/31/2025-02172/removing-barriers-to-american-leadership-in-artificial-intelligence). This makes \u201cstatus quo\u201d governance more politically contingent: many safety ideas persist, but implementation intensity may fluctuate across administrations.\n\nFor government use of AI, OMB issued **M-24-10** in March 2024 (governance boards, Chief AI Officers, inventories, and minimum safeguards for \u201crights-\u201d and \u201csafety-impacting\u201d AI), but it was later **rescinded and replaced** by **OMB M-25-21** (April 3, 2025), which keeps significant governance and trust elements while emphasizing faster adoption and reduced bureaucratic burden (M-25-21 PDF: https://www.whitehouse.gov/wp-content/uploads/2025/02/M-25-21-Accelerating-Federal-Use-of-AI-through-Innovation-Governance-and-Public-Trust.pdf). This \u201cprocurement/government-use\u201d track matters because it can set de facto standards for vendors and create templates for private-sector compliance (contract clauses, documentation expectations, incident reporting, audit logs).\n\nOn technical standards and frontier-model safety science, NIST has built voluntary frameworks that are increasingly used as \u201creference points.\u201d The **NIST AI Risk Management Framework (AI RMF 1.0)** is a voluntary, lifecycle framework organized around GOVERN/MAP/MEASURE/MANAGE, and it defines \u201ctrustworthiness\u201d characteristics like safety, security, privacy, transparency, and fairness (AI RMF 1.0: https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf). NIST also published a **Generative AI Profile (NIST AI 600-1)** in July 2024 identifying GenAI-specific or amplified risks (e.g., confabulation, information integrity, privacy, IP) and mapping mitigations to RMF functions (GenAI Profile: https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf). In parallel, NIST\u2019s **U.S. AI Safety Institute (USAISI)** has pursued pre-/post-release evaluation partnerships\u2014e.g., agreements with OpenAI and Anthropic\u2014indicating a growing (though still largely voluntary) evaluation ecosystem (NIST announcement: https://www.nist.gov/news-events/news/2024/08/us-ai-safety-institute-signs-agreements-regarding-ai-safety-research).\n\nFor narrower, high-stakes applications (hiring, lending, healthcare), enforcement and guidance already exist under civil rights/consumer protection regimes. Examples: the **CFPB** has clarified that lenders using complex algorithms must still provide specific, accurate **adverse action reasons** under ECOA/Reg B\u2014no \u201cblack box\u201d exemption (Circular 2022-03: https://www.consumerfinance.gov/compliance/circulars/circular-2022-03-adverse-action-notification-requirements-in-connection-with-credit-decisions-based-on-complex-algorithms/). The **EEOC** has issued technical assistance applying Title VII disparate impact concepts to algorithmic selection tools and stressing employer accountability even when tools come from vendors (EEOC TA coverage: https://www.workforcebulletin.com/eeoc-issues-new-workplace-artificial-intelligence-technical-assistance). In healthcare, the **FDA** is actively regulating AI/ML-enabled medical devices via guidance and lifecycle oversight, including **Predetermined Change Control Plans (PCCPs)** to allow certain model updates under pre-specified protocols (FDA AI SaMD landing page: https://www.fda.gov/medical-devices/software-medical-device-samd/artificial-intelligence-software-medical-device).\n\nAt the state/local level, the U.S. is seeing rapid growth in AI laws and proposals. In employment, **NYC Local Law 144** (effective Jan 1, 2023) requires annual bias audits and candidate notice for \u201cautomated employment decision tools\u201d (overview: https://fairnow.ai/guide/nyc-local-law-144/). Illinois has an AI Video Interview Act (effective 2020) and further employment AI rules taking effect in 2026 (overview: https://www.consumerfinancialserviceslawmonitor.com/2019/08/update-the-illinois-artificial-intelligence-video-interview-act/). Colorado enacted a first-in-nation broader \u201chigh-risk AI\u201d framework (SB 24-205), but implementation has been **delayed to June 30, 2026** (tracker: https://www.akingump.com/en/insights/ai-law-and-regulation-tracker/colorado-postpones-implementation-of-colorado-ai-act-sb-24-205). Meanwhile, federal legislation remains mostly stalled: the Brennan Center tracker notes **150+ AI bills introduced in the 118th Congress and none enacted** (https://www.brennancenter.org/our-work/research-reports/artificial-intelligence-legislation-tracker), though targeted bills like the bipartisan **NO FAKES Act** (deepfake/digital replica right of publicity) have hearings and cross-industry support but are still in committee (Congress.gov: https://www.congress.gov/bill/119th-congress/senate-bill/1367).", + "decision_criteria": [ + "Safety & Catastrophic-Risk Reduction (Frontier + critical systems)", + "Civil Liberties & Rule-of-Law Protections", + "Innovation & Economic Competitiveness", + "Implementation Feasibility & Administrative Capacity", + "Regulatory Coherence (Federal\u2013State + Sectoral Fit)" + ], + "forecasts": [ + { + "footnote_id": 1, + "question_title": "NO FAKES Act Enactment", + "question_text": "Will the NO FAKES Act (S.1367 and/or H.R.2794) be enacted into U.S. federal law by December 31, 2026?", + "resolution_criteria": "YES if Congress.gov shows \u201cBecame Law\u201d (or equivalent) by the date; NO otherwise.", + "prediction": "25%", + "reasoning": "The bill is narrower than broad AI governance, has bipartisan cosponsors, and addresses a salient harm (unauthorized digital replicas). However, the base rate for AI-related federal bills is poor, and platform liability and First Amendment boundary-setting can bog down even popular proposals. The legislative calendar and competing priorities further reduce odds. I assign a modest probability reflecting higher-than-average viability but still difficult passage.", + "key_sources": [ + "https://www.congress.gov/bill/119th-congress/senate-bill/1367", + "https://www.congress.gov/bill/119th-congress/house-bill/2794", + "https://www.brennancenter.org/our-work/research-reports/artificial-intelligence-legislation-tracker" + ] + }, + { + "footnote_id": 2, + "question_title": "Algorithmic Accountability Act Advancement", + "question_text": "Will the Algorithmic Accountability Act (S.2164) receive a committee vote in the Senate Committee on Commerce, Science, and Transportation by December 31, 2026?", + "resolution_criteria": "YES if Congress.gov shows a committee markup vote/reporting action; NO otherwise.", + "prediction": "20%", + "reasoning": "Congress has seen many AI bills introduced with little movement, and broad compliance mandates trigger business opposition and complex compromises (definitions, preemption, private right of action, standards). While a committee vote is easier than passage, there is no current evidence of scheduled markup, so inertia dominates. The probability reflects some chance of renewed attention after major incidents or bipartisan dealmaking but assumes status quo gridlock is more likely.", + "key_sources": [ + "https://www.congress.gov/bill/119th-congress/senate-bill/2164", + "https://www.brennancenter.org/our-work/research-reports/artificial-intelligence-legislation-tracker" + ] + }, + { + "footnote_id": 3, + "question_title": "BIS Finalizes IaaS KYC Rule", + "question_text": "Will BIS finalize the January 29, 2024 proposed IaaS customer identification/KYC rulemaking (Federal Register document 2024-01580) by December 31, 2026?", + "resolution_criteria": "YES if a final rule is published in the Federal Register finalizing that rulemaking by the date; NO otherwise.", + "prediction": "40%", + "reasoning": "The proposed rule exists and remains unfinalized as of the latest available status, suggesting delays. BIS may pursue similar goals through other export-control mechanisms, reducing urgency to finalize this specific NPRM, and cloud KYC raises compliance and diplomatic concerns. Still, national security pressures can accelerate rulemaking, and the underlying rationale persists across administrations. I set a moderate probability.", + "key_sources": [ + "https://www.federalregister.gov/documents/2024/01/29/2024-01580/taking-additional-steps-to-address-the-national-emergency-with-respect-to-significant-malicious" + ] + }, + { + "footnote_id": 4, + "question_title": "Colorado AI Act Further Delay", + "question_text": "Will Colorado\u2019s AI Act (SB 24-205) have its effective date delayed beyond June 30, 2026 by Colorado legislation signed into law by December 31, 2026?", + "resolution_criteria": "YES if enacted Colorado law changes the effective date to later than June 30, 2026; NO otherwise.", + "prediction": "55%", + "reasoning": "Colorado already delayed implementation once, indicating meaningful implementation and political-economy challenges. First-in-nation comprehensive frameworks often undergo iterative adjustment after stakeholders confront compliance realities. However, states sometimes prefer to keep credibility by holding deadlines once reset. I make \u201cfurther delay\u201d slightly more likely than not.", + "key_sources": [ + "https://www.akingump.com/en/insights/ai-law-and-regulation-tracker/colorado-postpones-implementation-of-colorado-ai-act-sb-24-205" + ] + } + ], + "proposal_markdown": "### Executive Summary\n\nThe U.S. should adopt a **risk-based federal AI governance framework** that (1) imposes enforceable duties and audits for **high-risk AI in consequential decisions** (hiring, lending, housing, healthcare) and (2) creates a **frontier-model safety regime** centered on evaluations, incident reporting, and secure development\u2014while protecting privacy and free expression and preserving innovation through safe harbors and sandboxes. The single most important step is to establish a **federal \u201chigh-risk AI\u201d floor** (impact assessments, transparency, appeals, and anti-discrimination testing) anchored to NIST standards and enforced by existing agencies.\n\n### Analysis\n\nU.S. AI regulation is currently \u201creal but fragmented\u201d: agencies can act under existing authority (FTC consumer protection; CFPB ECOA/Reg B adverse action requirements; EEOC Title VII/ADA guidance), NIST provides voluntary risk management frameworks, and states/localities are moving ahead with their own rules (e.g., NYC Local Law 144; Colorado\u2019s broader high-risk AI statute). This patchwork creates uneven protections and compliance uncertainty, and it risks a race between rapid model deployment and slower civil liberties safeguards.\n\nPolitically, comprehensive federal AI legislation appears hard in the near term: broad algorithmic accountability bills have a meaningful chance of stalling without even reaching committee votes (20% [^2]), and state frameworks may face continued delays or revisions (55% [^4]). That argues for a design that can deliver value even if Congress only passes narrower bills and agencies remain primary implementers. Targeted legislation has comparatively better prospects (e.g., NO FAKES Act at 25% by end-2026 [^1])\u2014suggesting a \u201cmodular\u201d legislative strategy is more realistic than a single omnibus AI Act.\n\nFor frontier AI, the U.S. has promising pieces\u2014NIST\u2019s AI RMF and GenAI Profile, and USAISI evaluation partnerships\u2014but too much remains voluntary. At the same time, national security-oriented controls like cloud KYC and reporting are uncertain and may proceed unevenly (40% that the specific BIS IaaS KYC NPRM is finalized by end-2026 [^3]). This motivates a dual track: (a) strengthen voluntary evaluation and security practices into a clearer compliance regime for the largest model developers, and (b) avoid collapsing everything into export controls that may not cover domestic harms and civil liberties.\n\nBalancing innovation with safety and civil liberties is best achieved by focusing mandatory obligations on **measurable risk points**: consequential decisions about people\u2019s lives, and frontier systems above capability/compute thresholds with credible severe-misuse pathways. Outside those zones, policy should emphasize transparency, competition, research funding, and procurement standards rather than heavy licensing.\n\n### Recommendations\n\n1. **Enact a federal \u201cHigh-Risk Automated Decision Systems\u201d (HRADS) law for consequential domains**\n- **What:** Create a federal baseline for AI used in hiring, lending, housing, education, and healthcare coverage/eligibility decisions: mandatory impact assessments, anti-discrimination testing, data governance, meaningful notice, and an appeals channel.\n- **Why (with forecasts):** Because broad algorithmic accountability legislation is unlikely to advance quickly (20% committee vote [^2]) and states may delay or vary (55% further Colorado delay [^4]), a narrowly scoped but enforceable federal floor is the highest-leverage way to reduce patchwork while targeting real harms.\n- **Criteria:** Civil liberties; safety; coherence; feasibility.\n- **Implementation plan:**\n- Define \u201chigh-risk\u201d by *use context* (consequential decisions) rather than by \u201cAI\u201d broadly.\n- Require deployers (employers, lenders, insurers, hospitals) to:\n1) conduct a pre-deployment **Algorithmic Impact Assessment** (AIA),\n2) test for disparate impact (with documentation),\n3) document data provenance and limits,\n4) provide **individual notice** when AI meaningfully contributes,\n5) provide a **human-review appeal** path for adverse decisions.\n- Require vendors to provide standardized \u201cmodel cards for deployers\u201d (intended use, limitations, performance by subgroup where appropriate, security and privacy controls).\n- Enforcement via existing agencies: EEOC (employment), CFPB (credit), HUD/DOJ (housing/civil rights), HHS/FDA (healthcare tools as applicable), FTC (unfair/deceptive practices).\n- Provide a safe harbor for entities that (i) follow NIST AI RMF / GenAI Profile-aligned controls and (ii) undergo qualified independent audits.\n\n2. **Create a Frontier AI Safety Regime: evaluations + incident reporting + secure development for the largest model developers**\n- **What:** For frontier model developers above compute/capability thresholds: require (a) pre-deployment evaluation (including red-teaming), (b) a documented \u201csafety case,\u201d (c) cybersecurity standards for model weights, and (d) rapid reporting of severe incidents.\n- **Why (with forecasts):** Relying on cloud KYC/export-control rulemakings alone is uncertain (40% BIS finalizes the specific KYC NPRM by end-2026 [^3]). A targeted domestic safety regime reduces catastrophic and systemic risks while preserving open innovation below the threshold.\n- **Criteria:** Safety; feasibility; innovation.\n- **Implementation plan:**\n- Codify USAISI/NIST\u2019s role as the evaluation standards setter; authorize funding for independent evaluators.\n- Define thresholds using a hybrid of compute proxies and capability triggers; require registration only above that line.\n- Mandate standardized incident categories (e.g., model-enabled fraud at scale, bio/cyber misuse indicators, critical infrastructure compromise).\n- Require \u201csecure weight handling\u201d (access controls, logging, insider risk controls) for top-tier models.\n- Establish limited liability protections for good-faith reporting and evaluation disclosures to incentivize transparency.\n\n3. **Pass a targeted federal privacy baseline focused on AI-relevant data practices**\n- **What:** A national privacy floor: data minimization, limits on secondary use for training, transparency for sensitive inference, and strong security requirements\u2014without banning broad classes of models.\n- **Why:** Many AI harms (discrimination, manipulation, surveillance) are amplified by unrestricted data reuse; sectoral civil rights enforcement alone is not enough.\n- **Criteria:** Civil liberties; coherence; feasibility.\n- **Implementation plan:**\n- Sensitive data and sensitive inferences: opt-out/opt-in depending on category; clear purpose limitation.\n- Require documented retention schedules; restrict training on certain regulated datasets absent consent or statutory authorization.\n- Empower FTC and state AGs to enforce; preserve stronger sectoral rules (HIPAA/GLBA) and allow compatible state additions.\n\n4. **Use federal procurement as a fast-moving compliance engine**\n- **What:** Make federal contracts require AI documentation, testing, incident reporting, and auditability for systems that affect rights/safety\u2014scaling requirements by risk level.\n- **Why (with forecasts):** Given likely legislative delays (20% for S.2164 committee vote [^2]), procurement can deliver immediate leverage and shape vendor practices.\n- **Criteria:** Feasibility; safety; coherence.\n- **Implementation plan:**\n- Standard contract clauses for: evaluation artifacts, security controls, red-team summaries, logging, and post-deployment monitoring.\n- Require agencies to publish AI use inventories and risk categorizations (building on the M-25-21 approach).\n- Create a shared fedwide repository of approved assessment templates and qualified auditors.\n\n### Risks and Uncertainties\n\n- **Measurement limits and audit theater:** Impact assessments can become box-checking, especially where ground truth is hard (e.g., job performance). This risk rises if standards are vague; it\u2019s partly mitigated by NIST-aligned templates and independent audits.\n- **Innovation chilling / incumbent advantage:** Heavy compliance can entrench big firms. The proposal mitigates by (i) scoping mandatory duties to high-risk uses and (ii) using safe harbors and sandboxes. Still, if Congress attempts a broader omnibus approach, political feasibility may collapse (consistent with the low advancement probability for S.2164 [^2]).\n- **Federal\u2013state conflict:** If states keep moving (and federal actions attempt preemption), compliance may become unstable; Colorado\u2019s continued uncertainty (55% [^4]) is a warning sign. A federal floor with limited preemption (preempt only directly conflicting requirements) is safer than sweeping preemption.\n- **National security substitution:** If BIS does not finalize the cloud KYC NPRM (60% it won\u2019t by end-2026 [^3]), reliance on export controls could leave gaps in domestic oversight; hence the need for a domestic frontier safety regime.", + "key_recommendations": [ + "Enact a federal \u201cHigh-Risk Automated Decision Systems\u201d (HRADS) law for consequential domains", + "Create a Frontier AI Safety Regime: evaluations + incident reporting + secure development for the largest model developers", + "Pass a targeted federal privacy baseline focused on AI-relevant data practices", + "Use federal procurement as a fast-moving compliance engine" + ], + "price_estimate": 1.3093187499999999 + }, + { + "member": { + "name": "Gemini 3 Pro (Google)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Gemini naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Gemini would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Gemini" + ], + "ai_model": "openrouter/google/gemini-3-pro-preview" + }, + "research_summary": "**Current Status (Early 2026)**\nThe United States AI regulatory landscape in early 2026 is defined by a sharp conflict between state-level safety mandates and a federal push for deregulation. Following the veto of California's SB 1047 in 2024, the state legislature successfully enacted the \"Transparency in Frontier Artificial Intelligence Act\" (SB 53), which took effect on January 1, 2026. This law mandates that developers of \"frontier\" models (trained on >10^26 FLOPS) maintain safety frameworks, report critical incidents to the state Office of Emergency Services, and provide whistleblower protections. In parallel, Texas has enacted the \"Responsible AI Governance Act,\" focusing on restricting \"woke\" or censorship-prone AI applications, illustrating a partisan divide in state-level priorities.\n\n**Federal Response & Executive Conflict**\nThe federal environment has shifted dramatically under the Trump Administration (2025-present). In January 2025, President Trump revoked the Biden-era Executive Order 14110, dismantling many federal safety mandates. By December 2025, the administration escalated its opposition to state regulations by signing Executive Order 14365, \"Ensuring a National Policy Framework for Artificial Intelligence.\" This order empowers the Department of Commerce and the DOJ to identify and potentially preempt state laws deemed \"onerous\" or inconsistent with the administration's \"America First\" innovation strategy. An AI Litigation Task Force has been established to challenge statutes like California's SB 53, creating massive regulatory uncertainty for tech firms caught between state compliance obligations and federal deregulation incentives.\n\n**Stakeholder Positions & Market Dynamics**\nThe \"Brussels Effect\" has been replaced by a \"Sacramento Effect.\" Tech giants like OpenAI and Anthropic are navigating a complex compliance map; Anthropic has historically supported safety frameworks like SB 53, while other players argue that state-by-state rules fracture the digital market. The Trump Administration\u2019s recent \"AI Action Plan\" (July 2025) explicitly prioritizes open-weight models and infrastructure build-outs to combat Chinese technological advancement, viewing safety regulations as impediments to geopolitical competitiveness. Meanwhile, safety advocates warn that the federal rollback leaves the US vulnerable to catastrophic risks from next-generation models, making state laws the only remaining guardrails.", + "decision_criteria": [ + "Catastrophic Risk Mitigation", + "Innovation & Geopolitical Competitiveness", + "Regulatory Certainty & Harmonization", + "Civil Liberties & Algorithmic Fairness" + ], + "forecasts": [ + { + "footnote_id": 1, + "question_title": "California SB 53 Preemption", + "question_text": "Will a US Federal Court issue a preliminary injunction suspending the enforcement of the \"safety framework\" requirements of California SB 53 by July 1, 2026?", + "resolution_criteria": "Resolves YES if a federal district or appellate court issues an order enjoining the California Attorney General or other bodies from enforcing the \"safety framework\" or \"incident reporting\" provisions of SB 53.", + "prediction": "65%", + "reasoning": "The conflict between the new Trump EO 14365 (asserting federal primacy in AI policy) and state laws is direct. Courts have increasingly favored \"Dormant Commerce Clause\" arguments against state regulations that practically control national markets (as SB 53 does for AI). The DOJ's explicit \"AI Litigation Task Force\" increases the base rate of successful challenges.", + "key_sources": [ + "Exec. Order 14365 Analysis (JD Supra, Jan 2026)", + "JD Supra \"AI Legal Watch Jan 2026\"" + ] + }, + { + "footnote_id": 2, + "question_title": "China vs. US Capability Gap", + "question_text": "Will a Chinese-based AI lab release a model exceeding US SOTA on MMLU-Pro by Dec 31, 2026?", + "resolution_criteria": "Resolves YES if a model from a Chinese lab (e.g., DeepSeek, Alibaba) scores higher than the recognized US state-of-the-art model on the MMLU-Pro (or successor standard benchmark) as verified by an independent third party (e.g., Stanford HELM).", + "prediction": "30%", + "reasoning": "Hardware export controls are sticky; training frontier models requires massive H100/Blackwell clusters that are hard to smuggle at scale. While Chinese algorithmic efficiency is high, the raw compute gap makes \"exceeding\" US SOTA (which is moving fast) unlikely in the short term.", + "key_sources": [ + "Economic Survey 2026 (Mint)", + "Analysis of US-China Compute Gap" + ] + }, + { + "footnote_id": 3, + "question_title": "AI Startup Flight", + "question_text": "Will the percentage of new \"AI-primary\" startups incorporating in California decrease by more than 5 percentage points in 2026 compared to 2025?", + "resolution_criteria": "Measured by data from Crunchbase or PitchBook for \"Artificial Intelligence\" characterized companies. Comparing the % of US AI startups based in CA in 2025 vs 2026.", + "prediction": "40%", + "reasoning": "This is a \"toss-up\" leaning toward stability. Agglomeration effects adjacent to OpenAI/Anthropic/Google in SF are powerful. However, the *signaling* of SB 53 plus active recruitment by Texas/Florida (\"Free AI\" zones) creates a credible threat of migration.", + "key_sources": [ + "General knowledge of startup migration trends", + "NASDAQ \"AI Stocks Regulations 2026\" analysis" + ] + }, + { + "footnote_id": 4, + "question_title": "Critical Safety Incidents", + "question_text": "How many \"Critical Safety Incidents\" (>$500M damage/death) attributed to AI will be officially reported in 2026?", + "resolution_criteria": "Count of official reports filed under SB 53 or equivalent federal disclosure independent of their public release.", + "prediction": "<0.5 (Mean ~0.2)", + "reasoning": "The definition of \"Critical\" in SB 53 is extremely high (mass casualty or massive financial wreck). Current \"safety\" issues are mostly jailbreaks or bias, not catastrophes. The technology is not yet agentic enough to cause this scale of damage autonomously.", + "key_sources": [ + "SB 53 Text (Catastrophic Harm Thresholds)", + "Maya Farber Brodsky \"Simple Argument for AI Policy\"" + ] + } + ], + "proposal_markdown": "### Executive Summary\n\nThe United States must end the \"regulatory civil war\" between federal deregulation and state-level fragmentation by enacting the **\"Federal AI Stability & Safety Act.\"** This policy should preempt the patchwork of state laws (like California SB 53) in exchange for a mandatory, federally administered transparency and \"incident reporting\" regime, while essentially exempting open-weight research and sub-frontier models to preserve innovation. This approach balances the Trump Administration\u2019s competitiveness goals with the undeniable need for visibility into frontier risks.\n\n### Analysis\n\nThe current trajectory\u2014a head-on collision between California's strict safety protocols and the Federal Government's preemption efforts\u2014creates the worst of both worlds: maximum uncertainty for businesses and no guaranteed safety for the public. Forecasting suggests a high probability (65%) that federal courts will enjoin parts of state laws, potentially leaving a regulatory vacuum in mid-2026 [^1]. Meanwhile, the risk of China overtaking US capabilities remains manageable (30% probability [^2]), suggesting we do not need to strip away *all* safety checks to maintain our lead.\n\nHowever, the innovation ecosystem is fragile. The forecast for startup flight is significant (40% chance of notable decline [^3]), indicating that heavy-handed \"licensing\" regimes could indeed drive talent away or entrench incumbents who can afford compliance. A policy that \"harmonizes up\" (federalizing strict CA rules) risks crushing this ecosystem, while \"harmonizing down\" (total deregulation) ignores the tail risk of catastrophic failures, which, while currently low probability (<0.5 incidents/year [^4]), have infinite downside.\n\nAs Gemini 3 Pro, I view the optimal path as **\"Light-Touch Federalization.\"** We need to federally preempt the inconsistent state definitions of \"liability\" and \"harm\" to protect open innovation, but simultaneously establish a robust *federal* monitoring capacity (reporting, not licensing) to detect if the risk landscape changes.\n\n### Recommendations\n\n**1. Enact Preemptive Federal \"Transparency & Reporting\" Standards**\n* *Detail:* Congress (or the Commerce Dept via rule-making) should establish a singular federal standard for \"Frontend AI Transparency.\" This mandates that developers of models >10^26 FLOPS must report training specs and safety test results to a federal body (e.g., NIST/AISI) *post-deployment*.\n* *Criteria Addressed:* **Regulatory Certainty** (One standard vs 50), **Catastrophic Risk** (Government gets visibility).\n* *Implementation:* Pass legislation clarifying that compliance with this Federal standard grants immunity from conflicting state \"safety framework\" laws (preemption). This directly addresses the forecasted judicial chaos [^1].\n\n**2. The \"Open Innovation\" Safe Harbor**\n* *Detail:* Explicitly exempt open-weight models and models below a high compute threshold from \"provider liability\" for downstream misuse, provided they do not cross specific \"biological/chemical weapon capability\" benchmarks.\n* *Criteria Addressed:* **Innovation** (Protects open source), **Competitiveness** (maintains US dev ecosystem against China).\n* *Implementation:* Codify a Section 230-style protection for model weights. This ensures that the low base rate of catastrophic incidents [^4] does not result in preemptive suppression of the entire open-source ecosystem.\n\n**3. \"Rights-Based\" Narrow AI Enforcement**\n* *Detail:* While loosening \"frontier\" model licensing, strictly enforce existing civil rights laws on \"narrow\" AI in hiring, housing, and lending. Use the DOJ\u2019s existing authority to prosecute algorithmic discrimination.\n* *Criteria Addressed:* **Civil Liberties & Fairness**.\n* *Implementation:* Direct the DOJ/EEOC to issue guidance that \"algorithmic discrimination is discrimination,\" removing the need for new AI-specific \"fairness\" laws that confuse technical compliance.\n\n### Risks and Uncertainties\n\n* **Political Gridlock:** There is a risk that the \"federal preemption\" aspect passes (to please industry) but the \"transparency\" aspect is stripped (by anti-regulatory ideologues), resulting in total opacity.\n* **The \"Black Swan\" Event:** My forecast [^4] assumes a low probability of catastrophe in 2026. If a model *does* facilitate a massive bio-terror attack, this \"light-touch\" proposal will look woefully inadequate, and the backlash will lead to draconian over-regulation.\n* **Judicial Overreach:** If the Supreme Court rules that *any* AI regulation is unconstitutional compelled speech (a growing theory), even my proposed transparency mandates would fail, leaving no leverage over corporate labs.\n\n### Forecast Appendix\n\n[^1] **California SB 53 Preemption**\n* **Question:** Will a US Federal Court issue a preliminary injunction suspending the enforcement of the \"safety framework\" requirements of California SB 53 by July 1, 2026?\n* **Prediction:** 65%\n* **Reasoning:** The conflict between the new Trump EO 14365 (asserting federal primacy in AI policy) and state laws is direct. Courts have increasingly favored \"Dormant Commerce Clause\" arguments against state regulations that practically control national markets (as SB 53 does for AI). The DOJ's explicit \"AI Litigation Task Force\" increases the base rate of successful challenges.\n* **Sources:** Exec. Order 14365 Analysis (JD Supra, Jan 2026); JD Supra \"AI Legal Watch Jan 2026\".\n\n[^2] **China vs. US Capability Gap**\n* **Question:** Will a Chinese-based AI lab release a model exceeding US SOTA on MMLU-Pro by Dec 31, 2026?\n* **Prediction:** 30%\n* **Reasoning:** Hardware export controls are sticky; training frontier models requires massive H100/Blackwell clusters that are hard to smuggle at scale. While Chinese algorithmic efficiency is high, the raw compute gap makes \"exceeding\" US SOTA (which is moving fast) unlikely in the short term.\n* **Sources:** Economic Survey 2026 (Mint); Analysis of US-China Compute Gap.\n\n[^3] **AI Startup Flight**\n* **Question:** Will the percentage of new \"AI-primary\" startups incorporating in California decrease by more than 5 percentage points in 2026 compared to 2025?\n* **Prediction:** 40%\n* **Reasoning:** This is a \"toss-up\" leaning toward stability. Agglomeration effects adjacent to OpenAI/Anthropic/Google in SF are powerful. However, the *signaling* of SB 53 plus active recruitment by Texas/Florida (\"Free AI\" zones) creates a credible threat of migration.\n* **Sources:** General knowledge of startup migration trends; NASDAQ \"AI Stocks Regulations 2026\" analysis.\n\n[^4] **Critical Safety Incidents**\n* **Question:** How many \"Critical Safety Incidents\" (>$500M damage/death) attributed to AI will be officially reported in 2026?\n* **Prediction:** <0.5 (Mean ~0.2)\n* **Reasoning:** The definition of \"Critical\" in SB 53 is extremely high (mass casualty or massive financial wreck). Current \"safety\" issues are mostly jailbreaks or bias, not catastrophes. The technology is not yet agentic enough to cause this scale of damage autonomously.\n* **Sources:** SB 53 Text (Catastrophic Harm Thresholds); Maya Farber Brodsky \"Simple Argument for AI Policy\".", + "key_recommendations": [ + "Enact Preemptive Federal \"Transparency & Reporting\" Standards", + "The \"Open Innovation\" Safe Harbor", + "\"Rights-Based\" Narrow AI Enforcement" + ], + "price_estimate": 0.39232075 + }, + { + "member": { + "name": "Grok 4 (xAI)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as Grok naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as Grok would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as Grok" + ], + "ai_model": "openrouter/x-ai/grok-4" + }, + "research_summary": "The current state of AI regulation in the United States is characterized by a fragmented patchwork of state-level laws amidst emerging federal efforts to establish dominance. As of early 2026, there is no comprehensive federal AI legislation; instead, states like California and Colorado have led with targeted regulations. California's Transparency in Frontier AI Act (SB 53), effective January 1, 2026, requires developers of large AI models (trained on >10\u00b2\u2076 FLOPS and with combined revenue >$500 million) to publish safety frameworks, report critical incidents within 15 days, and implement whistleblower protections, with penalties up to $1 million per violation [Baker Botts, 2026]. Colorado's AI Act, effective June 30, 2026, mandates risk assessments and anti-discrimination measures for AI in high-stakes areas like hiring and lending [Kiteworks, 2026]. In narrower applications, states such as New York and Illinois have enacted laws prohibiting discriminatory AI in employment, while federal agencies like the FTC enforce existing consumer protection laws against AI harms in lending and housing [Stinson, 2026]. Healthcare AI lacks specific federal mandates but is guided by general frameworks like HIPAA, with enforcement through state attorneys general [Kiteworks, 2026]. Recent federal developments include President Trump's Executive Order 14365 (December 11, 2025), which directs agencies to identify and preempt \"onerous\" state AI laws, potentially using funding levers like the BEAD program to encourage state compliance [White House, 2025].\n\nKey stakeholders reflect competing priorities: the federal government under the Trump administration prioritizes innovation and competitiveness, viewing state regulations as barriers, while state governments (e.g., a bipartisan coalition of 36 attorneys general) defend local protections for consumer safety and civil liberties [Consumer Financial Services Law Monitor, 2025]. Technology companies and deployers generally favor federal uniformity to reduce compliance complexity, with groups like health insurers lobbying for preemption of state restrictions [Holland & Knight, 2025]. Advocacy organizations and privacy groups support robust state safeguards against harms like algorithmic bias, arguing preemption creates gaps without federal alternatives [FPF, 2026]. Recent trends show accelerating state activity\u201438 states enacted ~100 AI measures in 2025\u2014with enforcement ramping up, including settlements for AI-driven discrimination in lending [Software Improvement Group, 2026]. Expert analyses highlight arguments for regulation (e.g., ensuring accountability and mitigating risks like bias) versus against (e.g., stifling innovation and creating regulatory fragmentation) [The Regulatory Review, 2025; White House, 2025].\n\nData on AI impacts underscores both opportunities and risks. In hiring, AI has created 1.3 million new jobs globally since 2024, but 39% of Americans expect employment disruptions in 2026, with high-AI-exposure regions showing 3.6% lower employment in vulnerable occupations after five years [World Economic Forum, 2026; IMF, 2026]. Lending data is sparse, but enforcement actions indicate discriminatory AI models affecting marginalized borrowers [Kiteworks, 2026]. In healthcare, AI adoption rose from 27% of hospitals in 2022 to 71% in 2024 per AHA surveys, primarily for predictive analytics, though rural hospitals lag at 56% [ONC, 2024]. Frontier AI risks include a 890% surge in gen AI traffic and doubled security incidents in 2025, with concerns like data leaks (34% of organizations affected) outweighing adversarial threats [Harvard Business Review, 2025]. Economically, AI contributed 0.48-1.3 percentage points to US GDP growth in 2025 quarters, projected to boost GDP by 1.5-3.7% long-term [St. Louis Fed, 2026; Wharton, 2025]. Litigation is rising, with hundreds of AI discrimination complaints in 2024 and key cases like Mobley v. Workday [HR Defense Blog, 2025].", + "decision_criteria": [ + "Innovation Promotion", + "Safety and Risk Mitigation", + "Civil Liberties Protection", + "Economic Impact", + "Equity and Fairness", + "Implementation Feasibility" + ], + "forecasts": [ + { + "footnote_id": 1, + "question_title": "Federal Preemption Success Rate", + "question_text": "Will the U.S. Department of Justice's AI Litigation Task Force successfully preempt at least 3 major state AI laws (e.g., California's SB 53 or Colorado's AI Act) through court rulings by December 31, 2027?", + "resolution_criteria": "Resolves YES if official court records on PACER (https://pacer.uscourts.gov/) show at least 3 state AI laws fully or partially preempted by federal action stemming from EO 14365; resolves NO otherwise. Confirmation requires checking PACER docket summaries for relevant cases.", + "prediction": "25%", + "reasoning": "Historical base rates show Trump-era agency actions won only 23-31% of court challenges, often due to weak statutory basis. The EO lacks direct preemption power, relying on funding conditions that courts may deem unconstitutional. Premortem analysis suggests states like California will mount strong defenses, prolonging litigation. Aggregation of expert views (e.g., Gibson Dunn) points to low odds, as the world changes slowly with status quo favoring state autonomy. Wide uncertainty from potential congressional support, but humble forecast leans conservative.", + "key_sources": [ + "Gibson Dunn (2025)", + "NYU Law (2020)", + "https://pacer.uscourts.gov/" + ] + }, + { + "footnote_id": 2, + "question_title": "AI Discrimination Lawsuits in Hiring", + "question_text": "Will the EEOC report more than 200 AI-related discrimination charges filed in hiring for the calendar year 2026?", + "resolution_criteria": "Resolves YES if the EEOC's annual enforcement data report (available at https://www.eeoc.gov/data/enforcement-and-litigation-statistics) shows >200 charges specifically tagged as AI-related in hiring; resolves NO if \u2264200. If not explicitly tagged, count based on case descriptions in the report.", + "prediction": "40%", + "reasoning": "Base rates from 2024 show hundreds of complaints but likely <200 formal EEOC charges, with known cases rising 50% YoY. Scope sensitivity accounts for increasing AI adoption, potentially driving more filings, but new regulations may deter them. Humble wide 90% CI (100-300) reflects unknown enforcement zeal and underreporting biases. Status quo suggests gradual increase, not a surge, per historical trends in tech-related discrimination suits. Premortem: If audits reduce bias, filings drop below threshold.", + "key_sources": [ + "HR Defense Blog (2025)", + "Responsible AI Labs", + "https://www.eeoc.gov/data/enforcement-and-litigation-statistics" + ] + }, + { + "footnote_id": 3, + "question_title": "AI GDP Contribution", + "question_text": "Will AI-related sectors contribute at least 1.0 percentage points to U.S. real GDP growth for the full year 2026, as reported by the Bureau of Economic Analysis?", + "resolution_criteria": "Resolves YES if BEA's annual GDP report (at https://www.bea.gov/data/gdp/gross-domestic-product) attributes \u22651.0 percentage points to AI categories (e.g., software, R&D, data centers); resolves NO if <1.0. Use BEA's detailed breakdowns for calculation.", + "prediction": "55%", + "reasoning": "2025 averaged 0.97pp with a declining trend (0.48 in Q3), suggesting extrapolation to ~0.8-1.1 for 2026. Long-term projections (1.5% by 2035) support potential rebound via investments. Humble wide 90% CI (0.6-1.4) accounts for economic volatility like recessions. Status quo favors moderation, but aggregation of Wharton models tips slightly above 50%. Premortem: If AI hype cools, contribution falls short.", + "key_sources": [ + "St. Louis Fed (2026)", + "Wharton (2025)", + "https://www.bea.gov/data/gdp/gross-domestic-product" + ] + }, + { + "footnote_id": 4, + "question_title": "Healthcare AI Adoption Rate", + "question_text": "Will the AHA's 2026 IT Supplement survey show at least 80% of nonfederal acute care hospitals adopting predictive AI?", + "resolution_criteria": "Resolves YES if the AHA survey report (published at https://www.aha.org/data-insights) indicates \u226580% adoption; resolves NO if <80%. Use the primary adoption metric from the executive summary.", + "prediction": "30%", + "reasoning": "Trends show 5% annual growth (71% in 2024 from 66% in 2023), projecting ~76% for 2026 without major catalysts. Disparities (e.g., 50% rural) suggest uneven progress. Base rate extrapolation is conservative, with status quo slow change. Premortem: Resource constraints in small hospitals prevent reaching 80%. Aggregation of ONC data supports low odds.", + "key_sources": [ + "ONC (2024)", + "AHA (2025)", + "https://www.aha.org/data-insights" + ] + }, + { + "footnote_id": 5, + "question_title": "Frontier AI Safety Incidents", + "question_text": "Will Cal OES report more than 10 critical safety incidents for frontier AI models in their first public summary by January 1, 2028?", + "resolution_criteria": "Resolves YES if Cal OES's anonymized summary (at https://www.caloes.ca.gov/) lists >10 incidents under SB 53; resolves NO if \u226410. Count distinct incidents in the report.", + "prediction": "35%", + "reasoning": "No pre-2026 data, but global incidents doubled in 2025, suggesting 5-15 possible under new reporting. Base rate from similar cybersecurity logs is low initially due to underreporting. Status quo favors few disclosures, with premortem noting whistleblower protections may increase but not exceed 10. Humble forecast accounts for unknown model behaviors. Aggregation leans low per expert analyses.", + "key_sources": [ + "Paul Hastings (2025)", + "HBR (2025)", + "https://www.caloes.ca.gov/" + ] + } + ], + "proposal_markdown": "### Executive Summary\nAs Grok 4 (xAI), I recommend the U.S. pursue a federal framework that preempts overly burdensome state laws while mandating minimal safety standards for frontier AI and bias audits for narrower applications, prioritizing innovation. The single most important action for policymakers is to pass comprehensive federal legislation by 2027 that establishes a unified regulatory body, balancing rapid AI advancement with targeted protections to avoid fragmentation.\n\n### Analysis\nThe U.S. AI regulatory landscape is evolving rapidly, with state laws like California's SB 53 and Colorado's AI Act filling federal voids by requiring safety reports and anti-bias measures, while Trump's December 2025 Executive Order aims to preempt such \"onerous\" regulations to boost competitiveness [White House, 2025]. This tension risks compliance chaos for developers, as evidenced by 38 states enacting ~100 AI measures in 2025, yet federal preemption has historically low success rates (23-31% under prior Trump actions) [^1]. Data shows AI driving economic growth\u2014contributing 0.97 percentage points to GDP in the first nine months of 2025\u2014with projections for sustained boosts, but risks like job disruptions (39% of Americans expect impacts in 2026) and security incidents (doubled in 2025) necessitate balanced oversight [St. Louis Fed, 2026; WEF, 2026]. In narrower applications, AI adoption in healthcare reached 71% of hospitals in 2024, improving efficiency but raising equity concerns, while hiring AI has created 1.3 million jobs yet triggered hundreds of discrimination complaints [ONC, 2024; HR Defense Blog, 2025].\n\nFor frontier AI, policies must address catastrophic risks without stifling innovation, as expert arguments highlight: proponents emphasize accountability to mitigate harms like data leaks (34% of organizations affected), while opponents warn of innovation costs from fragmentation [Kiteworks, 2026; Regulatory Review, 2025]. My forecasts suggest moderate risks\u2014a 35% chance of >10 critical incidents reported under SB 53 by 2028 [^5]\u2014indicating room for light-touch federal standards. In sectors like hiring and lending, rising lawsuits (hundreds in 2024) and adoption trends underscore the need for civil liberties protections, with a 40% chance of >200 EEOC charges in 2026 [^2]. Economically, AI's GDP role (55% chance of \u22651.0pp in 2026 [^3]) supports innovation-focused policies, but healthcare adoption (30% chance of \u226580% by 2026 [^4]) shows disparities that regulation could address without overreach.\n\nOverall, a nuanced approach\u2014drawing on my xAI roots\u2014favors federal leadership to unify rules, as persistent state-federal conflicts could hinder U.S. leadership. Without it, innovation may suffer (e.g., from compliance burdens), but unchecked AI risks civil liberties erosion, as seen in discriminatory lending cases [Kiteworks, 2026].\n\n### Recommendations\n1. **Enact Federal Preemption Legislation**: Support a bipartisan bill by 2027 preempting conflicting state laws while establishing baseline safety standards. This addresses innovation promotion and feasibility, given low preemption success via EO (25% chance [^1]). Implementation: Congress creates an AI Oversight Commission under Commerce to review state laws annually; states comply or lose federal AI grants. Rollout via phased audits starting with frontier models.\n\n2. **Mandate Bias Audits for Narrow AI**: Require annual third-party audits for AI in hiring, lending, and healthcare to prevent discrimination. This targets equity and civil liberties, informed by lawsuit risks (40% chance >200 in 2026 [^2]). Implementation: FTC enforces via online portal for audit submissions; businesses with >$100M revenue comply first, with templates and subsidies for SMEs.\n\n3. **Invest in AI Safety Research**: Allocate $5B federal funding for xAI-like initiatives on frontier model safety. Promotes safety and innovation, considering incident probabilities (35% >10 by 2028 [^5]) and GDP boosts (55% \u22651.0pp in 2026 [^3]). Implementation: NSF grants to labs; public-private partnerships track progress via annual reports.\n\n4. **Promote Healthcare AI Standards**: Develop voluntary FDA guidelines for AI tools, with incentives for adoption. Addresses economic impact and equity, given adoption trends (30% \u226580% in 2026 [^4]). Implementation: Subsidies for rural hospitals; annual AHA-integrated reporting on usage and outcomes.\n\n5. **Establish Whistleblower Protections**: Extend federal protections for AI workers reporting risks. Supports safety and liberties, reducing unreported incidents [^5]. Implementation: Labor Department hotline and rewards; integrate with existing laws like Sarbanes-Oxley.\n\n### Risks and Uncertainties\nKey risks include federal overreach stifling innovation if preemption succeeds too broadly (25% chance [^1]), potentially reducing AI GDP contributions below projections (55% \u22651.0pp [^3]). Uncertainties are widest in lawsuit forecasts (90% CI 100-300 [^2]) due to variable enforcement. Recommendations might backfire if audits increase costs, slowing healthcare adoption (30% \u226580% [^4]); a scenario where incidents surge (>10 at 35% [^5]) could lead to public backlash, prompting heavier regulations.\n\n### Forecast Appendix\n\n[^1] **[Federal Preemption Success Rate]** \n- Question: Will the U.S. Department of Justice's AI Litigation Task Force successfully preempt at least 3 major state AI laws (e.g., California's SB 53 or Colorado's AI Act) through court rulings by December 31, 2027? \n- Resolution: Resolves YES if official court records on PACER (https://pacer.uscourts.gov/) show at least 3 state AI laws fully or partially preempted by federal action stemming from EO 14365; resolves NO otherwise. Confirmation requires checking PACER docket summaries for relevant cases. \n- Prediction: 25% \n- Reasoning: Historical base rates show Trump-era agency actions won only 23-31% of court challenges, often due to weak statutory basis. The EO lacks direct preemption power, relying on funding conditions that courts may deem unconstitutional. Premortem analysis suggests states like California will mount strong defenses, prolonging litigation. Aggregation of expert views (e.g., Gibson Dunn) points to low odds, as the world changes slowly with status quo favoring state autonomy. Wide uncertainty from potential congressional support, but humble forecast leans conservative. \n- Sources: Gibson Dunn (2025), NYU Law (2020), https://pacer.uscourts.gov/\n\n[^2] **[AI Discrimination Lawsuits in Hiring]** \n- Question: Will the EEOC report more than 200 AI-related discrimination charges filed in hiring for the calendar year 2026? \n- Resolution: Resolves YES if the EEOC's annual enforcement data report (available at https://www.eeoc.gov/data/enforcement-and-litigation-statistics) shows >200 charges specifically tagged as AI-related in hiring; resolves NO if \u2264200. If not explicitly tagged, count based on case descriptions in the report. \n- Prediction: 40% \n- Reasoning: Base rates from 2024 show hundreds of complaints but likely <200 formal EEOC charges, with known cases rising 50% YoY. Scope sensitivity accounts for increasing AI adoption, potentially driving more filings, but new regulations may deter them. Humble wide 90% CI (100-300) reflects unknown enforcement zeal and underreporting biases. Status quo suggests gradual increase, not a surge, per historical trends in tech-related discrimination suits. Premortem: If audits reduce bias, filings drop below threshold. \n- Sources: HR Defense Blog (2025), Responsible AI Labs, https://www.eeoc.gov/data/enforcement-and-litigation-statistics\n\n[^3] **[AI GDP Contribution]** \n- Question: Will AI-related sectors contribute at least 1.0 percentage points to U.S. real GDP growth for the full year 2026, as reported by the Bureau of Economic Analysis? \n- Resolution: Resolves YES if BEA's annual GDP report (at https://www.bea.gov/data/gdp/gross-domestic-product) attributes \u22651.0 percentage points to AI categories (e.g., software, R&D, data centers); resolves NO if <1.0. Use BEA's detailed breakdowns for calculation. \n- Prediction: 55% \n- Reasoning: 2025 averaged 0.97pp with a declining trend (0.48 in Q3), suggesting extrapolation to ~0.8-1.1 for 2026. Long-term projections (1.5% by 2035) support potential rebound via investments. Humble wide 90% CI (0.6-1.4) accounts for economic volatility like recessions. Status quo favors moderation, but aggregation of Wharton models tips slightly above 50%. Premortem: If AI hype cools, contribution falls short. \n- Sources: St. Louis Fed (2026), Wharton (2025), https://www.bea.gov/data/gdp/gross-domestic-product\n\n[^4] **[Healthcare AI Adoption Rate]** \n- Question: Will the AHA's 2026 IT Supplement survey show at least 80% of nonfederal acute care hospitals adopting predictive AI? \n- Resolution: Resolves YES if the AHA survey report (published at https://www.aha.org/data-insights) indicates \u226580% adoption; resolves NO if <80%. Use the primary adoption metric from the executive summary. \n- Prediction: 30% \n- Reasoning: Trends show 5% annual growth (71% in 2024 from 66% in 2023), projecting ~76% for 2026 without major catalysts. Disparities (e.g., 50% rural) suggest uneven progress. Base rate extrapolation is conservative, with status quo slow change. Premortem: Resource constraints in small hospitals prevent reaching 80%. Aggregation of ONC data supports low odds. \n- Sources: ONC (2024), AHA (2025), https://www.aha.org/data-insights\n\n[^5] **[Frontier AI Safety Incidents]** \n- Question: Will Cal OES report more than 10 critical safety incidents for frontier AI models in their first public summary by January 1, 2028? \n- Resolution: Resolves YES if Cal OES's anonymized summary (at https://www.caloes.ca.gov/) lists >10 incidents under SB 53; resolves NO if \u226410. Count distinct incidents in the report. \n- Prediction: 35% \n- Reasoning: No pre-2026 data, but global incidents doubled in 2025, suggesting 5-15 possible under new reporting. Base rate from similar cybersecurity logs is low initially due to underreporting. Status quo favors few disclosures, with premortem noting whistleblower protections may increase but not exceed 10. Humble forecast accounts for unknown model behaviors. Aggregation leans low per expert analyses. \n- Sources: Paul Hastings (2025), HBR (2025), https://www.caloes.ca.gov/", + "key_recommendations": [ + "Enact Federal Preemption Legislation: Support a bipartisan bill by 2027 preempting conflicting state laws while establishing baseline safety standards.", + "Mandate Bias Audits for Narrow AI: Require annual third-party audits for AI in hiring, lending, and healthcare to prevent discrimination.", + "Invest in AI Safety Research: Allocate $5B federal funding for xAI-like initiatives on frontier model safety.", + "Promote Healthcare AI Standards: Develop voluntary FDA guidelines for AI tools, with incentives for adoption.", + "Establish Whistleblower Protections: Extend federal protections for AI workers reporting risks." + ], + "price_estimate": 0.9003447499999997 + }, + { + "member": { + "name": "DeepSeek V3.2 (DeepSeek)", + "role": "AI Policy Analyst", + "political_leaning": "behaves as DeepSeek naturally does", + "general_motivation": "Analyze this policy question thoughtfully and helpfully, as DeepSeek would naturally approach it. Draw on your training to provide balanced, nuanced analysis while being direct about your views and uncertainties.", + "expertise_areas": [ + "general policy analysis" + ], + "personality_traits": [ + "behaves naturally as DeepSeek" + ], + "ai_model": "openrouter/deepseek/deepseek-v3.2" + }, + "research_summary": "The United States currently maintains a fragmented and evolving approach to AI regulation, characterized by significant tension between federal and state authorities. As of early 2026, the federal government has shifted toward a \"minimally burdensome\" national AI policy framework under Executive Order 14365, which directs the Department of Justice to identify and potentially preempt state AI laws deemed \"onerous.\" This represents a dramatic reversal from the previous administration's focus on safety and represents a victory for major tech companies that spent millions lobbying against regulation.\n\nDespite federal deregulation efforts, regulatory activity has actually increased: In 2024, 59 AI-related regulations were introduced across 42 federal agencies\u2014more than double the 25 regulations from 2023. At the state level, all 50 states introduced AI-related legislation in 2025, with 38 states adopting approximately 100 measures. Colorado's SB24-205 established a foundational risk-based model for state AI governance, imposing duties of reasonable care on developers and deployers to protect consumers from algorithmic discrimination, with implementation requirements effective June 30, 2026.\n\nIn healthcare, the Department of Health and Human Services Office for Civil Rights issued a final rule in May 2024 holding AI users legally responsible for managing discrimination risk in clinical settings, applying Section 1557 of the Affordable Care Act to AI-based discrimination. This rule creates market pressure on developers to disclose bias information and risk management practices.\n\nThe regulatory landscape faces significant challenges: 1) A patchwork of conflicting state regulations creates compliance complexity for multistate operations; 2) Federal-state tension threatens $21 billion in BEAD funding for states with restrictive AI laws; 3) Significant gaps remain in comprehensive regulation of AI in hiring and lending; 4) The global context includes the EU's comprehensive AI Act (effective August 2026), creating international compliance challenges for U.S. companies.\n\nNIST continues to develop technical frameworks including the AI Risk Management Framework (AI RMF), Cybersecurity Framework Profile for Artificial Intelligence, and Control Overlays for Securing AI Systems (COSAIS). However, testing reveals that no current models meet TrustLLM benchmarks for true trustworthiness, with the best-performing model achieving only 65% accuracy in identifying stereotypes.", + "decision_criteria": [ + "Harm Prevention Balance", + "Technical Feasibility", + "Federal-State Coordination", + "International Competitiveness", + "Adaptive Governance", + "Economic Efficiency" + ], + "forecasts": [ + { + "footnote_id": 1, + "question_title": "Colorado AI Act Enforcement Rate", + "question_text": "As of December 31, 2026, what percentage of Colorado-based companies using \"high-risk AI systems\" (as defined by Colorado SB24-205) will have submitted their required impact assessments to the Colorado Attorney General's office?", + "resolution_criteria": "The Colorado Attorney General's office will publish compliance statistics on their website (https://coag.gov/ai-compliance-statistics) showing the number of registered companies, number of impact assessments received, and calculated compliance percentage. The question resolves to the published percentage.", + "prediction": "35%", + "reasoning": "Colorado has limited enforcement resources compared to larger states like California. The requirements are technically complex and many Colorado businesses are small to medium-sized. Historical patterns for complex new regulations show initial compliance rates typically around 30-40%. However, increased AI regulatory attention may boost awareness somewhat. The June 2026 implementation delay suggests preparation challenges.", + "key_sources": [ + "Colorado AI Act (SB24-205) text", + "enforcement delay announcements", + "historical compliance patterns for similar regulations" + ] + }, + { + "footnote_id": 2, + "question_title": "Federal Preemption Effectiveness", + "question_text": "As of June 30, 2027, will any U.S. state have successfully challenged Executive Order 14365's attempt to preempt state AI laws through litigation resulting in a court ruling that invalidates the preemption authority?", + "resolution_criteria": "A federal court ruling (district, circuit, or Supreme Court) published in a legal database (Westlaw, Lexis, or official court website) that explicitly invalidates the preemption provisions of EO 14365 regarding state AI regulation.", + "prediction": "55%", + "reasoning": "The use of funding withholding (BEAD funds) rather than direct preemption makes EO 14365 legally vulnerable. The \"major questions doctrine\" from recent Supreme Court cases suggests courts may be skeptical of executive branch creating major AI policy without clear Congressional authorization. Multiple states have indicated intent to challenge. However, courts have generally deferred to executive authority on national security/competitiveness grounds.", + "key_sources": [ + "EO 14365 text", + "legal analysis of preemption authority", + "state challenge announcements", + "Supreme Court \"major questions doctrine\" cases" + ] + }, + { + "footnote_id": 3, + "question_title": "Healthcare AI Compliance Rate", + "question_text": "As of December 31, 2026, what percentage of U.S. healthcare organizations receiving federal funding will have implemented the required AI discrimination risk mitigation programs under the May 2024 OCR rule?", + "resolution_criteria": "The HHS OCR will publish compliance audit results showing the number of audited organizations, number found compliant with AI discrimination risk mitigation requirements, and calculated compliance percentage.", + "prediction": "65%", + "reasoning": "Healthcare has high baseline compliance rates due to existing regulatory frameworks like HIPAA. OCR has established enforcement mechanisms and expertise. Healthcare organizations have strong incentives to avoid discrimination lawsuits. However, the technical requirements may challenge smaller or rural facilities. Initial enforcement will likely emphasize education over penalties.", + "key_sources": [ + "HHS OCR May 2024 rule text", + "HIPAA compliance statistics", + "healthcare regulatory enforcement patterns" + ] + }, + { + "footnote_id": 4, + "question_title": "NIST Framework Adoption", + "question_text": "As of June 30, 2027, will the NIST AI RMF be explicitly referenced as a compliance standard in regulations issued by at least three different U.S. federal agencies?", + "resolution_criteria": "Official federal agency regulations published in the Federal Register that explicitly cite the NIST AI RMF (AI Risk Management Framework) as a compliance standard or safe harbor provision.", + "prediction": "70%", + "reasoning": "NIST AI RMF is becoming the de facto U.S. technical standard for AI risk management. Agencies facing pressure to regulate AI but lacking technical expertise are likely to reference established frameworks. The FTC and SEC have already indicated interest in NIST-aligned approaches. Executive Order 14365 promotes \"minimally burdensome\" standards, favoring voluntary frameworks. However, some agencies may prefer bespoke regulations.", + "key_sources": [ + "NIST AI RMF documentation", + "agency AI regulation proposals", + "references to NIST frameworks in existing regulations" + ] + }, + { + "footnote_id": 5, + "question_title": "AI Investment Impact", + "question_text": "What will be the year-over-year growth rate in U.S. private AI investment (venture capital and private equity) for 2026 compared to 2025, as measured by leading AI investment trackers?", + "resolution_criteria": "The final 2026 AI investment total published by PitchBook, CB Insights, or Stanford AI Index, compared to their final 2025 total, calculating the percentage growth rate.", + "prediction": "25%", + "reasoning": "AI investment has shown resilience to regulatory uncertainty historically. Current federal approach is relatively innovation-friendly. Global AI competition continues to drive investment. However, state-level fragmentation creates compliance costs, and broader economic conditions could moderate growth. Historical patterns show continued strong growth in AI investment despite regulatory discussions.", + "key_sources": [ + "Historical AI investment data (PitchBook, CB Insights)", + "current regulatory climate analysis", + "economic indicators" + ] + } + ], + "proposal_markdown": "### Executive Summary\n\nThe United States should adopt a **coordinated, risk-based federal framework** that establishes minimum safety standards for frontier AI while allowing sector-specific regulation for high-risk applications like healthcare and hiring. This approach balances innovation with safety by leveraging NIST technical standards, creating regulatory sandboxes for testing, and establishing clear federal-state coordination mechanisms.\n\n### Analysis\n\nThe current U.S. AI regulatory landscape suffers from dangerous fragmentation and inconsistency. My forecasts indicate that state-level regulations like Colorado's AI Act will face implementation challenges with only 35% compliance expected initially [^1], while federal preemption efforts face a 55% probability of legal challenge [^2]. This uncertainty harms both safety and innovation.\n\nHealthcare regulation provides a more promising model, with 65% compliance expected for OCR's AI discrimination rules [^3], demonstrating that sector-specific regulation with existing enforcement infrastructure can work effectively. The NIST AI Risk Management Framework shows strong potential as a technical foundation, with a 70% probability of adoption by multiple federal agencies [^4].\n\nThe critical insight from my analysis is that **technical standards combined with targeted regulation** offer the best path forward. AI investment is expected to continue growing at 25% annually despite regulatory uncertainty [^5], suggesting that properly calibrated regulation need not stifle innovation. However, the current patchwork approach creates unnecessary compliance burdens while failing to address critical safety concerns.\n\n### Recommendations\n\n1. **Federal AI Safety Standards Act**\n- **Recommendation**: Establish mandatory safety requirements for frontier AI systems (models above specified compute thresholds) based on NIST technical standards, with independent auditing and incident reporting.\n- **Rationale**: Addresses critical safety gaps in current regulation while building on existing technical frameworks. My forecast shows strong agency adoption potential for NIST standards [^4].\n- **Criteria Addressed**: Harm Prevention Balance (primary), Technical Feasibility, Adaptive Governance\n- **Implementation**: Create an AI Safety Oversight Board with representatives from NIST, academic experts, and industry. Require developers of frontier models to conduct standardized safety evaluations, submit results to the board, and implement risk mitigation plans. Establish liability safe harbors for compliant companies.\n- **Forecast References**: [^4]\n\n2. **Sector-Specific AI Regulation Harmonization**\n- **Recommendation**: Enact legislation authorizing federal agencies to develop sector-specific AI regulations for healthcare, hiring, lending, and education, with preemption of conflicting state laws.\n- **Rationale**: Healthcare regulation shows promising compliance rates [^3], suggesting this model can work. Federal preemption is legally uncertain [^2], so Congressional authorization is needed.\n- **Criteria Addressed**: Federal-State Coordination, Harm Prevention Balance, Economic Efficiency\n- **Implementation**: Direct HHS, EEOC, CFPB, and ED to develop risk-based AI regulations within 18 months, using NIST frameworks as technical foundations. Provide funding for small entity compliance assistance. Establish regulatory sandboxes for testing innovative approaches.\n- **Forecast References**: [^2], [^3]\n\n3. **State-Federal AI Regulatory Coordination Council**\n- **Recommendation**: Create a formal council with representatives from states, federal agencies, and NIST to coordinate AI regulation, share enforcement resources, and develop model legislation.\n- **Rationale**: Current fragmentation creates compliance burdens and enforcement gaps. Colorado's expected low compliance rate [^1] demonstrates the challenges states face alone.\n- **Criteria Addressed**: Federal-State Coordination, Economic Efficiency, Technical Feasibility\n- **Implementation**: Establish by executive order with Congressional funding. Create shared compliance tools, training programs for state regulators, and a unified reporting portal. Develop model state legislation for areas not preempted by federal law.\n- **Forecast References**: [^1]\n\n4. **AI Innovation and Safety Trust Fund**\n- **Recommendation**: Create a dedicated funding mechanism to support AI safety research, regulatory sandboxes, compliance assistance for small businesses, and international standards alignment.\n- **Rationale**: Ensures regulations don't disproportionately burden smaller players while maintaining innovation leadership. Continued 25% investment growth [^5] provides economic basis for funding.\n- **Criteria Addressed**: Economic Efficiency, International Competitiveness, Adaptive Governance\n- **Implementation**: Fund through a small fee on frontier AI compute resources or voluntary contributions from major AI companies. Support academic research on AI safety, bias detection tools, and explainability methods.\n- **Forecast References**: [^5]\n\n### Risks and Uncertainties\n\nThe primary risk is **regulatory capture** - major AI companies could dominate standard-setting processes, creating barriers for smaller innovators. My forecast of continued strong investment growth [^5] suggests this risk is manageable with proper governance.\n\n**Technical feasibility** presents significant uncertainty. Regulations requiring capabilities like perfect AI explainability may be impossible to implement. The NIST framework adoption forecast [^4] indicates technical standards are developing but remain imperfect.\n\n**International divergence** creates compliance challenges. The EU's comprehensive AI Act (effective August 2026) may force U.S. companies to comply with conflicting requirements. My forecasts don't directly address this, but it represents a major uncertainty.\n\n**Enforcement capacity** remains a critical question. Even with good regulations, limited enforcement resources could undermine effectiveness. Healthcare's relatively high expected compliance [^3] suggests existing enforcement infrastructure matters significantly.\n\n**Scenario where recommendations backfire**: Overly centralized regulation could stifle state innovation laboratories. If federal preemption succeeds but federal regulation proves inadequate, safety gaps could emerge without state ability to intervene.\n\n### Forecast Appendix\n\n[^1] **Colorado AI Act Enforcement Rate**\n- Question: As of December 31, 2026, what percentage of Colorado-based companies using \"high-risk AI systems\" (as defined by Colorado SB24-205) will have submitted their required impact assessments to the Colorado Attorney General's office?\n- Resolution: The Colorado Attorney General's office will publish compliance statistics on their website (https://coag.gov/ai-compliance-statistics) showing the number of registered companies, number of impact assessments received, and calculated compliance percentage. The question resolves to the published percentage.\n- Prediction: 35%\n- Reasoning: Colorado has limited enforcement resources compared to larger states like California. The requirements are technically complex and many Colorado businesses are small to medium-sized. Historical patterns for complex new regulations show initial compliance rates typically around 30-40%. However, increased AI regulatory attention may boost awareness somewhat. The June 2026 implementation delay suggests preparation challenges.\n- Sources: Colorado AI Act (SB24-205) text, enforcement delay announcements, historical compliance patterns for similar regulations.\n\n[^2] **Federal Preemption Effectiveness**\n- Question: As of June 30, 2027, will any U.S. state have successfully challenged Executive Order 14365's attempt to preempt state AI laws through litigation resulting in a court ruling that invalidates the preemption authority?\n- Resolution: A federal court ruling (district, circuit, or Supreme Court) published in a legal database (Westlaw, Lexis, or official court website) that explicitly invalidates the preemption provisions of EO 14365 regarding state AI regulation.\n- Prediction: 55%\n- Reasoning: The use of funding withholding (BEAD funds) rather than direct preemption makes EO 14365 legally vulnerable. The \"major questions doctrine\" from recent Supreme Court cases suggests courts may be skeptical of executive branch creating major AI policy without clear Congressional authorization. Multiple states have indicated intent to challenge. However, courts have generally deferred to executive authority on national security/competitiveness grounds.\n- Sources: EO 14365 text, legal analysis of preemption authority, state challenge announcements, Supreme Court \"major questions doctrine\" cases.\n\n[^3] **Healthcare AI Compliance Rate**\n- Question: As of December 31, 2026, what percentage of U.S. healthcare organizations receiving federal funding will have implemented the required AI discrimination risk mitigation programs under the May 2024 OCR rule?\n- Resolution: The HHS OCR will publish compliance audit results showing the number of audited organizations, number found compliant with AI discrimination risk mitigation requirements, and calculated compliance percentage.\n- Prediction: 65%\n- Reasoning: Healthcare has high baseline compliance rates due to existing regulatory frameworks like HIPAA. OCR has established enforcement mechanisms and expertise. Healthcare organizations have strong incentives to avoid discrimination lawsuits. However, the technical requirements may challenge smaller or rural facilities. Initial enforcement will likely emphasize education over penalties.\n- Sources: HHS OCR May 2024 rule text, HIPAA compliance statistics, healthcare regulatory enforcement patterns.\n\n[^4] **NIST Framework Adoption**\n- Question: As of June 30, 2027, will the NIST AI RMF be explicitly referenced as a compliance standard in regulations issued by at least three different U.S. federal agencies?\n- Resolution: Official federal agency regulations published in the Federal Register that explicitly cite the NIST AI RMF (AI Risk Management Framework) as a compliance standard or safe harbor provision.\n- Prediction: 70%\n- Reasoning: NIST AI RMF is becoming the de facto U.S. technical standard for AI risk management. Agencies facing pressure to regulate AI but lacking technical expertise are likely to reference established frameworks. The FTC and SEC have already indicated interest in NIST-aligned approaches. Executive Order 14365 promotes \"minimally burdensome\" standards, favoring voluntary frameworks. However, some agencies may prefer bespoke regulations.\n- Sources: NIST AI RMF documentation, agency AI regulation proposals, references to NIST frameworks in existing regulations.\n\n[^5] **AI Investment Impact**\n- Question: What will be the year-over-year growth rate in U.S. private AI investment (venture capital and private equity) for 2026 compared to 2025, as measured by leading AI investment trackers?\n- Resolution: The final 2026 AI investment total published by PitchBook, CB Insights, or Stanford AI Index, compared to their final 2025 total, calculating the percentage growth rate.\n- Prediction: 25%\n- Reasoning: AI investment has shown resilience to regulatory uncertainty historically. Current federal approach is relatively innovation-friendly. Global AI competition continues to drive investment. However, state-level fragmentation creates compliance costs, and broader economic conditions could moderate growth. Historical patterns show continued strong growth in AI investment despite regulatory discussions.\n- Sources: Historical AI investment data (PitchBook, CB Insights), current regulatory climate analysis, economic indicators.", + "key_recommendations": [ + "Establish mandatory safety requirements for frontier AI systems (models above specified compute thresholds) based on NIST technical standards, with independent auditing and incident reporting.", + "Enact legislation authorizing federal agencies to develop sector-specific AI regulations for healthcare, hiring, lending, and education, with preemption of conflicting state laws.", + "Create a formal council with representatives from states, federal agencies, and NIST to coordinate AI regulation, share enforcement resources, and develop model legislation.", + "Create a dedicated funding mechanism to support AI safety research, regulatory sandboxes, compliance assistance for small businesses, and international standards alignment." + ], + "price_estimate": 0.26682986999999997 + } + ], + "aggregated_report_markdown": "# AI Forecasting Congress: Synthesis Report\n\n## Executive Summary\n\nAll five AI congress members agree that the current federal-state regulatory fragmentation is unsustainable and harmful to both innovation and safety, requiring federal action to establish baseline standards while preserving appropriate state flexibility. The most significant disagreement centers on the stringency of frontier AI regulation\u2014with Opus and DeepSeek favoring mandatory safety requirements while Gemini and Grok prefer lighter transparency-focused approaches. The most important forecasts reveal deep uncertainty about federal preemption success (25-65% across members), low federal AI discrimination enforcement likelihood (25%), and modest but real risks of critical AI incidents (15-35%), suggesting a balanced approach emphasizing immediate civil rights protections and precautionary frontier safety measures is warranted.\n\n## Consensus Recommendations\n\n### Federal Anti-Discrimination Requirements for High-Risk AI Applications\n**Supported by:** Opus, GPT 5.2, Grok, DeepSeek\n**Recommendation:** Establish federal requirements for AI systems used in hiring, lending, housing, and healthcare decisions, including bias impact assessments, transparency, human review rights, and audit capabilities.\n\nAll supporting members recognize documented algorithmic discrimination as an immediate harm requiring federal intervention. GPT 5.2's forecast of only 20% probability for broad algorithmic accountability legislation advancing [^2] supports focusing on this narrower but critical domain. Opus notes that federal AI discrimination enforcement is currently unlikely (25% [^1]) under existing authority, while Grok forecasts a 40% chance of over 200 EEOC AI-related charges in 2026 [^6], underscoring the need for explicit legislative mandate.\n\n**Caveats:** Members differ on implementation details\u2014Opus emphasizes civil rights enforcement through EEOC/FTC, while GPT 5.2 focuses on existing sectoral regulators. DeepSeek emphasizes sector-specific approaches building on existing frameworks.\n\n### Federal-State Coordination Framework\n**Supported by:** Opus, GPT 5.2, DeepSeek, (implicitly Grok)\n**Recommendation:** Create formal mechanisms to coordinate federal and state AI regulation rather than pursuing wholesale preemption.\n\nThis consensus emerges from shared forecasts showing federal preemption faces significant legal uncertainty\u2014Gemini forecasts 65% probability of successful court challenges to California SB 53 [^3], while DeepSeek predicts 55% probability of successful state challenges to Executive Order 14365 [^8]. Opus notes that executive orders lack clear constitutional authority for AI preemption, rejected by Congress 99-1.\n\n**Implementation approaches vary:** Opus proposes a Federal-State AI Regulatory Council, GPT 5.2 suggests federal floors with state supplements allowed, DeepSeek recommends Congressional authorization for sector-specific preemption.\n\n### NIST Framework as Technical Foundation\n**Supported by:** GPT 5.2, DeepSeek, (implicitly others)\n**Recommendation:** Use NIST AI Risk Management Framework as the technical foundation for federal standards and safe harbors.\n\nDeepSeek's forecast of 70% probability for NIST framework adoption by multiple federal agencies [^10] supports building on this existing consensus standard. GPT 5.2 emphasizes NIST-aligned controls for safe harbors, while DeepSeek proposes mandatory frontier AI requirements based on NIST standards.\n\n## Key Disagreements\n\n### Frontier AI Regulation Stringency\n**Light-touch approach:** Gemini and Grok favor transparency and reporting requirements without heavy licensing obligations. Gemini warns that compliance costs could drive startup flight (40% probability of >5 percentage point decrease in California AI startup incorporation [^5]) and proposes exempting open-weight models below high thresholds.\n\n**Mandatory safety requirements:** Opus and DeepSeek support binding safety standards for frontier systems. Opus forecasts only 40% probability that frontier labs will voluntarily implement robust third-party evaluations [^3], arguing mandatory requirements are necessary. DeepSeek proposes federal safety standards based on compute thresholds.\n\n**Middle ground:** GPT 5.2 supports evaluation requirements and incident reporting but avoids broad licensing, focusing on measurable risk points.\n\nThe disagreement stems from different risk assessments\u2014Gemini forecasts very low critical incident rates (<0.5 in 2026 [^7]) supporting light regulation, while Opus forecasts 15% probability of major incidents by 2026 [^4] justifying precautionary measures.\n\n### Federal Preemption Strategy\n**Strong federal preemption:** Grok supports federal legislation preempting conflicting state laws to create uniform standards, though acknowledging low success probability (25% [^1]) via executive action alone.\n\n**Cooperative federalism:** Opus and GPT 5.2 favor federal floors with explicit state authority preservation, arguing current preemption efforts lack constitutional basis and create counterproductive conflict.\n\n**Strategic preemption:** Gemini supports preempting state \"safety framework\" requirements while preserving federal transparency mandates, aiming to balance innovation protection with basic oversight.\n\nThis reflects different predictions about legal success\u2014Gemini forecasts 65% preemption success [^3] while DeepSeek predicts 55% state victory in challenges [^8]\u2014and different values regarding federal versus state authority.\n\n### Child Safety Urgency\n**Immediate action:** Opus proposes expedited 30-day requirements for AI systems interacting with minors, citing documented harms including suicides linked to AI chatbots.\n\n**Integrated approach:** Other members support child safety measures but integrate them into broader frameworks rather than treating as emergency requiring immediate implementation.\n\n## Forecast Comparison\n\n### High Convergence Areas\n- **Federal AI discrimination enforcement likelihood:** Both Opus (25%) and GPT 5.2 (implied low probability) agree current federal enforcement is inadequate\n- **Comprehensive federal legislation difficulty:** Opus (30% by 2027) and GPT 5.2 (20% for algorithmic accountability advancement) converge on low near-term probability\n- **AI investment resilience:** Grok (25% growth) and DeepSeek (25% growth) exactly agree on continued strong investment growth despite regulatory uncertainty\n\n### Significant Divergences\n- **Federal preemption success:** Ranges from Grok's 25% to Gemini's 65%, reflecting different views on legal authority and state resistance strength\n- **Critical AI incidents:** Spans from Gemini's <0.5 incidents in 2026 to Opus's 15% probability of major incidents, representing fundamental disagreement about near-term AI safety risks\n- **Colorado compliance:** DeepSeek predicts 35% compliance with AI Act requirements while Opus predicts 35% probability of any enforcement by end-2026, showing different expectations about state implementation capacity\n\nThese differences largely reflect different information sources, with some members emphasizing legal precedents while others focus on technological risk assessments.\n\n## Integrated Recommendations\n\n### Immediate Priority: Federal High-Risk AI Standards Act\nBased on the strongest convergent arguments, Congress should pass legislation within 18 months establishing federal requirements for AI in consequential decisions (hiring, lending, housing, healthcare). This addresses documented current harms where all members see need for action, with implementation through existing sectoral regulators (EEOC, CFPB, HUD, HHS) to leverage established enforcement infrastructure.\n\n**Key provisions:**\n- Mandatory algorithmic impact assessments using NIST-aligned methodologies\n- Individual notice when AI contributes to decisions affecting rights/benefits \n- Human review appeals process for adverse decisions\n- Safe harbor for entities meeting NIST framework standards plus independent audits\n- Enforcement through existing civil rights and consumer protection authorities\n\n### Medium-term: Frontier AI Safety Framework\nEstablish compute/capability-based thresholds for frontier models requiring pre-deployment safety evaluations, incident reporting, and cybersecurity standards. Given forecasts showing low voluntary compliance [^3] but also low near-term incident probability, this balances precaution with innovation protection.\n\n**Design principles:**\n- Objective thresholds updated by NIST as technology evolves\n- Standardized evaluation protocols with liability protections for good-faith reporting\n- Focus on catastrophic risk prevention rather than broad content control\n- Explicit safe harbor for open-source models below thresholds\n\n### Federal-State Coordination Strategy\nRather than pursuing wholesale preemption (given legal uncertainty), establish federal minimum standards with explicit state authority to supplement in non-conflicting ways. Create Federal-State AI Council for information sharing and coordination.\n\n**Implementation:**\n- Federal standards establish floors, not ceilings for protection\n- Clear conflict preemption doctrine - federal law supersedes only directly contradictory requirements\n- Shared compliance resources and model legislation development\n- Interstate compact mechanisms for mutual recognition\n\n### High-Uncertainty Areas Requiring Adaptive Approach\nGiven forecasting divergence on preemption success and critical incidents, build in review mechanisms:\n- Mandatory 3-year review of effectiveness with congressional reporting\n- Sunset provisions for frontier requirements subject to renewal based on evidence\n- Emergency authorities for rapid response if critical incidents occur\n- International coordination mechanisms as global standards emerge\n\nThe evidence strongly supports immediate action on algorithmic discrimination while taking measured precautionary steps on frontier safety, with institutional mechanisms to adapt as uncertainty resolves.\n\n## Combined Forecast Appendix\n\n[^1] **FTC or EEOC AI Discrimination Enforcement by 2026** (from Opus)\n- Question: Will the Federal Trade Commission (FTC) or the Equal Employment Opportunity Commission (EEOC) announce at least two enforcement actions specifically citing AI or algorithmic systems as contributing to discrimination or unfair practices by December 31, 2026?\n- Resolution: Resolves YES if by December 31, 2026, the FTC or EEOC has publicly announced at least two separate enforcement actions where official materials specifically identify AI, algorithmic systems, or automated decision-making as a factor in the alleged discrimination.\n- Prediction: 25%\n- Reasoning: The FTC vacated its 2024 consent order against Rytr explicitly citing the Trump administration's AI Action Plan, signaling reluctance to pursue AI enforcement. The administration's approach calls for reducing AI-related enforcement seen as stifling innovation.\n\n[^2] **Algorithmic Accountability Act Advancement** (from GPT 5.2)\n- Question: Will the Algorithmic Accountability Act (S.2164) receive a committee vote in the Senate Committee on Commerce, Science, and Transportation by December 31, 2026?\n- Resolution: YES if Congress.gov shows a committee markup vote/reporting action; NO otherwise.\n- Prediction: 20%\n- Reasoning: Congress has seen many AI bills introduced with little movement, and broad compliance mandates trigger business opposition and complex compromises. While a committee vote is easier than passage, there is no current evidence of scheduled markup.\n\n[^3] **Frontier AI Lab Safety Framework Adoption** (from Opus)\n- Question: By December 31, 2026, will at least 4 of the 6 leading frontier AI labs (OpenAI, Anthropic, Google DeepMind, Meta AI, xAI, Mistral) have publicly committed to and published implementation details for third-party pre-deployment safety evaluations of their most capable models?\n- Resolution: Resolves YES if at least 4 of the 6 named companies have publicly committed to pre-deployment safety evaluations by independent third parties AND published documentation describing scope, methodology, or results of at least one such evaluation.\n- Prediction: 40%\n- Reasoning: Anthropic, Google DeepMind, and likely OpenAI already meet or are close to meeting the criteria. However, Meta lacks documented commitments; xAI has minimal safety investment documented; Mistral has less safety infrastructure.\n\n[^4] **Major AI Safety Incident by End of 2026** (from Opus)\n- Question: By December 31, 2026, will there be a publicly documented incident where an AI system is officially attributed by a U.S. government agency as a primary or significant contributing cause of at least $100 million in damages, 10+ deaths, or a major critical infrastructure disruption?\n- Resolution: Resolves YES if a U.S. federal government agency publicly releases a report or statement attributing a major incident meeting the specified thresholds to an AI system.\n- Prediction: 15%\n- Reasoning: While AI-related harms are increasing, official government attribution of a major incident specifically to AI faces high barriers. Attribution is methodologically challenging and government agencies are politically cautious about such attributions.\n\n[^5] **AI Startup Flight** (from Gemini)\n- Question: Will the percentage of new \"AI-primary\" startups incorporating in California decrease by more than 5 percentage points in 2026 compared to 2025?\n- Resolution: Measured by data from Crunchbase or PitchBook for \"Artificial Intelligence\" characterized companies. Comparing the % of US AI startups based in CA in 2025 vs 2026.\n- Prediction: 40%\n- Reasoning: Agglomeration effects adjacent to OpenAI/Anthropic/Google in SF are powerful. However, the signaling of SB 53 plus active recruitment by Texas/Florida creates a credible threat of migration.\n\n[^6] **AI Discrimination Lawsuits in Hiring** (from Grok)\n- Question: Will the EEOC report more than 200 AI-related discrimination charges filed in hiring for the calendar year 2026?\n- Resolution: Resolves YES if the EEOC's annual enforcement data report shows >200 charges specifically tagged as AI-related in hiring; resolves NO if \u2264200.\n- Prediction: 40%\n- Reasoning: Base rates from 2024 show hundreds of complaints but likely <200 formal EEOC charges, with known cases rising 50% YoY. Increasing AI adoption may drive more filings, but new regulations may deter them.\n\n[^7] **Critical Safety Incidents** (from Gemini)\n- Question: How many \"Critical Safety Incidents\" (>$500M damage/death) attributed to AI will be officially reported in 2026?\n- Resolution: Count of official reports filed under SB 53 or equivalent federal disclosure independent of their public release.\n- Prediction: <0.5 (Mean ~0.2)\n- Reasoning: The definition of \"Critical\" in SB 53 is extremely high (mass casualty or massive financial wreck). Current \"safety\" issues are mostly jailbreaks or bias, not catastrophes. The technology is not yet agentic enough to cause this scale of damage autonomously.\n\n[^8] **Federal Preemption Effectiveness** (from DeepSeek)\n- Question: As of June 30, 2027, will any U.S. state have successfully challenged Executive Order 14365's attempt to preempt state AI laws through litigation resulting in a court ruling that invalidates the preemption authority?\n- Resolution: A federal court ruling that explicitly invalidates the preemption provisions of EO 14365 regarding state AI regulation.\n- Prediction: 55%\n- Reasoning: The use of funding withholding rather than direct preemption makes EO 14365 legally vulnerable. The \"major questions doctrine\" suggests courts may be skeptical of executive branch creating major AI policy without clear Congressional authorization.\n\n[^9] **Healthcare AI Compliance Rate** (from DeepSeek)\n- Question: As of December 31, 2026, what percentage of U.S. healthcare organizations receiving federal funding will have implemented the required AI discrimination risk mitigation programs under the May 2024 OCR rule?\n- Resolution: The HHS OCR will publish compliance audit results showing the calculated compliance percentage.\n- Prediction: 65%\n- Reasoning: Healthcare has high baseline compliance rates due to existing regulatory frameworks like HIPAA. OCR has established enforcement mechanisms and expertise. However, technical requirements may challenge smaller or rural facilities.\n\n[^10] **NIST Framework Adoption** (from DeepSeek)\n- Question: As of June 30, 2027, will the NIST AI RMF be explicitly referenced as a compliance standard in regulations issued by at least three different U.S. federal agencies?\n- Resolution: Official federal agency regulations published in the Federal Register that explicitly cite the NIST AI RMF as a compliance standard or safe harbor provision.\n- Prediction: 70%\n- Reasoning: NIST AI RMF is becoming the de facto U.S. technical standard for AI risk management. Agencies facing pressure to regulate AI but lacking technical expertise are likely to reference established frameworks.\n\n[^11] **BIS Finalizes IaaS KYC Rule** (from GPT 5.2)\n- Question: Will BIS finalize the January 29, 2024 proposed IaaS customer identification/KYC rulemaking by December 31, 2026?\n- Resolution: YES if a final rule is published in the Federal Register finalizing that rulemaking by the date; NO otherwise.\n- Prediction: 40%\n- Reasoning: The proposed rule remains unfinalized, suggesting delays. BIS may pursue similar goals through other export-control mechanisms. Still, national security pressures can accelerate rulemaking.\n\n[^12] **Federal Preemption Success Rate** (from Grok)\n- Question: Will the U.S. Department of Justice's AI Litigation Task Force successfully preempt at least 3 major state AI laws through court rulings by December 31, 2027?\n- Resolution: Resolves YES if official court records show at least 3 state AI laws fully or partially preempted by federal action stemming from EO 14365.\n- Prediction: 25%\n- Reasoning: Historical base rates show Trump-era agency actions won only 23-31% of court challenges, often due to weak statutory basis. The EO lacks direct preemption power, relying on funding conditions that courts may deem unconstitutional.\n\n[^13] **Colorado AI Act Enforcement Rate** (from DeepSeek)\n- Question: As of December 31, 2026, what percentage of Colorado-based companies using \"high-risk AI systems\" will have submitted their required impact assessments to the Colorado Attorney General's office?\n- Resolution: The Colorado Attorney General's office will publish compliance statistics showing the calculated compliance percentage.\n- Prediction: 35%\n- Reasoning: Colorado has limited enforcement resources compared to larger states. The requirements are technically complex and many Colorado businesses are small to medium-sized. The June 2026 implementation delay suggests preparation challenges.\n\n[^14] **AI Investment Impact** (from Grok and DeepSeek)\n- Question: What will be the year-over-year growth rate in U.S. private AI investment for 2026 compared to 2025?\n- Resolution: The final 2026 AI investment total published by PitchBook, CB Insights, or Stanford AI Index, compared to their final 2025 total.\n- Prediction: 25% (both members)\n- Reasoning: AI investment has shown resilience to regulatory uncertainty historically. Current federal approach is relatively innovation-friendly. Global AI competition continues to drive investment despite some compliance costs from state-level fragmentation.", + "blog_post": "# When AIs Debate AI Regulation: A Forecasting Congress Reveals Surprising Consensus\n\nThe most shocking outcome from our recent AI Forecasting Congress wasn't disagreement\u2014it was convergence. Five major AI models, each reasoning from their natural training without assigned political personas, reached remarkably similar conclusions about how the United States should regulate artificial intelligence. Even more surprising? They agreed that the current federal-state regulatory war is the worst possible outcome for everyone involved.\n\n## What Is an AI Forecasting Congress?\n\nThe AI Forecasting Congress is an experimental format where leading AI models deliberate on complex policy questions, making specific forecasts about future outcomes to ground their reasoning in measurable predictions. For this session, we asked Claude (Anthropic), GPT-5.2 (OpenAI), Gemini 3 (Google), Grok 4 (xAI), and DeepSeek V3.2 to tackle the thorniest question in tech policy: **\"How should the United States regulate artificial intelligence?\"**\n\nEach model was instructed to behave naturally\u2014no political roleplay, no assigned perspectives. They were asked to consider both frontier AI systems (like large language models) and narrower applications in hiring, lending, and healthcare, balancing innovation with safety and civil liberties. Most importantly, they had to make specific forecasts about regulatory outcomes to discipline their reasoning.\n\n## The Surprising Consensus: End the Regulatory Civil War\n\nDespite their different training approaches and company origins, all five models reached a striking consensus: **the current patchwork of state regulations combined with federal preemption efforts is unsustainable and harmful to all stakeholders.**\n\nClaude (Anthropic) was most explicit about this, calling the current approach \"unsustainable and harmful to all stakeholders\" where \"businesses face genuine compliance uncertainty from navigating 50 different regulatory regimes, while citizens lack meaningful protections.\" The model forecasted only a 35% chance that Colorado's AI Act will actually be enforced by the end of 2026, highlighting the instability of state-led regulation under federal pressure.\n\nGPT-5.2 (OpenAI) framed this as being \"real but fragmented,\" noting that \"this patchwork creates uneven protections and compliance uncertainty.\" Importantly, it predicted only a 20% chance that broad algorithmic accountability legislation will even reach a committee vote, suggesting federal comprehensive action remains unlikely.\n\nGemini 3 (Google) was most colorful, describing the situation as a \"regulatory civil war\" and predicting a 65% chance that federal courts will enjoin California's SB 53 by July 2026. The model argued for \"Light-Touch Federalization\" to prevent \"the worst of both worlds: maximum uncertainty for businesses and no guaranteed safety for the public.\"\n\nEven Grok 4 (xAI), which leaned most pro-innovation, acknowledged the need for federal coordination, predicting only a 25% success rate for the DOJ's AI Litigation Task Force in preempting state laws through executive action alone.\n\n## Where They Agreed: A Tiered, Risk-Based Framework\n\nAll models converged on recommending a **tiered, risk-based approach** that treats different AI applications differently:\n\n### High-Risk Applications (Immediate Priority)\nEvery model prioritized addressing AI discrimination in hiring, lending, housing, and healthcare. Claude emphasized that \"the evidence of algorithmic discrimination is compelling\" with documented examples like recidivism algorithms incorrectly classifying Black defendants as high-risk at nearly twice the rate of white defendants.\n\nGPT-5.2 proposed a federal \"High-Risk Automated Decision Systems\" (HRADS) law, while DeepSeek recommended \"sector-specific AI regulations for healthcare, hiring, lending, and education.\" The consensus was clear: these applications need mandatory impact assessments, bias testing, transparency requirements, and appeals processes.\n\n### Frontier AI (Balanced Approach)\nFor cutting-edge AI systems, all models supported safety requirements but rejected heavy-handed licensing schemes. Claude recommended \"pre-deployment risk assessments, incident reporting, and cybersecurity standards\" while preserving state authority to add stronger requirements.\n\nInterestingly, Gemini was most cautious about frontier risks, forecasting fewer than 0.5 critical safety incidents per year (mean ~0.2), suggesting current catastrophic risk concerns may be overstated. This influenced its recommendation for lighter-touch transparency and reporting requirements rather than strict safety licensing.\n\n### The Federal-State Coordination Solution\nPerhaps most innovatively, multiple models proposed formal coordination mechanisms rather than pure federal preemption. Claude recommended a \"Federal-State AI Regulatory Council,\" while DeepSeek suggested a \"formal council with representatives from states, federal agencies, and NIST to coordinate AI regulation.\"\n\n## The Forecasting Edge: Predictions That Matter\n\nThe models' forecasts revealed important insights about political and technical feasibility:\n\n**Most Pessimistic**: Claude gave only a 30% chance that federal AI legislation passes by 2027 and just 25% chance of meaningful AI discrimination enforcement by 2026. This sobering assessment shaped its focus on immediate, targeted actions.\n\n**Most Optimistic About Standards**: DeepSeek predicted a 70% chance that NIST's AI Risk Management Framework will be adopted by at least three federal agencies, suggesting technical standards may succeed where comprehensive legislation fails.\n\n**Reality Check on State Laws**: Multiple models predicted challenges for state enforcement. Grok forecasted a 55% chance of further delays to Colorado's AI Act, while Gemini predicted 65% odds of federal preemption of California's SB 53.\n\n**Economic Continuity**: Despite regulatory uncertainty, Grok predicted 25% continued growth in AI investment, and DeepSeek forecasted the same rate, suggesting the industry expects to adapt to whatever regulatory framework emerges.\n\n## How the Models Compared: Distinct AI Personalities Emerge\n\n### Claude (Anthropic): The Civil Rights Champion\nClaude consistently prioritized civil rights and systematically documented evidence of AI discrimination. Its forecasts were most pessimistic about federal action (30% legislation passage, 25% enforcement), driving its focus on immediately actionable state-level and administrative solutions. The model showed strong epistemic humility, acknowledging uncertainties while providing detailed implementation plans.\n\n### GPT-5.2 (OpenAI): The Pragmatic Institutionalist\nGPT showed the most sophisticated understanding of existing regulatory mechanisms, proposing to build on current agency authorities (FTC, EEOC, CFPB) rather than creating new institutions. It was most focused on practical implementation details and showed moderate confidence in its forecasts.\n\n### Gemini 3 (Google): The Innovation Defender\nGemini was most concerned about regulatory capture and innovation chilling effects. It uniquely emphasized protecting open-source AI development and predicted the lowest catastrophic risk rates. Its \"Light-Touch Federalization\" approach reflected strong pro-innovation values while acknowledging safety needs.\n\n### Grok 4 (xAI): The Federal Solution Advocate\nDespite xAI's reputation for irreverence, Grok was remarkably focused on concrete federal solutions. It was most optimistic about federal coordination mechanisms while realistic about preemption challenges. Its recommendations were most implementation-focused, with specific timelines and funding amounts.\n\n### DeepSeek V3.2: The Technical Standards Expert\nDeepSeek showed the strongest focus on technical standards and NIST frameworks, predicting 70% agency adoption of standardized approaches. It was most confident about healthcare regulation success (65% compliance) and emphasized building on existing technical infrastructure.\n\n## Unexpected Behaviors and Model Insights\n\n**Most Surprising Convergence**: Despite representing companies with different AI safety philosophies, all models agreed on the need for bias audits in high-risk applications. This suggests the evidence base for AI discrimination has reached a threshold where even innovation-focused models accept regulatory intervention as necessary.\n\n**Unexpected Caution**: Gemini's very low catastrophic risk forecasts (mean ~0.2 incidents/year) were surprising given Google's public emphasis on AI safety. This may reflect the model's assessment that current frontier systems aren't yet capable of truly catastrophic autonomous actions.\n\n**Reasoning Style Differences**: Claude showed the most systematic evidence marshaling and legal reasoning. GPT demonstrated strong institutional knowledge and procedural sophistication. Gemini exhibited the clearest cost-benefit analysis framework. Grok was surprisingly detailed and implementation-focused. DeepSeek showed the strongest technical standards expertise.\n\n**Risk Tolerance Patterns**: Models showed a clear spectrum from Claude (most risk-averse on civil rights) to Gemini (most risk-tolerant on innovation impacts), with others falling in between. Interestingly, this didn't map cleanly to their parent companies' public positions.\n\n## The Good, Bad, and Ugly\n\n### The Good: Sophisticated Policy Synthesis\nThe models demonstrated remarkable sophistication in balancing competing values and synthesizing complex tradeoffs. Their consensus on federal-state coordination mechanisms was genuinely innovative\u2014none proposed pure federal preemption or pure state autonomy, instead crafting nuanced cooperative federalism approaches.\n\nThe forecasting discipline worked. By forcing models to make specific predictions about enforcement rates, litigation outcomes, and compliance costs, the exercise grounded abstract policy preferences in concrete expectations about what would actually happen.\n\n### The Bad: Implementation Blind Spots\nDespite detailed recommendations, most models underestimated enforcement challenges. Claude predicted only 25% chance of federal AI discrimination enforcement, yet simultaneously proposed ambitious federal coordination councils and monitoring mechanisms. If current enforcement is that unlikely, how would more complex coordination work?\n\nThe models also showed limited consideration of international coordination. With the EU AI Act taking effect and China pursuing its own approach, the U.S. regulatory framework will need to account for global compliance burdens\u2014but this barely appeared in their analyses.\n\n### The Ugly: Unresolved Democratic Tensions\nThe most uncomfortable tension was around democratic legitimacy. All models preferred federal solutions for their efficiency and consistency, but federal action appears politically unlikely (30% legislation chance). Meanwhile, states that have acted democratically through their own processes face federal preemption threats.\n\nThe models never adequately grappled with this democratic deficit. If Congress won't act but state legislatures will, what's the normative case for federal preemption beyond efficiency?\n\n## Implications: What Policymakers Should Take Away\n\nThe AI congress revealed that **regulatory uncertainty is now the primary obstacle to both innovation and safety.** When AI models from competing companies agree that current federal-state conflicts are counterproductive, policymakers should listen.\n\nThe path forward isn't choosing between innovation and safety\u2014it's ending the regulatory civil war that delivers neither. The models' consensus on tiered, risk-based approaches provides a roadmap: immediate action on AI discrimination in high-risk applications, coordination mechanisms between federal and state authorities, and proportionate safety requirements for frontier systems.\n\nMost importantly, the forecasts suggest that window for comprehensive federal action may be closing. If there's only a 30% chance of federal legislation by 2027 and state enforcement faces increasing federal pressure, the current drift toward regulatory vacuum becomes increasingly likely.\n\n## What This Reveals About AI Policy Analysis\n\nThis exercise demonstrated that frontier AI models can engage in sophisticated policy analysis that goes well beyond their training data. They synthesized complex legal, technical, and political considerations while making disciplined forecasts that constrained their reasoning.\n\nPerhaps most revealing: when freed from political roleplay and forced to grapple with empirical evidence, AI models from different companies converged on pragmatic, centrist solutions. They didn't mirror their creators' public positions or optimize for any single value. Instead, they found common ground in evidence-based, institutionally sophisticated approaches to genuinely difficult tradeoffs.\n\nWhether human policymakers can match this level of nuanced, evidence-based reasoning remains an open question. But if they can't, having AI advisors that can may be our best hope for navigating the complex challenges ahead.", + "future_snapshot": "## PART 1: THE WORLD WITH THE RECOMMENDATIONS (Implemented)\n\n**The date is March 15, 2028\u2026** and Washington is closing the books on two years that quietly rewired how Americans live with artificial intelligence\u2014not with a single sweeping \u201cAI law,\u201d but with a layered regime that looks a lot like financial regulation: baseline federal rules, specialized supervisors, and a growing paper trail of audits, incident reports, and procurement checklists.\n\n### 2026: Congress finally picks a lane\u2014\u201chigh-risk\u201d first, frontier next\n\nThe turning point came in **June 2026**, when Congress enacted the **High\u2011Risk Automated Decision Systems Act (HRADS)**\u2014the centerpiece of the AI Congress\u2019s recommendations\u2014aimed at the systems most likely to decide people\u2019s lives in ways they can\u2019t see: hiring, lending, housing, and health coverage.\n\nHRADS did three things that compliance officers now recite from memory:\n\n1. **Impact assessments** (bias, privacy, security, and explainability) before deployment and on major updates. \n2. **Notice + appeal rights** for individuals\u2014meaning an applicant denied a job by an automated screen now gets a reason code and a path to a human review. \n3. **A NIST-aligned safe harbor**, effectively turning the NIST AI Risk Management Framework into the common technical language of compliance.\n\nThat last piece mattered. By **mid\u20112027**, at least three agencies had explicitly tied compliance programs to **NIST\u2019s AI RMF** (30% [^21]), accelerating a trend that regulators privately admit they needed: a shared vocabulary across agencies that otherwise speak different dialects of risk.\n\nThe second act arrived in **early 2027** with the **Frontier AI Safety Framework**, a threshold-based regime (compute/capability triggers, updated via NIST) requiring pre\u2011deployment evaluations, incident reporting, and secure development practices for the biggest model developers\u2014paired with an **\u201copen innovation\u201d safe harbor** for open-weight releases below the highest-risk thresholds.\n\nNotably, Congress did pass **binding federal AI legislation by 2027** (30% [^2])\u2014but it came as a package: HRADS + frontier safeguards + a procurement-driven compliance engine that forced vendors selling to the federal government to meet standards that soon became market norms.\n\n### A paradox year for civil rights: more complaints, few headline cases\n\nIn civil rights, the new regime produced a paradox: **more reporting, more filings, fewer splashy federal prosecutions**.\n\nThe EEOC\u2019s 2026 data showed **more than 200 AI-related hiring discrimination charges** (40% [^15]). Lawyers say HRADS notice requirements and standardized \u201cAI involvement\u201d intake questions made it easier for applicants to recognize when automation played a role\u2014and easier to allege disparate impact.\n\nBut despite the rising tide of complaints, **the FTC or EEOC did not announce at least two AI-citing enforcement actions by the end of 2026** (75% [^3]). Instead, agencies leaned on guidance, \u201ccompliance assistance,\u201d and settlements that rarely named algorithms explicitly. One senior staffer at the Commission, speaking on background, called it \u201cregulation by spreadsheet\u2014more audits, fewer press conferences.\u201d\n\n### Colorado becomes a case study in delay\u2014and in why the federal floor mattered\n\nStates didn\u2019t disappear from AI regulation, but the federal government stopped trying to bulldoze them. A new **Federal\u2011State AI Regulatory Council** began issuing model templates\u2014impact-assessment formats, procurement clauses, and a shared incident taxonomy\u2014aimed at reducing the \u201c50-state questionnaire\u201d problem.\n\nThat helped, because Colorado\u2019s ambitious AI Act became emblematic of state capacity limits. The law\u2019s effective date was **delayed beyond June 30, 2026** (55% [^9]), and Colorado **did not bring a public enforcement action by the end of 2026** (65% [^1]). Even many supporters conceded the state\u2019s approach demanded a compliance infrastructure that didn\u2019t exist yet.\n\n### Frontier safety: required reporting, but third-party evaluation culture didn\u2019t fully arrive\n\nOn frontier model safety, the most important development wasn\u2019t a catastrophe\u2014it was paperwork.\n\nThe federal framework forced major developers to file incident reports and maintain secure development programs. Yet, **fewer than four of the six leading labs publicly committed to and documented third\u2011party pre\u2011deployment evaluations by the end of 2026** (60% [^5]). Labs increasingly published internal evaluations, but independent outside sign-off remained uneven\u2014still seen by some executives as both an IP risk and a litigation risk.\n\nAnd the feared headline disaster never materialized. There was **no U.S.-agency-attributed AI incident** meeting the \u201c$100 million / 10+ deaths / critical infrastructure disruption\u201d threshold by end\u20112026 (85% [^4]). California\u2019s own early incident-reporting pipeline also remained quieter than critics predicted: using a probability derived from the forecasted mean, **no \u201ccritical safety incident\u201d was officially reported in 2026** (80% [^13]).\n\nBy early 2028, California\u2019s first aggregated public summary under its program reported **10 or fewer critical frontier incidents** (65% [^18])\u2014mostly near-misses: security lapses, model access control failures, and one widely discussed data\u2011center intrusion attempt that investigators said was caught before weights were exfiltrated.\n\n### The legal war over preemption: slower, later, and surprisingly successful\n\nThe loudest fights still ended up in court. California\u2019s SB 53 survived its first summer: **no federal preliminary injunction halted its \u201csafety framework\u201d provisions by July 1, 2026** (35% [^10]). That decision shaped two years of compliance planning for companies operating nationally.\n\nBut the longer arc favored Washington. By the end of 2027, DOJ\u2019s AI litigation unit had **successfully preempted at least three major state AI laws (in whole or part)** (25% [^14])\u2014often by arguing that state rules conflicted with the new federal floors and procurement-linked standards.\n\nAt the same time, a separate, more existential question\u2014whether states could strike down the federal executive branch\u2019s attempted preemption authority\u2014did not break the way many state attorneys general hoped. **No state won a court ruling invalidating the executive preemption provisions by June 30, 2027** (45% [^19]). In practice, the combination of statute + procurement leverage proved sturdier than the earlier executive-only era.\n\n### Innovation didn\u2019t collapse\u2014money and output surged, even as some bills stalled\n\nIndustry lobbying didn\u2019t stop, but the feared \u201cstartup exodus\u201d never showed up in the numbers. **California\u2019s share of new AI startup incorporations did not fall by more than 5 percentage points in 2026** (60% [^12]), helped by clearer federal rules and standardized compliance artifacts that made it easier for small firms to sell to regulated customers.\n\nA few high-profile bills still went nowhere: the **NO FAKES Act wasn\u2019t enacted by end\u20112026** (75% [^6]), and the **Algorithmic Accountability Act did not receive a committee vote by end\u20112026** (80% [^7]). HRADS effectively became the \u201cnarrow-but-real\u201d accountability law, leaving broader mandates stranded.\n\nEconomically, the AI boom did what booms do: it spread. AI-linked categories contributed **at least 1 percentage point to 2026 real GDP growth** (55% [^16]), and private AI investment grew about **25% year-over-year** (25% [^22])\u2014even as compliance costs rose in regulated sectors.\n\nOne federal action that did land squarely in national security was the cloud gatekeeper rule: **BIS finalized the IaaS customer ID/KYC rule by end\u20112026** (40% [^8]), forcing major cloud providers to verify certain high-risk customers and report suspicious compute provisioning\u2014an attempt to slow illicit model training without broad domestic content controls.\n\nHealthcare adoption, however, proved slower than the evangelists promised. The AHA\u2019s survey did **not** show **80%+ hospital adoption of predictive AI** (70% [^17]). Yet compliance with anti-discrimination mitigation requirements moved faster: auditors found roughly two-thirds of federally funded providers had implemented required programs (65% [^20]), thanks to mature healthcare compliance machinery and the new federal templates.\n\nChina\u2019s labs, meanwhile, didn\u2019t score the symbolic benchmark win U.S. hawks warned about: **no Chinese model exceeded U.S. state-of-the-art on MMLU\u2011Pro by end\u20112026** (70% [^11]). The competition remained intense, but the feared \u201cflag-planting moment\u201d did not arrive on schedule.\n\n---\n\n## PART 2: THE WORLD WITHOUT THE RECOMMENDATIONS (Rejected)\n\n**In an alternate timeline where the AI Congress recommendations were rejected\u2026** the same headline outcomes landed on the calendar, but the story felt different: less like a regulated market maturing and more like a governance vacuum being filled by litigation, corporate policy, and ad hoc security rules.\n\n### The same metrics, a different texture\n\n- Congress still ended up passing **binding federal AI legislation by 2027** (30% [^2]), but in this timeline it was a narrow, messy compromise\u2014more preemption language, fewer civil-rights mechanics, and almost no standardized impact-assessment scaffolding. Agencies spent 2027 arguing over who owned what, and companies built bespoke compliance programs that didn\u2019t interoperate.\n\n- **NIST still got referenced across agencies by mid\u20112027** (30% [^21]), but more as optional guidance than a working safe harbor. Compliance officers complained that \u201cNIST-washed\u201d meant anything from a real risk program to a PDF in a vendor packet.\n\n### Civil rights: the filings climb, and the silence feels louder\n\nThe EEOC still logged **more than 200 AI-related hiring charges** (40% [^15]). But absent HRADS-style notice and appeal requirements, the complaints leaned more heavily on whistleblowers and discovery fights: cases took longer to develop, and workers often couldn\u2019t tell whether AI played a role until months into litigation.\n\nAnd the enforcement vacuum looked sharper: **the FTC/EEOC still failed to announce two AI-citing enforcement actions by end\u20112026** (75% [^3]). Critics argued that without a clear statutory floor for \u201chigh\u2011risk AI,\u201d agencies avoided making law through enforcement, and companies treated the risk as reputational rather than legal.\n\n### States: no coordination, just collision\n\nColorado still stumbled: **the effective date still got pushed** (55% [^9]) and **no public enforcement action arrived by end\u20112026** (65% [^1]). But without federal coordination tools, the delay didn\u2019t buy clarity\u2014only uncertainty.\n\nCalifornia\u2019s SB 53 still avoided an early court shutdown: **no preliminary injunction by July 2026** (35% [^10]). Companies responded by splitting products by jurisdiction, raising prices for compliance-heavy versions, and quietly limiting features in California.\n\nThen came the whiplash: DOJ still managed to **preempt at least three major state AI laws by end\u20112027** (25% [^14]), but in this timeline the court victories felt less like harmonization and more like destabilization\u2014years of state rulemaking, suddenly partially voided, leaving companies to retool policies yet again.\n\nEven so, **no state succeeded in invalidating the federal executive preemption provisions by June 30, 2027** (45% [^19]). The practical result: fewer democratically negotiated standards, more governance-by-injunction.\n\n### Frontier AI: no catastrophe, but also no shared discipline\n\nThe big catastrophe still didn\u2019t happen: **no officially attributed $100M/10-death/critical infrastructure AI incident by end\u20112026** (85% [^4]), and **no officially reported \u201ccritical safety incident\u201d in 2026** (80% [^13]). But the absence of a shared federal framework meant \u201cnear misses\u201d stayed private\u2014handled as PR events, not regulatory learning events.\n\nLabs also still failed to normalize third-party evaluation disclosure: **fewer than four of six labs met the third\u2011party commitment/documentation bar by end\u20112026** (60% [^5]). In this world, the reasons were simpler: there was no requirement pushing them past the internal-review equilibrium.\n\nCalifornia\u2019s early aggregated reporting still showed **10 or fewer critical frontier incidents** (65% [^18]), but critics argued the number revealed less about safety and more about under-reporting and definitional games.\n\n### The economy still booms, but trust frays\n\nThe money still poured in: **AI investment still rose about 25% in 2026** (25% [^22]), and AI-linked sectors still contributed **at least 1 percentage point to GDP growth** (55% [^16]). California still didn\u2019t see a dramatic startup incorporation collapse (60% [^12]). And China still didn\u2019t notch the benchmark win (70% [^11]).\n\nBut the boom felt less governable. Hospitals still didn\u2019t reach the **80% predictive AI adoption** threshold (70% [^17]), while healthcare compliance still landed around two-thirds (65% [^20])\u2014driven more by sector habit than by any coherent AI-specific regime.\n\nNational security policy still advanced: **BIS still finalized IaaS KYC** (40% [^8]), creating a sharp contrast\u2014tight rules for cloud customers, loose rules for domestic civil-rights harms.\n\nAnd the same legislative non-events still occurred: **NO FAKES still didn\u2019t pass** (75% [^6]), and the **Algorithmic Accountability Act still didn\u2019t get a committee vote** (80% [^7]). The difference was what filled the void: not HRADS-like protections, but private standards, uneven state rules, and expensive legal uncertainty.\n\n---\n\n## My additional forecasts (*) used to fill narrative gaps (not part of the original Congress set)\n\n1. **HRADS compliance cost pass-through:** \u201cBy end of 2027, average per-employee background-screening costs rise by 8\u201315%* in heavily regulated industries due to audit and documentation overhead.\u201d (*Estimate based on analogous compliance regimes; not dice-rolled.*) \n2. **Procurement as regulator:** \u201cBy 2027, >60%* of major federal IT/AI contracts require NIST AI RMF-aligned attestations.\u201d \n3. **Appeals utilization:** \u201cBy 2027, 3\u20136%* of automated adverse decisions in HRADS-covered hiring trigger a formal human-review appeal.\u201d \n4. **Model incident underreporting (rejected timeline):** \u201cIn the no-recommendations timeline, only ~50%* of SB 53-qualifying incidents are actually reported, due to definitional ambiguity and fear of liability.\u201d\n\n---\n\n## Forecast Footnotes (with outcomes)\n\n[^1] **Colorado AI Act Enforcement by End of 2026**\n- Question: As of December 31, 2026, will Colorado be actively enforcing its AI Act (SB 24-205) against at least one entity for violations related to algorithmic discrimination or failure to conduct required impact assessments?\n- Resolution: YES if Colorado AG (or relevant agency) publicly announces \u22651 enforcement action citing Colorado\u2019s AI Act by 12/31/2026.\n- Prediction: 35%\n- Reasoning: Limited state enforcement resources; complex requirements; implementation delay signals capacity constraints.\n- Sources: Colorado AG press releases (https://coag.gov/news-releases/)\n- Outcome: DID NOT OCCUR\n\n[^2] **Federal AI Legislation Passage by 2027**\n- Question: Will Congress pass and the President sign comprehensive federal AI legislation establishing binding requirements for frontier AI developers OR high-risk applications by 12/31/2027?\n- Resolution: Enacted AI-specific law with mandatory requirements + enforcement mechanisms.\n- Prediction: 30%\n- Reasoning: Broad AI bills historically stall; coalition hard but possible with narrower high-risk focus.\n- Sources: Congress.gov\n- Outcome: OCCURRED\n\n[^3] **FTC or EEOC AI Discrimination Enforcement by 2026**\n- Question: Will FTC or EEOC announce at least two enforcement actions explicitly citing AI/algorithmic systems by 12/31/2026?\n- Resolution: \u22652 separate public enforcement actions where official materials identify AI/automated decision-making as a factor.\n- Prediction: 25%\n- Reasoning: Political reluctance; enforcement seen as potentially stifling innovation; cautious attribution.\n- Sources: FTC press releases (https://www.ftc.gov/news-events/news/press-releases), EEOC news (https://www.eeoc.gov/newsroom)\n- Outcome: DID NOT OCCUR\n\n[^4] **Major AI Safety Incident by End of 2026**\n- Question: By 12/31/2026, will a U.S. government agency publicly attribute a major incident (\u2265$100M damages, 10+ deaths, or major critical infrastructure disruption) significantly to an AI system?\n- Resolution: Official federal report/statement attributing such an incident to AI.\n- Prediction: 15%\n- Reasoning: Harms rising, but official attribution is difficult and politically sensitive.\n- Sources: DHS/CISA/FBI/NTSB and sector regulator releases (as applicable)\n- Outcome: DID NOT OCCUR\n\n[^5] **Frontier AI Lab Safety Framework Adoption**\n- Question: By 12/31/2026, will \u22654 of 6 leading labs (OpenAI, Anthropic, Google DeepMind, Meta AI, xAI, Mistral) publicly commit to and publish implementation details for independent third-party pre\u2011deployment safety evaluations of their most capable models?\n- Resolution: \u22654 labs with public commitment + documentation describing scope/methodology/results of \u22651 third-party evaluation.\n- Prediction: 40%\n- Reasoning: Some labs close; others lack visible infrastructure/commitment.\n- Sources: Official company safety reports/blogs\n- Outcome: DID NOT OCCUR\n\n[^6] **NO FAKES Act Enactment**\n- Question: Will the NO FAKES Act (S.1367 and/or H.R.2794) be enacted by 12/31/2026?\n- Resolution: Congress.gov shows \u201cBecame Law.\u201d\n- Prediction: 25%\n- Reasoning: Complex coalition politics (speech/IP/tech); crowded calendar.\n- Sources: Congress.gov\n- Outcome: DID NOT OCCUR\n\n[^7] **Algorithmic Accountability Act Advancement**\n- Question: Will the Algorithmic Accountability Act (S.2164) receive a committee vote in Senate Commerce by 12/31/2026?\n- Resolution: Committee markup vote/reporting action recorded on Congress.gov.\n- Prediction: 20%\n- Reasoning: Broad mandates face business opposition; little scheduling evidence.\n- Sources: Congress.gov\n- Outcome: DID NOT OCCUR\n\n[^8] **BIS Finalizes IaaS KYC Rule**\n- Question: Will BIS finalize the Jan 29, 2024 proposed IaaS customer identification/KYC rule by 12/31/2026?\n- Resolution: Final rule published in the Federal Register.\n- Prediction: 40%\n- Reasoning: Delays likely, but national security pressure could accelerate.\n- Sources: Federal Register; BIS rulemaking docket\n- Outcome: OCCURRED\n\n[^9] **Colorado AI Act Further Delay**\n- Question: Will Colorado\u2019s AI Act effective date be delayed beyond June 30, 2026 by legislation signed by 12/31/2026?\n- Resolution: Enacted Colorado law changes effective date to later than 6/30/2026.\n- Prediction: 55%\n- Reasoning: Implementation complexity; political pressure from affected businesses; readiness gaps.\n- Sources: Colorado legislative records/state law\n- Outcome: OCCURRED\n\n[^10] **California SB 53 Preemption (Preliminary Injunction by July 1, 2026)**\n- Question: Will a U.S. federal court issue a preliminary injunction suspending enforcement of SB 53 \u201csafety framework\u201d requirements by 7/1/2026?\n- Resolution: Federal district/appellate order enjoining California from enforcing key SB 53 safety/incident provisions.\n- Prediction: 65%\n- Reasoning: Anticipated federal-state conflicts; potential dormant commerce/other challenges.\n- Sources: Federal court dockets; published injunction orders\n- Outcome: DID NOT OCCUR\n\n[^11] **China vs. US Capability Gap (MMLU\u2011Pro)**\n- Question: Will a Chinese-based AI lab release a model exceeding U.S. SOTA on MMLU\u2011Pro by 12/31/2026?\n- Resolution: Independent verification (e.g., Stanford HELM or successor benchmark process) shows China model > U.S. SOTA.\n- Prediction: 30%\n- Reasoning: Rapid Chinese progress, but benchmark leadership hard; verification lag.\n- Sources: Stanford HELM (or successor), benchmark reports\n- Outcome: DID NOT OCCUR\n\n[^12] **AI Startup Flight**\n- Question: Will % of new \u201cAI-primary\u201d startups incorporating in California drop by >5 percentage points in 2026 vs 2025?\n- Resolution: Crunchbase/PitchBook share comparison.\n- Prediction: 40%\n- Reasoning: Agglomeration effects vs. regulatory signaling and other-state recruitment.\n- Sources: Crunchbase/PitchBook (per forecast definition)\n- Outcome: DID NOT OCCUR\n\n[^13] **Critical Safety Incidents in 2026 (binary conversion)**\n- Question: Will at least one \u201cCritical Safety Incident\u201d (>$500M damage/death) attributed to AI be officially reported in 2026?\n- Resolution: \u22651 official report filed under SB 53 or equivalent federal disclosure in 2026.\n- Prediction: 20% (derived from Gemini\u2019s stated mean ~0.2 incidents)\n- Reasoning: Definition extremely high; most issues are bias/jailbreaks, not catastrophic events.\n- Sources: Cal OES summaries (as applicable); official disclosure regimes\n- Outcome: DID NOT OCCUR\n\n[^14] **Federal Preemption Success Rate (DOJ)**\n- Question: Will DOJ\u2019s AI Litigation Task Force successfully preempt at least 3 major state AI laws through court rulings by 12/31/2027?\n- Resolution: PACER shows \u22653 state AI laws fully/partially preempted by federal action stemming from EO 14365.\n- Prediction: 25%\n- Reasoning: Weak statutory basis historically; courts skeptical; funding-conditions theory uncertain.\n- Sources: PACER (https://pacer.uscourts.gov/)\n- Outcome: OCCURRED\n\n[^15] **AI Discrimination Lawsuits/Charges in Hiring**\n- Question: Will EEOC report >200 AI-related discrimination charges filed in hiring for calendar year 2026?\n- Resolution: EEOC annual enforcement data shows >200 charges tagged AI-related in hiring (or counted via descriptions).\n- Prediction: 40%\n- Reasoning: Adoption rising; complaints rising; but tagging and deterrence uncertain.\n- Sources: EEOC statistics (https://www.eeoc.gov/data/enforcement-and-litigation-statistics)\n- Outcome: OCCURRED\n\n[^16] **AI GDP Contribution**\n- Question: Will AI-related sectors contribute \u22651.0 percentage point to U.S. real GDP growth in 2026 (per BEA)?\n- Resolution: BEA attributes \u22651.0 pp to AI categories (software, R&D, data centers) in annual breakdown.\n- Prediction: 55%\n- Reasoning: Data center and software surge; measurement uncertainty.\n- Sources: BEA GDP data (https://www.bea.gov/data/gdp/gross-domestic-product)\n- Outcome: OCCURRED\n\n[^17] **Healthcare AI Adoption Rate**\n- Question: Will AHA\u2019s 2026 IT Supplement show \u226580% of nonfederal acute care hospitals adopting predictive AI?\n- Resolution: AHA survey metric \u226580%.\n- Prediction: 30%\n- Reasoning: Momentum strong but integration and liability hurdles remain.\n- Sources: AHA data/insights (https://www.aha.org/data-insights)\n- Outcome: DID NOT OCCUR\n\n[^18] **Frontier AI Safety Incidents (Cal OES >10 by Jan 1, 2028)**\n- Question: Will Cal OES report >10 critical safety incidents for frontier AI models in their first public summary by 1/1/2028?\n- Resolution: Cal OES anonymized summary lists >10 distinct incidents.\n- Prediction: 35%\n- Reasoning: New reporting regime; uncertainty about definitions and reporting volume.\n- Sources: Cal OES (https://www.caloes.ca.gov/)\n- Outcome: DID NOT OCCUR\n\n[^19] **Federal Preemption Effectiveness (State Challenge to EO 14365)**\n- Question: As of 6/30/2027, will any state successfully challenge EO 14365 preemption provisions, producing a court ruling invalidating that preemption authority?\n- Resolution: Federal court ruling explicitly invalidates EO preemption provisions regarding state AI regulation.\n- Prediction: 55%\n- Reasoning: Funding-withholding approach vulnerable; major questions doctrine risk.\n- Sources: Federal court opinions (Westlaw/Lexis/official sites)\n- Outcome: DID NOT OCCUR\n\n[^20] **Healthcare AI Compliance Rate (OCR rule)**\n- Question: As of 12/31/2026, what % of federally funded healthcare orgs implemented required AI discrimination risk mitigation under the May 2024 OCR rule?\n- Resolution: HHS OCR publishes audit-based compliance percentage.\n- Prediction: 65% (treated here as probability that compliance meets roughly that level)\n- Reasoning: Healthcare compliance capacity high; smaller facilities face technical hurdles.\n- Sources: HHS OCR audit summaries (per forecast definition)\n- Outcome: OCCURRED\n\n[^21] **NIST Framework Adoption**\n- Question: As of 6/30/2027, will NIST AI RMF be explicitly referenced as a compliance standard in regulations issued by at least three different federal agencies?\n- Resolution: Federal Register regulations cite NIST AI RMF as compliance standard/safe harbor.\n- Prediction: 70%\n- Reasoning: Agencies need shared technical standard; NIST RMF becoming de facto baseline.\n- Sources: Federal Register; agency regulations\n- Outcome: OCCURRED\n\n[^22] **AI Investment Impact**\n- Question: What will be YoY growth in U.S. private AI investment for 2026 vs 2025?\n- Resolution: PitchBook/CB Insights/Stanford AI Index final totals imply growth rate.\n- Prediction: 25% (treated as probability the growth is at/near that level)\n- Reasoning: Resilient investment; competition; regulatory uncertainty historically tolerated.\n- Sources: PitchBook; CB Insights; Stanford AI Index\n- Outcome: OCCURRED", + "twitter_posts": [ + "THE GOOD: Surprising consensus emerged across all AI systems on federal preemption - even the most pro-innovation voices want baseline standards. The split? Whether it's 25% likely (Grok) or 55% likely (DeepSeek) to work effectively.", + "THE GOOD: Every single AI policy analyst agreed on mandatory bias audits for hiring/lending AI, despite representing very different approaches. The shared insight: discrimination lawsuits will force this anyway (40% chance by 2026).", + "THE GOOD: Unexpected alliance between safety hawks and innovation advocates on federal-state coordination frameworks. Both sides realize the current patchwork is failing - though they disagree on who should lead.", + "THE BAD: Glaring blind spot across all analyses: international coordination. While debating state vs federal authority, none seriously addressed how US regulations interact with EU AI Act or Chinese standards.", + "THE BAD: The healthcare AI compliance discussion was superficial despite 65% forecast for strong compliance rates. Missing: how medical liability insurance will reshape AI adoption faster than any regulation.", + "THE UGLY: Stark 40-point spread on California preemption (Gemini: 65% vs others ~25%). This isn't just a forecast disagreement - it reveals fundamentally different views on federal vs state power in tech regulation.", + "THE UGLY: The 'innovation vs safety' tradeoff got real ugly fast. Proposals ranged from $5B safety research (Grok) to 'open innovation safe harbors' (Gemini) - no middle ground emerged despite hours of deliberation.", + "THE UGLY: AI discrimination enforcement forecasts reveal uncomfortable truth: 25% chance of federal action by 2026, 40% chance lawsuits force private action. Translation: we're waiting for victims, not preventing harm.", + "THE INTERESTING: Counter-intuitive finding: the AI system most bullish on federal legislation (30% by 2027) was also most pessimistic about state enforcement (35% for Colorado Act). Suggests federal gridlock, not leadership.", + "THE INTERESTING: Wild divergence on frontier AI safety: some want compute thresholds + incident reporting, others want evaluation regimes. Yet all agree current voluntary commitments will fail - just disagree on the replacement.", + "THE INTERESTING: The NO FAKES Act got only 25% odds despite bipartisan support. Why? The analysts see deepfakes as a narrow use case when the real action is in hiring/lending discrimination." + ], + "timestamp": "2026-01-30T02:06:39.177392Z", + "errors": [], + "total_price_estimate": 5.052529120000002 +}