From f5a2b0789201c78afde93b3675fe43d91d3e4c90 Mon Sep 17 00:00:00 2001 From: lsabor Date: Sun, 14 Dec 2025 12:04:14 -0800 Subject: [PATCH 01/10] adds daily task to update global bot leaderboard, refactors command for convenience --- misc/management/commands/cron.py | 8 + scoring/jobs.py | 17 + .../commands/update_global_bot_leaderboard.py | 562 +++++++++--------- 3 files changed, 306 insertions(+), 281 deletions(-) diff --git a/misc/management/commands/cron.py b/misc/management/commands/cron.py index cf9360552e..0734505f5d 100644 --- a/misc/management/commands/cron.py +++ b/misc/management/commands/cron.py @@ -26,6 +26,7 @@ from scoring.jobs import ( finalize_leaderboards, update_global_comment_and_question_leaderboards, + update_gobal_bot_leaderboard, ) from scoring.utils import update_medal_points_and_ranks @@ -200,6 +201,13 @@ def handle(self, *args, **options): max_instances=1, replace_existing=True, ) + scheduler.add_job( + close_old_connections(update_gobal_bot_leaderboard), + trigger=CronTrigger.from_crontab("0 5 * * *"), # Every day at 05:00 UTC + id="update_gobal_bot_leaderboard", + max_instances=1, + replace_existing=True, + ) # # Comment Jobs diff --git a/scoring/jobs.py b/scoring/jobs.py index 89c6900a89..1d3b60dace 100644 --- a/scoring/jobs.py +++ b/scoring/jobs.py @@ -6,9 +6,26 @@ from scoring.models import Leaderboard from scoring.utils import update_project_leaderboard +from scoring.management.commands.update_global_bot_leaderboard import ( + run_update_global_bot_leaderboard, +) + logger = logging.getLogger(__name__) +def update_gobal_bot_leaderboard(): + global_bot_leaderboard = Leaderboard.objects.filter( + name="Global Bot Leaderboard", + ).first() + if not global_bot_leaderboard: + logger.warning("Global Bot Leaderboard not found.") + return + try: + run_update_global_bot_leaderboard() + except Exception as e: + logger.error(f"Error updating Global Bot Leaderboard: {e}") + + def update_global_comment_and_question_leaderboards(): global_leaderboards = Leaderboard.objects.filter( finalized=False, diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 479e57a3db..89594a307e 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -604,302 +604,302 @@ def bootstrap_skills( return ci_lower, ci_upper -class Command(BaseCommand): - help = """ - Update the global bots leaderboard - """ - - def handle(self, *args, **options) -> None: - baseline_player: int | str = 236038 # metac-gpt-4o+asknews - bootstrap_iterations = 30 - - # SETUP: users to evaluate & questions - print("Initializing...") - users: QuerySet[User] = User.objects.filter( - metadata__bot_details__metac_bot=True, - metadata__bot_details__include_in_calculations=True, - metadata__bot_details__display_in_leaderboard=True, - is_active=True, - ).order_by("id") - user_forecast_exists = Forecast.objects.filter( - question_id=OuterRef("pk"), author__in=users - ) - questions: QuerySet[Question] = ( - Question.objects.filter( - Q( - related_posts__post__default_project__default_permission__in=[ - "viewer", - "forecaster", - ] - ) - | Q( - related_posts__post__default_project_id__in=[ - 3349, # aib q3 2024 - 32506, # aib q4 2024 - 32627, # aib q1 2025 - 32721, # aib q2 2025 - 32813, # aib fall 2025 - ] - ), - related_posts__post__curation_status=Post.CurationStatus.APPROVED, - resolution__isnull=False, - scheduled_close_time__lte=timezone.now(), - ) - .exclude(related_posts__post__default_project__slug__startswith="minibench") - .exclude(resolution__in=UnsuccessfulResolutionType) - .filter(Exists(user_forecast_exists)) - .prefetch_related( # only prefetch forecasts from those users - Prefetch( - "user_forecasts", queryset=Forecast.objects.filter(author__in=users) - ) +def run_update_global_bot_leaderboard() -> None: + baseline_player: int | str = 236038 # metac-gpt-4o+asknews + bootstrap_iterations = 30 + + # SETUP: users to evaluate & questions + print("Initializing...") + users: QuerySet[User] = User.objects.filter( + metadata__bot_details__metac_bot=True, + metadata__bot_details__include_in_calculations=True, + metadata__bot_details__display_in_leaderboard=True, + is_active=True, + ).order_by("id") + user_forecast_exists = Forecast.objects.filter( + question_id=OuterRef("pk"), author__in=users + ) + questions: QuerySet[Question] = ( + Question.objects.filter( + Q( + related_posts__post__default_project__default_permission__in=[ + "viewer", + "forecaster", + ] ) - .order_by("id") - .distinct("id") + | Q( + related_posts__post__default_project_id__in=[ + 3349, # aib q3 2024 + 32506, # aib q4 2024 + 32627, # aib q1 2025 + 32721, # aib q2 2025 + 32813, # aib fall 2025 + ] + ), + related_posts__post__curation_status=Post.CurationStatus.APPROVED, + resolution__isnull=False, + scheduled_close_time__lte=timezone.now(), ) - ############### - # make sure they have at least 100 resolved questions - print("initialize list") - question_list = list(questions) - print("Filtering users.") - scored_question_counts: dict[int, int] = defaultdict(int) - c = users.count() - i = 0 - for user in users: - i += 1 - print(i, "/", c, end="\r") - scored_question_counts[user.id] = ( - Score.objects.filter(user=user, question__in=question_list) - .distinct("question_id") - .count() + .exclude(related_posts__post__default_project__slug__startswith="minibench") + .exclude(resolution__in=UnsuccessfulResolutionType) + .filter(Exists(user_forecast_exists)) + .prefetch_related( # only prefetch forecasts from those users + Prefetch( + "user_forecasts", queryset=Forecast.objects.filter(author__in=users) ) - excluded_ids = [ - uid for uid, count in scored_question_counts.items() if count < 100 - ] - users = users.exclude(id__in=excluded_ids) - ############### - print("Initializing... DONE") - - # Gather head to head scores - user1_ids, user2_ids, question_ids, scores, weights = gather_data( - users, questions ) - - # choose baseline player if not already chosen - if not baseline_player: - baseline_player = max( - set(user1_ids) | set(user2_ids), key=(user1_ids + user2_ids).count - ) - # get variance of average scores (used in rescaling) - avg_scores = get_avg_scores(user1_ids, user2_ids, scores, weights) - var_avg_scores = ( - np.var(np.array(list(avg_scores.values()))) if len(avg_scores) > 1 else 0 + .order_by("id") + .distinct("id") + ) + ############### + # make sure they have at least 100 resolved questions + print("initialize list") + question_list = list(questions) + print("Filtering users.") + scored_question_counts: dict[int, int] = defaultdict(int) + c = users.count() + i = 0 + for user in users: + i += 1 + print(i, "/", c, end="\r") + scored_question_counts[user.id] = ( + Score.objects.filter(user=user, question__in=question_list) + .distinct("question_id") + .count() ) - - # compute skills initially - skills = get_skills( - user1_ids=user1_ids, - user2_ids=user2_ids, - question_ids=question_ids, - scores=scores, - weights=weights, - baseline_player=baseline_player, - var_avg_scores=var_avg_scores, - verbose=False, + excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100] + users = users.exclude(id__in=excluded_ids) + ############### + print("Initializing... DONE") + + # Gather head to head scores + user1_ids, user2_ids, question_ids, scores, weights = gather_data(users, questions) + + # choose baseline player if not already chosen + if not baseline_player: + baseline_player = max( + set(user1_ids) | set(user2_ids), key=(user1_ids + user2_ids).count ) + # get variance of average scores (used in rescaling) + avg_scores = get_avg_scores(user1_ids, user2_ids, scores, weights) + var_avg_scores = ( + np.var(np.array(list(avg_scores.values()))) if len(avg_scores) > 1 else 0 + ) - # Compute bootstrap confidence intervals - ci_lower, ci_upper = bootstrap_skills( - user1_ids, - user2_ids, - question_ids, - scores, - weights, - var_avg_scores, - baseline_player=baseline_player, - bootstrap_iterations=bootstrap_iterations, - ) - print() + # compute skills initially + skills = get_skills( + user1_ids=user1_ids, + user2_ids=user2_ids, + question_ids=question_ids, + scores=scores, + weights=weights, + baseline_player=baseline_player, + var_avg_scores=var_avg_scores, + verbose=False, + ) - ordered_skills = sorted( - [(user, skill) for user, skill in skills.items()], key=lambda x: -x[1] - ) - player_stats: dict[int | str, list] = defaultdict(lambda: [0, set()]) - for u1id, u2id, qid in zip(user1_ids, user2_ids, question_ids): - player_stats[u1id][0] += 1 - player_stats[u1id][1].add(qid) - player_stats[u2id][0] += 1 - player_stats[u2id][1].add(qid) - - ########################################################################## - ########################################################################## - ########################################################################## - ########################################################################## - # UPDATE Leaderboard - print("Updating leaderboard...", end="\r") - leaderboard, _ = Leaderboard.objects.get_or_create( - name="Global Bot Leaderboard", - project=Project.objects.get(type=Project.ProjectTypes.SITE_MAIN), - score_type=LeaderboardScoreTypes.MANUAL, - bot_status=Project.BotLeaderboardStatus.BOTS_ONLY, - ) - entry_dict = { - entry.user_id or entry.aggregation_method: entry - for entry in list(leaderboard.entries.all()) - } - rank = 1 - question_count = len(set(question_ids)) - seen = set() - for uid, skill in ordered_skills: - contribution_count = len(player_stats[uid][1]) - - excluded = False - if isinstance(uid, int): - user = User.objects.get(id=uid) - bot_details = user.metadata["bot_details"] - if not bot_details.get("display_in_leaderboard"): - excluded = True - - entry: LeaderboardEntry = entry_dict.pop(uid, LeaderboardEntry()) - entry.user_id = uid if isinstance(uid, int) else None - entry.aggregation_method = uid if isinstance(uid, str) else None - entry.leaderboard = leaderboard - entry.score = skill - entry.rank = rank - entry.excluded = excluded - entry.show_when_excluded = False - entry.contribution_count = contribution_count - entry.coverage = contribution_count / question_count - entry.calculated_on = timezone.now() - entry.ci_lower = ci_lower.get(uid, None) - entry.ci_upper = ci_upper.get(uid, None) - # TODO: support for more efficient saving once this is implemented - # for leaderboards with more than 100 entries - entry.save() - seen.add(entry.id) - - if not excluded: - rank += 1 - print("Updating leaderboard... DONE") - # delete unseen entries - leaderboard.entries.exclude(id__in=seen).delete() - print() - - ########################################################################## - ########################################################################## - ########################################################################## - ########################################################################## - # DISPLAY - print("Results:") + # Compute bootstrap confidence intervals + ci_lower, ci_upper = bootstrap_skills( + user1_ids, + user2_ids, + question_ids, + scores, + weights, + var_avg_scores, + baseline_player=baseline_player, + bootstrap_iterations=bootstrap_iterations, + ) + print() + + ordered_skills = sorted( + [(user, skill) for user, skill in skills.items()], key=lambda x: -x[1] + ) + player_stats: dict[int | str, list] = defaultdict(lambda: [0, set()]) + for u1id, u2id, qid in zip(user1_ids, user2_ids, question_ids): + player_stats[u1id][0] += 1 + player_stats[u1id][1].add(qid) + player_stats[u2id][0] += 1 + player_stats[u2id][1].add(qid) + + ########################################################################## + ########################################################################## + ########################################################################## + ########################################################################## + # UPDATE Leaderboard + print("Updating leaderboard...", end="\r") + leaderboard, _ = Leaderboard.objects.get_or_create( + name="Global Bot Leaderboard", + project=Project.objects.get(type=Project.ProjectTypes.SITE_MAIN), + score_type=LeaderboardScoreTypes.MANUAL, + bot_status=Project.BotLeaderboardStatus.BOTS_ONLY, + ) + entry_dict = { + entry.user_id or entry.aggregation_method: entry + for entry in list(leaderboard.entries.all()) + } + rank = 1 + question_count = len(set(question_ids)) + seen = set() + for uid, skill in ordered_skills: + contribution_count = len(player_stats[uid][1]) + + excluded = False + if isinstance(uid, int): + user = User.objects.get(id=uid) + bot_details = user.metadata["bot_details"] + if not bot_details.get("display_in_leaderboard"): + excluded = True + + entry: LeaderboardEntry = entry_dict.pop(uid, LeaderboardEntry()) + entry.user_id = uid if isinstance(uid, int) else None + entry.aggregation_method = uid if isinstance(uid, str) else None + entry.leaderboard = leaderboard + entry.score = skill + entry.rank = rank + entry.excluded = excluded + entry.show_when_excluded = False + entry.contribution_count = contribution_count + entry.coverage = contribution_count / question_count + entry.calculated_on = timezone.now() + entry.ci_lower = ci_lower.get(uid, None) + entry.ci_upper = ci_upper.get(uid, None) + # TODO: support for more efficient saving once this is implemented + # for leaderboards with more than 100 entries + entry.save() + seen.add(entry.id) + + if not excluded: + rank += 1 + print("Updating leaderboard... DONE") + # delete unseen entries + leaderboard.entries.exclude(id__in=seen).delete() + print() + + ########################################################################## + ########################################################################## + ########################################################################## + ########################################################################## + # DISPLAY + print("Results:") + print( + "| 2.5% " + "| Skill " + "| 97.5% " + "| Match " + "| Quest. " + "| ID " + "| Username " + ) + print( + "| Match " + "| " + "| Match " + "| Count " + "| Count " + "| " + "| " + ) + print( + "==========================================" + "==========================================" + ) + unevaluated = ( + set(user1_ids) | set(user2_ids) | set(users.values_list("id", flat=True)) + ) + for uid, skill in ordered_skills: + if isinstance(uid, str): + username = uid + else: + username = User.objects.get(id=uid).username + unevaluated.remove(uid) + lower = ci_lower.get(uid, 0) + upper = ci_upper.get(uid, 0) print( - "| 2.5% " - "| Skill " - "| 97.5% " - "| Match " - "| Quest. " - "| ID " - "| Username " + f"| {round(lower, 2):>6} " + f"| {round(skill, 2):>6} " + f"| {round(upper, 2):>6} " + f"| {player_stats[uid][0]:>6} " + f"| {len(player_stats[uid][1]):>6} " + f"| {uid if isinstance(uid, int) else '':>6} " + f"| {username}" ) + for uid in unevaluated: + if isinstance(uid, str): + username = uid + else: + username = User.objects.get(id=uid).username print( - "| Match " - "| " - "| Match " - "| Count " - "| Count " - "| " - "| " + "| ------ " + "| ------ " + "| ------ " + "| ------ " + "| ------ " + f"| {uid if isinstance(uid, int) else '':>5} " + f"| {username}" ) + print() + + ########################################################################## + ########################################################################## + ########################################################################## + ########################################################################## + # TESTS + skills_array = np.array(list(skills.values())) + + # 1. Correllation between skill and avg_score (DO NOT HAVE YET - need avg_score) + x = [] + y = [] + for uid in user1_ids: + x.append(skills.get(uid, 0)) + y.append(avg_scores.get(uid, 0)) + correlation = np.corrcoef(x, y) + print(f"\nCorrelation between skill and avg_score: {correlation[0][1]}") + + # 2. Shapiro-Wilk test (good for small to medium samples) + if len(skills_array) >= 3: + shapiro_stat, shapiro_p = stats.shapiro(skills_array) print( - "==========================================" - "==========================================" + f" Shapiro-Wilk test: statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}" ) - unevaluated = ( - set(user1_ids) | set(user2_ids) | set(users.values_list("id", flat=True)) - ) - for uid, skill in ordered_skills: - if isinstance(uid, str): - username = uid - else: - username = User.objects.get(id=uid).username - unevaluated.remove(uid) - lower = ci_lower.get(uid, 0) - upper = ci_upper.get(uid, 0) - print( - f"| {round(lower, 2):>6} " - f"| {round(skill, 2):>6} " - f"| {round(upper, 2):>6} " - f"| {player_stats[uid][0]:>6} " - f"| {len(player_stats[uid][1]):>6} " - f"| {uid if isinstance(uid, int) else '':>6} " - f"| {username}" - ) - for uid in unevaluated: - if isinstance(uid, str): - username = uid - else: - username = User.objects.get(id=uid).username - print( - "| ------ " - "| ------ " - "| ------ " - "| ------ " - "| ------ " - f"| {uid if isinstance(uid, int) else '':>5} " - f"| {username}" - ) - print() - - ########################################################################## - ########################################################################## - ########################################################################## - ########################################################################## - # TESTS - skills_array = np.array(list(skills.values())) - - # 1. Correllation between skill and avg_score (DO NOT HAVE YET - need avg_score) - x = [] - y = [] - for uid in user1_ids: - x.append(skills.get(uid, 0)) - y.append(avg_scores.get(uid, 0)) - correlation = np.corrcoef(x, y) - print(f"\nCorrelation between skill and avg_score: {correlation[0][1]}") - - # 2. Shapiro-Wilk test (good for small to medium samples) - if len(skills_array) >= 3: - shapiro_stat, shapiro_p = stats.shapiro(skills_array) - print( - f" Shapiro-Wilk test: statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}" - ) - if shapiro_p > 0.05: - print(" → Skills appear normally distributed (p > 0.05)") - else: - print(" → Skills may not be normally distributed (p ≤ 0.05)") - - # 3. Anderson-Darling test (more sensitive to tails) - anderson_result = stats.anderson(skills_array, dist="norm") - print(f" Anderson-Darling test: statistic={anderson_result.statistic:.4f}") - # Check at 5% significance level - critical_5pct = anderson_result.critical_values[2] # Index 2 is 5% level - print(f" Critical value at 5%: {critical_5pct:.4f}") - if anderson_result.statistic < critical_5pct: - print(" → Skills appear normally distributed (stat < critical)") - else: - print(" → Skills may not be normally distributed (stat ≥ critical)") - - # 4. Kolmogorov-Smirnov test (compare to normal distribution) - ks_stat, ks_p = stats.kstest( - skills_array, "norm", args=(skills_array.mean(), skills_array.std()) - ) - print(f" Kolmogorov-Smirnov test: statistic={ks_stat:.4f}, p-value={ks_p:.4f}") - if ks_p > 0.05: + if shapiro_p > 0.05: print(" → Skills appear normally distributed (p > 0.05)") else: print(" → Skills may not be normally distributed (p ≤ 0.05)") - # 5. Summary statistics - print("\nSkill distribution summary:") - print(f" Mean: {skills_array.mean():.2f}") - print(f" Std: {skills_array.std():.2f}") - print(f" Skewness: {stats.skew(skills_array):.4f}") - print(f" Kurtosis: {stats.kurtosis(skills_array):.4f}") - print() + # 3. Anderson-Darling test (more sensitive to tails) + anderson_result = stats.anderson(skills_array, dist="norm") + print(f" Anderson-Darling test: statistic={anderson_result.statistic:.4f}") + # Check at 5% significance level + critical_5pct = anderson_result.critical_values[2] # Index 2 is 5% level + print(f" Critical value at 5%: {critical_5pct:.4f}") + if anderson_result.statistic < critical_5pct: + print(" → Skills appear normally distributed (stat < critical)") + else: + print(" → Skills may not be normally distributed (stat ≥ critical)") + + # 4. Kolmogorov-Smirnov test (compare to normal distribution) + ks_stat, ks_p = stats.kstest( + skills_array, "norm", args=(skills_array.mean(), skills_array.std()) + ) + print(f" Kolmogorov-Smirnov test: statistic={ks_stat:.4f}, p-value={ks_p:.4f}") + if ks_p > 0.05: + print(" → Skills appear normally distributed (p > 0.05)") + else: + print(" → Skills may not be normally distributed (p ≤ 0.05)") + + # 5. Summary statistics + print("\nSkill distribution summary:") + print(f" Mean: {skills_array.mean():.2f}") + print(f" Std: {skills_array.std():.2f}") + print(f" Skewness: {stats.skew(skills_array):.4f}") + print(f" Kurtosis: {stats.kurtosis(skills_array):.4f}") + print() + + +class Command(BaseCommand): + help = """ + Update the global bots leaderboard + """ + + def handle(self, *args, **options) -> None: + run_update_global_bot_leaderboard() From 4d2c0f76c64e857d82663837a7f9399a5c999141 Mon Sep 17 00:00:00 2001 From: lsabor Date: Fri, 23 Jan 2026 13:56:35 -0800 Subject: [PATCH 02/10] add partial cache support --- .../commands/update_global_bot_leaderboard.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 89594a307e..3d44e3dbc9 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -104,8 +104,13 @@ def get_score_pair( def gather_data( users: QuerySet[User], questions: QuerySet[Question], - cache: bool = False, + cache: bool = True, ) -> tuple[list[int | str], list[int | str], list[int], list[float], list[float]]: + user1_ids: list[int | str] = [] + user2_ids: list[int | str] = [] + question_ids: list[int] = [] + scores: list[float] = [] + coverages: list[float] = [] if cache: csv_path = Path("HtH_score_data.csv") if csv_path.exists(): @@ -124,11 +129,6 @@ def _deserialize_user(value: str) -> int | str: except ValueError: return value - user1_ids: list[int | str] = [] - user2_ids: list[int | str] = [] - question_ids: list[int] = [] - scores: list[float] = [] - coverages: list[float] = [] with csv_path.open() as input_file: reader = csv.DictReader(input_file) for row in reader: @@ -138,7 +138,7 @@ def _deserialize_user(value: str) -> int | str: question_ids.append(int(row["questionid"])) scores.append(float(row["score"])) coverages.append(float(row["coverage"])) - return (user1_ids, user2_ids, question_ids, scores, coverages) + cached_question_ids = set(question_ids) # TODO: make authoritative mapping print("creating AIB <> Pro AIB question mapping...", end="\r") @@ -187,12 +187,10 @@ def _deserialize_user(value: str) -> int | str: print("| Question | ID | Pairing | Duration | Est. Duration |") t0 = datetime.now() question_count = len(questions) - user1_ids: list[int | str] = [] - user2_ids: list[int | str] = [] - question_ids: list[int] = [] - scores: list[float] = [] - coverages: list[float] = [] for question_number, question in enumerate(questions.iterator(chunk_size=10), 1): + if question.id in cached_question_ids: + # Skip questions that are already cached + continue # if question_number % 50 != 0: # continue question_print_str = ( From fb932d940979b42fa38eafdc588124bdfd8cc725 Mon Sep 17 00:00:00 2001 From: lsabor Date: Sun, 1 Feb 2026 07:29:44 -0800 Subject: [PATCH 03/10] save work --- .../commands/update_global_bot_leaderboard.py | 106 ++++++++++++------ 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 73d6e45ba3..615ea20b67 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -105,12 +105,15 @@ def gather_data( users: QuerySet[User], questions: QuerySet[Question], cache: bool = True, -) -> tuple[list[int | str], list[int | str], list[int], list[float], list[float]]: +) -> tuple[ + list[int | str], list[int | str], list[int], list[float], list[float], list[float] +]: user1_ids: list[int | str] = [] user2_ids: list[int | str] = [] question_ids: list[int] = [] scores: list[float] = [] coverages: list[float] = [] + timestamps: list[float] = [] if cache: csv_path = Path("HtH_score_data.csv") if csv_path.exists(): @@ -138,6 +141,7 @@ def _deserialize_user(value: str) -> int | str: question_ids.append(int(row["questionid"])) scores.append(float(row["score"])) coverages.append(float(row["coverage"])) + timestamps.append(float(row["timestamp"])) cached_question_ids = set(question_ids) # TODO: make authoritative mapping @@ -186,37 +190,56 @@ def _deserialize_user(value: str) -> int | str: t0 = datetime.now() question_count = len(questions) for question_number, question in enumerate(questions.iterator(chunk_size=10), 1): - if question.id in cached_question_ids: - # Skip questions that are already cached - continue - # if question_number % 50 != 0: - # continue + # TODO: cache results every ~100 questions, clearing lists of values question_print_str = ( f"\033[K" f"| {question_number:>5}/{question_count:<5} " f"| {question.id:<5} " ) + if question.id in cached_question_ids: + # Skip questions that are already cached + duration = datetime.now() - t0 + est_duration = duration / question_number * question_count + print( + f"{question_print_str}" + f"| {"N":>5}/{"A":<5} " + f"| {duration} " + f"| {est_duration} " + "|", + end="\r", + ) + continue # Get forecasts forecast_dict: dict[int | str, list[Forecast | AggregateForecast]] = ( defaultdict(list) ) - # bot forecasts - simple - bot_forecasts = question.user_forecasts.filter(author_id__in=user_ids).order_by( - "start_time" - ) - for f in bot_forecasts: - # don't include forecasts made 1 year or more after model release - user = user_id_map[f.author_id] - primary_base_model = user.metadata["bot_details"]["base_models"][0] + # bot forecasts + old_bot_ids: set[int] = set() + for user in users: + base_models = ( + (user.metadata or dict()) + .get("bot_details", dict()) + .get("base_models", []) + ) + # don't include bots on question that resolved 1 year or more + # after model release + primary_base_model = None if not base_models else base_models[0] + if not primary_base_model: + continue if release_date := primary_base_model.get("model_release_date"): if len(release_date) == 7: release_date += "-01" release = datetime.fromisoformat(release_date).replace( tzinfo=dt_timezone.utc ) - if f.start_time > release + timedelta(days=365): - continue - + if question.resolution_set_time > release + timedelta(days=365): + old_bot_ids.add(user.id) + bot_forecasts = ( + question.user_forecasts.filter(author_id__in=user_ids) + .exclude(author_id__in=old_bot_ids) + .order_by("start_time") + ) + for f in bot_forecasts: forecast_dict[f.author_id].append(f) # human aggregate forecasts - conditional on a bunch of stuff human_question: Question | None = aib_question_map.get(question, question) @@ -230,10 +253,13 @@ def _deserialize_user(value: str) -> int | str: if question.default_score_type == ScoreTypes.SPOT_PEER else AggregationMethod.RECENCY_WEIGHTED ) + # aggregate_forecasts = human_question.aggregate_forecasts.filter( + # method=aggregation_method + # ).order_by("start_time") aggregate_forecasts = get_aggregation_history( human_question, [aggregation_method], - minimize=False, + minimize=True, include_stats=False, include_bots=False, include_future=False, @@ -279,6 +305,7 @@ def _deserialize_user(value: str) -> int | str: question_ids.append(q) scores.append(u1s) coverages.append(cov) + timestamps.append(question.actual_resolve_time.timestamp()) print("\n") weights = coverages @@ -287,11 +314,15 @@ def _deserialize_user(value: str) -> int | str: with open("HtH_score_data.csv", "w") as output_file: writer = csv.writer(output_file) - writer.writerow(["user1", "user2", "questionid", "score", "coverage"]) - for row in zip(user1_ids, user2_ids, question_ids, scores, weights): + writer.writerow( + ["user1", "user2", "questionid", "score", "coverage", "timestamp"] + ) + for row in zip( + user1_ids, user2_ids, question_ids, scores, weights, timestamps + ): writer.writerow(row) - return (user1_ids, user2_ids, question_ids, scores, weights) + return (user1_ids, user2_ids, question_ids, scores, weights, timestamps) def get_avg_scores( @@ -607,9 +638,10 @@ def run_update_global_bot_leaderboard() -> None: # SETUP: users to evaluate & questions print("Initializing...") users: QuerySet[User] = User.objects.filter( - metadata__bot_details__metac_bot=True, - metadata__bot_details__include_in_calculations=True, - metadata__bot_details__display_in_leaderboard=True, + is_bot=True, + # metadata__bot_details__metac_bot=True, + # metadata__bot_details__include_in_calculations=True, + # metadata__bot_details__display_in_leaderboard=True, is_active=True, ).order_by("id") user_forecast_exists = Forecast.objects.filter( @@ -617,14 +649,9 @@ def run_update_global_bot_leaderboard() -> None: ) questions: QuerySet[Question] = ( Question.objects.filter( - Q( - related_posts__post__default_project__default_permission__in=[ - "viewer", - "forecaster", - ] - ) + Q(post__default_project__default_permission__in=["viewer", "forecaster"]) | Q( - related_posts__post__default_project_id__in=[ + post__default_project_id__in=[ 3349, # aib q3 2024 32506, # aib q4 2024 32627, # aib q1 2025 @@ -632,11 +659,11 @@ def run_update_global_bot_leaderboard() -> None: 32813, # aib fall 2025 ] ), - related_posts__post__curation_status=Post.CurationStatus.APPROVED, + post__curation_status=Post.CurationStatus.APPROVED, resolution__isnull=False, scheduled_close_time__lte=timezone.now(), ) - .exclude(related_posts__post__default_project__slug__startswith="minibench") + .exclude(post__default_project__slug__startswith="minibench") .exclude(resolution__in=UnsuccessfulResolutionType) .filter(Exists(user_forecast_exists)) .prefetch_related( # only prefetch forecasts from those users @@ -659,17 +686,26 @@ def run_update_global_bot_leaderboard() -> None: i += 1 print(i, "/", c, end="\r") scored_question_counts[user.id] = ( - Score.objects.filter(user=user, question__in=question_list) + Score.objects.filter( + user=user, + score_type="peer", + question__in=question_list, + ) .distinct("question_id") .count() ) excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100] users = users.exclude(id__in=excluded_ids) ############### + print(f"Filtered {c} users down to {users.count()}.") print("Initializing... DONE") # Gather head to head scores - user1_ids, user2_ids, question_ids, scores, weights = gather_data(users, questions) + user1_ids, user2_ids, question_ids, scores, weights, timestamps = gather_data( + users, questions + ) + + # TODO: set up support for yearly updates for all non-metac bots # choose baseline player if not already chosen if not baseline_player: From fd222894770ec3602bda209c2d221b0ff8fc67f2 Mon Sep 17 00:00:00 2001 From: lsabor Date: Sun, 1 Feb 2026 13:12:42 -0800 Subject: [PATCH 04/10] bug fixes --- .../commands/update_global_bot_leaderboard.py | 180 +++++++++++++----- 1 file changed, 136 insertions(+), 44 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 615ea20b67..e47a38ffff 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -1,5 +1,6 @@ import random from collections import defaultdict +import csv from pathlib import Path from datetime import datetime, timedelta, timezone as dt_timezone @@ -116,33 +117,18 @@ def gather_data( timestamps: list[float] = [] if cache: csv_path = Path("HtH_score_data.csv") - if csv_path.exists(): - userset = set([str(u.id) for u in users]) | { - "Pro Aggregate", - "Community Aggregate", - } - import csv - - def _deserialize_user(value: str) -> int | str: - value = value.strip() - if not value: - return value - try: - return int(value) - except ValueError: - return value - - with csv_path.open() as input_file: - reader = csv.DictReader(input_file) - for row in reader: - if (row["user1"] in userset) and (row["user2"] in userset): - user1_ids.append(_deserialize_user(row["user1"])) - user2_ids.append(_deserialize_user(row["user2"])) - question_ids.append(int(row["questionid"])) - scores.append(float(row["score"])) - coverages.append(float(row["coverage"])) - timestamps.append(float(row["timestamp"])) + if not csv_path.exists(): + with csv_path.open("w") as output_file: + writer = csv.writer(output_file) + writer.writerow( + ["user1", "user2", "questionid", "score", "coverage", "timestamp"] + ) + with csv_path.open("r") as input_file: + reader = csv.DictReader(input_file) + for row in reader: + question_ids.append(int(row["questionid"])) cached_question_ids = set(question_ids) + question_ids = [] # TODO: make authoritative mapping print("creating AIB <> Pro AIB question mapping...", end="\r") @@ -184,12 +170,13 @@ def _deserialize_user(value: str) -> int | str: print("creating AIB <> Pro AIB question mapping...DONE\n") # user_ids = users.values_list("id", flat=True) - user_id_map = {user.id: user for user in users} - print("Processing Pairwise Scoring:") - print("| Question | ID | Pairing | Duration | Est. Duration |") t0 = datetime.now() question_count = len(questions) - for question_number, question in enumerate(questions.iterator(chunk_size=10), 1): + questions = list(questions) + cache_interval = 100 + print("Processing Pairwise Scoring:") + print("| Question | ID | Pairing | Duration | Est. Duration |") + for question_number, question in enumerate(questions, 1): # TODO: cache results every ~100 questions, clearing lists of values question_print_str = ( f"\033[K" @@ -202,7 +189,7 @@ def _deserialize_user(value: str) -> int | str: est_duration = duration / question_number * question_count print( f"{question_print_str}" - f"| {"N":>5}/{"A":<5} " + f"| {'N':>5}/{'A':<5} " f"| {duration} " f"| {est_duration} " "|", @@ -306,23 +293,56 @@ def _deserialize_user(value: str) -> int | str: scores.append(u1s) coverages.append(cov) timestamps.append(question.actual_resolve_time.timestamp()) + if cache and question_number % cache_interval == 0: + print(f"\nCaching {len(user1_ids)} matches...") + with csv_path.open("a") as output_file: + writer = csv.writer(output_file) + for row in zip( + user1_ids, user2_ids, question_ids, scores, coverages, timestamps + ): + writer.writerow(row) + user1_ids = [] + user2_ids = [] + question_ids = [] + scores = [] + coverages = [] + timestamps = [] print("\n") - weights = coverages if cache: - import csv - - with open("HtH_score_data.csv", "w") as output_file: + with csv_path.open("a") as output_file: writer = csv.writer(output_file) - writer.writerow( - ["user1", "user2", "questionid", "score", "coverage", "timestamp"] - ) for row in zip( - user1_ids, user2_ids, question_ids, scores, weights, timestamps + user1_ids, user2_ids, question_ids, scores, coverages, timestamps ): writer.writerow(row) - - return (user1_ids, user2_ids, question_ids, scores, weights, timestamps) + user1_ids = [] + user2_ids = [] + question_ids = [] + scores = [] + coverages = [] + timestamps = [] + + def _deserialize_user(value: str) -> int | str: + value = value.strip() + if not value: + return value + try: + return int(value) + except ValueError: + return value + + with csv_path.open("r") as input_file: + reader = csv.DictReader(input_file) + for row in reader: + user1_ids.append(_deserialize_user(row["user1"])) + user2_ids.append(_deserialize_user(row["user2"])) + question_ids.append(int(row["questionid"])) + scores.append(float(row["score"])) + coverages.append(float(row["coverage"])) + timestamps.append(float(row["timestamp"])) + + return (user1_ids, user2_ids, question_ids, scores, coverages, timestamps) def get_avg_scores( @@ -434,7 +454,7 @@ def estimate_variances_from_head_to_head( ) print(f"σ_true (skill variance): {skill_variance:.4f}") print(f"alpha = (σ_error / σ_true)² = {alpha:.4f}") - return 2 + return alpha def compute_skills( @@ -610,7 +630,7 @@ def bootstrap_skills( boot_skills = get_skills( user1_ids=boot_user1_ids, user2_ids=boot_user2_ids, - question_ids=question_ids, + question_ids=boot_question_ids, scores=boot_scores, weights=boot_weights, baseline_player=baseline_player, @@ -705,7 +725,79 @@ def run_update_global_bot_leaderboard() -> None: users, questions ) - # TODO: set up support for yearly updates for all non-metac bots + # for pro aggregation, community aggregate, and any non-metac bot, + # duplicate rows indicating year-specific achievements + user_map = {user.id: user for user in users} + user_map["Pro Aggregate"] = "Pro Aggregate" + user_map["Community Aggregate"] = "Community Aggregate" + new_rows = [] + for user1_id, user2_id, question_id, score, weight, timestamp in zip( + user1_ids, user2_ids, question_ids, scores, weights, timestamps + ): + user1 = user_map[user1_id] + if isinstance(user1, User): + if ( + not (user1.metadata or dict()) + .get("bot_details", dict()) + .get("metac_bot") + ): + # non-metac bot + time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + new_rows.append( + ( + f"{user1.username} {time.year}", + user2_id, + question_id, + score, + weight, + timestamp, + ) + ) + else: + # aggregation methods + time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + new_rows.append( + ( + f"{user1} {time.year}", + user2_id, + question_id, + score, + weight, + timestamp, + ) + ) + user2 = user_map[user2_id] + if isinstance(user2, User): + if not ( + not (user2.metadata or dict()) + .get("bot_details", dict()) + .get("metac_bot") + ): + # non-metac bot + time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + new_rows.append( + ( + user1_id, + f"{user2.username} {time.year}", + question_id, + -score, + weight, + timestamp, + ) + ) + else: + # aggregation methods + time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + new_rows.append( + ( + user1_id, + f"{user2} {time.year}", + question_id, + -score, + weight, + timestamp, + ) + ) # choose baseline player if not already chosen if not baseline_player: From 33ebff2a6853e193aea3461a6c4f54c8bdde8e91 Mon Sep 17 00:00:00 2001 From: lsabor Date: Wed, 4 Feb 2026 08:48:02 -0800 Subject: [PATCH 05/10] save work --- .../commands/update_global_bot_leaderboard.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index e47a38ffff..4f9a5f6757 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -255,6 +255,7 @@ def gather_data( pass elif question in aib_question_map: # set the last aggregate to be the one that gets scored + # TODO: instead grab the aggregate that was live at spot scoring time forecast = aggregate_forecasts[-1] forecast.start_time = question.get_spot_scoring_time() - timedelta( seconds=1 @@ -691,7 +692,7 @@ def run_update_global_bot_leaderboard() -> None: "user_forecasts", queryset=Forecast.objects.filter(author__in=users) ) ) - .order_by("id") + .order_by("?") .distinct("id") ) ############### @@ -721,7 +722,7 @@ def run_update_global_bot_leaderboard() -> None: print("Initializing... DONE") # Gather head to head scores - user1_ids, user2_ids, question_ids, scores, weights, timestamps = gather_data( + user1_ids, user2_ids, question_ids, scores, coverages, timestamps = gather_data( users, questions ) @@ -731,8 +732,8 @@ def run_update_global_bot_leaderboard() -> None: user_map["Pro Aggregate"] = "Pro Aggregate" user_map["Community Aggregate"] = "Community Aggregate" new_rows = [] - for user1_id, user2_id, question_id, score, weight, timestamp in zip( - user1_ids, user2_ids, question_ids, scores, weights, timestamps + for user1_id, user2_id, question_id, score, coverage, timestamp in zip( + user1_ids, user2_ids, question_ids, scores, coverages, timestamps ): user1 = user_map[user1_id] if isinstance(user1, User): @@ -749,7 +750,7 @@ def run_update_global_bot_leaderboard() -> None: user2_id, question_id, score, - weight, + coverage, timestamp, ) ) @@ -762,13 +763,13 @@ def run_update_global_bot_leaderboard() -> None: user2_id, question_id, score, - weight, + coverage, timestamp, ) ) user2 = user_map[user2_id] if isinstance(user2, User): - if not ( + if ( not (user2.metadata or dict()) .get("bot_details", dict()) .get("metac_bot") @@ -781,7 +782,7 @@ def run_update_global_bot_leaderboard() -> None: f"{user2.username} {time.year}", question_id, -score, - weight, + coverage, timestamp, ) ) @@ -794,10 +795,17 @@ def run_update_global_bot_leaderboard() -> None: f"{user2} {time.year}", question_id, -score, - weight, + coverage, timestamp, ) ) + for user1_id, user2_id, question_id, score, coverage, timestamp in new_rows: + user1_ids.append(user1_id) + user2_ids.append(user2_id) + question_ids.append(question_id) + scores.append(score) + coverages.append(coverage) + timestamps.append(timestamp) # choose baseline player if not already chosen if not baseline_player: @@ -805,7 +813,7 @@ def run_update_global_bot_leaderboard() -> None: set(user1_ids) | set(user2_ids), key=(user1_ids + user2_ids).count ) # get variance of average scores (used in rescaling) - avg_scores = get_avg_scores(user1_ids, user2_ids, scores, weights) + avg_scores = get_avg_scores(user1_ids, user2_ids, scores, coverages) var_avg_scores = ( np.var(np.array(list(avg_scores.values()))) if len(avg_scores) > 1 else 0 ) @@ -816,7 +824,7 @@ def run_update_global_bot_leaderboard() -> None: user2_ids=user2_ids, question_ids=question_ids, scores=scores, - weights=weights, + weights=coverages, baseline_player=baseline_player, var_avg_scores=var_avg_scores, verbose=False, @@ -828,7 +836,7 @@ def run_update_global_bot_leaderboard() -> None: user2_ids, question_ids, scores, - weights, + coverages, var_avg_scores, baseline_player=baseline_player, bootstrap_iterations=bootstrap_iterations, @@ -870,8 +878,8 @@ def run_update_global_bot_leaderboard() -> None: excluded = False if isinstance(uid, int): user = User.objects.get(id=uid) - bot_details = user.metadata["bot_details"] - if not bot_details.get("display_in_leaderboard"): + bot_details = (user.metadata or dict()).get("bot_details") + if bot_details and not bot_details.get("display_in_leaderboard"): excluded = True entry: LeaderboardEntry = entry_dict.pop(uid, LeaderboardEntry()) From fb72e119da70477f1778f880e3f7279610dd8aad Mon Sep 17 00:00:00 2001 From: lsabor Date: Wed, 4 Feb 2026 09:14:43 -0800 Subject: [PATCH 06/10] save work --- .../commands/update_global_bot_leaderboard.py | 212 +++++++++--------- utils/the_math/aggregations.py | 9 +- 2 files changed, 112 insertions(+), 109 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 4f9a5f6757..3dc61b3700 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -246,7 +246,7 @@ def gather_data( aggregate_forecasts = get_aggregation_history( human_question, [aggregation_method], - minimize=True, + minimize=100, include_stats=False, include_bots=False, include_future=False, @@ -293,7 +293,7 @@ def gather_data( question_ids.append(q) scores.append(u1s) coverages.append(cov) - timestamps.append(question.actual_resolve_time.timestamp()) + timestamps.append(question.actual_close_time.timestamp()) if cache and question_number % cache_interval == 0: print(f"\nCaching {len(user1_ids)} matches...") with csv_path.open("a") as output_file: @@ -695,30 +695,30 @@ def run_update_global_bot_leaderboard() -> None: .order_by("?") .distinct("id") ) - ############### - # make sure they have at least 100 resolved questions - print("initialize list") - question_list = list(questions) - print("Filtering users.") - scored_question_counts: dict[int, int] = defaultdict(int) - c = users.count() - i = 0 - for user in users: - i += 1 - print(i, "/", c, end="\r") - scored_question_counts[user.id] = ( - Score.objects.filter( - user=user, - score_type="peer", - question__in=question_list, - ) - .distinct("question_id") - .count() - ) - excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100] - users = users.exclude(id__in=excluded_ids) - ############### - print(f"Filtered {c} users down to {users.count()}.") + # ############### + # # make sure they have at least 100 resolved questions + # print("initialize list") + # question_list = list(questions) + # print("Filtering users.") + # scored_question_counts: dict[int, int] = defaultdict(int) + # c = users.count() + # i = 0 + # for user in users: + # i += 1 + # print(i, "/", c, end="\r") + # scored_question_counts[user.id] = ( + # Score.objects.filter( + # user=user, + # score_type="peer", + # question__in=question_list, + # ) + # .distinct("question_id") + # .count() + # ) + # excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100] + # users = users.exclude(id__in=excluded_ids) + # print(f"Filtered {c} users down to {users.count()}.") + # ############### print("Initializing... DONE") # Gather head to head scores @@ -726,86 +726,86 @@ def run_update_global_bot_leaderboard() -> None: users, questions ) - # for pro aggregation, community aggregate, and any non-metac bot, - # duplicate rows indicating year-specific achievements - user_map = {user.id: user for user in users} - user_map["Pro Aggregate"] = "Pro Aggregate" - user_map["Community Aggregate"] = "Community Aggregate" - new_rows = [] - for user1_id, user2_id, question_id, score, coverage, timestamp in zip( - user1_ids, user2_ids, question_ids, scores, coverages, timestamps - ): - user1 = user_map[user1_id] - if isinstance(user1, User): - if ( - not (user1.metadata or dict()) - .get("bot_details", dict()) - .get("metac_bot") - ): - # non-metac bot - time = datetime.fromtimestamp(timestamp, dt_timezone.utc) - new_rows.append( - ( - f"{user1.username} {time.year}", - user2_id, - question_id, - score, - coverage, - timestamp, - ) - ) - else: - # aggregation methods - time = datetime.fromtimestamp(timestamp, dt_timezone.utc) - new_rows.append( - ( - f"{user1} {time.year}", - user2_id, - question_id, - score, - coverage, - timestamp, - ) - ) - user2 = user_map[user2_id] - if isinstance(user2, User): - if ( - not (user2.metadata or dict()) - .get("bot_details", dict()) - .get("metac_bot") - ): - # non-metac bot - time = datetime.fromtimestamp(timestamp, dt_timezone.utc) - new_rows.append( - ( - user1_id, - f"{user2.username} {time.year}", - question_id, - -score, - coverage, - timestamp, - ) - ) - else: - # aggregation methods - time = datetime.fromtimestamp(timestamp, dt_timezone.utc) - new_rows.append( - ( - user1_id, - f"{user2} {time.year}", - question_id, - -score, - coverage, - timestamp, - ) - ) - for user1_id, user2_id, question_id, score, coverage, timestamp in new_rows: - user1_ids.append(user1_id) - user2_ids.append(user2_id) - question_ids.append(question_id) - scores.append(score) - coverages.append(coverage) - timestamps.append(timestamp) + # # for pro aggregation, community aggregate, and any non-metac bot, + # # duplicate rows indicating year-specific achievements + # user_map = {user.id: user for user in users} + # user_map["Pro Aggregate"] = "Pro Aggregate" + # user_map["Community Aggregate"] = "Community Aggregate" + # new_rows = [] + # for user1_id, user2_id, question_id, score, coverage, timestamp in zip( + # user1_ids, user2_ids, question_ids, scores, coverages, timestamps + # ): + # user1 = user_map[user1_id] + # if isinstance(user1, User): + # if ( + # not (user1.metadata or dict()) + # .get("bot_details", dict()) + # .get("metac_bot") + # ): + # # non-metac bot + # time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + # new_rows.append( + # ( + # f"{user1.username} {time.year}", + # user2_id, + # question_id, + # score, + # coverage, + # timestamp, + # ) + # ) + # else: + # # aggregation methods + # time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + # new_rows.append( + # ( + # f"{user1} {time.year}", + # user2_id, + # question_id, + # score, + # coverage, + # timestamp, + # ) + # ) + # user2 = user_map[user2_id] + # if isinstance(user2, User): + # if ( + # not (user2.metadata or dict()) + # .get("bot_details", dict()) + # .get("metac_bot") + # ): + # # non-metac bot + # time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + # new_rows.append( + # ( + # user1_id, + # f"{user2.username} {time.year}", + # question_id, + # -score, + # coverage, + # timestamp, + # ) + # ) + # else: + # # aggregation methods + # time = datetime.fromtimestamp(timestamp, dt_timezone.utc) + # new_rows.append( + # ( + # user1_id, + # f"{user2} {time.year}", + # question_id, + # -score, + # coverage, + # timestamp, + # ) + # ) + # for user1_id, user2_id, question_id, score, coverage, timestamp in new_rows: + # user1_ids.append(user1_id) + # user2_ids.append(user2_id) + # question_ids.append(question_id) + # scores.append(score) + # coverages.append(coverage) + # timestamps.append(timestamp) # choose baseline player if not already chosen if not baseline_player: diff --git a/utils/the_math/aggregations.py b/utils/the_math/aggregations.py index 4d920961e7..afc0c835c0 100644 --- a/utils/the_math/aggregations.py +++ b/utils/the_math/aggregations.py @@ -907,7 +907,7 @@ def minimize_history( def get_user_forecast_history( forecasts: Sequence[Forecast], - minimize: bool = False, + minimize: bool | int = False, cutoff: datetime | None = None, ) -> list[ForecastSet]: timestep_set: set[datetime] = set() @@ -919,7 +919,10 @@ def get_user_forecast_history( timestep_set.add(forecast.end_time) timesteps = sorted(timestep_set) if minimize: - timesteps = minimize_history(timesteps) + if isinstance(minimize, int): + timesteps = minimize_history(timesteps, minimize) + else: + timesteps = minimize_history(timesteps) forecast_sets: dict[datetime, ForecastSet] = { timestep: ForecastSet( forecasts_values=[], @@ -951,7 +954,7 @@ def get_aggregation_history( aggregation_methods: list[AggregationMethod], forecasts: QuerySet[Forecast] | None = None, only_include_user_ids: list[int] | set[int] | None = None, - minimize: bool = True, + minimize: bool | int = True, include_stats: bool = True, include_bots: bool = False, histogram: bool | None = None, From b7714cb0847e9e2a46fb193ae02402b7b90f5cde Mon Sep 17 00:00:00 2001 From: lsabor Date: Wed, 4 Feb 2026 11:20:15 -0800 Subject: [PATCH 07/10] save work --- .../commands/update_global_bot_leaderboard.py | 83 +++++++++++++++++-- 1 file changed, 74 insertions(+), 9 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 3dc61b3700..81120ee66b 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -105,7 +105,7 @@ def get_score_pair( def gather_data( users: QuerySet[User], questions: QuerySet[Question], - cache: bool = True, + cache_use: str | None = "partial", ) -> tuple[ list[int | str], list[int | str], list[int], list[float], list[float], list[float] ]: @@ -115,7 +115,31 @@ def gather_data( scores: list[float] = [] coverages: list[float] = [] timestamps: list[float] = [] - if cache: + if cache_use == "full": + # load all from cache, don't calculate more + def _deserialize_user(value: str) -> int | str: + value = value.strip() + if not value: + return value + try: + return int(value) + except ValueError: + return value + + csv_path = Path("HtH_score_data.csv") + with csv_path.open("r") as input_file: + reader = csv.DictReader(input_file) + for row in reader: + user1_ids.append(_deserialize_user(row["user1"])) + user2_ids.append(_deserialize_user(row["user2"])) + question_ids.append(int(row["questionid"])) + scores.append(float(row["score"])) + coverages.append(float(row["coverage"])) + timestamps.append(float(row["timestamp"])) + + return (user1_ids, user2_ids, question_ids, scores, coverages, timestamps) + + if cache_use == "partial": csv_path = Path("HtH_score_data.csv") if not csv_path.exists(): with csv_path.open("w") as output_file: @@ -171,7 +195,7 @@ def gather_data( # user_ids = users.values_list("id", flat=True) t0 = datetime.now() - question_count = len(questions) + question_count = questions.count() questions = list(questions) cache_interval = 100 print("Processing Pairwise Scoring:") @@ -294,7 +318,7 @@ def gather_data( scores.append(u1s) coverages.append(cov) timestamps.append(question.actual_close_time.timestamp()) - if cache and question_number % cache_interval == 0: + if cache_use and question_number % cache_interval == 0: print(f"\nCaching {len(user1_ids)} matches...") with csv_path.open("a") as output_file: writer = csv.writer(output_file) @@ -310,7 +334,7 @@ def gather_data( timestamps = [] print("\n") - if cache: + if cache_use: with csv_path.open("a") as output_file: writer = csv.writer(output_file) for row in zip( @@ -652,7 +676,9 @@ def bootstrap_skills( return ci_lower, ci_upper -def run_update_global_bot_leaderboard() -> None: +def run_update_global_bot_leaderboard( + cache_use: str = "partial", +) -> None: baseline_player: int | str = 236038 # metac-gpt-4o+asknews bootstrap_iterations = 30 @@ -692,8 +718,8 @@ def run_update_global_bot_leaderboard() -> None: "user_forecasts", queryset=Forecast.objects.filter(author__in=users) ) ) - .order_by("?") .distinct("id") + # .order_by("?") ) # ############### # # make sure they have at least 100 resolved questions @@ -723,7 +749,7 @@ def run_update_global_bot_leaderboard() -> None: # Gather head to head scores user1_ids, user2_ids, question_ids, scores, coverages, timestamps = gather_data( - users, questions + users, questions, cache_use=cache_use ) # # for pro aggregation, community aggregate, and any non-metac bot, @@ -807,6 +833,45 @@ def run_update_global_bot_leaderboard() -> None: # coverages.append(coverage) # timestamps.append(timestamp) + # ############### + # Filter out entries we don't care about + print(f"Filtering {len(timestamps)} matches down to only relevant identities ...") + relevant_identities = set( + User.objects.filter( + metadata__bot_details__metac_bot=True, + metadata__bot_details__include_in_calculations=True, + metadata__bot_details__display_in_leaderboard=True, + is_active=True, + ).values_list("id", flat=True) + ) | { + "Pro Aggregate", + "Community Aggregate", + } + filtered_user1_ids = [] + filtered_user2_ids = [] + filtered_question_ids = [] + filtered_scores = [] + filtered_coverages = [] + filtered_timestamps = [] + for u1id, u2id, qid, score, coverage, timestamp in zip( + user1_ids, user2_ids, question_ids, scores, coverages, timestamps + ): + if u1id in relevant_identities and u2id in relevant_identities: + filtered_user1_ids.append(u1id) + filtered_user2_ids.append(u2id) + filtered_question_ids.append(qid) + filtered_scores.append(score) + filtered_coverages.append(coverage) + filtered_timestamps.append(timestamp) + user1_ids = filtered_user1_ids + user2_ids = filtered_user2_ids + question_ids = filtered_question_ids + scores = filtered_scores + coverages = filtered_coverages + timestamps = filtered_timestamps + print(f"Filtered down to {len(timestamps)} matches.\n") + # ############### + # choose baseline player if not already chosen if not baseline_player: baseline_player = max( @@ -1034,4 +1099,4 @@ class Command(BaseCommand): """ def handle(self, *args, **options) -> None: - run_update_global_bot_leaderboard() + run_update_global_bot_leaderboard(cache_use="full") From 64d2d7bf4d513ab9078214739168aa1458dbf87f Mon Sep 17 00:00:00 2001 From: lsabor Date: Thu, 5 Feb 2026 07:31:18 -0800 Subject: [PATCH 08/10] save work --- .../commands/update_global_bot_leaderboard.py | 106 ++++++++++-------- 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 81120ee66b..64ba387813 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -44,6 +44,7 @@ def get_score_pair( geometric_means = get_geometric_means(forecasts) if question.default_score_type == ScoreTypes.PEER: + breakpoint() # Coverage coverage = 0.0 cvs = [] @@ -79,10 +80,12 @@ def get_score_pair( if gm.timestamp <= spot_forecast_timestamp <= current_timestamp: if gm.num_forecasters == 2: # both have a forecast at spot scoring time - coverage = 1 / 3 # downweight spot score questions + coverage = 1.0 + # coverage = 1 / 3 # downweight spot score questions break current_timestamp = gm.timestamp if coverage == 0: + breakpoint() return None user1_scores = evaluate_forecasts_peer_spot_forecast( forecasts=user1_forecasts, # only evaluate user1 (user2 is opposite) @@ -225,30 +228,8 @@ def _deserialize_user(value: str) -> int | str: defaultdict(list) ) # bot forecasts - old_bot_ids: set[int] = set() - for user in users: - base_models = ( - (user.metadata or dict()) - .get("bot_details", dict()) - .get("base_models", []) - ) - # don't include bots on question that resolved 1 year or more - # after model release - primary_base_model = None if not base_models else base_models[0] - if not primary_base_model: - continue - if release_date := primary_base_model.get("model_release_date"): - if len(release_date) == 7: - release_date += "-01" - release = datetime.fromisoformat(release_date).replace( - tzinfo=dt_timezone.utc - ) - if question.resolution_set_time > release + timedelta(days=365): - old_bot_ids.add(user.id) - bot_forecasts = ( - question.user_forecasts.filter(author_id__in=user_ids) - .exclude(author_id__in=old_bot_ids) - .order_by("start_time") + bot_forecasts = question.user_forecasts.filter(author_id__in=user_ids).order_by( + "start_time" ) for f in bot_forecasts: forecast_dict[f.author_id].append(f) @@ -689,7 +670,6 @@ def run_update_global_bot_leaderboard( # metadata__bot_details__metac_bot=True, # metadata__bot_details__include_in_calculations=True, # metadata__bot_details__display_in_leaderboard=True, - is_active=True, ).order_by("id") user_forecast_exists = Forecast.objects.filter( question_id=OuterRef("pk"), author__in=users @@ -836,14 +816,13 @@ def run_update_global_bot_leaderboard( # ############### # Filter out entries we don't care about print(f"Filtering {len(timestamps)} matches down to only relevant identities ...") - relevant_identities = set( - User.objects.filter( - metadata__bot_details__metac_bot=True, - metadata__bot_details__include_in_calculations=True, - metadata__bot_details__display_in_leaderboard=True, - is_active=True, - ).values_list("id", flat=True) - ) | { + metac_bots = User.objects.filter( + metadata__bot_details__metac_bot=True, + # metadata__bot_details__include_in_calculations=True, # TODO: this should be + # but we don't have that data correct at the moment + ) + user_map = {user.id: user for user in metac_bots} + relevant_identities = set(metac_bots.values_list("id", flat=True)) | { "Pro Aggregate", "Community Aggregate", } @@ -856,6 +835,31 @@ def run_update_global_bot_leaderboard( for u1id, u2id, qid, score, coverage, timestamp in zip( user1_ids, user2_ids, question_ids, scores, coverages, timestamps ): + # skip if either user is either not in relevant identities or if their model + # is more than a year old at time of question actual close time + match_users = [user_map[u] for u in (u1id, u2id) if (u in user_map)] + skip = False + for user in match_users: + base_models = ( + (user.metadata or dict()) + .get("bot_details", dict()) + .get("base_models", []) + ) + if release_date := ( + base_models[0].get("model_release_date") if base_models else None + ): + if len(release_date) == 7: + release_date += "-01" + release = ( + datetime.fromisoformat(release_date) + .replace(tzinfo=dt_timezone.utc) + .timestamp() + ) + if timestamp > release + timedelta(days=365).total_seconds(): + skip = True + if skip: + breakpoint() + continue if u1id in relevant_identities and u2id in relevant_identities: filtered_user1_ids.append(u1id) filtered_user2_ids.append(u2id) @@ -911,8 +915,12 @@ def run_update_global_bot_leaderboard( ordered_skills = sorted( [(user, skill) for user, skill in skills.items()], key=lambda x: -x[1] ) - player_stats: dict[int | str, list] = defaultdict(lambda: [0, set()]) + player_stats: dict[int | str, list] = dict() for u1id, u2id, qid in zip(user1_ids, user2_ids, question_ids): + if u1id not in player_stats: + player_stats[u1id] = [0, set()] + if u2id not in player_stats: + player_stats[u2id] = [0, set()] player_stats[u1id][0] += 1 player_stats[u1id][1].add(qid) player_stats[u2id][0] += 1 @@ -1020,20 +1028,20 @@ def run_update_global_bot_leaderboard( f"| {uid if isinstance(uid, int) else '':>6} " f"| {username}" ) - for uid in unevaluated: - if isinstance(uid, str): - username = uid - else: - username = User.objects.get(id=uid).username - print( - "| ------ " - "| ------ " - "| ------ " - "| ------ " - "| ------ " - f"| {uid if isinstance(uid, int) else '':>5} " - f"| {username}" - ) + # for uid in unevaluated: + # if isinstance(uid, str): + # username = uid + # else: + # username = User.objects.get(id=uid).username + # print( + # "| ------ " + # "| ------ " + # "| ------ " + # "| ------ " + # "| ------ " + # f"| {uid if isinstance(uid, int) else '':>5} " + # f"| {username}" + # ) print() ########################################################################## From ff1ce67b696e491d62c02f2d3a325b3de5ac0cf0 Mon Sep 17 00:00:00 2001 From: lsabor Date: Thu, 5 Feb 2026 13:20:58 -0800 Subject: [PATCH 09/10] save work --- .../commands/update_global_bot_leaderboard.py | 95 +++++++++++-------- 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 64ba387813..2667e07ae7 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -44,7 +44,6 @@ def get_score_pair( geometric_means = get_geometric_means(forecasts) if question.default_score_type == ScoreTypes.PEER: - breakpoint() # Coverage coverage = 0.0 cvs = [] @@ -80,8 +79,7 @@ def get_score_pair( if gm.timestamp <= spot_forecast_timestamp <= current_timestamp: if gm.num_forecasters == 2: # both have a forecast at spot scoring time - coverage = 1.0 - # coverage = 1 / 3 # downweight spot score questions + coverage = 1 / 3 # downweight spot score questions break current_timestamp = gm.timestamp if coverage == 0: @@ -699,32 +697,7 @@ def run_update_global_bot_leaderboard( ) ) .distinct("id") - # .order_by("?") ) - # ############### - # # make sure they have at least 100 resolved questions - # print("initialize list") - # question_list = list(questions) - # print("Filtering users.") - # scored_question_counts: dict[int, int] = defaultdict(int) - # c = users.count() - # i = 0 - # for user in users: - # i += 1 - # print(i, "/", c, end="\r") - # scored_question_counts[user.id] = ( - # Score.objects.filter( - # user=user, - # score_type="peer", - # question__in=question_list, - # ) - # .distinct("question_id") - # .count() - # ) - # excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100] - # users = users.exclude(id__in=excluded_ids) - # print(f"Filtered {c} users down to {users.count()}.") - # ############### print("Initializing... DONE") # Gather head to head scores @@ -815,14 +788,48 @@ def run_update_global_bot_leaderboard( # ############### # Filter out entries we don't care about + # and map some users to other users + userid_mapping = { + 189585: 236038, # mf-bot-1 -> metac-gpt-4o+asknews + 189588: 236041, # mf-bot-3 -> metac-claude-3-5-sonnet-20240620+asknews + 208405: 240416, # mf-bot-4 -> metac-o1-preview + 221727: 236040, # mf-bot-5 -> metac-claude-3-5-sonnet-latest+asknews + } print(f"Filtering {len(timestamps)} matches down to only relevant identities ...") - metac_bots = User.objects.filter( + relevant_users = User.objects.filter( metadata__bot_details__metac_bot=True, # metadata__bot_details__include_in_calculations=True, # TODO: this should be # but we don't have that data correct at the moment ) - user_map = {user.id: user for user in metac_bots} - relevant_identities = set(metac_bots.values_list("id", flat=True)) | { + ############### + # make sure they have at least 'minimum_resolved_questions' resolved questions + print("Filtering users.") + minimum_resolved_questions = 100 + scored_question_counts: dict[int, int] = defaultdict(int) + c = relevant_users.count() + i = 0 + for user in relevant_users: + i += 1 + print(i, "/", c, end="\r") + scored_question_counts[user.id] = ( + Score.objects.filter( + user=user, + score_type="peer", + question__in=questions, + ) + .distinct("question_id") + .count() + ) + excluded_ids = [ + uid + for uid, count in scored_question_counts.items() + if count < minimum_resolved_questions + ] + relevant_users = relevant_users.exclude(id__in=excluded_ids) + print(f"Filtered {c} users down to {relevant_users.count()}.") + ############### + user_map = {user.id: user for user in relevant_users} + relevant_identities = set(relevant_users.values_list("id", flat=True)) | { "Pro Aggregate", "Community Aggregate", } @@ -835,8 +842,13 @@ def run_update_global_bot_leaderboard( for u1id, u2id, qid, score, coverage, timestamp in zip( user1_ids, user2_ids, question_ids, scores, coverages, timestamps ): - # skip if either user is either not in relevant identities or if their model - # is more than a year old at time of question actual close time + # replace userIds according to the mapping + u1id = userid_mapping.get(u1id, u1id) + u2id = userid_mapping.get(u2id, u2id) + # skip if either user is not in relevant identities + if (u1id not in relevant_identities) or (u2id not in relevant_identities): + continue + # skip if either user model is more than a year old at time of 'timestamp' match_users = [user_map[u] for u in (u1id, u2id) if (u in user_map)] skip = False for user in match_users: @@ -860,13 +872,14 @@ def run_update_global_bot_leaderboard( if skip: breakpoint() continue - if u1id in relevant_identities and u2id in relevant_identities: - filtered_user1_ids.append(u1id) - filtered_user2_ids.append(u2id) - filtered_question_ids.append(qid) - filtered_scores.append(score) - filtered_coverages.append(coverage) - filtered_timestamps.append(timestamp) + + # done + filtered_user1_ids.append(u1id) + filtered_user2_ids.append(u2id) + filtered_question_ids.append(qid) + filtered_scores.append(score) + filtered_coverages.append(coverage) + filtered_timestamps.append(timestamp) user1_ids = filtered_user1_ids user2_ids = filtered_user2_ids question_ids = filtered_question_ids @@ -1107,4 +1120,4 @@ class Command(BaseCommand): """ def handle(self, *args, **options) -> None: - run_update_global_bot_leaderboard(cache_use="full") + run_update_global_bot_leaderboard(cache_use="partial") From a0c679e5438375ba525532a0418e4c21c9e03cd3 Mon Sep 17 00:00:00 2001 From: lsabor Date: Sat, 7 Feb 2026 10:44:55 -0800 Subject: [PATCH 10/10] save work --- scoring/management/commands/update_global_bot_leaderboard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py index 2667e07ae7..85b95d249c 100644 --- a/scoring/management/commands/update_global_bot_leaderboard.py +++ b/scoring/management/commands/update_global_bot_leaderboard.py @@ -57,6 +57,8 @@ def get_score_pair( cvs.append(max(0, (end - start)) / total_duration) current_timestamp = gm.timestamp if coverage == 0: + # investigate! + breakpoint() return None user1_scores = evaluate_forecasts_peer_accuracy( forecasts=user1_forecasts, # only evaluate user1 (user2 is opposite) @@ -83,7 +85,6 @@ def get_score_pair( break current_timestamp = gm.timestamp if coverage == 0: - breakpoint() return None user1_scores = evaluate_forecasts_peer_spot_forecast( forecasts=user1_forecasts, # only evaluate user1 (user2 is opposite) @@ -164,6 +165,7 @@ def _deserialize_user(value: str) -> int | str: 32627, # Q1 2025 32721, # Q2 2025 32813, # fall 2025 + 32916, # Q1 2026 ] ) aib_to_pro_version = { @@ -172,6 +174,7 @@ def _deserialize_user(value: str) -> int | str: 32627: 32631, 32721: 32761, 32813: None, + 32916: 32930, } aib_question_map: dict[Question, Question | None] = dict() for aib in aib_projects: