File size: 30,035 Bytes
ac15cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1971175
5ef407f
ee1b999
7b52df4
b077021
7b52df4
b077021
7b52df4
 
b077021
7b52df4
 
b077021
7b52df4
 
b077021
7b52df4
b077021
7b52df4
ee1b999
 
7b52df4
ee1b999
1971175
 
 
 
9181029
95d1ab7
 
 
 
9181029
 
 
95d1ab7
 
 
 
 
9181029
 
 
95d1ab7
 
9181029
95d1ab7
1971175
9181029
95d1ab7
 
 
9181029
5ef407f
21516b7
 
 
 
 
b47a3c3
21516b7
 
 
5ef407f
ac15cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ef407f
 
 
 
 
 
 
 
 
 
 
0aeab65
 
 
 
 
 
 
 
 
 
 
5ef407f
 
 
 
 
 
 
 
 
 
 
 
ee1b999
 
 
 
5ef407f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1b999
cd5338b
ee1b999
ac15cf4
 
 
 
 
 
 
 
 
21516b7
3317293
 
 
 
 
 
 
21516b7
 
 
 
 
1971175
 
a0bf2a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1b999
a0bf2a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd5281a
 
 
11de2f8
dd5281a
1971175
 
 
ee1b999
 
1971175
 
 
ee1b999
 
 
11de2f8
 
 
ee1b999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1971175
 
 
ee1b999
 
 
 
 
 
 
02a4349
ee1b999
1971175
 
 
ee1b999
 
 
1971175
 
 
 
 
 
 
 
 
37bf680
 
 
1971175
 
 
 
 
2972be9
 
1971175
 
 
 
 
 
 
b0bd18e
 
 
 
 
a0bf2a2
b0bd18e
a0bf2a2
1971175
011f79c
 
 
 
b0bd18e
 
 
 
 
1971175
011f79c
b0bd18e
 
 
1971175
011f79c
b0bd18e
1971175
11de2f8
 
 
 
 
 
2972be9
 
 
 
 
 
 
7b52df4
2972be9
 
 
 
 
 
 
 
 
 
 
 
 
 
9832707
ac9171f
 
 
 
 
 
 
 
 
ae05bbd
ac9171f
9832707
ac9171f
 
 
 
 
ae05bbd
ac9171f
 
50fb0f5
ae05bbd
 
50fb0f5
 
 
 
 
 
ac9171f
 
 
 
0d80b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b48ebfb
 
 
 
 
cd5338b
 
 
37bf680
cd5338b
 
07dc6d3
cd5338b
 
 
37bf680
cd5338b
 
 
37bf680
 
cd5338b
37bf680
cd5338b
 
 
 
 
 
 
9a90879
37bf680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a90879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac15cf4
 
 
 
49b6cdd
ac15cf4
 
cdccabc
 
ac15cf4
 
 
9c18850
011f79c
ac15cf4
011f79c
9c18850
50fb0f5
b392800
 
21516b7
 
 
 
b392800
 
21516b7
 
 
 
 
 
b392800
 
21516b7
 
 
0aeab65
 
 
 
 
 
 
 
 
 
 
 
 
 
21516b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20c57a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50fb0f5
 
20c57a4
50fb0f5
 
 
 
20c57a4
 
 
 
 
 
50fb0f5
 
 
20c57a4
 
 
 
 
 
50fb0f5
 
 
 
 
 
20c57a4
50fb0f5
 
 
afb9b4c
c917078
 
afb9b4c
 
 
 
 
 
 
 
 
 
 
c917078
afb9b4c
 
 
 
c934393
 
 
 
 
 
1e64d2b
 
 
 
 
 
 
 
268d785
1e64d2b
 
 
 
 
 
 
268d785
1e64d2b
 
 
 
 
 
 
 
 
268d785
1e64d2b
 
268d785
1e64d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07dc6d3
28bc4e5
 
 
 
 
 
 
 
268d785
1e64d2b
abb9f0a
07dc6d3
 
 
 
 
 
 
 
 
482c591
abb9f0a
482c591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c22c48e
 
 
 
7b52df4
 
 
17162c9
ac15cf4
 
 
 
b077021
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17162c9
 
eeb1750
17162c9
 
5015459
 
17162c9
 
 
 
5015459
17162c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeb1750
17162c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e31d809
 
 
 
 
 
ffbe9ce
 
 
 
 
 
ee1b999
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
import re

def create_gradio_anchor_id(text: str, validation) -> str:
    """
    Replicates the ID format created by gr.Markdown(header_links=True).
    Example: "Paper Finder Validation" -> "h-paper-finder-validation"
    """
    text = text.lower()
    text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
    text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
    if validation:
        return f"h-{text}-leaderboard-1"
    return f"h-{text}-leaderboard"


TITLE = """<h1 align="left" id="space-title">AstaBench Leaderboard</h1>"""

INTRO_PARAGRAPH = """
<p>
    <strong>AstaBench</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all four categories. We report:
</p>

<ul class="info-list">
    <li>
        <strong>Overall score:</strong> A macro-average of the four category-level average scores. Each category contributes equally, regardless of how many benchmarks it includes. This ensures fair comparisons across agents with different domain strengths.
    </li>
    <li>
        <strong>Overall cost:</strong> A macro-average of the agent’s cost per problem across all categories, in USD. Each category contributes equally.
    </li>
</ul>

<p>
    This view is designed for quick comparison of general-purpose scientific agents. For more details on how we calculate scores and cost, please see the <a href="/about" style="color: #0FCB8C; text-decoration: underline;">About</a> Page.
</p>
"""
SCATTER_DISCLAIMER = """
**Note:** Agents without cost data are displayed to the right of the vertical divider line.
"""
PARETO_DISCLAIMER = """
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost. 
"""
LIT_DESCRIPTION = """
The **Literature Understanding** category evaluates how well agents comprehend and interact with scientific literature—testing their ability to find research papers, assess citation quality, extract information from text, and more.
<br><br>
The scores shown below reflect performance aggregated across five distinct benchmarks, each targeting a different aspect of literature-based reasoning. 
<br><br>
For detailed results, use the links above to explore individual benchmarks.
<br>
"""
CODE_EXECUTION_DESCRIPTION = """
The **Code & Execution** category in AstaBench includes tasks that evaluate an agent’s ability to write, modify, and run code in realistic research scenarios. Unlike literature tasks—which only require read-only tools and can sometimes even be solved by a language model alone—these problems often require the agent to manipulate a machine environment with tools: reading input files, executing code, and writing outputs to specific files in the required format.
<br><br>
The scores in this category are aggregated from three distinct benchmarks, each targeting different facets of scientific coding and execution. Together, these benchmarks evaluate whether an agent can function as a hands-on scientific assistant—not just by reasoning about code, but by running it in real-world contexts.
<br><br>
For detailed results, use the links above to explore individual benchmark pages.
<br>
"""
DATA_ANALYSIS_DESCRIPTION = """
The **Data Analysis** category evaluates agents on their ability to analyze structured datasets and generate meaningful scientific hypotheses. It currently includes a single benchmark, DiscoveryBench, so the category-level scores are the same as the benchmark-level results.
<br><br>
As additional benchmarks are added in the future, this category will expand to cover a broader range of data-driven reasoning tasks across scientific domains.
<br>
"""
DISCOVERY_DESCRIPTION = """
The **End-to-End Discovery** category tests whether agents can carry out a complete scientific workflow, from task description to experiment design, code execution, results  analysis, and report writing. These tasks require agents to integrate multiple capabilities, producing not just answers but full research artifacts.
<br><br>
Scores in this category are aggregated from two benchmarks, providing the first standardized way to evaluate automated scientific discovery (ASD) agents across all stages of the research process. Use the links above to explore individual benchmark pages.
<br>
"""
SUBMISSION_CONFIRMATION = """
**Your agent has been submitted to AstaBench for evaluation.**
<br><br>
🙏 Thanks for contributing!
<br><br>
You'll receive a confirmation email from our team within 2 business days with next steps. We will reach out to you directly if further information is needed.
<br><br>
We appreciate your support in advancing scientific AI.
"""

# External URLs for benchmark descriptions
SCHOLAR_QA_CS_URL = "https://www.semanticscholar.org/paper/OpenScholar%3A-Synthesizing-Scientific-Literature-LMs-Asai-He/b40df4b273f255b3cb5639e220c8ab7b1bdb313e"
LITQA2_URL = "https://www.semanticscholar.org/paper/Language-agents-achieve-superhuman-synthesis-of-Skarlinski-Cox/fa5f9aa1cb6f97654ca8e6d279ceee1427a87e68"
ARXIV_DIGESTABLES_URL = "https://www.semanticscholar.org/paper/ArxivDIGESTables%3A-Synthesizing-Scientific-into-Newman-Lee/c7face35e84f2cb04fb1600d54298799aa0ed189"
SUPER_URL = "https://www.semanticscholar.org/paper/SUPER%3A-Evaluating-Agents-on-Setting-Up-and-Tasks-Bogin-Yang/053ef8299988680d47df36224bfccffc817472f1"
CORE_BENCH_URL = "https://www.semanticscholar.org/paper/CORE-Bench%3A-Fostering-the-Credibility-of-Published-Siegel-Kapoor/4c913d59d150fe7581386b87dfd9f90448a9adee"
DS1000_URL = "https://arxiv.org/abs/2211.11501"
DISCOVERY_BENCH_URL = "https://www.semanticscholar.org/paper/DiscoveryBench%3A-Towards-Data-Driven-Discovery-with-Majumder-Surana/48c83799530dc523ee01e6c1c40ad577d5c10a16"

# Helper function to create external links
def external_link(url, text, is_s2_url=False):
    url = f"{url}?utm_source=asta_leaderboard" if is_s2_url else url
    return f"<a href='{url}' target='_blank' rel='noopener noreferrer'>{text}</a>"

def internal_leaderboard_link(text, validation):
    anchor_id = create_gradio_anchor_id(text, validation)
    return f"<a href='#{anchor_id}'>{text}</a>"

# Function to get benchmark descriptions with validation flag
def get_benchmark_description(benchmark_name, validation):
    descriptions = {
    'PaperFindingBench': (
        "PaperFindingBench assesses an agent's ability to locate sets of papers based on a natural language "
        "description that may involve both the papers' content and metadata, such as the author or publication year."
    ),
    'LitQA2-FullText-Search': (
        f"A version of {internal_leaderboard_link('LitQA2-FullText', validation)} that isolates the retrieval aspect of the task. "
        f"This benchmark features the same multi-choice questions as {internal_leaderboard_link('LitQA2-FullText', validation)}, but the agent is not evaluated on answering the actual question "
        "but rather on providing a ranked list of papers in which the answer is likely to be found."
    ),
    'ScholarQA-CS2': (
        "ScholarQA-CS2 assesses long-form model responses to literature review questions in the domain of computer science. "
        "Answers are expected to be comprehensive reports, such as those produced by deep research systems. "
        f"This benchmark advances on the previously released {external_link(SCHOLAR_QA_CS_URL, 'ScholarQA-CS', is_s2_url=True)} "
        "by using queries from real-world usage, and introducing new evaluation methods for coverage and precision "
        "of both the report text and its citations."
    ),
    'LitQA2-FullText': (
        f"{external_link(LITQA2_URL, 'LitQA2', is_s2_url=True)}, a benchmark introduced by FutureHouse, gauges a model's ability to answer questions that require document retrieval from the scientific literature. "
        "It consists of multiple-choice questions that necessitate finding a unique paper and analyzing its detailed full text to spot precise information; these questions cannot be answered from a paper’s abstract. "
        "While the original version of the benchmark provided for each question the title of the paper in which the answer can be found, it did not specify the overall collection to search over. In our version, "
        "we search over the index we provide as part of the Asta standard toolset. The “-FullText” suffix indicates we consider only the subset of LitQA2 questions for which "
        "the full-text version of the answering paper is open source and available in our index."
    ),
    'ArxivDIGESTables-Clean': (
        f"{external_link(ARXIV_DIGESTABLES_URL, 'ArxivDIGESTables', is_s2_url=True)} assesses the ability of models to construct literature review tables, i.e., tables whose rows are papers and whose columns constitute a set of "
        "aspects used to compare and contrast the papers. The goal is to construct such tables given a set of related papers and a table caption describing the user's goal. Generated tables are evaluated by "
        "comparing them to actual tables published in ArXiv papers. The “-Clean” suffix indicates a curated subset of ArxivDIGESTables which drops tables that are either trivial or impossible to reconstruct from full-texts."
    ),
    'SUPER-Expert': (
        "SUPER-Expert evaluates the capability of models in setting up and executing tasks from low-resource "
        "research repositories—centralized databases containing research data and related materials. "
        f"The \"-Expert\" split indicates the name of the most challenging split in the {external_link(SUPER_URL, 'original SUPER benchmark', is_s2_url=True)} "
        "that involves solving reproduction tasks from scratch and without any intermediate hints or details "
        "about the important landmarks involved in each task."
    ),
    'CORE-Bench-Hard': (
        "Core-Bench-Hard tests computational reproducibility, a task involving reproducing the results of a study "
        "using provided code and data. It consists of both language-only and vision-language challenges across "
        "multiple difficulty levels. "
        f"The \"-Hard\" split refers to the name of the most challenging split in the original {external_link(CORE_BENCH_URL, 'Core-bench benchmark', is_s2_url=True)} "
        "where only a README file is provided with no instructions or an auxiliary Dockerfile."
    ),
    'DS-1000': (
        "DS-1000 is an established code generation benchmark containing Python data science coding questions "
        "originally sourced from StackOverflow. It's designed to reflect an array of diverse, realistic, and "
        "practical use cases and directly involves many of the Python libraries commonly used in data science "
        f"and machine learning research. We split the original {external_link(DS1000_URL, 'dataset')} "
        "into 100 validation and 900 test problems."
    ),
    'DiscoveryBench': (
        "DiscoveryBench is the first comprehensive benchmark to formalize the multi-step process of data-driven "
        "analysis and discovery (i.e., data loading, transformation, statistical analysis, and modeling). "
        f"Originally introduced {external_link(DISCOVERY_BENCH_URL, 'here', is_s2_url=True)}, it is designed to systematically "
        "evaluate how well current LLMs can replicate or reproduce published scientific findings across diverse "
        "domains, including social science, biology, history, and more."
    ),
    'E2E-Bench': (
        "E2E-Bench is the \"decathlon\" of AI-assisted research. It measures whether a system can run the entire "
        "research pipeline, starting with an initial task description, to designing and performing (software) "
        "experiments, to analyzing and writing up the results."
    ),
    'E2E-Bench-Hard': (
        f"E2E-Bench-Hard is a more challenging variant of {internal_leaderboard_link('E2E-Bench', validation)}. Tasks are generated using the HypER system, "
        "which identifies research trends and proposes new, underexplored problems. Unlike the regular version, "
        "these tasks are not simplified or curated for accessibility; they are reviewed only for feasibility. "
        "This version is intended to test whether systems can handle more complex and less-structured research "
        f"scenarios, following the same end-to-end process as {internal_leaderboard_link('E2E-Bench', validation)}."
    )
    }
    
    return descriptions.get(benchmark_name, "")

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{asta-bench,
    title={AstaBench},
    author={AstaBench folks},
    year={2025},
    eprint={TBD.TBD},
    archivePrefix={arXiv},
    primaryClass={cs.AI},
    secondaryClass={cs.CL}
}"""

LEGAL_DISCLAIMER_TEXT = """
<h2>Terms and Conditions</h2>
<p>
    The Allen Institute for Artificial Intelligence (Ai2) maintains this repository for agent evaluation submissions to AstaBench. To keep AstaBench fair and auditable, all evaluation logs and associated submission files will be made publicly available. This includes your benchmark inputs, model output responses, and other data and information related to your submission as needed to verify the results.
</p>
<br>
<p>
    Your submissions to AstaBench will be posted, scored, and ranked on the leaderboard at <a href="https://huggingface.co/spaces/allenai/asta-bench-leaderboard" target="_blank" rel="noopener noreferrer">https://huggingface.co/spaces/allenai/asta-bench-leaderboard</a>. You agree you have the rights to the materials you submit and that you will not share any personal, sensitive, proprietary, or confidential information.
</p>
"""

def format_error(msg):
    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"


def format_warning(msg):
    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"


def format_log(msg):
    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"


def hyperlink(link_url: str, text: str = "🔗") -> str:
    if not link_url or not isinstance(link_url, str):
        return str(text) # Or simply "" if link_url is bad
    return f'<a target="_blank" href="{link_url}">{text}</a>'


def hf_uri_to_web_url(uri: str) -> str:
    """
    Convert a Hugging Face-style URI like:
        hf://datasets/{namespace}/{repo}/{path...}
    into a public web URL:
        https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path...}
    """
    prefix = "hf://datasets/"
    if not uri.startswith(prefix):
        raise ValueError("URI must start with 'hf://datasets/'")

    parts = uri[len(prefix) :].split("/", 2)
    if len(parts) < 3:
        raise ValueError("Expected format: hf://datasets/{namespace}/{repo}/{path...}")

    namespace, repo, path = parts
    return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"


css = """
/* CSS Color Variables using Gradio theme */
:root {
    --color-primary-green: var(--primary-900); /* #0FCB8C */
    --color-primary-pink: var(--secondary-900); /* #f0529c */
    --color-neutral-light: var(--neutral-200); /* #C9C9C3 */
    --color-background-light: var(--neutral-50); /* #FAF2E9 */
    --color-background-dark: var(--neutral-900); /* #032629 */
    --color-text-light: var(--neutral-50); /* #FAF2E9 */
}

/* This makes space for the huggingface header bar which must shown on HF spaces. */
/* FIXME Media queries don't seem to survive rendering. */
/* @media (min-width: 768px) { ... } */
gradio-app {
    padding-top: 65px;
}

/* Global Styles */
h2 {
    overflow: hidden;
}

#intro-paragraph {
    font-size: 18px;
    max-width: 90%;
    padding-left: 35px;
    margin-top: 20px;
}

#intro-paragraph p,
#intro-paragraph li {
    font-size: 16px; 
    line-height: 1.8; 
}

#intro-paragraph ul {
    margin-top: 20px;
    margin-bottom: 20px;
}

#diagram-image {
    height: 100%;
}

#diagram-image img {
    width: 100%;
    height: 100%;
    object-fit: cover; 
}
#intro-category-paragraph {
    font-size: 18px;
    max-width: 90%;
    margin-top: 20px;
}

#intro-category-paragraph p,
#intro-category-paragraph li {
    font-size: 16px; 
    line-height: 1.8; 
}

#intro-category-paragraph ul {
    margin-top: 20px;
    margin-bottom: 20px;
}

#about-content {
    font-size: 18px;
    max-width: 60%;
    padding-left: 25px;
}
#category-intro {
    font-size: 18px;
    max-width: 60%;
}
#logo-image { 
    margin: 0;
    margin-bottom: 30px; 
    justify-content: flex-start;        
    max-width: 250px;       
    height: auto;           
}
#page-content-wrapper{
    padding-left: 25px;
}
.table-component{
    height: auto !important;
    max-height: none !important;
}
.table-wrap {
    max-height: none !important;
    height: auto !important;
    overflow-y: visible !important;
}
/* --- New Rules for Table Density --- */
table.gr-table th, table.gr-table td {
    padding: 4px 4px !important; 
    width: 1%;
    white-space: nowrap;
}
table.svelte-1e98i6s td {
    vertical-align: top !important;
}
table.gr-table {
    font-size: 14px !important;
}
.html-container {
    padding-top: 0 !important;
}
#scatter-disclaimer {
        overflow: visible !important;
}
#pareto-disclaimer {
    color: #f0529c !important;
}
thead.svelte-1e98i6s th {
    background: white !important;
}
.dark thead.svelte-1e98i6s th {
    background: #091a1a !important;
}
.cell-wrap.svelte-v1pjjd {
    font-family: 'Manrope';
    }
nav.svelte-ti537g.svelte-ti537g {
    justify-content: flex-start;
}
.nav-holder {
    padding-left: 20px !important;
}
#legend-markdown span {
    margin-right: 15px !important; 
}
#leaderboard-accordion .label-wrap {
    font-size: 1.4rem !important; 
    z-index: 10 !important;
    position: relative !important;
}
.dark #leaderboard-accordion .label-wrap {
    color: #0FCB8C !important; 
}
.dark block.svelte-1svsvh2 {
    background: #032629 !important;
}
.padding.svelte-phx28p {
    padding: 0 !important;
}
.sub-nav-bar-container {
    display: flex !important;
    flex-wrap: wrap !important; 
    align-items: center !important; 
    gap: 10px !important;
}
.dark .primary-link-button {
    color: var(--color-primary-green);
}
.primary-link-button {
    background: none;
    border: none;
    padding: 0;
    margin: 0;
    font-family: inherit;
    font-size: 16px;
    color: var(--color-primary-pink);
    text-decoration: none;
    cursor: pointer;
    white-space: nowrap;
}
.primary-link-button:hover {
    text-decoration: underline;
}
.sub-nav-label {
    font-weight: bold;
    font-size: 16px;
    display: flex;
    align-items: center;
}
.wrap-header-df th span{
    white-space: normal !important;
    word-break: normal !important;
    overflow-wrap: break-word !important;
    line-height: 1.2 !important;
    vertical-align: top !important;
    font-size: 12px !important;
    font-family: 'Manrope';
}
.wrap-header-df th {
    height: auto !important;
}
.wrap-header-df .cell-wrap img {
    width: 16px;
    height: 16px;
    vertical-align: middle;
}
#legend-markdown img {
    width: 16px;
    height: 16px;
    vertical-align: middle;
}
/*------ Global tooltip styles ------*/
.tooltip-icon {
    display: inline-block;
    cursor: help;
    position: relative;
}
.tooltip-icon::after {
    content: attr(data-tooltip);
    position: absolute;
    bottom: 125%;
    background-color: #105257;
    color: #fff;
    padding: 10px;
    border-radius: 4px;
    font-size: 12px;
    opacity: 0;
    transition: opacity 0.2s;
    white-space: pre-line;
    width: max-content;
    text-align: left;
    pointer-events: none;
    max-width: 300px;
    left: 50%;
    transform: translateX(-50%);
    z-index: 1000;
}
@media (max-width: 768px) {
    .tooltip-icon::after {
        max-width: 250px;
    }
}
.tooltip-icon:hover::after {
    opacity: 1;
}
/*------ Openness label tooltip styles ------*/
.styler,
#openness-label-html,
#agent-tooling-label-html {
    overflow: visible !important;
}
/*------ Table cell tooltip styles ------*/
.wrap.default.full,
span.wrap[tabindex="0"][role="button"][data-editable="false"] {
  overflow: visible !important;
}

.cell-tooltip-icon::after {
    height: fit-content;
    top: 125%;
}
/*------ Table column description tooltip styles ------*/
#legend-markdown,
#leaderboard-accordion {
    overflow: visible !important;
}

/* --- inside table tooltips --- */
.native-tooltip-icon {
    cursor: help;
    text-decoration: underline dotted 1px;
}
/* Main Nav bar styling */
.nav-holder nav {
    display: grid !important;
    grid-template-columns: auto auto auto auto auto 1fr auto auto !important;
    gap: 10px 20px !important; /* Vertical and horizontal spacing */
    width: 100% !important;
    align-items: center;
}
.nav-holder nav a[href*="about"] {
    grid-row: 1 !important;
    grid-column: 7 !important;
}
.nav-holder nav a[href*="submit"] {
    grid-row: 1 !important;
    grid-column: 8 !important;
    white-space: nowrap !important;
}
/* Divider line between header and category nav */
.nav-holder nav::after {
    content: ''; /* Required for pseudo-elements to appear */
    background-color: #C9C9C3;
    height: 1px; 
    grid-row: 2 !important;
    grid-column: 1 / -1 !important;
}

/* Horizontal scrolling for navigation */
.nav-holder nav {
    overflow-x: auto;
    scrollbar-width: none;
    -ms-overflow-style: none;
}
.nav-holder nav::-webkit-scrollbar {
    display: none;
}

/* Category navigation buttons in row 3 */
.nav-holder nav a[href*="literature-understanding"],
.nav-holder nav a[href*="code-execution"],
.nav-holder nav a[href*="data-analysis"],
.nav-holder nav a[href*="discovery"] {
    grid-row: 3 !important;
    justify-self: center !important;
    width: fit-content !important;
    white-space: nowrap;
    flex-shrink: 0;
}

.nav-holder nav a[href*="literature-understanding"] { grid-column: 1 !important; }
.nav-holder nav a[href*="code-execution"] { grid-column: 2 !important; }
.nav-holder nav a[href*="data-analysis"] { grid-column: 3 !important; }
.nav-holder nav a[href*="discovery"] { grid-column: 4 !important; }

/* Navigation hover styles */
.nav-holder nav a[href*="about"]:hover,
.nav-holder nav a[href*="submit"]:hover,
.nav-holder nav a[href*="literature-understanding"]:hover,
.nav-holder nav a[href*="code-execution"]:hover,
.nav-holder nav a[href*="data-analysis"]:hover,
.nav-holder nav a[href*="discovery"]:hover {
    background-color: #FDF9F4;
}

.dark .nav-holder nav a[href*="about"]:hover,
.dark .nav-holder nav a[href*="submit"]:hover,
.dark .nav-holder nav a[href*="literature-understanding"]:hover,
.dark .nav-holder nav a[href*="code-execution"]:hover,
.dark .nav-holder nav a[href*="data-analysis"]:hover,
.dark .nav-holder nav a[href*="discovery"]:hover {
    background-color: #1C3A3C;
}
.benchmark-main-subtitle{
    color: var(--color-primary-green);
    overflow: hidden;
    padding-top: 120px;
}
.benchmark-title{
    color: var(--color-primary-pink);
    margin-top: 50px;
        font-size: 20px;
}
.dark .benchmark-title{
    color: var(--color-primary-green);
}
.benchmark-description {
    margin: 20px 0;
    max-width: 800px;
}
/*------ Submission Page CSS ------*/
#submission-modal .modal-container,
#success-modal .modal-container {
    height: auto;
    max-width: 600px;
}

#submission-modal-content,
#success-modal .submission-modal-content { 
    padding: 20px;
    background-color: inherit;
    border-radius: 8px;
    text-align: center;
}

#submission-modal-content p,
#success-modal .submission-modal-content p {
    font-size: 16px;
}

#legal-modal-content {
    padding: 30px;
    background-color: inherit;
    border-radius: 8px;
    text-align: left;
    font-size: 14px;
}

#legal-modal-content h2 {
    text-align: center;
}
#legal-modal-content button {
    width: fit-content;
}
.spinner-container {
    display: flex;
    flex-direction: column;
    align-items: center;
    justify-content: center;
    padding: 30px;
}
    
.spinner {
    width: 50px;
    height: 50px;
    border: 5px solid #dee2e6;
    border-top: 5px solid #007bff;
    border-radius: 50%;
    animation: spin 1s linear infinite;
    margin-bottom: 20px;
}
    
@keyframes spin {
    0% { transform: rotate(0deg); }
    100% { transform: rotate(360deg); }
}
    
#submission-page-container {
    max-width: 800px;
    margin: 0 auto;
}

#submission-file-label {
    padding: 10px;
}

#submission-button {
    max-width: fit-content;
    font-size: 14px;
}

.custom-form-group {
    border: 1px solid #000 !important; 
    border-radius: 4px !important;
    padding: 24px !important;
    overflow: visible !important;    
}

#openness-label-html,
#agent-tooling-label-html,
#agent-info-label-html,
#submitter-info-label-html,
#username-label-html,
#email-label-html,
#role-label-html  {
    padding-left: 12px;
}

.form-label {
    margin: 4px 0px 0px 6px;
}

.form-label-fieldset {
    padding-top: 10px !important;
}

#agent-tooling-label-html {
    padding-top: 6px;
}

.custom-form-group,
.styler {
    background: none;
}

#feedback-button {
    display: inline-block;
    background-color: #345d60;
    color: white;
    border: none;
    border-radius: 4px;
    padding: 15px 20px;
    font-size: 16px;
    cursor: pointer;
    transition: all 0.3s ease;
    text-decoration: none;
}

#feedback-button:hover {
    background-color: #5d888b;
    transform: translateY(-2px);
    box-shadow: 0 6px 12px rgba(0,0,0,0.3);
}
.dark #main-header h2 {
    color: #0fcb8c; 
}
#main-header h2 {
    color: #f0529c;
}

/* --- New HTML-Based Tooltip Styles --- */
.tooltip-icon-legend {
    position: relative;
    cursor: help;
    display: inline-block;
}

/* The HTML pop-up card tooltips.*/
.tooltip-card {
    /* Hiding mechanism */
    opacity: 0;
    visibility: hidden;
    transition: opacity 0.2s;
    pointer-events: none;
    /* Card appearance */
    position: fixed;
    z-index: 1000;
    background-color: #083c40;
    color: #e5e7eb;
    border-radius: 12px;
    padding: 15px;
    width: max-content;
    max-width: 400px;
    text-align: left;
}
.tooltip-card.visible {
    opacity: 1;
    visibility: visible;
} 
.tooltip-card h3 {
    font-size: 18px; 
    color: #fff; 
    margin-top: 0; 
    margin-bottom: 12px;
}
.tooltip-card .tooltip-description {
    margin-bottom: 20px; 
    line-height: 1.3;
}
.tooltip-card .tooltip-items-container {
    display: flex; 
    flex-direction: column; 
    gap: 10px;
}
.tooltip-card .tooltip-legend-item {
    display: flex; 
    align-items: 
    flex-start; 
    gap: 10px;
}
.tooltip-card .tooltip-legend-item img {
    width: 20px; 
    height: 20px; 
    margin-top: 2px;
}
.tooltip-card .tooltip-legend-item div {
    display: flex; 
    flex-direction: column;
}
.tooltip-card .tooltip-legend-item strong {
    font-weight: 600; 
    color: #fff;
}
.tooltip-card .tooltip-legend-item span {
    font-size: 13px; 
    line-height: 1.3;
}
.tooltip-sub-list {
    list-style-type: '• '; 
    padding-left: 18px;         
    font-size: 13px;
    line-height: 1.3;  
    display: flex;
    flex-direction: column;   
} 
.table-legend-item {  
    display: flex; 
    align-items: center; 
    white-space: nowrap; 
    margin-top: 8px; 
    flex-wrap: wrap;
}

/* About Page CSS */
#about-page-content-wrapper {
    margin-left: auto;
    margin-right: auto;
    max-width: 800px; 
    padding: 0 24px;
    display: flex;
    flex-direction: column; 
    gap: 40px; 
    margin-top: 40px;
    opacity: 85%; 
    margin-bottom: 60px;
}
.link-buttons-container {
    display: flex;
    flex-wrap: wrap; /* Allows buttons to stack on very narrow screens */
    gap: 16px;     
    margin-top: 16px;
}
.link-button {
    display: flex;
    justify-content: space-between;
    align-items: center;
    flex-grow: 1; 
    background-color: #083c40; 
    padding: 16px 20px;
    font-weight: 600;
    border-radius: 12px;
    text-decoration: none; 
    transition: background-color 0.2s ease-in-out;
}
.link-button:hover {
    background-color: #0a4c52; 
}
.external-link-icon {
    font-size: 20px;
    line-height: 1;
    margin-left: 12px;
}

#leaderboard-accordion table {
    width: auto !important;
    margin-right: auto !important;
}
.info-list {
    padding-left: 20px;
}

/* Smooth scrolling for the entire page */
html {
    scroll-behavior: smooth;
}
/* Home Page Styling */
.diagram-placeholder {
    width: 100%;
    height: 100%; 
    min-height: 250px; 
    display: flex;
    align-items: center;
    justify-content: center;
    background-color: #FAF2E9; 
    color: #F0529C;          
    border-radius: 8px;
    font-size: 14px;
    text-align: center;
}
/* 2. Responsive behavior for smaller screens */
@media (max-width: 900px) {
    #intro-row {
        flex-direction: column;
    }
}
/* Plot legend styles */
.plot-legend-container {
    min-height: 572px;
    background-color: #fff;
    padding: 24px 32px;
    border: 1px solid black;
    border-radius: 4px;
}

.dark .plot-legend-container {
    background: rgba(250, 242, 233, 0.1);
    border-color: rgb(159, 234, 209);
}

#plot-legend-logo {
    margin-bottom: 24px;
}

#plot-legend-logo img {
    height: 19px;
}

.plot-legend-category-heading {
    font-size: 16px;
    font-weight: 700;    
}

.plot-legend-item {
    display: flex;      
    margin-top: 8px;
}


.plot-legend-item-text .description {
    color: #888;
    font-size: 12px;
}

.plot-legend-item-svg {
    margin-top: 3px;
    width: 14px;
    height: 14px;
    margin-right: 8px;
}

.plot-legend-tooling-svg {
    height: 16px;
    width: 16px;
    margin-top: 2px;
}

#plot-legend-item-pareto-svg {
    width: 18px;
    height: 18px;
    margin-right: 2px;
}
h3 .header-link-icon {
    font-size: 12px;
    vertical-align: text-top;
    margin-left: 6px;
    text-decoration: none;
}

/* Targets all "overall stats" columns in the main leaderboard for each category */
#main-leaderboard td:nth-child(6) .prose,
#main-leaderboard td:nth-child(7) .prose {
    font-weight: 700 !important;
}
"""