Spaces:
Sleeping
Sleeping
style: code blacked
Browse files- app.py +4 -2
- src/common/data.py +3 -1
- src/eval/cli.py +30 -12
- src/eval/matchers.py +3 -1
- src/eval/metrics.py +1 -1
- src/generate/cli.py +27 -12
- src/generate/generators.py +2 -6
app.py
CHANGED
@@ -52,7 +52,9 @@ with gr.Blocks(
|
|
52 |
),
|
53 |
) as application:
|
54 |
gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
|
55 |
-
gr.Markdown(
|
|
|
|
|
56 |
with gr.Tabs():
|
57 |
with gr.Tab("Leaderboard"):
|
58 |
gr.Markdown("In progress...")
|
@@ -77,7 +79,7 @@ with gr.Blocks(
|
|
77 |
[22],
|
78 |
[40],
|
79 |
[230],
|
80 |
-
]
|
81 |
)
|
82 |
|
83 |
|
|
|
52 |
),
|
53 |
) as application:
|
54 |
gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
|
55 |
+
gr.Markdown(
|
56 |
+
f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})."
|
57 |
+
)
|
58 |
with gr.Tabs():
|
59 |
with gr.Tab("Leaderboard"):
|
60 |
gr.Markdown("In progress...")
|
|
|
79 |
[22],
|
80 |
[40],
|
81 |
[230],
|
82 |
+
],
|
83 |
)
|
84 |
|
85 |
|
src/common/data.py
CHANGED
@@ -13,5 +13,7 @@ def load_dataset() -> pd.DataFrame:
|
|
13 |
ds = datasets.load_dataset(DATASET_NAME, split="test")
|
14 |
df = pd.DataFrame(ds)
|
15 |
|
16 |
-
df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(
|
|
|
|
|
17 |
return df
|
|
|
13 |
ds = datasets.load_dataset(DATASET_NAME, split="test")
|
14 |
df = pd.DataFrame(ds)
|
15 |
|
16 |
+
df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(
|
17 |
+
json.loads
|
18 |
+
)
|
19 |
return df
|
src/eval/cli.py
CHANGED
@@ -41,7 +41,9 @@ def _evaluate_single_answer(
|
|
41 |
)
|
42 |
except Exception as e:
|
43 |
print(e)
|
44 |
-
print(
|
|
|
|
|
45 |
exit(1)
|
46 |
return result
|
47 |
|
@@ -53,7 +55,9 @@ def _evaluate(
|
|
53 |
) -> pd.DataFrame:
|
54 |
tqdm.pandas()
|
55 |
|
56 |
-
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
|
|
|
|
57 |
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
|
58 |
)
|
59 |
dataset_df = load_dataset()
|
@@ -67,10 +71,14 @@ def _evaluate(
|
|
67 |
axis=1,
|
68 |
)
|
69 |
|
70 |
-
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
|
|
|
|
|
71 |
lambda x: x.answer if not pd.isna(x) else None,
|
72 |
)
|
73 |
-
predictions_df[DatasetEvalSchema.context] = predictions_df[
|
|
|
|
|
74 |
lambda x: x.context if not pd.isna(x) else None,
|
75 |
)
|
76 |
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
|
@@ -91,7 +99,9 @@ def evaluate(
|
|
91 |
|
92 |
df = pd.read_json(file, lines=True)
|
93 |
evaluated_df = _evaluate(df)
|
94 |
-
evaluated_df.to_json(
|
|
|
|
|
95 |
|
96 |
|
97 |
@pa.check_input(DatasetEvalSchema)
|
@@ -101,16 +111,24 @@ def _metrics(
|
|
101 |
model_name: str,
|
102 |
model_size: float,
|
103 |
model_url: str,
|
104 |
-
model_config: str
|
105 |
) -> pd.DataFrame:
|
106 |
pass1 = df[DatasetEvalSchema.is_correct].mean()
|
107 |
|
108 |
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
|
109 |
-
weighted_accuracy = (
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
result = {
|
116 |
LeaderBoardSchema.model_name: model_name,
|
@@ -174,7 +192,7 @@ def metrics(
|
|
174 |
model_name=model_name,
|
175 |
model_size=model_size,
|
176 |
model_url=model_url,
|
177 |
-
model_config=model_config or
|
178 |
)
|
179 |
metrics = metrics_df.to_dict(orient="records")[0]
|
180 |
print(f"Metrics for {model_name}:")
|
|
|
41 |
)
|
42 |
except Exception as e:
|
43 |
print(e)
|
44 |
+
print(
|
45 |
+
f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}"
|
46 |
+
)
|
47 |
exit(1)
|
48 |
return result
|
49 |
|
|
|
55 |
) -> pd.DataFrame:
|
56 |
tqdm.pandas()
|
57 |
|
58 |
+
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
59 |
+
GeneratedDatasetSchema.generated_answer
|
60 |
+
].apply(
|
61 |
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
|
62 |
)
|
63 |
dataset_df = load_dataset()
|
|
|
71 |
axis=1,
|
72 |
)
|
73 |
|
74 |
+
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
|
75 |
+
GeneratedDatasetSchema.generated_answer
|
76 |
+
].apply(
|
77 |
lambda x: x.answer if not pd.isna(x) else None,
|
78 |
)
|
79 |
+
predictions_df[DatasetEvalSchema.context] = predictions_df[
|
80 |
+
GeneratedDatasetSchema.generated_answer
|
81 |
+
].apply(
|
82 |
lambda x: x.context if not pd.isna(x) else None,
|
83 |
)
|
84 |
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
|
|
|
99 |
|
100 |
df = pd.read_json(file, lines=True)
|
101 |
evaluated_df = _evaluate(df)
|
102 |
+
evaluated_df.to_json(
|
103 |
+
file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False
|
104 |
+
)
|
105 |
|
106 |
|
107 |
@pa.check_input(DatasetEvalSchema)
|
|
|
111 |
model_name: str,
|
112 |
model_size: float,
|
113 |
model_url: str,
|
114 |
+
model_config: str,
|
115 |
) -> pd.DataFrame:
|
116 |
pass1 = df[DatasetEvalSchema.is_correct].mean()
|
117 |
|
118 |
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
|
119 |
+
weighted_accuracy = (
|
120 |
+
df[DatasetEvalSchema.is_correct].astype(int) * w
|
121 |
+
).sum() / w.sum()
|
122 |
+
|
123 |
+
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][
|
124 |
+
DatasetEvalSchema.is_correct
|
125 |
+
].mean()
|
126 |
+
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][
|
127 |
+
DatasetEvalSchema.is_correct
|
128 |
+
].mean()
|
129 |
+
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][
|
130 |
+
DatasetEvalSchema.is_correct
|
131 |
+
].mean()
|
132 |
|
133 |
result = {
|
134 |
LeaderBoardSchema.model_name: model_name,
|
|
|
192 |
model_name=model_name,
|
193 |
model_size=model_size,
|
194 |
model_url=model_url,
|
195 |
+
model_config=model_config or "",
|
196 |
)
|
197 |
metrics = metrics_df.to_dict(orient="records")[0]
|
198 |
print(f"Metrics for {model_name}:")
|
src/eval/matchers.py
CHANGED
@@ -32,7 +32,9 @@ def um(y_true: list, y_pred: list) -> bool:
|
|
32 |
return False
|
33 |
if len(y_true) == 0:
|
34 |
return True
|
35 |
-
if (len(y_true) > 0 and type(y_true[0]) is dict) or (
|
|
|
|
|
36 |
y_true = [_dict_to_tuple(item) for item in y_true]
|
37 |
y_pred = [_dict_to_tuple(item) for item in y_pred]
|
38 |
if type(y_true) != type(y_pred):
|
|
|
32 |
return False
|
33 |
if len(y_true) == 0:
|
34 |
return True
|
35 |
+
if (len(y_true) > 0 and type(y_true[0]) is dict) or (
|
36 |
+
len(y_true) == 0 and type(y_pred[0]) is dict
|
37 |
+
):
|
38 |
y_true = [_dict_to_tuple(item) for item in y_true]
|
39 |
y_pred = [_dict_to_tuple(item) for item in y_pred]
|
40 |
if type(y_true) != type(y_pred):
|
src/eval/metrics.py
CHANGED
@@ -3,5 +3,5 @@ import numpy as np
|
|
3 |
|
4 |
def grade_to_weight(g: str) -> float:
|
5 |
"""Convert a grade string to a weight value."""
|
6 |
-
parts = list(map(int, g.split(
|
7 |
return np.mean(parts)
|
|
|
3 |
|
4 |
def grade_to_weight(g: str) -> float:
|
5 |
"""Convert a grade string to a weight value."""
|
6 |
+
parts = list(map(int, g.split("-")))
|
7 |
return np.mean(parts)
|
src/generate/cli.py
CHANGED
@@ -41,7 +41,9 @@ def _generate_single_answer(
|
|
41 |
) -> GenerationAnswer:
|
42 |
if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
|
43 |
return GenerationAnswer.model_validate(
|
44 |
-
json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[
|
|
|
|
|
45 |
)
|
46 |
answer_type = make_root_model(row[DatasetSchema.answer_type])
|
47 |
chain = build_chain(answer_type)
|
@@ -131,14 +133,16 @@ def generate(
|
|
131 |
build_chain_function,
|
132 |
llm_class=config.llm_class,
|
133 |
structured_output_method=config.structured_output_method,
|
134 |
-
**config.kwargs
|
135 |
)
|
136 |
|
137 |
-
df = _generate_answers(
|
138 |
-
|
139 |
-
df[GeneratedDatasetSchema.generated_answer] = df[GeneratedDatasetSchema.generated_answer].apply(
|
140 |
-
lambda x: x.model_dump()
|
141 |
)
|
|
|
|
|
|
|
|
|
142 |
df.to_json(
|
143 |
output_path,
|
144 |
lines=True,
|
@@ -151,15 +155,19 @@ def generate(
|
|
151 |
def _type_sanitycheck(
|
152 |
generated_df: pd.DataFrame,
|
153 |
) -> tuple[bool, str]:
|
154 |
-
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
155 |
-
|
|
|
|
|
|
|
|
|
156 |
)
|
157 |
|
158 |
dataset_df = load_dataset()
|
159 |
predicted_df = dataset_df.join(
|
160 |
generated_df.set_index(GeneratedDatasetSchema.id_),
|
161 |
on=DatasetSchema.id_,
|
162 |
-
rsuffix=
|
163 |
).dropna(subset=[GeneratedDatasetSchema.generated_answer])
|
164 |
|
165 |
if len(predicted_df) == 0:
|
@@ -170,13 +178,20 @@ def _type_sanitycheck(
|
|
170 |
lambda row: matches_type(
|
171 |
row[GeneratedDatasetSchema.generated_answer].answer,
|
172 |
string_to_type(row[DatasetSchema.answer_type]),
|
173 |
-
),
|
|
|
174 |
)
|
175 |
|
176 |
if not predicted_df[TYPE_MATCH].all():
|
177 |
-
return
|
|
|
|
|
|
|
178 |
|
179 |
-
return
|
|
|
|
|
|
|
180 |
|
181 |
|
182 |
@click.command()
|
|
|
41 |
) -> GenerationAnswer:
|
42 |
if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
|
43 |
return GenerationAnswer.model_validate(
|
44 |
+
json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[
|
45 |
+
GeneratedDatasetSchema.generated_answer
|
46 |
+
]
|
47 |
)
|
48 |
answer_type = make_root_model(row[DatasetSchema.answer_type])
|
49 |
chain = build_chain(answer_type)
|
|
|
133 |
build_chain_function,
|
134 |
llm_class=config.llm_class,
|
135 |
structured_output_method=config.structured_output_method,
|
136 |
+
**config.kwargs,
|
137 |
)
|
138 |
|
139 |
+
df = _generate_answers(
|
140 |
+
df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path
|
|
|
|
|
141 |
)
|
142 |
+
|
143 |
+
df[GeneratedDatasetSchema.generated_answer] = df[
|
144 |
+
GeneratedDatasetSchema.generated_answer
|
145 |
+
].apply(lambda x: x.model_dump())
|
146 |
df.to_json(
|
147 |
output_path,
|
148 |
lines=True,
|
|
|
155 |
def _type_sanitycheck(
|
156 |
generated_df: pd.DataFrame,
|
157 |
) -> tuple[bool, str]:
|
158 |
+
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
159 |
+
GeneratedDatasetSchema.generated_answer
|
160 |
+
].apply(
|
161 |
+
lambda x: GenerationAnswer.model_validate(deepcopy(x))
|
162 |
+
if not isinstance(x, GenerationAnswer)
|
163 |
+
else x
|
164 |
)
|
165 |
|
166 |
dataset_df = load_dataset()
|
167 |
predicted_df = dataset_df.join(
|
168 |
generated_df.set_index(GeneratedDatasetSchema.id_),
|
169 |
on=DatasetSchema.id_,
|
170 |
+
rsuffix="_generated",
|
171 |
).dropna(subset=[GeneratedDatasetSchema.generated_answer])
|
172 |
|
173 |
if len(predicted_df) == 0:
|
|
|
178 |
lambda row: matches_type(
|
179 |
row[GeneratedDatasetSchema.generated_answer].answer,
|
180 |
string_to_type(row[DatasetSchema.answer_type]),
|
181 |
+
),
|
182 |
+
axis=1,
|
183 |
)
|
184 |
|
185 |
if not predicted_df[TYPE_MATCH].all():
|
186 |
+
return (
|
187 |
+
False,
|
188 |
+
f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}.",
|
189 |
+
)
|
190 |
|
191 |
+
return (
|
192 |
+
True,
|
193 |
+
f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}",
|
194 |
+
)
|
195 |
|
196 |
|
197 |
@click.command()
|
src/generate/generators.py
CHANGED
@@ -57,9 +57,7 @@ def build_singleturn_chain(
|
|
57 |
context={},
|
58 |
)
|
59 |
)
|
60 |
-
chain = chain.with_retry(
|
61 |
-
retry_if_exception_type=(openai.PermissionDeniedError, )
|
62 |
-
)
|
63 |
return chain
|
64 |
|
65 |
|
@@ -126,9 +124,7 @@ def build_thinking_chain(
|
|
126 |
)
|
127 |
)
|
128 |
)
|
129 |
-
chain = chain.with_retry(
|
130 |
-
retry_if_exception_type=(openai.PermissionDeniedError, )
|
131 |
-
)
|
132 |
return chain
|
133 |
|
134 |
|
|
|
57 |
context={},
|
58 |
)
|
59 |
)
|
60 |
+
chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
|
|
|
|
|
61 |
return chain
|
62 |
|
63 |
|
|
|
124 |
)
|
125 |
)
|
126 |
)
|
127 |
+
chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
|
|
|
|
|
128 |
return chain
|
129 |
|
130 |
|