d0rj commited on
Commit
3e35a01
·
1 Parent(s): 9445c3c

style: code blacked

Browse files
app.py CHANGED
@@ -52,7 +52,9 @@ with gr.Blocks(
52
  ),
53
  ) as application:
54
  gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
55
- gr.Markdown(f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}).")
 
 
56
  with gr.Tabs():
57
  with gr.Tab("Leaderboard"):
58
  gr.Markdown("In progress...")
@@ -77,7 +79,7 @@ with gr.Blocks(
77
  [22],
78
  [40],
79
  [230],
80
- ]
81
  )
82
 
83
 
 
52
  ),
53
  ) as application:
54
  gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
55
+ gr.Markdown(
56
+ f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})."
57
+ )
58
  with gr.Tabs():
59
  with gr.Tab("Leaderboard"):
60
  gr.Markdown("In progress...")
 
79
  [22],
80
  [40],
81
  [230],
82
+ ],
83
  )
84
 
85
 
src/common/data.py CHANGED
@@ -13,5 +13,7 @@ def load_dataset() -> pd.DataFrame:
13
  ds = datasets.load_dataset(DATASET_NAME, split="test")
14
  df = pd.DataFrame(ds)
15
 
16
- df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(json.loads)
 
 
17
  return df
 
13
  ds = datasets.load_dataset(DATASET_NAME, split="test")
14
  df = pd.DataFrame(ds)
15
 
16
+ df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(
17
+ json.loads
18
+ )
19
  return df
src/eval/cli.py CHANGED
@@ -41,7 +41,9 @@ def _evaluate_single_answer(
41
  )
42
  except Exception as e:
43
  print(e)
44
- print(f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}")
 
 
45
  exit(1)
46
  return result
47
 
@@ -53,7 +55,9 @@ def _evaluate(
53
  ) -> pd.DataFrame:
54
  tqdm.pandas()
55
 
56
- generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
 
 
57
  lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
58
  )
59
  dataset_df = load_dataset()
@@ -67,10 +71,14 @@ def _evaluate(
67
  axis=1,
68
  )
69
 
70
- predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
 
 
71
  lambda x: x.answer if not pd.isna(x) else None,
72
  )
73
- predictions_df[DatasetEvalSchema.context] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
 
 
74
  lambda x: x.context if not pd.isna(x) else None,
75
  )
76
  predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
@@ -91,7 +99,9 @@ def evaluate(
91
 
92
  df = pd.read_json(file, lines=True)
93
  evaluated_df = _evaluate(df)
94
- evaluated_df.to_json(file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False)
 
 
95
 
96
 
97
  @pa.check_input(DatasetEvalSchema)
@@ -101,16 +111,24 @@ def _metrics(
101
  model_name: str,
102
  model_size: float,
103
  model_url: str,
104
- model_config: str
105
  ) -> pd.DataFrame:
106
  pass1 = df[DatasetEvalSchema.is_correct].mean()
107
 
108
  w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
109
- weighted_accuracy = (df[DatasetEvalSchema.is_correct].astype(int) * w).sum() / w.sum()
110
-
111
- arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][DatasetEvalSchema.is_correct].mean()
112
- geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][DatasetEvalSchema.is_correct].mean()
113
- logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][DatasetEvalSchema.is_correct].mean()
 
 
 
 
 
 
 
 
114
 
115
  result = {
116
  LeaderBoardSchema.model_name: model_name,
@@ -174,7 +192,7 @@ def metrics(
174
  model_name=model_name,
175
  model_size=model_size,
176
  model_url=model_url,
177
- model_config=model_config or '',
178
  )
179
  metrics = metrics_df.to_dict(orient="records")[0]
180
  print(f"Metrics for {model_name}:")
 
41
  )
42
  except Exception as e:
43
  print(e)
44
+ print(
45
+ f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}"
46
+ )
47
  exit(1)
48
  return result
49
 
 
55
  ) -> pd.DataFrame:
56
  tqdm.pandas()
57
 
58
+ generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
59
+ GeneratedDatasetSchema.generated_answer
60
+ ].apply(
61
  lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
62
  )
63
  dataset_df = load_dataset()
 
71
  axis=1,
72
  )
73
 
74
+ predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
75
+ GeneratedDatasetSchema.generated_answer
76
+ ].apply(
77
  lambda x: x.answer if not pd.isna(x) else None,
78
  )
79
+ predictions_df[DatasetEvalSchema.context] = predictions_df[
80
+ GeneratedDatasetSchema.generated_answer
81
+ ].apply(
82
  lambda x: x.context if not pd.isna(x) else None,
83
  )
84
  predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
 
99
 
100
  df = pd.read_json(file, lines=True)
101
  evaluated_df = _evaluate(df)
102
+ evaluated_df.to_json(
103
+ file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False
104
+ )
105
 
106
 
107
  @pa.check_input(DatasetEvalSchema)
 
111
  model_name: str,
112
  model_size: float,
113
  model_url: str,
114
+ model_config: str,
115
  ) -> pd.DataFrame:
116
  pass1 = df[DatasetEvalSchema.is_correct].mean()
117
 
118
  w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
119
+ weighted_accuracy = (
120
+ df[DatasetEvalSchema.is_correct].astype(int) * w
121
+ ).sum() / w.sum()
122
+
123
+ arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][
124
+ DatasetEvalSchema.is_correct
125
+ ].mean()
126
+ geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][
127
+ DatasetEvalSchema.is_correct
128
+ ].mean()
129
+ logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][
130
+ DatasetEvalSchema.is_correct
131
+ ].mean()
132
 
133
  result = {
134
  LeaderBoardSchema.model_name: model_name,
 
192
  model_name=model_name,
193
  model_size=model_size,
194
  model_url=model_url,
195
+ model_config=model_config or "",
196
  )
197
  metrics = metrics_df.to_dict(orient="records")[0]
198
  print(f"Metrics for {model_name}:")
src/eval/matchers.py CHANGED
@@ -32,7 +32,9 @@ def um(y_true: list, y_pred: list) -> bool:
32
  return False
33
  if len(y_true) == 0:
34
  return True
35
- if (len(y_true) > 0 and type(y_true[0]) is dict) or (len(y_true) == 0 and type(y_pred[0]) is dict):
 
 
36
  y_true = [_dict_to_tuple(item) for item in y_true]
37
  y_pred = [_dict_to_tuple(item) for item in y_pred]
38
  if type(y_true) != type(y_pred):
 
32
  return False
33
  if len(y_true) == 0:
34
  return True
35
+ if (len(y_true) > 0 and type(y_true[0]) is dict) or (
36
+ len(y_true) == 0 and type(y_pred[0]) is dict
37
+ ):
38
  y_true = [_dict_to_tuple(item) for item in y_true]
39
  y_pred = [_dict_to_tuple(item) for item in y_pred]
40
  if type(y_true) != type(y_pred):
src/eval/metrics.py CHANGED
@@ -3,5 +3,5 @@ import numpy as np
3
 
4
  def grade_to_weight(g: str) -> float:
5
  """Convert a grade string to a weight value."""
6
- parts = list(map(int, g.split('-')))
7
  return np.mean(parts)
 
3
 
4
  def grade_to_weight(g: str) -> float:
5
  """Convert a grade string to a weight value."""
6
+ parts = list(map(int, g.split("-")))
7
  return np.mean(parts)
src/generate/cli.py CHANGED
@@ -41,7 +41,9 @@ def _generate_single_answer(
41
  ) -> GenerationAnswer:
42
  if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
43
  return GenerationAnswer.model_validate(
44
- json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[GeneratedDatasetSchema.generated_answer]
 
 
45
  )
46
  answer_type = make_root_model(row[DatasetSchema.answer_type])
47
  chain = build_chain(answer_type)
@@ -131,14 +133,16 @@ def generate(
131
  build_chain_function,
132
  llm_class=config.llm_class,
133
  structured_output_method=config.structured_output_method,
134
- **config.kwargs
135
  )
136
 
137
- df = _generate_answers(df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path)
138
-
139
- df[GeneratedDatasetSchema.generated_answer] = df[GeneratedDatasetSchema.generated_answer].apply(
140
- lambda x: x.model_dump()
141
  )
 
 
 
 
142
  df.to_json(
143
  output_path,
144
  lines=True,
@@ -151,15 +155,19 @@ def generate(
151
  def _type_sanitycheck(
152
  generated_df: pd.DataFrame,
153
  ) -> tuple[bool, str]:
154
- generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
155
- lambda x: GenerationAnswer.model_validate(deepcopy(x)) if not isinstance(x, GenerationAnswer) else x
 
 
 
 
156
  )
157
 
158
  dataset_df = load_dataset()
159
  predicted_df = dataset_df.join(
160
  generated_df.set_index(GeneratedDatasetSchema.id_),
161
  on=DatasetSchema.id_,
162
- rsuffix='_generated',
163
  ).dropna(subset=[GeneratedDatasetSchema.generated_answer])
164
 
165
  if len(predicted_df) == 0:
@@ -170,13 +178,20 @@ def _type_sanitycheck(
170
  lambda row: matches_type(
171
  row[GeneratedDatasetSchema.generated_answer].answer,
172
  string_to_type(row[DatasetSchema.answer_type]),
173
- ), axis=1
 
174
  )
175
 
176
  if not predicted_df[TYPE_MATCH].all():
177
- return False, f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}."
 
 
 
178
 
179
- return True, f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}"
 
 
 
180
 
181
 
182
  @click.command()
 
41
  ) -> GenerationAnswer:
42
  if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
43
  return GenerationAnswer.model_validate(
44
+ json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[
45
+ GeneratedDatasetSchema.generated_answer
46
+ ]
47
  )
48
  answer_type = make_root_model(row[DatasetSchema.answer_type])
49
  chain = build_chain(answer_type)
 
133
  build_chain_function,
134
  llm_class=config.llm_class,
135
  structured_output_method=config.structured_output_method,
136
+ **config.kwargs,
137
  )
138
 
139
+ df = _generate_answers(
140
+ df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path
 
 
141
  )
142
+
143
+ df[GeneratedDatasetSchema.generated_answer] = df[
144
+ GeneratedDatasetSchema.generated_answer
145
+ ].apply(lambda x: x.model_dump())
146
  df.to_json(
147
  output_path,
148
  lines=True,
 
155
  def _type_sanitycheck(
156
  generated_df: pd.DataFrame,
157
  ) -> tuple[bool, str]:
158
+ generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
159
+ GeneratedDatasetSchema.generated_answer
160
+ ].apply(
161
+ lambda x: GenerationAnswer.model_validate(deepcopy(x))
162
+ if not isinstance(x, GenerationAnswer)
163
+ else x
164
  )
165
 
166
  dataset_df = load_dataset()
167
  predicted_df = dataset_df.join(
168
  generated_df.set_index(GeneratedDatasetSchema.id_),
169
  on=DatasetSchema.id_,
170
+ rsuffix="_generated",
171
  ).dropna(subset=[GeneratedDatasetSchema.generated_answer])
172
 
173
  if len(predicted_df) == 0:
 
178
  lambda row: matches_type(
179
  row[GeneratedDatasetSchema.generated_answer].answer,
180
  string_to_type(row[DatasetSchema.answer_type]),
181
+ ),
182
+ axis=1,
183
  )
184
 
185
  if not predicted_df[TYPE_MATCH].all():
186
+ return (
187
+ False,
188
+ f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}.",
189
+ )
190
 
191
+ return (
192
+ True,
193
+ f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}",
194
+ )
195
 
196
 
197
  @click.command()
src/generate/generators.py CHANGED
@@ -57,9 +57,7 @@ def build_singleturn_chain(
57
  context={},
58
  )
59
  )
60
- chain = chain.with_retry(
61
- retry_if_exception_type=(openai.PermissionDeniedError, )
62
- )
63
  return chain
64
 
65
 
@@ -126,9 +124,7 @@ def build_thinking_chain(
126
  )
127
  )
128
  )
129
- chain = chain.with_retry(
130
- retry_if_exception_type=(openai.PermissionDeniedError, )
131
- )
132
  return chain
133
 
134
 
 
57
  context={},
58
  )
59
  )
60
+ chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
 
 
61
  return chain
62
 
63
 
 
124
  )
125
  )
126
  )
127
+ chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
 
 
128
  return chain
129
 
130