alisamak commited on
Commit
20e486a
·
verified ·
1 Parent(s): 9fb8c19

Update evaluate_agent.py

Browse files
Files changed (1) hide show
  1. evaluate_agent.py +29 -37
evaluate_agent.py CHANGED
@@ -9,20 +9,20 @@ def test_questions():
9
  "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
10
  },
11
  {
12
- "task_id": "q7",
13
- "question": (
14
- "Given this table defining * on the set S = {a, b, c, d, e}\n\n"
15
- "|*|a|b|c|d|e|\n"
16
- "|---|---|---|---|---|---|\n"
17
- "|a|a|b|c|b|d|\n"
18
- "|b|b|c|a|e|c|\n"
19
- "|c|c|a|b|b|a|\n"
20
- "|d|b|e|b|e|d|\n"
21
- "|e|d|b|a|d|c|\n\n"
22
- "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
23
- "Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
24
- ),
25
- "expected_keywords": ["b, e"]
26
  },
27
  {
28
  "task_id": "q3",
@@ -31,20 +31,6 @@ def test_questions():
31
  ),
32
  "expected_keywords": ["right"]
33
  },
34
- # {
35
- # "task_id": "q10",
36
- # "question": (
37
- # "I’m organizing a grocery list and only want to include true vegetables "
38
- # "(not fruits, even if they’re used as vegetables in cooking). "
39
- # "From the following list, which items are true vegetables?\n\n"
40
- # "milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, "
41
- # "green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, "
42
- # "zucchini, lettuce, peanuts"
43
- # ),
44
- # "expected_keywords": [
45
- # "broccoli", "celery", "green beans", "lettuce", "sweet potatoes", "zucchini"
46
- # ]
47
- # },
48
  {
49
  "task_id": "q2",
50
  "question": (
@@ -53,20 +39,26 @@ def test_questions():
53
  ),
54
  "expected_keywords": ["3", "three"]
55
  },
56
- # {
57
- # "task_id": "q3",
58
- # "question": (
59
- # "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species "
60
- # "to be on camera simultaneously?"
61
- # ),
62
- # "expected_keywords": ["14", "fourteen"]
63
- # },
64
  {
65
- "task_id": "q4",
66
  "question": (
67
  "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
68
  ),
69
  "expected_keywords": ["FunkMonk"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
  ]
72
 
 
9
  "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
10
  },
11
  {
12
+ "task_id": "q7",
13
+ "question": (
14
+ "Given this table defining * on the set S = {a, b, c, d, e}\n\n"
15
+ "|*|a|b|c|d|e|\n"
16
+ "|---|---|---|---|---|---|\n"
17
+ "|a|a|b|c|b|d|\n"
18
+ "|b|b|c|a|e|c|\n"
19
+ "|c|c|a|b|b|a|\n"
20
+ "|d|b|e|b|e|d|\n"
21
+ "|e|d|b|a|d|c|\n\n"
22
+ "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
23
+ "Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
24
+ ),
25
+ "expected_keywords": ["b, e"]
26
  },
27
  {
28
  "task_id": "q3",
 
31
  ),
32
  "expected_keywords": ["right"]
33
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "task_id": "q2",
36
  "question": (
 
39
  ),
40
  "expected_keywords": ["3", "three"]
41
  },
 
 
 
 
 
 
 
 
42
  {
43
+ "task_id": "q4b",
44
  "question": (
45
  "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
46
  ),
47
  "expected_keywords": ["FunkMonk"]
48
+ },
49
+ {
50
+ "task_id": "q5",
51
+ "question": (
52
+ "Who is the CEO of OpenAI?"
53
+ ),
54
+ "expected_keywords": ["sam altman"]
55
+ },
56
+ {
57
+ "task_id": "q6",
58
+ "question": (
59
+ "When was the Eiffel Tower built?"
60
+ ),
61
+ "expected_keywords": ["1889"]
62
  }
63
  ]
64