Dkapsis commited on
Commit
6dd5f4b
·
1 Parent(s): 9faf105

more detailed search

Browse files
app_agents/web_agent.py CHANGED
@@ -18,12 +18,31 @@ from smolagents import InferenceClientModel, CodeAgent, DuckDuckGoSearchTool, G
18
  load_dotenv()
19
  model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  web_agent = CodeAgent(
22
  model=model,
23
  tools=[
24
  # DuckDuckGoSearchTool(),
25
  GoogleSearchTool("serper"),
26
- VisitWebpageTool()
 
 
 
 
 
 
 
 
27
  ],
28
  name="web_agent",
29
  description="Runs web searches for you. Give it your query as an argument.",
 
18
  load_dotenv()
19
  model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")
20
 
21
+ from app_tools.web_search_tools import(
22
+ SearchInformationTool,
23
+ NavigationalSearchTool,
24
+ VisitTool,
25
+ PageUpTool,
26
+ PageDownTool,
27
+ FinderTool,
28
+ FindNextTool,
29
+ ArchiveSearchTool,
30
+ )
31
+
32
  web_agent = CodeAgent(
33
  model=model,
34
  tools=[
35
  # DuckDuckGoSearchTool(),
36
  GoogleSearchTool("serper"),
37
+ VisitWebpageTool(),
38
+ SearchInformationTool(),
39
+ NavigationalSearchTool(),
40
+ VisitTool(),
41
+ PageUpTool(),
42
+ PageDownTool(),
43
+ FinderTool(),
44
+ FindNextTool(),
45
+ ArchiveSearchTool(),
46
  ],
47
  name="web_agent",
48
  description="Runs web searches for you. Give it your query as an argument.",
app_tools/browser.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import json
4
+ import os
5
+ import requests
6
+ import re
7
+ import io
8
+ import uuid
9
+ import mimetypes
10
+ import time
11
+ import pathlib
12
+ import pathvalidate
13
+ from urllib.parse import urljoin, urlparse, unquote, parse_qs
14
+ from urllib.request import url2pathname
15
+ from typing import Any, Dict, List, Optional, Union, Tuple
16
+ from .mdconvert import MarkdownConverter, UnsupportedFormatException, FileConversionException
17
+ from serpapi import GoogleSearch
18
+ from .cookies import COOKIES
19
+
20
+
21
+ class SimpleTextBrowser:
22
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
23
+
24
+ def __init__(
25
+ self,
26
+ start_page: Optional[str] = None,
27
+ viewport_size: Optional[int] = 1024 * 8,
28
+ downloads_folder: Optional[Union[str, None]] = None,
29
+ serpapi_key: Optional[Union[str, None]] = None,
30
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
31
+ ):
32
+ self.start_page: str = start_page if start_page else "about:blank"
33
+ self.viewport_size = viewport_size # Applies only to the standard uri types
34
+ self.downloads_folder = downloads_folder
35
+ self.history: List[Tuple[str, float]] = list()
36
+ self.page_title: Optional[str] = None
37
+ self.viewport_current_page = 0
38
+ self.viewport_pages: List[Tuple[int, int]] = list()
39
+ self.set_address(self.start_page)
40
+ self.serpapi_key = serpapi_key
41
+ self.request_kwargs = request_kwargs
42
+ self.request_kwargs["cookies"] = COOKIES
43
+ self._mdconvert = MarkdownConverter()
44
+ self._page_content: str = ""
45
+
46
+ self._find_on_page_query: Union[str, None] = None
47
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
48
+
49
+ @property
50
+ def address(self) -> str:
51
+ """Return the address of the current page."""
52
+ return self.history[-1][0]
53
+
54
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
55
+ # TODO: Handle anchors
56
+ self.history.append((uri_or_path, time.time()))
57
+
58
+ # Handle special URIs
59
+ if uri_or_path == "about:blank":
60
+ self._set_page_content("")
61
+ elif uri_or_path.startswith("google:"):
62
+ self._serpapi_search(uri_or_path[len("google:"):].strip(), filter_year=filter_year)
63
+ else:
64
+ if (
65
+ not uri_or_path.startswith("http:")
66
+ and not uri_or_path.startswith("https:")
67
+ and not uri_or_path.startswith("file:")
68
+ ):
69
+ if len(self.history) > 1:
70
+ prior_address = self.history[-2][0]
71
+ uri_or_path = urljoin(prior_address, uri_or_path)
72
+ # Update the address with the fully-qualified path
73
+ self.history[-1] = (uri_or_path, self.history[-1][1])
74
+ self._fetch_page(uri_or_path)
75
+
76
+ self.viewport_current_page = 0
77
+ self.find_on_page_query = None
78
+ self.find_on_page_viewport = None
79
+
80
+ @property
81
+ def viewport(self) -> str:
82
+ """Return the content of the current viewport."""
83
+ bounds = self.viewport_pages[self.viewport_current_page]
84
+ return self.page_content[bounds[0] : bounds[1]]
85
+
86
+ @property
87
+ def page_content(self) -> str:
88
+ """Return the full contents of the current page."""
89
+ return self._page_content
90
+
91
+ def _set_page_content(self, content: str) -> None:
92
+ """Sets the text content of the current page."""
93
+ self._page_content = content
94
+ self._split_pages()
95
+ if self.viewport_current_page >= len(self.viewport_pages):
96
+ self.viewport_current_page = len(self.viewport_pages) - 1
97
+
98
+ def page_down(self) -> None:
99
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
100
+
101
+ def page_up(self) -> None:
102
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
103
+
104
+ def find_on_page(self, query: str) -> Union[str, None]:
105
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
106
+
107
+ # Did we get here via a previous find_on_page search with the same query?
108
+ # If so, map to find_next
109
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
110
+ return self.find_next()
111
+
112
+ # Ok it's a new search start from the current viewport
113
+ self._find_on_page_query = query
114
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
115
+ if viewport_match is None:
116
+ self._find_on_page_last_result = None
117
+ return None
118
+ else:
119
+ self.viewport_current_page = viewport_match
120
+ self._find_on_page_last_result = viewport_match
121
+ return self.viewport
122
+
123
+ def find_next(self) -> None:
124
+ """Scroll to the next viewport that matches the query"""
125
+
126
+ if self._find_on_page_query is None:
127
+ return None
128
+
129
+ starting_viewport = self._find_on_page_last_result
130
+ if starting_viewport is None:
131
+ starting_viewport = 0
132
+ else:
133
+ starting_viewport += 1
134
+ if starting_viewport >= len(self.viewport_pages):
135
+ starting_viewport = 0
136
+
137
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
138
+ if viewport_match is None:
139
+ self._find_on_page_last_result = None
140
+ return None
141
+ else:
142
+ self.viewport_current_page = viewport_match
143
+ self._find_on_page_last_result = viewport_match
144
+ return self.viewport
145
+
146
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
147
+ """Search for matches between the starting viewport looping when reaching the end."""
148
+
149
+ if query is None:
150
+ return None
151
+
152
+ # Normalize the query, and convert to a regular expression
153
+ nquery = re.sub(r"\*", "__STAR__", query)
154
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
155
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
156
+ nquery = nquery.replace("__STAR__", ".*").lower()
157
+
158
+ if nquery.strip() == "":
159
+ return None
160
+
161
+ idxs = list()
162
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
163
+ idxs.extend(range(0, starting_viewport))
164
+
165
+ for i in idxs:
166
+ bounds = self.viewport_pages[i]
167
+ content = self.page_content[bounds[0] : bounds[1]]
168
+
169
+ # TODO: Remove markdown links and images
170
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
171
+ if re.search(nquery, ncontent):
172
+ return i
173
+
174
+ return None
175
+
176
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
177
+ """Update the address, visit the page, and return the content of the viewport."""
178
+ self.set_address(path_or_uri, filter_year=filter_year)
179
+ return self.viewport
180
+
181
+ def _split_pages(self) -> None:
182
+ # Do not split search results
183
+ if self.address.startswith("google:"):
184
+ self.viewport_pages = [(0, len(self._page_content))]
185
+ return
186
+
187
+ # Handle empty pages
188
+ if len(self._page_content) == 0:
189
+ self.viewport_pages = [(0, 0)]
190
+ return
191
+
192
+ # Break the viewport into pages
193
+ self.viewport_pages = []
194
+ start_idx = 0
195
+ while start_idx < len(self._page_content):
196
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
197
+ # Adjust to end on a space
198
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
199
+ end_idx += 1
200
+ self.viewport_pages.append((start_idx, end_idx))
201
+ start_idx = end_idx
202
+
203
+
204
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
205
+ if self.serpapi_key is None:
206
+ raise ValueError("Missing SerpAPI key.")
207
+
208
+ params = {
209
+ "engine": "google",
210
+ "q": query,
211
+ "api_key": self.serpapi_key,
212
+ }
213
+ if filter_year is not None:
214
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
215
+
216
+ search = GoogleSearch(params)
217
+ results = search.get_dict()
218
+ self.page_title = f"{query} - Search"
219
+ if "organic_results" not in results.keys():
220
+ raise Exception(f"'organic_results' key not found in results: {results}. Use a less restrictive query.")
221
+ if len(results['organic_results']) == 0:
222
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
223
+ self._set_page_content(f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter.")
224
+ return
225
+
226
+ def _prev_visit(url):
227
+ for i in range(len(self.history) - 1, -1, -1):
228
+ if self.history[i][0] == url:
229
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
230
+ return ""
231
+
232
+ web_snippets: List[str] = list()
233
+ idx = 0
234
+ if "organic_results" in results:
235
+ for page in results["organic_results"]:
236
+ idx += 1
237
+ date_published = ""
238
+ if "date" in page:
239
+ date_published = "\nDate published: " + page["date"]
240
+
241
+ source = ""
242
+ if "source" in page:
243
+ source = "\nSource: " + page["source"]
244
+
245
+ snippet = ""
246
+ if "snippet" in page:
247
+ snippet = "\n" + page["snippet"]
248
+
249
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
250
+
251
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
252
+ web_snippets.append(redacted_version)
253
+
254
+
255
+ content = (
256
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
257
+ + "\n\n".join(web_snippets)
258
+ )
259
+
260
+ self._set_page_content(content)
261
+
262
+
263
+ def _fetch_page(self, url: str) -> None:
264
+ download_path = ""
265
+ try:
266
+ if url.startswith("file://"):
267
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
268
+ res = self._mdconvert.convert_local(download_path)
269
+ self.page_title = res.title
270
+ self._set_page_content(res.text_content)
271
+ else:
272
+ # Prepare the request parameters
273
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
274
+ request_kwargs["stream"] = True
275
+
276
+ # Send a HTTP request to the URL
277
+ response = requests.get(url, **request_kwargs)
278
+ response.raise_for_status()
279
+
280
+ # If the HTTP request was successful
281
+ content_type = response.headers.get("content-type", "")
282
+
283
+ # Text or HTML
284
+ if "text/" in content_type.lower():
285
+ res = self._mdconvert.convert_response(response)
286
+ self.page_title = res.title
287
+ self._set_page_content(res.text_content)
288
+ # A download
289
+ else:
290
+ # Try producing a safe filename
291
+ fname = None
292
+ download_path = None
293
+ try:
294
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
295
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
296
+
297
+ suffix = 0
298
+ while os.path.exists(download_path) and suffix < 1000:
299
+ suffix += 1
300
+ base, ext = os.path.splitext(fname)
301
+ new_fname = f"{base}__{suffix}{ext}"
302
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
303
+
304
+ except NameError:
305
+ pass
306
+
307
+ # No suitable name, so make one
308
+ if fname is None:
309
+ extension = mimetypes.guess_extension(content_type)
310
+ if extension is None:
311
+ extension = ".download"
312
+ fname = str(uuid.uuid4()) + extension
313
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
314
+
315
+ # Open a file for writing
316
+ with open(download_path, "wb") as fh:
317
+ for chunk in response.iter_content(chunk_size=512):
318
+ fh.write(chunk)
319
+
320
+ # Render it
321
+ local_uri = pathlib.Path(download_path).as_uri()
322
+ self.set_address(local_uri)
323
+
324
+
325
+ except UnsupportedFormatException as e:
326
+ print(e)
327
+ self.page_title = ("Download complete.",)
328
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
329
+ except FileConversionException as e:
330
+ print(e)
331
+ self.page_title = ("Download complete.",)
332
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
333
+ except FileNotFoundError:
334
+ self.page_title = "Error 404"
335
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
336
+ except requests.exceptions.RequestException as request_exception:
337
+ try:
338
+ self.page_title = f"Error {response.status_code}"
339
+
340
+ # If the error was rendered in HTML we might as well render it
341
+ content_type = response.headers.get("content-type", "")
342
+ if content_type is not None and "text/html" in content_type.lower():
343
+ res = self._mdconvert.convert(response)
344
+ self.page_title = f"Error {response.status_code}"
345
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
346
+ else:
347
+ text = ""
348
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
349
+ text += chunk
350
+ self.page_title = f"Error {response.status_code}"
351
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
352
+ except NameError:
353
+ self.page_title = f"Error"
354
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
app_tools/cookies.py ADDED
@@ -0,0 +1,713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ COOKIES_LIST = [
2
+ {
3
+ "domain": ".youtube.com",
4
+ "expirationDate": 1718884961,
5
+ "hostOnly": False,
6
+ "httpOnly": False,
7
+ "name": "ST-xuwub9",
8
+ "path": "/",
9
+ "sameSite": None,
10
+ "secure": False,
11
+ "session": False,
12
+ "storeId": None,
13
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3"
14
+ },
15
+ {
16
+ "domain": ".youtube.com",
17
+ "expirationDate": 1753004444.745411,
18
+ "hostOnly": False,
19
+ "httpOnly": True,
20
+ "name": "__Secure-YEC",
21
+ "path": "/",
22
+ "sameSite": "lax",
23
+ "secure": True,
24
+ "session": False,
25
+ "storeId": None,
26
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk"
27
+ },
28
+ {
29
+ "domain": ".youtube.com",
30
+ "expirationDate": 1753434620.050824,
31
+ "hostOnly": False,
32
+ "httpOnly": True,
33
+ "name": "__Secure-3PSID",
34
+ "path": "/",
35
+ "sameSite": "no_restriction",
36
+ "secure": True,
37
+ "session": False,
38
+ "storeId": None,
39
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076"
40
+ },
41
+ {
42
+ "domain": ".youtube.com",
43
+ "expirationDate": 1750420959.974642,
44
+ "hostOnly": False,
45
+ "httpOnly": False,
46
+ "name": "SIDCC",
47
+ "path": "/",
48
+ "sameSite": None,
49
+ "secure": False,
50
+ "session": False,
51
+ "storeId": None,
52
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw"
53
+ },
54
+ {
55
+ "domain": ".youtube.com",
56
+ "expirationDate": 1753434620.050652,
57
+ "hostOnly": False,
58
+ "httpOnly": False,
59
+ "name": "SID",
60
+ "path": "/",
61
+ "sameSite": None,
62
+ "secure": False,
63
+ "session": False,
64
+ "storeId": None,
65
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076"
66
+ },
67
+ {
68
+ "domain": ".youtube.com",
69
+ "expirationDate": 1750420958.397534,
70
+ "hostOnly": False,
71
+ "httpOnly": True,
72
+ "name": "__Secure-1PSIDTS",
73
+ "path": "/",
74
+ "sameSite": None,
75
+ "secure": True,
76
+ "session": False,
77
+ "storeId": None,
78
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA"
79
+ },
80
+ {
81
+ "domain": ".youtube.com",
82
+ "expirationDate": 1753433494.44729,
83
+ "hostOnly": False,
84
+ "httpOnly": False,
85
+ "name": "_ga_M0180HEFCY",
86
+ "path": "/",
87
+ "sameSite": None,
88
+ "secure": False,
89
+ "session": False,
90
+ "storeId": None,
91
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0"
92
+ },
93
+ {
94
+ "domain": ".youtube.com",
95
+ "expirationDate": 1753434620.050933,
96
+ "hostOnly": False,
97
+ "httpOnly": False,
98
+ "name": "SAPISID",
99
+ "path": "/",
100
+ "sameSite": None,
101
+ "secure": True,
102
+ "session": False,
103
+ "storeId": None,
104
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6"
105
+ },
106
+ {
107
+ "domain": ".youtube.com",
108
+ "expirationDate": 1750420959.974764,
109
+ "hostOnly": False,
110
+ "httpOnly": True,
111
+ "name": "__Secure-1PSIDCC",
112
+ "path": "/",
113
+ "sameSite": None,
114
+ "secure": True,
115
+ "session": False,
116
+ "storeId": None,
117
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK"
118
+ },
119
+ {
120
+ "domain": ".youtube.com",
121
+ "expirationDate": 1753434620.050881,
122
+ "hostOnly": False,
123
+ "httpOnly": True,
124
+ "name": "SSID",
125
+ "path": "/",
126
+ "sameSite": None,
127
+ "secure": True,
128
+ "session": False,
129
+ "storeId": None,
130
+ "value": "AmlwXHnQvOQ10LVd-"
131
+ },
132
+ {
133
+ "domain": ".youtube.com",
134
+ "expirationDate": 1753434620.050959,
135
+ "hostOnly": False,
136
+ "httpOnly": False,
137
+ "name": "__Secure-1PAPISID",
138
+ "path": "/",
139
+ "sameSite": None,
140
+ "secure": True,
141
+ "session": False,
142
+ "storeId": None,
143
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6"
144
+ },
145
+ {
146
+ "domain": ".youtube.com",
147
+ "expirationDate": 1753434620.050795,
148
+ "hostOnly": False,
149
+ "httpOnly": True,
150
+ "name": "__Secure-1PSID",
151
+ "path": "/",
152
+ "sameSite": None,
153
+ "secure": True,
154
+ "session": False,
155
+ "storeId": None,
156
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076"
157
+ },
158
+ {
159
+ "domain": ".youtube.com",
160
+ "expirationDate": 1753434620.050993,
161
+ "hostOnly": False,
162
+ "httpOnly": False,
163
+ "name": "__Secure-3PAPISID",
164
+ "path": "/",
165
+ "sameSite": "no_restriction",
166
+ "secure": True,
167
+ "session": False,
168
+ "storeId": None,
169
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6"
170
+ },
171
+ {
172
+ "domain": ".youtube.com",
173
+ "expirationDate": 1750420959.974815,
174
+ "hostOnly": False,
175
+ "httpOnly": True,
176
+ "name": "__Secure-3PSIDCC",
177
+ "path": "/",
178
+ "sameSite": "no_restriction",
179
+ "secure": True,
180
+ "session": False,
181
+ "storeId": None,
182
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg"
183
+ },
184
+ {
185
+ "domain": ".youtube.com",
186
+ "expirationDate": 1750420958.397647,
187
+ "hostOnly": False,
188
+ "httpOnly": True,
189
+ "name": "__Secure-3PSIDTS",
190
+ "path": "/",
191
+ "sameSite": "no_restriction",
192
+ "secure": True,
193
+ "session": False,
194
+ "storeId": None,
195
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA"
196
+ },
197
+ {
198
+ "domain": ".youtube.com",
199
+ "expirationDate": 1753434620.050908,
200
+ "hostOnly": False,
201
+ "httpOnly": False,
202
+ "name": "APISID",
203
+ "path": "/",
204
+ "sameSite": None,
205
+ "secure": False,
206
+ "session": False,
207
+ "storeId": None,
208
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk"
209
+ },
210
+ {
211
+ "domain": ".youtube.com",
212
+ "expirationDate": 1753434620.050855,
213
+ "hostOnly": False,
214
+ "httpOnly": True,
215
+ "name": "HSID",
216
+ "path": "/",
217
+ "sameSite": None,
218
+ "secure": False,
219
+ "session": False,
220
+ "storeId": None,
221
+ "value": "AasA7hmRuTFv7vjoq"
222
+ },
223
+ {
224
+ "domain": ".youtube.com",
225
+ "expirationDate": 1753435873.577793,
226
+ "hostOnly": False,
227
+ "httpOnly": True,
228
+ "name": "LOGIN_INFO",
229
+ "path": "/",
230
+ "sameSite": "no_restriction",
231
+ "secure": True,
232
+ "session": False,
233
+ "storeId": None,
234
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3"
235
+ },
236
+ {
237
+ "domain": ".youtube.com",
238
+ "expirationDate": 1753444956.555608,
239
+ "hostOnly": False,
240
+ "httpOnly": False,
241
+ "name": "PREF",
242
+ "path": "/",
243
+ "sameSite": None,
244
+ "secure": True,
245
+ "session": False,
246
+ "storeId": None,
247
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100"
248
+ }
249
+ ]
250
+
251
+ COOKIES_LIST += [
252
+ {
253
+ "domain": ".www.researchgate.net",
254
+ "hostOnly": False,
255
+ "httpOnly": True,
256
+ "name": "isInstIp",
257
+ "path": "/",
258
+ "sameSite": None,
259
+ "secure": True,
260
+ "session": True,
261
+ "storeId": None,
262
+ "value": "False"
263
+ },
264
+ {
265
+ "domain": ".researchgate.net",
266
+ "expirationDate": 1734423981,
267
+ "hostOnly": False,
268
+ "httpOnly": False,
269
+ "name": "__eoi",
270
+ "path": "/",
271
+ "sameSite": None,
272
+ "secure": False,
273
+ "session": False,
274
+ "storeId": None,
275
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc"
276
+ },
277
+ {
278
+ "domain": ".www.researchgate.net",
279
+ "expirationDate": 1753444909.646103,
280
+ "hostOnly": False,
281
+ "httpOnly": True,
282
+ "name": "ptc",
283
+ "path": "/",
284
+ "sameSite": None,
285
+ "secure": True,
286
+ "session": False,
287
+ "storeId": None,
288
+ "value": "RG1.8947708639250500550.1718872043"
289
+ },
290
+ {
291
+ "domain": ".researchgate.net",
292
+ "expirationDate": 1750507578,
293
+ "hostOnly": False,
294
+ "httpOnly": False,
295
+ "name": "euconsent-v2-didomi",
296
+ "path": "/",
297
+ "sameSite": "lax",
298
+ "secure": True,
299
+ "session": False,
300
+ "storeId": None,
301
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA"
302
+ },
303
+ {
304
+ "domain": ".researchgate.net",
305
+ "expirationDate": 1718885236,
306
+ "hostOnly": False,
307
+ "httpOnly": False,
308
+ "name": "_gat",
309
+ "path": "/",
310
+ "sameSite": None,
311
+ "secure": False,
312
+ "session": False,
313
+ "storeId": None,
314
+ "value": "1"
315
+ },
316
+ {
317
+ "domain": "www.researchgate.net",
318
+ "expirationDate": 1721477183,
319
+ "hostOnly": True,
320
+ "httpOnly": False,
321
+ "name": "_pbjs_userid_consent_data",
322
+ "path": "/",
323
+ "sameSite": "lax",
324
+ "secure": False,
325
+ "session": False,
326
+ "storeId": None,
327
+ "value": "3524755945110770"
328
+ },
329
+ {
330
+ "domain": ".researchgate.net",
331
+ "expirationDate": 1752567981,
332
+ "hostOnly": False,
333
+ "httpOnly": False,
334
+ "name": "__gads",
335
+ "path": "/",
336
+ "sameSite": None,
337
+ "secure": False,
338
+ "session": False,
339
+ "storeId": None,
340
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ"
341
+ },
342
+ {
343
+ "domain": ".researchgate.net",
344
+ "expirationDate": 1718886709.646173,
345
+ "hostOnly": False,
346
+ "httpOnly": True,
347
+ "name": "__cf_bm",
348
+ "path": "/",
349
+ "sameSite": "no_restriction",
350
+ "secure": True,
351
+ "session": False,
352
+ "storeId": None,
353
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA"
354
+ },
355
+ {
356
+ "domain": ".researchgate.net",
357
+ "expirationDate": 1752567981,
358
+ "hostOnly": False,
359
+ "httpOnly": False,
360
+ "name": "__gpi",
361
+ "path": "/",
362
+ "sameSite": None,
363
+ "secure": False,
364
+ "session": False,
365
+ "storeId": None,
366
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg"
367
+ },
368
+ {
369
+ "domain": ".researchgate.net",
370
+ "hostOnly": False,
371
+ "httpOnly": True,
372
+ "name": "_cfuvid",
373
+ "path": "/",
374
+ "sameSite": "no_restriction",
375
+ "secure": True,
376
+ "session": True,
377
+ "storeId": None,
378
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000"
379
+ },
380
+ {
381
+ "domain": ".researchgate.net",
382
+ "expirationDate": 1753445177.271667,
383
+ "hostOnly": False,
384
+ "httpOnly": False,
385
+ "name": "_ga",
386
+ "path": "/",
387
+ "sameSite": None,
388
+ "secure": False,
389
+ "session": False,
390
+ "storeId": None,
391
+ "value": "GA1.1.1525244793.1718885177"
392
+ },
393
+ {
394
+ "domain": ".researchgate.net",
395
+ "expirationDate": 1753445177.271482,
396
+ "hostOnly": False,
397
+ "httpOnly": False,
398
+ "name": "_ga_4P31SJ70EJ",
399
+ "path": "/",
400
+ "sameSite": None,
401
+ "secure": False,
402
+ "session": False,
403
+ "storeId": None,
404
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0"
405
+ },
406
+ {
407
+ "domain": ".researchgate.net",
408
+ "expirationDate": 1718971576,
409
+ "hostOnly": False,
410
+ "httpOnly": False,
411
+ "name": "_gid",
412
+ "path": "/",
413
+ "sameSite": None,
414
+ "secure": False,
415
+ "session": False,
416
+ "storeId": None,
417
+ "value": "GA1.2.854907463.1718885177"
418
+ },
419
+ {
420
+ "domain": ".www.researchgate.net",
421
+ "expirationDate": 1750407982.506505,
422
+ "hostOnly": False,
423
+ "httpOnly": True,
424
+ "name": "did",
425
+ "path": "/",
426
+ "sameSite": None,
427
+ "secure": True,
428
+ "session": False,
429
+ "storeId": None,
430
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH"
431
+ },
432
+ {
433
+ "domain": ".researchgate.net",
434
+ "expirationDate": 1750507578,
435
+ "hostOnly": False,
436
+ "httpOnly": False,
437
+ "name": "didomi_token",
438
+ "path": "/",
439
+ "sameSite": "lax",
440
+ "secure": True,
441
+ "session": False,
442
+ "storeId": None,
443
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9"
444
+ },
445
+ {
446
+ "domain": ".www.researchgate.net",
447
+ "hostOnly": False,
448
+ "httpOnly": True,
449
+ "name": "hasPdpNext",
450
+ "path": "/",
451
+ "sameSite": None,
452
+ "secure": True,
453
+ "session": True,
454
+ "storeId": None,
455
+ "value": "False"
456
+ },
457
+ {
458
+ "domain": ".researchgate.net",
459
+ "expirationDate": 1750421183,
460
+ "hostOnly": False,
461
+ "httpOnly": False,
462
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
463
+ "path": "/",
464
+ "sameSite": "lax",
465
+ "secure": True,
466
+ "session": False,
467
+ "storeId": None,
468
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D"
469
+ },
470
+ {
471
+ "domain": ".www.researchgate.net",
472
+ "hostOnly": False,
473
+ "httpOnly": True,
474
+ "name": "sid",
475
+ "path": "/",
476
+ "sameSite": None,
477
+ "secure": True,
478
+ "session": True,
479
+ "storeId": None,
480
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ"
481
+ }
482
+ ]
483
+
484
+ COOKIES_LIST += [
485
+ {
486
+ "domain": "github.com",
487
+ "hostOnly": True,
488
+ "httpOnly": True,
489
+ "name": "_gh_sess",
490
+ "path": "/",
491
+ "sameSite": "lax",
492
+ "secure": True,
493
+ "session": True,
494
+ "storeId": None,
495
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D"
496
+ },
497
+ {
498
+ "domain": ".github.com",
499
+ "expirationDate": 1750408875.763785,
500
+ "hostOnly": False,
501
+ "httpOnly": False,
502
+ "name": "_octo",
503
+ "path": "/",
504
+ "sameSite": "lax",
505
+ "secure": True,
506
+ "session": False,
507
+ "storeId": None,
508
+ "value": "GH1.1.728652011.1718872875"
509
+ },
510
+ {
511
+ "domain": ".github.com",
512
+ "expirationDate": 1750408875.763926,
513
+ "hostOnly": False,
514
+ "httpOnly": True,
515
+ "name": "logged_in",
516
+ "path": "/",
517
+ "sameSite": "lax",
518
+ "secure": True,
519
+ "session": False,
520
+ "storeId": None,
521
+ "value": "no"
522
+ },
523
+ {
524
+ "domain": ".github.com",
525
+ "hostOnly": False,
526
+ "httpOnly": False,
527
+ "name": "preferred_color_mode",
528
+ "path": "/",
529
+ "sameSite": "lax",
530
+ "secure": True,
531
+ "session": True,
532
+ "storeId": None,
533
+ "value": "dark"
534
+ },
535
+ {
536
+ "domain": ".github.com",
537
+ "hostOnly": False,
538
+ "httpOnly": False,
539
+ "name": "tz",
540
+ "path": "/",
541
+ "sameSite": "lax",
542
+ "secure": True,
543
+ "session": True,
544
+ "storeId": None,
545
+ "value": "Europe%2FParis"
546
+ }
547
+ ]
548
+
549
+ COOKIES_LIST += [
550
+ {
551
+ "domain": ".web.archive.org",
552
+ "expirationDate": 1718886430,
553
+ "hostOnly": False,
554
+ "httpOnly": False,
555
+ "name": "_gat",
556
+ "path": "/web/20201123221659/http://orcid.org/",
557
+ "sameSite": None,
558
+ "secure": False,
559
+ "session": False,
560
+ "storeId": None,
561
+ "value": "1"
562
+ },
563
+ {
564
+ "domain": ".web.archive.org",
565
+ "expirationDate": 1718972770,
566
+ "hostOnly": False,
567
+ "httpOnly": False,
568
+ "name": "_gid",
569
+ "path": "/web/20201123221659/http://orcid.org/",
570
+ "sameSite": None,
571
+ "secure": False,
572
+ "session": False,
573
+ "storeId": None,
574
+ "value": "GA1.2.402246368.1606169825"
575
+ },
576
+ {
577
+ "domain": ".web.archive.org",
578
+ "expirationDate": 1753446370.315621,
579
+ "hostOnly": False,
580
+ "httpOnly": False,
581
+ "name": "_ga",
582
+ "path": "/web/20201123221659/http://orcid.org/",
583
+ "sameSite": None,
584
+ "secure": False,
585
+ "session": False,
586
+ "storeId": None,
587
+ "value": "GA1.2.1301409987.1606169825"
588
+ },
589
+ {
590
+ "domain": ".web.archive.org",
591
+ "expirationDate": 1750422367,
592
+ "hostOnly": False,
593
+ "httpOnly": False,
594
+ "name": "_hjid",
595
+ "path": "/web/20201123221659/http://orcid.org/",
596
+ "sameSite": "lax",
597
+ "secure": False,
598
+ "session": False,
599
+ "storeId": None,
600
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2"
601
+ },
602
+ {
603
+ "domain": ".web.archive.org",
604
+ "expirationDate": 1718888167,
605
+ "hostOnly": False,
606
+ "httpOnly": False,
607
+ "name": "_hjFirstSeen",
608
+ "path": "/web/20201123221659/http://orcid.org/",
609
+ "sameSite": "lax",
610
+ "secure": False,
611
+ "session": False,
612
+ "storeId": None,
613
+ "value": "1"
614
+ }
615
+ ]
616
+ COOKIES_LIST += [
617
+ {
618
+ "domain": "orcid.org",
619
+ "hostOnly": True,
620
+ "httpOnly": False,
621
+ "name": "AWSELBCORS",
622
+ "path": "/",
623
+ "sameSite": "no_restriction",
624
+ "secure": True,
625
+ "session": True,
626
+ "storeId": None,
627
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F"
628
+ },
629
+ {
630
+ "domain": ".orcid.org",
631
+ "expirationDate": 1753452454.637671,
632
+ "hostOnly": False,
633
+ "httpOnly": False,
634
+ "name": "_ga_9R61FWK9H5",
635
+ "path": "/",
636
+ "sameSite": None,
637
+ "secure": False,
638
+ "session": False,
639
+ "storeId": None,
640
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0"
641
+ },
642
+ {
643
+ "domain": ".orcid.org",
644
+ "expirationDate": 1753452454.63421,
645
+ "hostOnly": False,
646
+ "httpOnly": False,
647
+ "name": "_ga",
648
+ "path": "/",
649
+ "sameSite": None,
650
+ "secure": False,
651
+ "session": False,
652
+ "storeId": None,
653
+ "value": "GA1.1.2021310691.1718892455"
654
+ },
655
+ {
656
+ "domain": "orcid.org",
657
+ "hostOnly": True,
658
+ "httpOnly": False,
659
+ "name": "AWSELB",
660
+ "path": "/",
661
+ "sameSite": None,
662
+ "secure": False,
663
+ "session": True,
664
+ "storeId": None,
665
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F"
666
+ },
667
+ {
668
+ "domain": ".orcid.org",
669
+ "expirationDate": 1750428454,
670
+ "hostOnly": False,
671
+ "httpOnly": False,
672
+ "name": "OptanonAlertBoxClosed",
673
+ "path": "/",
674
+ "sameSite": "lax",
675
+ "secure": False,
676
+ "session": False,
677
+ "storeId": None,
678
+ "value": "2024-06-20T14:07:34.583Z"
679
+ },
680
+ {
681
+ "domain": ".orcid.org",
682
+ "expirationDate": 1750428454,
683
+ "hostOnly": False,
684
+ "httpOnly": False,
685
+ "name": "OptanonConsent",
686
+ "path": "/",
687
+ "sameSite": "lax",
688
+ "secure": False,
689
+ "session": False,
690
+ "storeId": None,
691
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1"
692
+ },
693
+ {
694
+ "domain": "orcid.org",
695
+ "hostOnly": True,
696
+ "httpOnly": False,
697
+ "name": "XSRF-TOKEN",
698
+ "path": "/",
699
+ "sameSite": None,
700
+ "secure": True,
701
+ "session": True,
702
+ "storeId": None,
703
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9"
704
+ }
705
+ ]
706
+ from requests.cookies import RequestsCookieJar
707
+
708
+ # Create a RequestsCookieJar instance
709
+ COOKIES = RequestsCookieJar()
710
+
711
+ # Add cookies to the jar
712
+ for cookie in COOKIES_LIST:
713
+ COOKIES.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])
app_tools/mdconvert.py ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E722
2
+ import json
3
+ import os
4
+ import requests
5
+ import re
6
+ import markdownify
7
+ import mimetypes
8
+ import html
9
+ import puremagic
10
+ import tempfile
11
+ import copy
12
+ import mammoth
13
+ import pptx
14
+ import pandas as pd
15
+ import traceback
16
+
17
+ from urllib.parse import urlparse, parse_qs
18
+ from bs4 import BeautifulSoup
19
+ from typing import Any, Dict, List, Optional, Union
20
+ import pdfminer
21
+ import pdfminer.high_level
22
+ from youtube_transcript_api import YouTubeTranscriptApi
23
+
24
+
25
+
26
+ class DocumentConverterResult:
27
+ """The result of converting a document to text."""
28
+
29
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
30
+ self.title = title
31
+ self.text_content = text_content
32
+
33
+
34
+ class DocumentConverter:
35
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
36
+ raise NotImplementedError()
37
+
38
+
39
+ class PlainTextConverter(DocumentConverter):
40
+ """Anything with content type text/plain"""
41
+
42
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
43
+ extension = kwargs.get("file_extension", "")
44
+ if extension == "":
45
+ return None
46
+
47
+ content_type, encoding = mimetypes.guess_type("__placeholder" + extension)
48
+
49
+ text_content = ""
50
+ with open(local_path, "rt") as fh:
51
+ text_content = fh.read()
52
+
53
+ return DocumentConverterResult(
54
+ title=None,
55
+ text_content=text_content,
56
+ )
57
+
58
+
59
+ class HtmlConverter(DocumentConverter):
60
+ """Anything with content type text/html"""
61
+
62
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
63
+ # Bail if not html
64
+ extension = kwargs.get("file_extension", "")
65
+ if extension.lower() not in [".html", ".htm"]:
66
+ return None
67
+
68
+ result = None
69
+ with open(local_path, "rt") as fh:
70
+ result = self._convert(fh.read())
71
+
72
+ return result
73
+
74
+ def _convert(self, html_content) -> Union[None, DocumentConverterResult]:
75
+ """Helper function that converts and HTML string."""
76
+
77
+ # Parse the string
78
+ soup = BeautifulSoup(html_content, "html.parser")
79
+
80
+ # Remove javascript and style blocks
81
+ for script in soup(["script", "style"]):
82
+ script.extract()
83
+
84
+ # Print only the main content
85
+ body_elm = soup.find("body")
86
+ webpage_text = ""
87
+ if body_elm:
88
+ webpage_text = markdownify.MarkdownConverter().convert_soup(body_elm)
89
+ else:
90
+ webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
91
+
92
+ return DocumentConverterResult(
93
+ title=None if soup.title is None else soup.title.string,
94
+ text_content=webpage_text,
95
+ )
96
+
97
+
98
+ class WikipediaConverter(DocumentConverter):
99
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
100
+
101
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
102
+ # Bail if not Wikipedia
103
+ extension = kwargs.get("file_extension", "")
104
+ if extension.lower() not in [".html", ".htm"]:
105
+ return None
106
+ url = kwargs.get("url", "")
107
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
108
+ return None
109
+
110
+ # Parse the file
111
+ soup = None
112
+ with open(local_path, "rt") as fh:
113
+ soup = BeautifulSoup(fh.read(), "html.parser")
114
+
115
+ # Remove javascript and style blocks
116
+ for script in soup(["script", "style"]):
117
+ script.extract()
118
+
119
+ # Print only the main content
120
+ body_elm = soup.find("div", {"id": "mw-content-text"})
121
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
122
+
123
+ webpage_text = ""
124
+ if body_elm:
125
+ # What's the title
126
+ main_title = soup.title.string
127
+ if title_elm and len(title_elm) > 0:
128
+ main_title = title_elm.string
129
+
130
+ # Convert the page
131
+ webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
132
+ else:
133
+ webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
134
+
135
+ return DocumentConverterResult(
136
+ title=soup.title.string,
137
+ text_content=webpage_text,
138
+ )
139
+
140
+
141
+ class YouTubeConverter(DocumentConverter):
142
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
143
+
144
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
145
+ # Bail if not YouTube
146
+ extension = kwargs.get("file_extension", "")
147
+ if extension.lower() not in [".html", ".htm"]:
148
+ return None
149
+ url = kwargs.get("url", "")
150
+ if not url.startswith("https://www.youtube.com/watch?"):
151
+ return None
152
+
153
+ # Parse the file
154
+ soup = None
155
+ with open(local_path, "rt") as fh:
156
+ soup = BeautifulSoup(fh.read(), "html.parser")
157
+
158
+ # Read the meta tags
159
+ metadata = {"title": soup.title.string}
160
+ for meta in soup(["meta"]):
161
+ for a in meta.attrs:
162
+ if a in ["itemprop", "property", "name"]:
163
+ metadata[meta[a]] = meta.get("content", "")
164
+ break
165
+
166
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
167
+ try:
168
+ for script in soup(["script"]):
169
+ content = script.text
170
+ if "ytInitialData" in content:
171
+ lines = re.split(r"\r?\n", content)
172
+ obj_start = lines[0].find("{")
173
+ obj_end = lines[0].rfind("}")
174
+ if obj_start >= 0 and obj_end >= 0:
175
+ data = json.loads(lines[0][obj_start : obj_end + 1])
176
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText")
177
+ if attrdesc:
178
+ metadata["description"] = attrdesc["content"]
179
+ break
180
+ except:
181
+ pass
182
+
183
+ # Start preparing the page
184
+ webpage_text = "# YouTube\n"
185
+
186
+ title = self._get(metadata, ["title", "og:title", "name"])
187
+ if title:
188
+ webpage_text += f"\n## {title}\n"
189
+
190
+ stats = ""
191
+ views = self._get(metadata, ["interactionCount"])
192
+ if views:
193
+ stats += f"- **Views:** {views}\n"
194
+
195
+ keywords = self._get(metadata, ["keywords"])
196
+ if keywords:
197
+ stats += f"- **Keywords:** {keywords}\n"
198
+
199
+ runtime = self._get(metadata, ["duration"])
200
+ if runtime:
201
+ stats += f"- **Runtime:** {runtime}\n"
202
+
203
+ if len(stats) > 0:
204
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
205
+
206
+ description = self._get(metadata, ["description", "og:description"])
207
+ if description:
208
+ webpage_text += f"\n### Description\n{description}\n"
209
+
210
+ transcript_text = ""
211
+ parsed_url = urlparse(url)
212
+ params = parse_qs(parsed_url.query)
213
+
214
+ video_id = params["v"][0]
215
+ # Must be a single transcript.
216
+ print("VIDDDD ID:", video_id)
217
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
218
+ transcript_text = " ".join([part["text"] for part in transcript])
219
+ # Alternative formatting:
220
+ # formatter = TextFormatter()
221
+ # formatter.format_transcript(transcript)
222
+ if transcript_text:
223
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
224
+
225
+ return DocumentConverterResult(
226
+ title=title if title else soup.title.string,
227
+ text_content=webpage_text,
228
+ )
229
+
230
+ def _get(self, json, keys, default=None):
231
+ for k in keys:
232
+ if k in json:
233
+ return json[k]
234
+ return default
235
+
236
+ def _findKey(self, json, key):
237
+ if isinstance(json, list):
238
+ for elm in json:
239
+ ret = self._findKey(elm, key)
240
+ if ret is not None:
241
+ return ret
242
+ elif isinstance(json, dict):
243
+ for k in json:
244
+ if k == key:
245
+ return json[k]
246
+ else:
247
+ ret = self._findKey(json[k], key)
248
+ if ret is not None:
249
+ return ret
250
+ return None
251
+
252
+
253
+ class PdfConverter(DocumentConverter):
254
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
255
+ # Bail if not a PDF
256
+ extension = kwargs.get("file_extension", "")
257
+ if extension.lower() != ".pdf":
258
+ return None
259
+
260
+ return DocumentConverterResult(
261
+ title=None,
262
+ text_content=pdfminer.high_level.extract_text(local_path),
263
+ )
264
+
265
+ from huggingface_hub import InferenceClient
266
+ class AudioConverter(DocumentConverter):
267
+ def __init__(self):
268
+ super().__init__()
269
+ self.client = InferenceClient("distil-whisper/distil-large-v3")
270
+
271
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
272
+ # Bail if not an audio file
273
+ extension = kwargs.get("file_extension", "")
274
+ if extension.lower() not in [".wav", ".mp3", ".flac", ".m4a"]:
275
+ return None
276
+ try:
277
+ result = self.client.automatic_speech_recognition(audio=local_path).text
278
+ except Exception as e:
279
+ print("Exception in decoding audio:", e)
280
+ from openai import OpenAI
281
+ oai_client = OpenAI()
282
+ from pathlib import Path
283
+ result = oai_client.audio.transcriptions.create(
284
+ model="whisper-1",
285
+ file=Path(local_path)
286
+ ).text
287
+
288
+ return DocumentConverterResult(
289
+ title=None,
290
+ text_content=result,
291
+ )
292
+
293
+
294
+ class DocxConverter(HtmlConverter):
295
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
296
+ # Bail if not a DOCX
297
+ extension = kwargs.get("file_extension", "")
298
+ if extension.lower() != ".docx":
299
+ return None
300
+
301
+ result = None
302
+ with open(local_path, "rb") as docx_file:
303
+ result = mammoth.convert_to_html(docx_file)
304
+ html_content = result.value
305
+ result = self._convert(html_content)
306
+
307
+ return result
308
+
309
+
310
+ class XlsxConverter(HtmlConverter):
311
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
312
+ # Bail if not a XLSX
313
+ extension = kwargs.get("file_extension", "")
314
+
315
+ if extension.lower() not in [".xlsx", ".xls"]:
316
+ return None
317
+
318
+ sheets = pd.read_excel(local_path, sheet_name=None)
319
+ md_content = ""
320
+ for s in sheets:
321
+ md_content += f"## {s}\n"
322
+ html_content = sheets[s].to_html(index=False)
323
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
324
+
325
+ return DocumentConverterResult(
326
+ title=None,
327
+ text_content=md_content.strip(),
328
+ )
329
+
330
+
331
+ import xml.etree.ElementTree as ET
332
+ class XmlConverter(DocumentConverter):
333
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
334
+ # Parse the XML string
335
+ extension = kwargs.get("file_extension", "")
336
+
337
+ if extension.lower() not in [".xml"]:
338
+ return None
339
+
340
+ xml_string = ""
341
+ with open(local_path, "rt") as fh:
342
+ xml_string = fh.read()
343
+
344
+ def extract_table_from_html_like(xml_root):
345
+ table = xml_root.find('.//table')
346
+ if table is None:
347
+ raise ValueError("No table found in the XML")
348
+
349
+ headers = [th.text for th in table.find('thead').findall('th')]
350
+ rows = [[td.text for td in tr.findall('td')] for tr in table.find('tbody').findall('tr')]
351
+
352
+ # Create markdown table
353
+ markdown = '| ' + ' | '.join(headers) + ' |\n'
354
+ markdown += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
355
+ for row in rows:
356
+ markdown += '| ' + ' | '.join(row) + ' |\n'
357
+
358
+ def extract_table_from_wordml(xml_root, namespaces):
359
+ # Parse the XML content
360
+ root = xml_root
361
+ namespace = {'w': 'http://schemas.microsoft.com/office/word/2003/wordml'}
362
+
363
+ # Extract text content
364
+ body = root.find('w:body', namespace)
365
+ paragraphs = body.findall('.//w:p', namespace)
366
+ text_content = []
367
+ for para in paragraphs:
368
+ texts = para.findall('.//w:t', namespace)
369
+ for text in texts:
370
+ text_content.append(text.text)
371
+
372
+ return '\n'.join(text_content)
373
+
374
+ # Parse the XML string
375
+ root = ET.fromstring(xml_string)
376
+ namespaces = {'w': 'http://schemas.microsoft.com/office/word/2003/wordml'}
377
+
378
+ if root.tag.endswith('wordDocument'):
379
+ markdown = extract_table_from_wordml(root, namespaces)
380
+ else:
381
+ markdown = extract_table_from_html_like(root)
382
+
383
+ return DocumentConverterResult(
384
+ title=None,
385
+ text_content=markdown.strip(),
386
+ )
387
+
388
+ class PptxConverter(HtmlConverter):
389
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
390
+ # Bail if not a PPTX
391
+ extension = kwargs.get("file_extension", "")
392
+ if extension.lower() != ".pptx":
393
+ return None
394
+
395
+ md_content = ""
396
+
397
+ presentation = pptx.Presentation(local_path)
398
+ slide_num = 0
399
+ for slide in presentation.slides:
400
+ slide_num += 1
401
+
402
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
403
+
404
+ title = slide.shapes.title
405
+ for shape in slide.shapes:
406
+ # Pictures
407
+ if self._is_picture(shape):
408
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
409
+ alt_text = ""
410
+ try:
411
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
412
+ except:
413
+ pass
414
+
415
+ # A placeholder name
416
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
417
+ # try:
418
+ # filename = shape.image.filename
419
+ # except:
420
+ # pass
421
+
422
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
423
+
424
+ # Tables
425
+ if self._is_table(shape):
426
+ html_table = "<html><body><table>"
427
+ first_row = True
428
+ for row in shape.table.rows:
429
+ html_table += "<tr>"
430
+ for cell in row.cells:
431
+ if first_row:
432
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
433
+ else:
434
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
435
+ html_table += "</tr>"
436
+ first_row = False
437
+ html_table += "</table></body></html>"
438
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
439
+
440
+ # Text areas
441
+ elif shape.has_text_frame:
442
+ if shape == title:
443
+ md_content += "# " + shape.text.lstrip() + " "
444
+ else:
445
+ md_content += shape.text + " "
446
+
447
+ md_content = md_content.strip()
448
+
449
+ if slide.has_notes_slide:
450
+ md_content += "\n\n### Notes:\n"
451
+ notes_frame = slide.notes_slide.notes_text_frame
452
+ if notes_frame is not None:
453
+ md_content += notes_frame.text
454
+ md_content = md_content.strip()
455
+
456
+ return DocumentConverterResult(
457
+ title=None,
458
+ text_content=md_content.strip(),
459
+ )
460
+
461
+ def _is_picture(self, shape):
462
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
463
+ return True
464
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
465
+ if hasattr(shape, "image"):
466
+ return True
467
+ return False
468
+
469
+ def _is_table(self, shape):
470
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
471
+ return True
472
+ return False
473
+
474
+ class FileConversionException(Exception):
475
+ pass
476
+
477
+ class UnsupportedFormatException(Exception):
478
+ pass
479
+
480
+ class MarkdownConverter:
481
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
482
+ This reader will convert common file-types or webpages to Markdown."""
483
+
484
+ def __init__(
485
+ self,
486
+ requests_session: Optional[requests.Session] = None,
487
+ ):
488
+ if requests_session is None:
489
+ self._requests_session = requests.Session()
490
+ else:
491
+ self._requests_session = requests_session
492
+
493
+
494
+ self._page_converters: List[DocumentConverter] = []
495
+
496
+ # Register converters for successful browsing operations
497
+ # Later registrations are tried first / take higher priority than earlier registrations
498
+ # To this end, the most specific converters should appear below the most generic converters
499
+ self.register_page_converter(WikipediaConverter())
500
+ self.register_page_converter(XmlConverter())
501
+ self.register_page_converter(YouTubeConverter())
502
+ self.register_page_converter(DocxConverter())
503
+ self.register_page_converter(XlsxConverter())
504
+ self.register_page_converter(PptxConverter())
505
+ # self.register_page_converter(ImageConverter())
506
+ self.register_page_converter(PdfConverter())
507
+ self.register_page_converter(AudioConverter())
508
+ self.register_page_converter(HtmlConverter())
509
+ self.register_page_converter(PlainTextConverter())
510
+
511
+ def convert(self, source, **kwargs):
512
+ """
513
+ Args:
514
+ - source: can be a string representing a path or url, or a requests.response object
515
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
516
+ """
517
+
518
+ # Local path or url
519
+ if isinstance(source, str):
520
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
521
+ return self.convert_url(source, **kwargs)
522
+ else:
523
+ return self.convert_local(source, **kwargs)
524
+ # Request response
525
+ elif isinstance(source, requests.Response):
526
+ return self.convert_response(source, **kwargs)
527
+
528
+ def convert_local(self, path, **kwargs):
529
+ # Prepare a list of extensions to try (in order of priority)
530
+ ext = kwargs.get("file_extension")
531
+ extensions = [ext] if ext is not None else []
532
+
533
+ # Get extension alternatives from the path and puremagic
534
+ base, ext = os.path.splitext(path)
535
+ self._append_ext(extensions, ext)
536
+ self._append_ext(extensions, self._guess_ext_magic(path))
537
+
538
+ # Convert
539
+ return self._convert(path, extensions, **kwargs)
540
+
541
+ def convert_url(self, url, **kwargs):
542
+ # Send a HTTP request to the URL
543
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
544
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
545
+ response.raise_for_status()
546
+ return self.convert_response(response, **kwargs)
547
+
548
+ def convert_response(self, response, **kwargs):
549
+ # Prepare a list of extensions to try (in order of priority)
550
+ ext = kwargs.get("file_extension")
551
+ extensions = [ext] if ext is not None else []
552
+
553
+ # Guess from the mimetype
554
+ content_type = response.headers.get("content-type", "").split(";")[0]
555
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
556
+
557
+ # Read the content disposition if there is one
558
+ content_disposition = response.headers.get("content-disposition", "")
559
+ m = re.search(r"filename=([^;]+)", content_disposition)
560
+ if m:
561
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
562
+ self._append_ext(extensions, ext)
563
+
564
+ # Read from the extension from the path
565
+ base, ext = os.path.splitext(urlparse(response.url).path)
566
+ self._append_ext(extensions, ext)
567
+
568
+ # Save the file locally to a temporary file. It will be deleted before this method exits
569
+ handle, temp_path = tempfile.mkstemp()
570
+ fh = os.fdopen(handle, "wb")
571
+ result = None
572
+ try:
573
+ # Download the file
574
+ for chunk in response.iter_content(chunk_size=512):
575
+ fh.write(chunk)
576
+ fh.close()
577
+
578
+ # Use puremagic to check for more extension options
579
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
580
+
581
+ # Convert
582
+ result = self._convert(temp_path, extensions, url=response.url)
583
+ except Exception as e:
584
+ print(f"Error in converting: {e}")
585
+
586
+ # Clean up
587
+ finally:
588
+ try:
589
+ fh.close()
590
+ except:
591
+ pass
592
+ os.unlink(temp_path)
593
+
594
+ return result
595
+
596
+ def _convert(self, local_path, extensions, **kwargs):
597
+ error_trace = ""
598
+ for ext in extensions:
599
+ for converter in self._page_converters:
600
+ _kwargs = copy.deepcopy(kwargs)
601
+ _kwargs.update({"file_extension": ext})
602
+ # If we hit an error log it and keep trying
603
+ try:
604
+ res = converter.convert(local_path, **_kwargs)
605
+ if res is not None:
606
+ # Normalize the content
607
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
608
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
609
+
610
+ # Todo
611
+ return res
612
+ except Exception as e:
613
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
614
+
615
+
616
+ # If we got this far without success, report any exceptions
617
+ if len(error_trace) > 0:
618
+ raise FileConversionException(
619
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
620
+ )
621
+
622
+ # Nothing can handle it!
623
+ # raise UnsupportedFormatException(
624
+ # f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
625
+ # )
626
+ res = PlainTextConverter().convert(local_path, **kwargs)
627
+ return res
628
+
629
+ def _append_ext(self, extensions, ext):
630
+ """Append a unique non-None, non-empty extension to a list of extensions."""
631
+ if ext is None:
632
+ return
633
+ ext = ext.strip()
634
+ if ext == "":
635
+ return
636
+ # if ext not in extensions:
637
+ if True:
638
+ extensions.append(ext)
639
+
640
+ def _guess_ext_magic(self, path):
641
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
642
+ # Use puremagic to guess
643
+ try:
644
+ guesses = puremagic.magic_file(path)
645
+ if len(guesses) > 0:
646
+ ext = guesses[0].extension.strip()
647
+ if len(ext) > 0:
648
+ return ext
649
+ except FileNotFoundError:
650
+ pass
651
+ except IsADirectoryError:
652
+ pass
653
+ except PermissionError:
654
+ pass
655
+ return None
656
+
657
+ def register_page_converter(self, converter: DocumentConverter) -> None:
658
+ """Register a page text converter."""
659
+ self._page_converters.append(converter)
app_tools/web_search_tools.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team and HuggingFace: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ # https://github.com/aymeric-roucher/GAIA/blob/a66aefc857d484a051a5eb66b49575dfaadff266/scripts/tools/web_surfer.py
4
+
5
+ import os
6
+ import re
7
+ from typing import Tuple, Optional
8
+ from smolagents import Tool
9
+ import time
10
+ from dotenv import load_dotenv
11
+ import requests
12
+ from pypdf import PdfReader
13
+ from markdownify import markdownify as md
14
+ import mimetypes
15
+ from .browser import SimpleTextBrowser
16
+
17
+ load_dotenv(override=True)
18
+
19
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
20
+
21
+ browser_config = {
22
+ "viewport_size": 1024 * 5,
23
+ "downloads_folder": "coding",
24
+ "request_kwargs": {
25
+ "headers": {"User-Agent": user_agent},
26
+ "timeout": 300,
27
+ },
28
+ }
29
+
30
+ browser_config["serpapi_key"] = os.environ["SERPER_API_KEY"]
31
+
32
+ browser = SimpleTextBrowser(**browser_config)
33
+
34
+ # Helper functions
35
+ def _browser_state() -> Tuple[str, str]:
36
+ header = f"Address: {browser.address}\n"
37
+ if browser.page_title is not None:
38
+ header += f"Title: {browser.page_title}\n"
39
+
40
+ current_page = browser.viewport_current_page
41
+ total_pages = len(browser.viewport_pages)
42
+
43
+ address = browser.address
44
+ for i in range(len(browser.history)-2,-1,-1): # Start from the second last
45
+ if browser.history[i][0] == address:
46
+ header += f"You previously visited this page {round(time.time() - browser.history[i][1])} seconds ago.\n"
47
+ break
48
+
49
+ header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n"
50
+ return (header, browser.viewport)
51
+
52
+
53
+ class SearchInformationTool(Tool):
54
+ name="informational_web_search"
55
+ description="Perform an INFORMATIONAL web search query then return the search results."
56
+ inputs = {
57
+ "query": {
58
+ "type": "string",
59
+ "description": "The informational web search query to perform."
60
+ }
61
+ }
62
+ inputs["filter_year"]= {
63
+ "type": "string",
64
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
65
+ "nullable": True
66
+ }
67
+ output_type = "string"
68
+
69
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
70
+ browser.visit_page(f"google: {query}", filter_year=filter_year)
71
+ header, content = _browser_state()
72
+ return header.strip() + "\n=======================\n" + content
73
+
74
+
75
+ class NavigationalSearchTool(Tool):
76
+ name="navigational_web_search"
77
+ description="Perform a NAVIGATIONAL web search query then immediately navigate to the top result. Useful, for example, to navigate to a particular Wikipedia article or other known destination. Equivalent to Google's \"I'm Feeling Lucky\" button."
78
+ inputs = {"query": {"type": "string", "description": "The navigational web search query to perform."}}
79
+ output_type = "string"
80
+
81
+ def forward(self, query: str) -> str:
82
+ browser.visit_page(f"google: {query}")
83
+
84
+ # Extract the first line
85
+ m = re.search(r"\[.*?\]\((http.*?)\)", browser.page_content)
86
+ if m:
87
+ browser.visit_page(m.group(1))
88
+
89
+ # Return where we ended up
90
+ header, content = _browser_state()
91
+ return header.strip() + "\n=======================\n" + content
92
+
93
+
94
+ class VisitTool(Tool):
95
+ name="visit_page"
96
+ description="Visit a webpage at a given URL and return its text."
97
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
98
+ output_type = "string"
99
+
100
+ def forward(self, url: str) -> str:
101
+ browser.visit_page(url)
102
+ header, content = _browser_state()
103
+ return header.strip() + "\n=======================\n" + content
104
+
105
+
106
+ class DownloadTool(Tool):
107
+ name="download_file"
108
+ description="""
109
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
110
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
111
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
112
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
113
+ output_type = "string"
114
+
115
+ def forward(self, url: str) -> str:
116
+ if "arxiv" in url:
117
+ url = url.replace("abs", "pdf")
118
+ response = requests.get(url)
119
+ content_type = response.headers.get("content-type", "")
120
+ extension = mimetypes.guess_extension(content_type)
121
+ if extension and isinstance(extension, str):
122
+ new_path = f"./downloads/file{extension}"
123
+ else:
124
+ new_path = "./downloads/file.object"
125
+
126
+ with open(new_path, "wb") as f:
127
+ f.write(response.content)
128
+
129
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
130
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
131
+
132
+ return f"File was downloaded and saved under path {new_path}."
133
+
134
+
135
+ class PageUpTool(Tool):
136
+ name="page_up"
137
+ description="Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
138
+ output_type = "string"
139
+ inputs = {}
140
+
141
+ def forward(self) -> str:
142
+ browser.page_up()
143
+ header, content = _browser_state()
144
+ return header.strip() + "\n=======================\n" + content
145
+
146
+ class ArchiveSearchTool(Tool):
147
+ name="find_archived_url"
148
+ description="Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
149
+ inputs={
150
+ "url": {"type": "string", "description": "The url you need the archive for."},
151
+ "date": {"type": "string", "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'."}
152
+ }
153
+ output_type = "string"
154
+
155
+ def forward(self, url, date) -> str:
156
+ archive_url = f"https://archive.org/wayback/available?url={url}&timestamp={date}"
157
+ response = requests.get(archive_url).json()
158
+ try:
159
+ closest = response["archived_snapshots"]["closest"]
160
+ except:
161
+ raise Exception(f"Your url was not archived on Wayback Machine, try a different url.")
162
+ target_url = closest["url"]
163
+ browser.visit_page(target_url)
164
+ header, content = _browser_state()
165
+ return f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n" + header.strip() + "\n=======================\n" + content
166
+
167
+
168
+ class PageDownTool(Tool):
169
+ name="page_down"
170
+ description="Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
171
+ output_type = "string"
172
+ inputs = {}
173
+
174
+ def forward(self, ) -> str:
175
+ browser.page_down()
176
+ header, content = _browser_state()
177
+ return header.strip() + "\n=======================\n" + content
178
+
179
+
180
+ class FinderTool(Tool):
181
+ name="find_on_page_ctrl_f"
182
+ description="Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
183
+ inputs = {"search_string": {"type": "string", "description": "The string to search for on the page. This search string supports wildcards like '*'" }}
184
+ output_type = "string"
185
+
186
+ def forward(self, search_string: str) -> str:
187
+ find_result = browser.find_on_page(search_string)
188
+ header, content = _browser_state()
189
+
190
+ if find_result is None:
191
+ return header.strip() + f"\n=======================\nThe search string '{search_string}' was not found on this page."
192
+ else:
193
+ return header.strip() + "\n=======================\n" + content
194
+
195
+
196
+ class FindNextTool(Tool):
197
+ name="find_next"
198
+ description="Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
199
+ inputs = {}
200
+ output_type = "string"
201
+
202
+ def forward(self, ) -> str:
203
+ find_result = browser.find_next()
204
+ header, content = _browser_state()
205
+
206
+ if find_result is None:
207
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
208
+ else:
209
+ return header.strip() + "\n=======================\n" + content
requirements.txt CHANGED
@@ -12,4 +12,12 @@ huggingface_hub
12
  markdownify
13
  transformers
14
  gradio_tools
15
- langchain
 
 
 
 
 
 
 
 
 
12
  markdownify
13
  transformers
14
  gradio_tools
15
+ langchain
16
+ pypdf
17
+ pathvalidate
18
+ puremagic
19
+ mammoth
20
+ python-pptx
21
+ pdfminer.six
22
+ youtube-transcript-api
23
+ google-search-results