abidlabs HF Staff commited on
Commit
f5a2c54
·
verified ·
1 Parent(s): aaa3fc6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. __init__.py +172 -0
  3. __pycache__/__init__.cpython-310.pyc +0 -0
  4. __pycache__/__init__.cpython-311.pyc +0 -0
  5. __pycache__/__init__.cpython-312.pyc +0 -0
  6. __pycache__/cli.cpython-311.pyc +0 -0
  7. __pycache__/cli.cpython-312.pyc +0 -0
  8. __pycache__/commit_scheduler.cpython-311.pyc +0 -0
  9. __pycache__/commit_scheduler.cpython-312.pyc +0 -0
  10. __pycache__/context.cpython-312.pyc +0 -0
  11. __pycache__/context_vars.cpython-311.pyc +0 -0
  12. __pycache__/context_vars.cpython-312.pyc +0 -0
  13. __pycache__/deploy.cpython-310.pyc +0 -0
  14. __pycache__/deploy.cpython-311.pyc +0 -0
  15. __pycache__/deploy.cpython-312.pyc +0 -0
  16. __pycache__/dummy_commit_scheduler.cpython-310.pyc +0 -0
  17. __pycache__/dummy_commit_scheduler.cpython-311.pyc +0 -0
  18. __pycache__/dummy_commit_scheduler.cpython-312.pyc +0 -0
  19. __pycache__/file_storage.cpython-311.pyc +0 -0
  20. __pycache__/file_storage.cpython-312.pyc +0 -0
  21. __pycache__/imports.cpython-311.pyc +0 -0
  22. __pycache__/imports.cpython-312.pyc +0 -0
  23. __pycache__/media.cpython-311.pyc +0 -0
  24. __pycache__/media.cpython-312.pyc +0 -0
  25. __pycache__/run.cpython-310.pyc +0 -0
  26. __pycache__/run.cpython-311.pyc +0 -0
  27. __pycache__/run.cpython-312.pyc +0 -0
  28. __pycache__/sqlite_storage.cpython-310.pyc +0 -0
  29. __pycache__/sqlite_storage.cpython-311.pyc +0 -0
  30. __pycache__/sqlite_storage.cpython-312.pyc +0 -0
  31. __pycache__/storage.cpython-312.pyc +0 -0
  32. __pycache__/typehints.cpython-311.pyc +0 -0
  33. __pycache__/typehints.cpython-312.pyc +0 -0
  34. __pycache__/types.cpython-312.pyc +0 -0
  35. __pycache__/ui.cpython-310.pyc +0 -0
  36. __pycache__/ui.cpython-311.pyc +0 -0
  37. __pycache__/ui.cpython-312.pyc +0 -0
  38. __pycache__/utils.cpython-310.pyc +0 -0
  39. __pycache__/utils.cpython-311.pyc +0 -0
  40. __pycache__/utils.cpython-312.pyc +0 -0
  41. assets/trackio_logo_dark.png +0 -0
  42. assets/trackio_logo_light.png +0 -0
  43. assets/trackio_logo_old.png +3 -0
  44. assets/trackio_logo_type_dark.png +0 -0
  45. assets/trackio_logo_type_dark_transparent.png +0 -0
  46. assets/trackio_logo_type_light.png +0 -0
  47. assets/trackio_logo_type_light_transparent.png +0 -0
  48. cli.py +26 -0
  49. commit_scheduler.py +392 -0
  50. context_vars.py +15 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/trackio_logo_old.png filter=lfs diff=lfs merge=lfs -text
__init__.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+ import webbrowser
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from gradio_client import Client
8
+
9
+ from trackio import context_vars, deploy, utils
10
+ from trackio.imports import import_csv, import_tf_events
11
+ from trackio.media import TrackioImage
12
+ from trackio.run import Run
13
+ from trackio.sqlite_storage import SQLiteStorage
14
+ from trackio.ui import demo
15
+ from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR
16
+
17
+ __version__ = Path(__file__).parent.joinpath("version.txt").read_text().strip()
18
+
19
+ __all__ = ["init", "log", "finish", "show", "import_csv", "import_tf_events", "Image"]
20
+
21
+ Image = TrackioImage
22
+
23
+
24
+ config = {}
25
+
26
+
27
+ def init(
28
+ project: str,
29
+ name: str | None = None,
30
+ space_id: str | None = None,
31
+ dataset_id: str | None = None,
32
+ config: dict | None = None,
33
+ resume: str = "never",
34
+ settings: Any = None,
35
+ ) -> Run:
36
+ """
37
+ Creates a new Trackio project and returns a Run object.
38
+
39
+ Args:
40
+ project: The name of the project (can be an existing project to continue tracking or a new project to start tracking from scratch).
41
+ name: The name of the run (if not provided, a default name will be generated).
42
+ space_id: If provided, the project will be logged to a Hugging Face Space instead of a local directory. Should be a complete Space name like "username/reponame" or "orgname/reponame", or just "reponame" in which case the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not exist, it will be created. If the Space already exists, the project will be logged to it.
43
+ dataset_id: If a space_id is provided, a persistent Hugging Face Dataset will be created and the metrics will be synced to it every 5 minutes. Specify a Dataset with name like "username/datasetname" or "orgname/datasetname", or "datasetname" (uses currently-logged-in Hugging Face user's namespace), or None (uses the same name as the Space but with the "_dataset" suffix). If the Dataset does not exist, it will be created. If the Dataset already exists, the project will be appended to it.
44
+ config: A dictionary of configuration options. Provided for compatibility with wandb.init()
45
+ resume: Controls how to handle resuming a run. Can be one of:
46
+ - "must": Must resume the run with the given name, raises error if run doesn't exist
47
+ - "allow": Resume the run if it exists, otherwise create a new run
48
+ - "never": Never resume a run, always create a new one
49
+ settings: Not used. Provided for compatibility with wandb.init()
50
+ """
51
+ if settings is not None:
52
+ warnings.warn(
53
+ "* Warning: settings is not used. Provided for compatibility with wandb.init(). Please create an issue at: https://github.com/gradio-app/trackio/issues if you need a specific feature implemented."
54
+ )
55
+
56
+ if space_id is None and dataset_id is not None:
57
+ raise ValueError("Must provide a `space_id` when `dataset_id` is provided.")
58
+ space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
59
+ url = context_vars.current_server.get()
60
+
61
+ if url is None:
62
+ if space_id is None:
63
+ _, url, _ = demo.launch(
64
+ show_api=False,
65
+ inline=False,
66
+ quiet=True,
67
+ prevent_thread_lock=True,
68
+ show_error=True,
69
+ )
70
+ else:
71
+ url = space_id
72
+ context_vars.current_server.set(url)
73
+
74
+ if (
75
+ context_vars.current_project.get() is None
76
+ or context_vars.current_project.get() != project
77
+ ):
78
+ print(f"* Trackio project initialized: {project}")
79
+
80
+ if dataset_id is not None:
81
+ os.environ["TRACKIO_DATASET_ID"] = dataset_id
82
+ print(
83
+ f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}"
84
+ )
85
+ if space_id is None:
86
+ print(f"* Trackio metrics logged to: {TRACKIO_DIR}")
87
+ utils.print_dashboard_instructions(project)
88
+ else:
89
+ deploy.create_space_if_not_exists(space_id, dataset_id)
90
+ print(
91
+ f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
92
+ )
93
+ context_vars.current_project.set(project)
94
+
95
+ client = None
96
+ if not space_id:
97
+ client = Client(url, verbose=False)
98
+
99
+ if resume == "must":
100
+ if name is None:
101
+ raise ValueError("Must provide a run name when resume='must'")
102
+ if name not in SQLiteStorage.get_runs(project):
103
+ raise ValueError(f"Run '{name}' does not exist in project '{project}'")
104
+ elif resume == "allow":
105
+ if name is not None and name in SQLiteStorage.get_runs(project):
106
+ print(f"* Resuming existing run: {name}")
107
+ elif resume == "never":
108
+ if name is not None and name in SQLiteStorage.get_runs(project):
109
+ name = None
110
+ else:
111
+ raise ValueError("resume must be one of: 'must', 'allow', or 'never'")
112
+
113
+ run = Run(
114
+ url=url,
115
+ project=project,
116
+ client=client,
117
+ name=name,
118
+ config=config,
119
+ space_id=space_id,
120
+ )
121
+ context_vars.current_run.set(run)
122
+ globals()["config"] = run.config
123
+ return run
124
+
125
+
126
+ def log(metrics: dict, step: int | None = None) -> None:
127
+ """
128
+ Logs metrics to the current run.
129
+
130
+ Args:
131
+ metrics: A dictionary of metrics to log.
132
+ step: The step number. If not provided, the step will be incremented automatically.
133
+ """
134
+ run = context_vars.current_run.get()
135
+ if run is None:
136
+ raise RuntimeError("Call trackio.init() before trackio.log().")
137
+ run.log(
138
+ metrics=metrics,
139
+ step=step,
140
+ )
141
+
142
+
143
+ def finish():
144
+ """
145
+ Finishes the current run.
146
+ """
147
+ run = context_vars.current_run.get()
148
+ if run is None:
149
+ raise RuntimeError("Call trackio.init() before trackio.finish().")
150
+ run.finish()
151
+
152
+
153
+ def show(project: str | None = None):
154
+ """
155
+ Launches the Trackio dashboard.
156
+
157
+ Args:
158
+ project: The name of the project whose runs to show. If not provided, all projects will be shown and the user can select one.
159
+ """
160
+ _, url, share_url = demo.launch(
161
+ show_api=False,
162
+ quiet=True,
163
+ inline=False,
164
+ prevent_thread_lock=True,
165
+ favicon_path=TRACKIO_LOGO_DIR / "trackio_logo_light.png",
166
+ allowed_paths=[TRACKIO_LOGO_DIR],
167
+ )
168
+ base_url = share_url + "/" if share_url else url
169
+ dashboard_url = base_url + f"?project={project}" if project else base_url
170
+ print(f"* Trackio UI launched at: {dashboard_url}")
171
+ webbrowser.open(dashboard_url)
172
+ utils.block_except_in_notebook()
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (4.96 kB). View file
 
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (8.77 kB). View file
 
__pycache__/__init__.cpython-312.pyc ADDED
Binary file (8.12 kB). View file
 
__pycache__/cli.cpython-311.pyc ADDED
Binary file (1.25 kB). View file
 
__pycache__/cli.cpython-312.pyc ADDED
Binary file (1.11 kB). View file
 
__pycache__/commit_scheduler.cpython-311.pyc ADDED
Binary file (20.3 kB). View file
 
__pycache__/commit_scheduler.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
__pycache__/context.cpython-312.pyc ADDED
Binary file (440 Bytes). View file
 
__pycache__/context_vars.cpython-311.pyc ADDED
Binary file (840 Bytes). View file
 
__pycache__/context_vars.cpython-312.pyc ADDED
Binary file (763 Bytes). View file
 
__pycache__/deploy.cpython-310.pyc ADDED
Binary file (1.72 kB). View file
 
__pycache__/deploy.cpython-311.pyc ADDED
Binary file (7.44 kB). View file
 
__pycache__/deploy.cpython-312.pyc ADDED
Binary file (6.6 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-310.pyc ADDED
Binary file (936 Bytes). View file
 
__pycache__/dummy_commit_scheduler.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/file_storage.cpython-311.pyc ADDED
Binary file (3.14 kB). View file
 
__pycache__/file_storage.cpython-312.pyc ADDED
Binary file (2.77 kB). View file
 
__pycache__/imports.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
__pycache__/imports.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
__pycache__/media.cpython-311.pyc ADDED
Binary file (6.39 kB). View file
 
__pycache__/media.cpython-312.pyc ADDED
Binary file (5.78 kB). View file
 
__pycache__/run.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/run.cpython-311.pyc ADDED
Binary file (7.61 kB). View file
 
__pycache__/run.cpython-312.pyc ADDED
Binary file (6.93 kB). View file
 
__pycache__/sqlite_storage.cpython-310.pyc ADDED
Binary file (5.37 kB). View file
 
__pycache__/sqlite_storage.cpython-311.pyc ADDED
Binary file (22.4 kB). View file
 
__pycache__/sqlite_storage.cpython-312.pyc ADDED
Binary file (18.8 kB). View file
 
__pycache__/storage.cpython-312.pyc ADDED
Binary file (4.6 kB). View file
 
__pycache__/typehints.cpython-311.pyc ADDED
Binary file (1.06 kB). View file
 
__pycache__/typehints.cpython-312.pyc ADDED
Binary file (855 Bytes). View file
 
__pycache__/types.cpython-312.pyc ADDED
Binary file (531 Bytes). View file
 
__pycache__/ui.cpython-310.pyc ADDED
Binary file (7.83 kB). View file
 
__pycache__/ui.cpython-311.pyc ADDED
Binary file (29.9 kB). View file
 
__pycache__/ui.cpython-312.pyc ADDED
Binary file (25.6 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.62 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (16.1 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (14.6 kB). View file
 
assets/trackio_logo_dark.png ADDED
assets/trackio_logo_light.png ADDED
assets/trackio_logo_old.png ADDED

Git LFS Details

  • SHA256: 3922c4d1e465270ad4d8abb12023f3beed5d9f7f338528a4c0ac21dcf358a1c8
  • Pointer size: 131 Bytes
  • Size of remote file: 487 kB
assets/trackio_logo_type_dark.png ADDED
assets/trackio_logo_type_dark_transparent.png ADDED
assets/trackio_logo_type_light.png ADDED
assets/trackio_logo_type_light_transparent.png ADDED
cli.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from trackio import show
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(description="Trackio CLI")
8
+ subparsers = parser.add_subparsers(dest="command")
9
+
10
+ ui_parser = subparsers.add_parser(
11
+ "show", help="Show the Trackio dashboard UI for a project"
12
+ )
13
+ ui_parser.add_argument(
14
+ "--project", required=False, help="Project name to show in the dashboard"
15
+ )
16
+
17
+ args = parser.parse_args()
18
+
19
+ if args.command == "show":
20
+ show(args.project)
21
+ else:
22
+ parser.print_help()
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
commit_scheduler.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Originally copied from https://github.com/huggingface/huggingface_hub/blob/d0a948fc2a32ed6e557042a95ef3e4af97ec4a7c/src/huggingface_hub/_commit_scheduler.py
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import time
7
+ from concurrent.futures import Future
8
+ from dataclasses import dataclass
9
+ from io import SEEK_END, SEEK_SET, BytesIO
10
+ from pathlib import Path
11
+ from threading import Lock, Thread
12
+ from typing import Callable, Dict, List, Optional, Union
13
+
14
+ from huggingface_hub.hf_api import (
15
+ DEFAULT_IGNORE_PATTERNS,
16
+ CommitInfo,
17
+ CommitOperationAdd,
18
+ HfApi,
19
+ )
20
+ from huggingface_hub.utils import filter_repo_objects
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class _FileToUpload:
27
+ """Temporary dataclass to store info about files to upload. Not meant to be used directly."""
28
+
29
+ local_path: Path
30
+ path_in_repo: str
31
+ size_limit: int
32
+ last_modified: float
33
+
34
+
35
+ class CommitScheduler:
36
+ """
37
+ Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
38
+
39
+ The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
40
+ properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
41
+ with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
42
+ to learn more about how to use it.
43
+
44
+ Args:
45
+ repo_id (`str`):
46
+ The id of the repo to commit to.
47
+ folder_path (`str` or `Path`):
48
+ Path to the local folder to upload regularly.
49
+ every (`int` or `float`, *optional*):
50
+ The number of minutes between each commit. Defaults to 5 minutes.
51
+ path_in_repo (`str`, *optional*):
52
+ Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
53
+ of the repository.
54
+ repo_type (`str`, *optional*):
55
+ The type of the repo to commit to. Defaults to `model`.
56
+ revision (`str`, *optional*):
57
+ The revision of the repo to commit to. Defaults to `main`.
58
+ private (`bool`, *optional*):
59
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
60
+ token (`str`, *optional*):
61
+ The token to use to commit to the repo. Defaults to the token saved on the machine.
62
+ allow_patterns (`List[str]` or `str`, *optional*):
63
+ If provided, only files matching at least one pattern are uploaded.
64
+ ignore_patterns (`List[str]` or `str`, *optional*):
65
+ If provided, files matching any of the patterns are not uploaded.
66
+ squash_history (`bool`, *optional*):
67
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
68
+ useful to avoid degraded performances on the repo when it grows too large.
69
+ hf_api (`HfApi`, *optional*):
70
+ The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
71
+ on_before_commit (`Callable[[], None]`, *optional*):
72
+ If specified, a function that will be called before the CommitScheduler lists files to create a commit.
73
+
74
+ Example:
75
+ ```py
76
+ >>> from pathlib import Path
77
+ >>> from huggingface_hub import CommitScheduler
78
+
79
+ # Scheduler uploads every 10 minutes
80
+ >>> csv_path = Path("watched_folder/data.csv")
81
+ >>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
82
+
83
+ >>> with csv_path.open("a") as f:
84
+ ... f.write("first line")
85
+
86
+ # Some time later (...)
87
+ >>> with csv_path.open("a") as f:
88
+ ... f.write("second line")
89
+ ```
90
+
91
+ Example using a context manager:
92
+ ```py
93
+ >>> from pathlib import Path
94
+ >>> from huggingface_hub import CommitScheduler
95
+
96
+ >>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
97
+ ... csv_path = Path("watched_folder/data.csv")
98
+ ... with csv_path.open("a") as f:
99
+ ... f.write("first line")
100
+ ... (...)
101
+ ... with csv_path.open("a") as f:
102
+ ... f.write("second line")
103
+
104
+ # Scheduler is now stopped and last commit have been triggered
105
+ ```
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ *,
111
+ repo_id: str,
112
+ folder_path: Union[str, Path],
113
+ every: Union[int, float] = 5,
114
+ path_in_repo: Optional[str] = None,
115
+ repo_type: Optional[str] = None,
116
+ revision: Optional[str] = None,
117
+ private: Optional[bool] = None,
118
+ token: Optional[str] = None,
119
+ allow_patterns: Optional[Union[List[str], str]] = None,
120
+ ignore_patterns: Optional[Union[List[str], str]] = None,
121
+ squash_history: bool = False,
122
+ hf_api: Optional["HfApi"] = None,
123
+ on_before_commit: Optional[Callable[[], None]] = None,
124
+ ) -> None:
125
+ self.api = hf_api or HfApi(token=token)
126
+ self.on_before_commit = on_before_commit
127
+
128
+ # Folder
129
+ self.folder_path = Path(folder_path).expanduser().resolve()
130
+ self.path_in_repo = path_in_repo or ""
131
+ self.allow_patterns = allow_patterns
132
+
133
+ if ignore_patterns is None:
134
+ ignore_patterns = []
135
+ elif isinstance(ignore_patterns, str):
136
+ ignore_patterns = [ignore_patterns]
137
+ self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
138
+
139
+ if self.folder_path.is_file():
140
+ raise ValueError(
141
+ f"'folder_path' must be a directory, not a file: '{self.folder_path}'."
142
+ )
143
+ self.folder_path.mkdir(parents=True, exist_ok=True)
144
+
145
+ # Repository
146
+ repo_url = self.api.create_repo(
147
+ repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True
148
+ )
149
+ self.repo_id = repo_url.repo_id
150
+ self.repo_type = repo_type
151
+ self.revision = revision
152
+ self.token = token
153
+
154
+ # Keep track of already uploaded files
155
+ self.last_uploaded: Dict[
156
+ Path, float
157
+ ] = {} # key is local path, value is timestamp
158
+
159
+ # Scheduler
160
+ if not every > 0:
161
+ raise ValueError(f"'every' must be a positive integer, not '{every}'.")
162
+ self.lock = Lock()
163
+ self.every = every
164
+ self.squash_history = squash_history
165
+
166
+ logger.info(
167
+ f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes."
168
+ )
169
+ self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
170
+ self._scheduler_thread.start()
171
+ atexit.register(self._push_to_hub)
172
+
173
+ self.__stopped = False
174
+
175
+ def stop(self) -> None:
176
+ """Stop the scheduler.
177
+
178
+ A stopped scheduler cannot be restarted. Mostly for tests purposes.
179
+ """
180
+ self.__stopped = True
181
+
182
+ def __enter__(self) -> "CommitScheduler":
183
+ return self
184
+
185
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
186
+ # Upload last changes before exiting
187
+ self.trigger().result()
188
+ self.stop()
189
+ return
190
+
191
+ def _run_scheduler(self) -> None:
192
+ """Dumb thread waiting between each scheduled push to Hub."""
193
+ while True:
194
+ self.last_future = self.trigger()
195
+ time.sleep(self.every * 60)
196
+ if self.__stopped:
197
+ break
198
+
199
+ def trigger(self) -> Future:
200
+ """Trigger a `push_to_hub` and return a future.
201
+
202
+ This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
203
+ immediately, without waiting for the next scheduled commit.
204
+ """
205
+ return self.api.run_as_future(self._push_to_hub)
206
+
207
+ def _push_to_hub(self) -> Optional[CommitInfo]:
208
+ if self.__stopped: # If stopped, already scheduled commits are ignored
209
+ return None
210
+
211
+ logger.info("(Background) scheduled commit triggered.")
212
+ try:
213
+ value = self.push_to_hub()
214
+ if self.squash_history:
215
+ logger.info("(Background) squashing repo history.")
216
+ self.api.super_squash_history(
217
+ repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision
218
+ )
219
+ return value
220
+ except Exception as e:
221
+ logger.error(
222
+ f"Error while pushing to Hub: {e}"
223
+ ) # Depending on the setup, error might be silenced
224
+ raise
225
+
226
+ def push_to_hub(self) -> Optional[CommitInfo]:
227
+ """
228
+ Push folder to the Hub and return the commit info.
229
+
230
+ <Tip warning={true}>
231
+
232
+ This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
233
+ queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
234
+ issues.
235
+
236
+ </Tip>
237
+
238
+ The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
239
+ uploads only changed files. If no changes are found, the method returns without committing anything. If you want
240
+ to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
241
+ for example to compress data together in a single file before committing. For more details and examples, check
242
+ out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
243
+ """
244
+ # Check files to upload (with lock)
245
+ with self.lock:
246
+ if self.on_before_commit is not None:
247
+ self.on_before_commit()
248
+
249
+ logger.debug("Listing files to upload for scheduled commit.")
250
+
251
+ # List files from folder (taken from `_prepare_upload_folder_additions`)
252
+ relpath_to_abspath = {
253
+ path.relative_to(self.folder_path).as_posix(): path
254
+ for path in sorted(
255
+ self.folder_path.glob("**/*")
256
+ ) # sorted to be deterministic
257
+ if path.is_file()
258
+ }
259
+ prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
260
+
261
+ # Filter with pattern + filter out unchanged files + retrieve current file size
262
+ files_to_upload: List[_FileToUpload] = []
263
+ for relpath in filter_repo_objects(
264
+ relpath_to_abspath.keys(),
265
+ allow_patterns=self.allow_patterns,
266
+ ignore_patterns=self.ignore_patterns,
267
+ ):
268
+ local_path = relpath_to_abspath[relpath]
269
+ stat = local_path.stat()
270
+ if (
271
+ self.last_uploaded.get(local_path) is None
272
+ or self.last_uploaded[local_path] != stat.st_mtime
273
+ ):
274
+ files_to_upload.append(
275
+ _FileToUpload(
276
+ local_path=local_path,
277
+ path_in_repo=prefix + relpath,
278
+ size_limit=stat.st_size,
279
+ last_modified=stat.st_mtime,
280
+ )
281
+ )
282
+
283
+ # Return if nothing to upload
284
+ if len(files_to_upload) == 0:
285
+ logger.debug("Dropping schedule commit: no changed file to upload.")
286
+ return None
287
+
288
+ # Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
289
+ logger.debug("Removing unchanged files since previous scheduled commit.")
290
+ add_operations = [
291
+ CommitOperationAdd(
292
+ # TODO: Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
293
+ # (requires an upstream fix for XET-535: `hf_xet` should support `BinaryIO` for upload)
294
+ path_or_fileobj=file_to_upload.local_path,
295
+ path_in_repo=file_to_upload.path_in_repo,
296
+ )
297
+ for file_to_upload in files_to_upload
298
+ ]
299
+
300
+ # Upload files (append mode expected - no need for lock)
301
+ logger.debug("Uploading files for scheduled commit.")
302
+ commit_info = self.api.create_commit(
303
+ repo_id=self.repo_id,
304
+ repo_type=self.repo_type,
305
+ operations=add_operations,
306
+ commit_message="Scheduled Commit",
307
+ revision=self.revision,
308
+ )
309
+
310
+ # Successful commit: keep track of the latest "last_modified" for each file
311
+ for file in files_to_upload:
312
+ self.last_uploaded[file.local_path] = file.last_modified
313
+ return commit_info
314
+
315
+
316
+ class PartialFileIO(BytesIO):
317
+ """A file-like object that reads only the first part of a file.
318
+
319
+ Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
320
+ file is uploaded (i.e. the part that was available when the filesystem was first scanned).
321
+
322
+ In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
323
+ disturbance for the user. The object is passed to `CommitOperationAdd`.
324
+
325
+ Only supports `read`, `tell` and `seek` methods.
326
+
327
+ Args:
328
+ file_path (`str` or `Path`):
329
+ Path to the file to read.
330
+ size_limit (`int`):
331
+ The maximum number of bytes to read from the file. If the file is larger than this, only the first part
332
+ will be read (and uploaded).
333
+ """
334
+
335
+ def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
336
+ self._file_path = Path(file_path)
337
+ self._file = self._file_path.open("rb")
338
+ self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
339
+
340
+ def __del__(self) -> None:
341
+ self._file.close()
342
+ return super().__del__()
343
+
344
+ def __repr__(self) -> str:
345
+ return (
346
+ f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
347
+ )
348
+
349
+ def __len__(self) -> int:
350
+ return self._size_limit
351
+
352
+ def __getattribute__(self, name: str):
353
+ if name.startswith("_") or name in (
354
+ "read",
355
+ "tell",
356
+ "seek",
357
+ ): # only 3 public methods supported
358
+ return super().__getattribute__(name)
359
+ raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
360
+
361
+ def tell(self) -> int:
362
+ """Return the current file position."""
363
+ return self._file.tell()
364
+
365
+ def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
366
+ """Change the stream position to the given offset.
367
+
368
+ Behavior is the same as a regular file, except that the position is capped to the size limit.
369
+ """
370
+ if __whence == SEEK_END:
371
+ # SEEK_END => set from the truncated end
372
+ __offset = len(self) + __offset
373
+ __whence = SEEK_SET
374
+
375
+ pos = self._file.seek(__offset, __whence)
376
+ if pos > self._size_limit:
377
+ return self._file.seek(self._size_limit)
378
+ return pos
379
+
380
+ def read(self, __size: Optional[int] = -1) -> bytes:
381
+ """Read at most `__size` bytes from the file.
382
+
383
+ Behavior is the same as a regular file, except that it is capped to the size limit.
384
+ """
385
+ current = self._file.tell()
386
+ if __size is None or __size < 0:
387
+ # Read until file limit
388
+ truncated_size = self._size_limit - current
389
+ else:
390
+ # Read until file limit or __size
391
+ truncated_size = min(__size, self._size_limit - current)
392
+ return self._file.read(truncated_size)
context_vars.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextvars
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from trackio.run import Run
6
+
7
+ current_run: contextvars.ContextVar["Run | None"] = contextvars.ContextVar(
8
+ "current_run", default=None
9
+ )
10
+ current_project: contextvars.ContextVar[str | None] = contextvars.ContextVar(
11
+ "current_project", default=None
12
+ )
13
+ current_server: contextvars.ContextVar[str | None] = contextvars.ContextVar(
14
+ "current_server", default=None
15
+ )