lfs fix
Browse files- .gitattributes +0 -4
- README.md +13 -3
- app.py +158 -3
- dpt.py +240 -3
- examples/demo01.jpg +0 -0
- examples/demo02.jpg +0 -0
- examples/demo03.jpg +0 -0
- examples/demo04.jpg +0 -0
- examples/demo05.jpg +0 -0
- examples/demo06.jpg +0 -0
- examples/demo07.jpg +0 -0
- examples/demo08.jpg +0 -0
- examples/demo09.jpg +0 -0
- examples/demo10.jpg +0 -0
- examples/demo11.jpg +0 -0
- examples/demo12.jpg +0 -0
- examples/demo13.jpg +0 -0
- examples/demo14.jpg +0 -0
- examples/demo15.jpg +0 -0
- examples/demo16.jpg +0 -0
- examples/demo17.jpg +0 -0
- examples/demo18.jpg +0 -0
- examples/demo20.jpg +0 -0
- requirements.txt +7 -3
- transform.py +185 -3
.gitattributes
CHANGED
@@ -33,8 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
37 |
-
*.md filter=lfs diff=lfs merge=lfs -text
|
38 |
-
*.py filter=lfs diff=lfs merge=lfs -text
|
39 |
-
*.txt filter=lfs diff=lfs merge=lfs -text
|
40 |
examples/demo19.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
36 |
examples/demo19.jpg filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Depth Anything V2 tinygrad
|
3 |
+
emoji: 🌖
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.36.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,3 +1,158 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
from typing import Literal
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import matplotlib
|
7 |
+
import numpy as np
|
8 |
+
import safetensors as st
|
9 |
+
import spaces
|
10 |
+
import torch
|
11 |
+
from gradio_imageslider import ImageSlider
|
12 |
+
from huggingface_hub import hf_hub_download
|
13 |
+
from PIL import Image
|
14 |
+
from tinygrad import Tensor, nn
|
15 |
+
from transform import image2tensor
|
16 |
+
|
17 |
+
from dpt import DPTv2, DPTv2Config
|
18 |
+
|
19 |
+
css = """
|
20 |
+
#img-display-container {
|
21 |
+
max-height: 100vh;
|
22 |
+
}
|
23 |
+
#img-display-input {
|
24 |
+
max-height: 80vh;
|
25 |
+
}
|
26 |
+
#img-display-output {
|
27 |
+
max-height: 80vh;
|
28 |
+
}
|
29 |
+
#download {
|
30 |
+
height: 62px;
|
31 |
+
}
|
32 |
+
"""
|
33 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
34 |
+
|
35 |
+
|
36 |
+
def get_config(m_size: Literal["vits", "vitb", "vitl", "vitg"]):
|
37 |
+
return DPTv2Config(img_size=518, patch_size=14, in_channels=3, mlp_ratio=4, **model_configs[m_size])
|
38 |
+
|
39 |
+
|
40 |
+
model_configs = {
|
41 |
+
"vits": {
|
42 |
+
"indermediate_layers": [2, 5, 8, 11],
|
43 |
+
"depth": 12,
|
44 |
+
"num_heads": 6,
|
45 |
+
"embed_dim": 384,
|
46 |
+
"features": 64,
|
47 |
+
"out_channels": [48, 96, 192, 384],
|
48 |
+
},
|
49 |
+
"vitb": {
|
50 |
+
"indermediate_layers": [2, 5, 8, 11],
|
51 |
+
"depth": 12,
|
52 |
+
"num_heads": 12,
|
53 |
+
"embed_dim": 768,
|
54 |
+
"features": 128,
|
55 |
+
"out_channels": [96, 192, 384, 768],
|
56 |
+
},
|
57 |
+
"vitl": {
|
58 |
+
"indermediate_layers": [4, 11, 17, 23],
|
59 |
+
"depth": 24,
|
60 |
+
"num_heads": 16,
|
61 |
+
"embed_dim": 1024,
|
62 |
+
"features": 256,
|
63 |
+
"out_channels": [256, 512, 1024, 1024],
|
64 |
+
},
|
65 |
+
}
|
66 |
+
|
67 |
+
encoder2name = {
|
68 |
+
"vits": "Small",
|
69 |
+
"vitb": "Base",
|
70 |
+
"vitl": "Large",
|
71 |
+
}
|
72 |
+
|
73 |
+
encoder = "vits"
|
74 |
+
filepath = hf_hub_download(
|
75 |
+
repo_id=f"depth-anything/Depth-Anything-V2-{encoder2name[encoder]}-hf",
|
76 |
+
filename=f"{encoder}.safetensors",
|
77 |
+
repo_type="model",
|
78 |
+
)
|
79 |
+
|
80 |
+
config = get_config(encoder)
|
81 |
+
model = DPTv2(config)
|
82 |
+
|
83 |
+
Tensor.no_grad = True
|
84 |
+
with st.safe_open(f"{encoder}.safetensors", "numpy") as f:
|
85 |
+
tensors = {key: Tensor(f.get_tensor(key)) for key in f.keys()}
|
86 |
+
nn.state.load_state_dict(model, tensors, verbose=False, strict=True, consume=True)
|
87 |
+
|
88 |
+
|
89 |
+
title = "# Depth Anything V2 tinygrad"
|
90 |
+
description = """Demo for **Depth Anything V2 tinygrad**. Refer to [github](https://github.com/kutipense/Depth-Anything-V2-tinygrad) for source.
|
91 |
+
Please also refer to original [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), and [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
|
92 |
+
|
93 |
+
|
94 |
+
@spaces.GPU
|
95 |
+
def predict_depth(image):
|
96 |
+
image, (h, w) = image2tensor(image, input_size=config.img_size)
|
97 |
+
output = model(image)
|
98 |
+
output = output.interpolate((h, w), mode="linear", align_corners=True).realize()
|
99 |
+
output = output.numpy()[0, 0]
|
100 |
+
return output
|
101 |
+
|
102 |
+
|
103 |
+
with gr.Blocks(css=css) as demo:
|
104 |
+
gr.Markdown(title)
|
105 |
+
gr.Markdown(description)
|
106 |
+
gr.Markdown("### Depth Prediction demo")
|
107 |
+
|
108 |
+
with gr.Row():
|
109 |
+
input_image = gr.Image(label="Input Image", type="numpy", elem_id="img-display-input")
|
110 |
+
depth_image_slider = ImageSlider(
|
111 |
+
label="Depth Map with Slider View", elem_id="img-display-output", position=0.5
|
112 |
+
)
|
113 |
+
submit = gr.Button(value="Compute Depth")
|
114 |
+
gray_depth_file = gr.File(
|
115 |
+
label="Grayscale depth map",
|
116 |
+
elem_id="download",
|
117 |
+
)
|
118 |
+
raw_file = gr.File(
|
119 |
+
label="16-bit raw output (can be considered as disparity)",
|
120 |
+
elem_id="download",
|
121 |
+
)
|
122 |
+
|
123 |
+
cmap = matplotlib.colormaps.get_cmap("Spectral_r")
|
124 |
+
|
125 |
+
def on_submit(image):
|
126 |
+
original_image = image.copy()
|
127 |
+
|
128 |
+
depth = predict_depth(image[:, :, ::-1])
|
129 |
+
|
130 |
+
raw_depth = Image.fromarray(depth.astype("uint16"))
|
131 |
+
tmp_raw_depth = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
132 |
+
raw_depth.save(tmp_raw_depth.name)
|
133 |
+
|
134 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
135 |
+
depth = depth.astype(np.uint8)
|
136 |
+
colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
|
137 |
+
|
138 |
+
gray_depth = Image.fromarray(depth)
|
139 |
+
tmp_gray_depth = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
140 |
+
gray_depth.save(tmp_gray_depth.name)
|
141 |
+
|
142 |
+
return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
|
143 |
+
|
144 |
+
submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
|
145 |
+
|
146 |
+
example_files = os.listdir("assets/examples")
|
147 |
+
example_files.sort()
|
148 |
+
example_files = [os.path.join("assets/examples", filename) for filename in example_files]
|
149 |
+
examples = gr.Examples(
|
150 |
+
examples=example_files,
|
151 |
+
inputs=[input_image],
|
152 |
+
outputs=[depth_image_slider, gray_depth_file, raw_file],
|
153 |
+
fn=on_submit,
|
154 |
+
)
|
155 |
+
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
demo.queue().launch(share=True)
|
dpt.py
CHANGED
@@ -1,3 +1,240 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
from tinygrad import Tensor, nn
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class DPTv2Config:
|
8 |
+
img_size: int
|
9 |
+
patch_size: int
|
10 |
+
in_channels: int
|
11 |
+
embed_dim: int
|
12 |
+
depth: int
|
13 |
+
mlp_ratio: int
|
14 |
+
num_heads: int
|
15 |
+
features: int
|
16 |
+
out_channels: list[int]
|
17 |
+
indermediate_layers: list[int]
|
18 |
+
|
19 |
+
|
20 |
+
class PatchEmbeddings:
|
21 |
+
def __init__(self, config: DPTv2Config):
|
22 |
+
self.projection = nn.Conv2d(
|
23 |
+
config.in_channels, config.embed_dim, kernel_size=config.patch_size, stride=config.patch_size
|
24 |
+
)
|
25 |
+
|
26 |
+
def __call__(self, x: Tensor) -> Tensor:
|
27 |
+
return self.projection(x).flatten(2).transpose(1, 2)
|
28 |
+
|
29 |
+
|
30 |
+
class Embeddings:
|
31 |
+
def __init__(self, config: DPTv2Config, num_tokens=1):
|
32 |
+
num_patches = (config.img_size // config.patch_size) ** 2
|
33 |
+
|
34 |
+
self.patch_embeddings = PatchEmbeddings(config)
|
35 |
+
self.cls_token = Tensor.zeros(1, 1, config.embed_dim)
|
36 |
+
self.mask_token = Tensor.zeros(1, config.embed_dim) # unused
|
37 |
+
self.position_embeddings = Tensor.zeros(1, num_patches + num_tokens, config.embed_dim)
|
38 |
+
|
39 |
+
def __call__(self, x: Tensor) -> Tensor:
|
40 |
+
x = self.patch_embeddings(x)
|
41 |
+
x = Tensor.cat(self.cls_token.expand(x.shape[0], -1, -1), x, dim=1)
|
42 |
+
x = x + self.position_embeddings
|
43 |
+
|
44 |
+
return x
|
45 |
+
|
46 |
+
|
47 |
+
class Attention:
|
48 |
+
def __init__(self, config: DPTv2Config):
|
49 |
+
self.num_heads = config.num_heads
|
50 |
+
self.scale = (config.embed_dim // config.num_heads) ** -0.5
|
51 |
+
|
52 |
+
self.query = nn.Linear(config.embed_dim, config.embed_dim)
|
53 |
+
self.key = nn.Linear(config.embed_dim, config.embed_dim)
|
54 |
+
self.value = nn.Linear(config.embed_dim, config.embed_dim)
|
55 |
+
|
56 |
+
def __call__(self, x: Tensor) -> Tensor:
|
57 |
+
B, N, C = x.shape
|
58 |
+
ch = C // self.num_heads
|
59 |
+
q = self.query(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
|
60 |
+
k = self.key(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
|
61 |
+
v = self.value(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
|
62 |
+
|
63 |
+
attn: Tensor = (q @ k.transpose(-2, -1)) * self.scale
|
64 |
+
attn = attn.softmax(axis=-1)
|
65 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
66 |
+
|
67 |
+
return x
|
68 |
+
|
69 |
+
|
70 |
+
class MLP:
|
71 |
+
def __init__(self, config: DPTv2Config):
|
72 |
+
in_features = config.embed_dim
|
73 |
+
hidden_features = int(config.embed_dim * config.mlp_ratio)
|
74 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
75 |
+
self.fc2 = nn.Linear(hidden_features, in_features)
|
76 |
+
|
77 |
+
def __call__(self, x: Tensor) -> Tensor:
|
78 |
+
return self.fc2(self.fc1(x).gelu())
|
79 |
+
|
80 |
+
|
81 |
+
class Layer:
|
82 |
+
def __init__(self, config: DPTv2Config):
|
83 |
+
self.attention = Attention(config)
|
84 |
+
self.dense = nn.Linear(config.embed_dim, config.embed_dim)
|
85 |
+
self.layer_scales = [Tensor.ones(config.embed_dim) * 1e-5 for _ in range(2)]
|
86 |
+
self.norms = [nn.LayerNorm(config.embed_dim, eps=1e-6) for _ in range(2)]
|
87 |
+
self.mlp = MLP(config)
|
88 |
+
|
89 |
+
def __call__(self, x: Tensor) -> Tensor:
|
90 |
+
x = x + self.layer_scales[0] * self.dense(self.attention(self.norms[0](x)))
|
91 |
+
x = x + self.layer_scales[1] * self.mlp(self.norms[1](x))
|
92 |
+
return x
|
93 |
+
|
94 |
+
def _asdict(self):
|
95 |
+
return {
|
96 |
+
"attention.attention": self.attention,
|
97 |
+
"attention.output.dense": self.dense,
|
98 |
+
"layer_scale1.lambda1": self.layer_scales[0],
|
99 |
+
"layer_scale2.lambda1": self.layer_scales[1],
|
100 |
+
"mlp": self.mlp,
|
101 |
+
"norm1": self.norms[0],
|
102 |
+
"norm2": self.norms[1],
|
103 |
+
}
|
104 |
+
|
105 |
+
|
106 |
+
class Encoder:
|
107 |
+
def __init__(self, config: DPTv2Config):
|
108 |
+
self.layer = [Layer(config) for _ in range(config.depth)]
|
109 |
+
|
110 |
+
def __call__(self, x: Tensor) -> Tensor:
|
111 |
+
outputs = []
|
112 |
+
for layer in self.layer:
|
113 |
+
x = layer(x)
|
114 |
+
outputs.append(x)
|
115 |
+
return outputs
|
116 |
+
|
117 |
+
|
118 |
+
class Backbone:
|
119 |
+
def __init__(self, config: DPTv2Config):
|
120 |
+
self.indermediate_layers = config.indermediate_layers
|
121 |
+
self.embeddings = Embeddings(config)
|
122 |
+
self.encoder = Encoder(config)
|
123 |
+
self.layernorm = nn.LayerNorm(config.embed_dim, eps=1e-6)
|
124 |
+
|
125 |
+
def __call__(self, x: Tensor) -> Tensor:
|
126 |
+
x = self.encoder(self.embeddings(x))
|
127 |
+
return [self.layernorm(x[ind]) for ind in self.indermediate_layers]
|
128 |
+
|
129 |
+
|
130 |
+
class Head:
|
131 |
+
def __init__(self, config: DPTv2Config):
|
132 |
+
in_feats, out_feats = config.features, config.features // 2
|
133 |
+
self.conv1 = nn.Conv2d(in_feats, out_feats, kernel_size=3, padding=1)
|
134 |
+
self.conv2 = nn.Conv2d(out_feats, 32, kernel_size=3, padding=1)
|
135 |
+
self.conv3 = nn.Conv2d(32, 1, kernel_size=1)
|
136 |
+
|
137 |
+
self.patch_h = self.patch_w = config.img_size // config.patch_size
|
138 |
+
self.patch_h = self.patch_w = self.patch_h * config.patch_size
|
139 |
+
|
140 |
+
def __call__(self, x: Tensor) -> Tensor:
|
141 |
+
x = self.conv1(x).interpolate((self.patch_h, self.patch_w), align_corners=True)
|
142 |
+
x = self.conv3(self.conv2(x).relu()).relu()
|
143 |
+
return x
|
144 |
+
|
145 |
+
|
146 |
+
class ResidualLayer:
|
147 |
+
def __init__(self, config: DPTv2Config):
|
148 |
+
in_feats = config.features
|
149 |
+
self.convolution1 = nn.Conv2d(in_feats, in_feats, kernel_size=3, padding=1)
|
150 |
+
self.convolution2 = nn.Conv2d(in_feats, in_feats, kernel_size=3, padding=1)
|
151 |
+
|
152 |
+
def __call__(self, x: Tensor) -> Tensor:
|
153 |
+
return self.convolution2(self.convolution1(x.relu()).relu()) + x
|
154 |
+
|
155 |
+
|
156 |
+
class FusionStage:
|
157 |
+
def __init__(self, config: DPTv2Config):
|
158 |
+
in_feats = config.features
|
159 |
+
self.residual_layer1 = ResidualLayer(config)
|
160 |
+
self.residual_layer2 = ResidualLayer(config)
|
161 |
+
self.projection = nn.Conv2d(in_feats, in_feats, kernel_size=1)
|
162 |
+
|
163 |
+
def __call__(self, layer0: Tensor, layer1: Tensor = None, size=None) -> Tensor:
|
164 |
+
if layer1 is not None:
|
165 |
+
layer0 = layer0 + self.residual_layer1(layer1)
|
166 |
+
|
167 |
+
layer0 = self.residual_layer2(layer0)
|
168 |
+
size = list(map(lambda x: x * 2, layer0.shape[2:])) if size is None else size
|
169 |
+
return self.projection(layer0.interpolate(size, align_corners=True))
|
170 |
+
|
171 |
+
|
172 |
+
class ReassembleStage:
|
173 |
+
def __init__(self, config: DPTv2Config):
|
174 |
+
ins, outs = config.embed_dim, config.out_channels
|
175 |
+
|
176 |
+
self.projection = [
|
177 |
+
nn.Conv2d(in_channels=ins, out_channels=out_channel, kernel_size=1) for out_channel in outs
|
178 |
+
]
|
179 |
+
|
180 |
+
self.resize_layers = [
|
181 |
+
nn.ConvTranspose2d(in_channels=outs[0], out_channels=outs[0], kernel_size=4, stride=4),
|
182 |
+
nn.ConvTranspose2d(in_channels=outs[1], out_channels=outs[1], kernel_size=2, stride=2),
|
183 |
+
lambda x: x,
|
184 |
+
nn.Conv2d(in_channels=outs[3], out_channels=outs[3], kernel_size=3, stride=2, padding=1),
|
185 |
+
]
|
186 |
+
|
187 |
+
self.patch_h = self.patch_w = config.img_size // config.patch_size
|
188 |
+
|
189 |
+
def __call__(self, inputs: list[Tensor]) -> list[Tensor]:
|
190 |
+
outputs = []
|
191 |
+
for i, out in enumerate(inputs):
|
192 |
+
x = out[:, 1:] # remove the cls token
|
193 |
+
x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], self.patch_h, self.patch_w))
|
194 |
+
x = self.resize_layers[i](self.projection[i](x))
|
195 |
+
outputs.append(x)
|
196 |
+
return outputs
|
197 |
+
|
198 |
+
def _asdict(self):
|
199 |
+
return {
|
200 |
+
"layers": [{"projection": p, "resize": r} for p, r in zip(self.projection, self.resize_layers)]
|
201 |
+
}
|
202 |
+
|
203 |
+
|
204 |
+
class Neck:
|
205 |
+
def __init__(self, config: DPTv2Config):
|
206 |
+
self.convs = [
|
207 |
+
nn.Conv2d(in_channels=ch, out_channels=config.features, kernel_size=3, padding=1, bias=False)
|
208 |
+
for ch in config.out_channels
|
209 |
+
]
|
210 |
+
|
211 |
+
self.reassemble_stage = ReassembleStage(config)
|
212 |
+
self.fusion_stage = [FusionStage(config) for _ in range(4)]
|
213 |
+
|
214 |
+
def __call__(self, x: Tensor) -> Tensor:
|
215 |
+
outputs = self.reassemble_stage(x)
|
216 |
+
outputs = [conv(out) for out, conv in zip(outputs, self.convs)]
|
217 |
+
|
218 |
+
path_4 = self.fusion_stage[0](outputs[3], size=outputs[2].shape[2:])
|
219 |
+
path_3 = self.fusion_stage[1](path_4, outputs[2], size=outputs[1].shape[2:])
|
220 |
+
path_2 = self.fusion_stage[2](path_3, outputs[1], size=outputs[0].shape[2:])
|
221 |
+
path_1 = self.fusion_stage[3](path_2, outputs[0])
|
222 |
+
|
223 |
+
return path_1
|
224 |
+
|
225 |
+
def _asdict(self):
|
226 |
+
return {
|
227 |
+
"convs": self.convs,
|
228 |
+
"fusion_stage.layers": self.fusion_stage,
|
229 |
+
"reassemble_stage": self.reassemble_stage,
|
230 |
+
}
|
231 |
+
|
232 |
+
|
233 |
+
class DPTv2:
|
234 |
+
def __init__(self, config):
|
235 |
+
self.backbone = Backbone(config)
|
236 |
+
self.head = Head(config)
|
237 |
+
self.neck = Neck(config)
|
238 |
+
|
239 |
+
def __call__(self, x: Tensor) -> Tensor:
|
240 |
+
return self.head(self.neck(self.backbone(x)))
|
examples/demo01.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo02.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo03.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo04.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo05.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo06.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo07.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo08.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo09.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo10.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo11.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo12.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo13.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo14.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo15.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo16.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo17.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo18.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
examples/demo20.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
requirements.txt
CHANGED
@@ -1,3 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio_imageslider
|
2 |
+
gradio==4.36.0
|
3 |
+
tinygrad
|
4 |
+
safetensors
|
5 |
+
opencv-python
|
6 |
+
matplotlib
|
7 |
+
huggingface_hub
|
transform.py
CHANGED
@@ -1,3 +1,185 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
from tinygrad import Tensor
|
4 |
+
|
5 |
+
|
6 |
+
class Resize(object):
|
7 |
+
"""Resize sample to given size (width, height)."""
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
width,
|
12 |
+
height,
|
13 |
+
resize_target=True,
|
14 |
+
keep_aspect_ratio=False,
|
15 |
+
ensure_multiple_of=1,
|
16 |
+
resize_method="lower_bound",
|
17 |
+
image_interpolation_method=cv2.INTER_AREA,
|
18 |
+
):
|
19 |
+
"""Init.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
width (int): desired output width
|
23 |
+
height (int): desired output height
|
24 |
+
resize_target (bool, optional):
|
25 |
+
True: Resize the full sample (image, mask, target).
|
26 |
+
False: Resize image only.
|
27 |
+
Defaults to True.
|
28 |
+
keep_aspect_ratio (bool, optional):
|
29 |
+
True: Keep the aspect ratio of the input sample.
|
30 |
+
Output sample might not have the given width and height, and
|
31 |
+
resize behaviour depends on the parameter 'resize_method'.
|
32 |
+
Defaults to False.
|
33 |
+
ensure_multiple_of (int, optional):
|
34 |
+
Output width and height is constrained to be multiple of this parameter.
|
35 |
+
Defaults to 1.
|
36 |
+
resize_method (str, optional):
|
37 |
+
"lower_bound": Output will be at least as large as the given size.
|
38 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
39 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
40 |
+
Defaults to "lower_bound".
|
41 |
+
"""
|
42 |
+
self.__width = width
|
43 |
+
self.__height = height
|
44 |
+
|
45 |
+
self.__resize_target = resize_target
|
46 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
47 |
+
self.__multiple_of = ensure_multiple_of
|
48 |
+
self.__resize_method = resize_method
|
49 |
+
self.__image_interpolation_method = image_interpolation_method
|
50 |
+
|
51 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
52 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
53 |
+
|
54 |
+
if max_val is not None and y > max_val:
|
55 |
+
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
56 |
+
|
57 |
+
if y < min_val:
|
58 |
+
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
59 |
+
|
60 |
+
return y
|
61 |
+
|
62 |
+
def get_size(self, width, height):
|
63 |
+
# determine new height and width
|
64 |
+
scale_height = self.__height / height
|
65 |
+
scale_width = self.__width / width
|
66 |
+
|
67 |
+
if self.__keep_aspect_ratio:
|
68 |
+
if self.__resize_method == "lower_bound":
|
69 |
+
# scale such that output size is lower bound
|
70 |
+
if scale_width > scale_height:
|
71 |
+
# fit width
|
72 |
+
scale_height = scale_width
|
73 |
+
else:
|
74 |
+
# fit height
|
75 |
+
scale_width = scale_height
|
76 |
+
elif self.__resize_method == "upper_bound":
|
77 |
+
# scale such that output size is upper bound
|
78 |
+
if scale_width < scale_height:
|
79 |
+
# fit width
|
80 |
+
scale_height = scale_width
|
81 |
+
else:
|
82 |
+
# fit height
|
83 |
+
scale_width = scale_height
|
84 |
+
elif self.__resize_method == "minimal":
|
85 |
+
# scale as least as possbile
|
86 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
87 |
+
# fit width
|
88 |
+
scale_height = scale_width
|
89 |
+
else:
|
90 |
+
# fit height
|
91 |
+
scale_width = scale_height
|
92 |
+
else:
|
93 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
94 |
+
|
95 |
+
if self.__resize_method == "lower_bound":
|
96 |
+
new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
|
97 |
+
new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
|
98 |
+
elif self.__resize_method == "upper_bound":
|
99 |
+
new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
|
100 |
+
new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
|
101 |
+
elif self.__resize_method == "minimal":
|
102 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
103 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
104 |
+
else:
|
105 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
106 |
+
|
107 |
+
return (new_width, new_height)
|
108 |
+
|
109 |
+
def __call__(self, sample):
|
110 |
+
width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
|
111 |
+
|
112 |
+
# resize sample
|
113 |
+
sample["image"] = cv2.resize(
|
114 |
+
sample["image"], (width, height), interpolation=self.__image_interpolation_method
|
115 |
+
)
|
116 |
+
|
117 |
+
if self.__resize_target:
|
118 |
+
if "depth" in sample:
|
119 |
+
sample["depth"] = cv2.resize(
|
120 |
+
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
121 |
+
)
|
122 |
+
|
123 |
+
if "mask" in sample:
|
124 |
+
sample["mask"] = cv2.resize(
|
125 |
+
sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST
|
126 |
+
)
|
127 |
+
|
128 |
+
return sample
|
129 |
+
|
130 |
+
|
131 |
+
class NormalizeImage(object):
|
132 |
+
"""Normlize image by given mean and std."""
|
133 |
+
|
134 |
+
def __init__(self, mean, std):
|
135 |
+
self.__mean = mean
|
136 |
+
self.__std = std
|
137 |
+
|
138 |
+
def __call__(self, sample):
|
139 |
+
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
140 |
+
|
141 |
+
return sample
|
142 |
+
|
143 |
+
|
144 |
+
class PrepareForNet(object):
|
145 |
+
"""Prepare sample for usage as network input."""
|
146 |
+
|
147 |
+
def __init__(self):
|
148 |
+
pass
|
149 |
+
|
150 |
+
def __call__(self, sample):
|
151 |
+
image = np.transpose(sample["image"], (2, 0, 1))
|
152 |
+
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
153 |
+
|
154 |
+
if "depth" in sample:
|
155 |
+
depth = sample["depth"].astype(np.float32)
|
156 |
+
sample["depth"] = np.ascontiguousarray(depth)
|
157 |
+
|
158 |
+
if "mask" in sample:
|
159 |
+
sample["mask"] = sample["mask"].astype(np.float32)
|
160 |
+
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
161 |
+
|
162 |
+
return sample
|
163 |
+
|
164 |
+
|
165 |
+
def image2tensor(raw_image, input_size=518):
|
166 |
+
transforms = [
|
167 |
+
Resize(
|
168 |
+
width=input_size,
|
169 |
+
height=input_size,
|
170 |
+
resize_target=False,
|
171 |
+
keep_aspect_ratio=False,
|
172 |
+
ensure_multiple_of=14,
|
173 |
+
resize_method="lower_bound",
|
174 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
175 |
+
),
|
176 |
+
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
177 |
+
PrepareForNet(),
|
178 |
+
]
|
179 |
+
|
180 |
+
image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
|
181 |
+
for transform in transforms:
|
182 |
+
image = transform({"image": image})["image"]
|
183 |
+
image = Tensor(image).unsqueeze(0)
|
184 |
+
|
185 |
+
return image, raw_image.shape[:2]
|