Spaces:

kutipense
/

Depth-Anything-V2-tinygrad

Sleeping

App Files Files Community

kutipense commited on Oct 18, 2024

Commit

7fff87a

1 Parent(s): 18dbcb5

lfs fix

Browse files

Files changed (25) hide show

.gitattributes +0 -4
README.md +13 -3
app.py +158 -3
dpt.py +240 -3
examples/demo01.jpg +0 -0
examples/demo02.jpg +0 -0
examples/demo03.jpg +0 -0
examples/demo04.jpg +0 -0
examples/demo05.jpg +0 -0
examples/demo06.jpg +0 -0
examples/demo07.jpg +0 -0
examples/demo08.jpg +0 -0
examples/demo09.jpg +0 -0
examples/demo10.jpg +0 -0
examples/demo11.jpg +0 -0
examples/demo12.jpg +0 -0
examples/demo13.jpg +0 -0
examples/demo14.jpg +0 -0
examples/demo15.jpg +0 -0
examples/demo16.jpg +0 -0
examples/demo17.jpg +0 -0
examples/demo18.jpg +0 -0
examples/demo20.jpg +0 -0
requirements.txt +7 -3
transform.py +185 -3

.gitattributes CHANGED Viewed

@@ -33,8 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-*.jpg filter=lfs diff=lfs merge=lfs -text
-*.md filter=lfs diff=lfs merge=lfs -text
-*.py filter=lfs diff=lfs merge=lfs -text
-*.txt filter=lfs diff=lfs merge=lfs -text
 examples/demo19.jpg filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 examples/demo19.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,13 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8d2a8df0a0165c80105ab7e07422628914c75a185ab3112df72d0e4128481473
-size 266

+---
+title: Depth Anything V2 tinygrad
+emoji: 🌖
+colorFrom: red
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.36.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,3 +1,158 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eedd9efc509f1d9870c4b9f655124348ef968df3b9c8004417bd20e275e179ad
-size 4822

+import os
+import tempfile
+from typing import Literal
+import gradio as gr
+import matplotlib
+import numpy as np
+import safetensors as st
+import spaces
+import torch
+from gradio_imageslider import ImageSlider
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from tinygrad import Tensor, nn
+from transform import image2tensor
+from dpt import DPTv2, DPTv2Config
+css = """
+#img-display-container {
+    max-height: 100vh;
+}
+#img-display-input {
+    max-height: 80vh;
+}
+#img-display-output {
+    max-height: 80vh;
+}
+#download {
+    height: 62px;
+}
+"""
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def get_config(m_size: Literal["vits", "vitb", "vitl", "vitg"]):
+    return DPTv2Config(img_size=518, patch_size=14, in_channels=3, mlp_ratio=4, **model_configs[m_size])
+model_configs = {
+    "vits": {
+        "indermediate_layers": [2, 5, 8, 11],
+        "depth": 12,
+        "num_heads": 6,
+        "embed_dim": 384,
+        "features": 64,
+        "out_channels": [48, 96, 192, 384],
+    },
+    "vitb": {
+        "indermediate_layers": [2, 5, 8, 11],
+        "depth": 12,
+        "num_heads": 12,
+        "embed_dim": 768,
+        "features": 128,
+        "out_channels": [96, 192, 384, 768],
+    },
+    "vitl": {
+        "indermediate_layers": [4, 11, 17, 23],
+        "depth": 24,
+        "num_heads": 16,
+        "embed_dim": 1024,
+        "features": 256,
+        "out_channels": [256, 512, 1024, 1024],
+    },
+}
+encoder2name = {
+    "vits": "Small",
+    "vitb": "Base",
+    "vitl": "Large",
+}
+encoder = "vits"
+filepath = hf_hub_download(
+    repo_id=f"depth-anything/Depth-Anything-V2-{encoder2name[encoder]}-hf",
+    filename=f"{encoder}.safetensors",
+    repo_type="model",
+)
+config = get_config(encoder)
+model = DPTv2(config)
+Tensor.no_grad = True
+with st.safe_open(f"{encoder}.safetensors", "numpy") as f:
+    tensors = {key: Tensor(f.get_tensor(key)) for key in f.keys()}
+    nn.state.load_state_dict(model, tensors, verbose=False, strict=True, consume=True)
+title = "# Depth Anything V2 tinygrad"
+description = """Demo for **Depth Anything V2 tinygrad**. Refer to [github](https://github.com/kutipense/Depth-Anything-V2-tinygrad) for source.
+Please also refer to original [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), and [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
+@spaces.GPU
+def predict_depth(image):
+    image, (h, w) = image2tensor(image, input_size=config.img_size)
+    output = model(image)
+    output = output.interpolate((h, w), mode="linear", align_corners=True).realize()
+    output = output.numpy()[0, 0]
+    return output
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown("### Depth Prediction demo")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type="numpy", elem_id="img-display-input")
+        depth_image_slider = ImageSlider(
+            label="Depth Map with Slider View", elem_id="img-display-output", position=0.5
+        )
+    submit = gr.Button(value="Compute Depth")
+    gray_depth_file = gr.File(
+        label="Grayscale depth map",
+        elem_id="download",
+    )
+    raw_file = gr.File(
+        label="16-bit raw output (can be considered as disparity)",
+        elem_id="download",
+    )
+    cmap = matplotlib.colormaps.get_cmap("Spectral_r")
+    def on_submit(image):
+        original_image = image.copy()
+        depth = predict_depth(image[:, :, ::-1])
+        raw_depth = Image.fromarray(depth.astype("uint16"))
+        tmp_raw_depth = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+        raw_depth.save(tmp_raw_depth.name)
+        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+        depth = depth.astype(np.uint8)
+        colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+        gray_depth = Image.fromarray(depth)
+        tmp_gray_depth = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+        gray_depth.save(tmp_gray_depth.name)
+        return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
+    submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
+    example_files = os.listdir("assets/examples")
+    example_files.sort()
+    example_files = [os.path.join("assets/examples", filename) for filename in example_files]
+    examples = gr.Examples(
+        examples=example_files,
+        inputs=[input_image],
+        outputs=[depth_image_slider, gray_depth_file, raw_file],
+        fn=on_submit,
+    )
+if __name__ == "__main__":
+    demo.queue().launch(share=True)

dpt.py CHANGED Viewed

@@ -1,3 +1,240 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:813608b8524baebc1081e684fdab88b93160a95634591cd4273607fe022d3862
-size 8535

+from dataclasses import dataclass
+from tinygrad import Tensor, nn
+@dataclass
+class DPTv2Config:
+    img_size: int
+    patch_size: int
+    in_channels: int
+    embed_dim: int
+    depth: int
+    mlp_ratio: int
+    num_heads: int
+    features: int
+    out_channels: list[int]
+    indermediate_layers: list[int]
+class PatchEmbeddings:
+    def __init__(self, config: DPTv2Config):
+        self.projection = nn.Conv2d(
+            config.in_channels, config.embed_dim, kernel_size=config.patch_size, stride=config.patch_size
+        )
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.projection(x).flatten(2).transpose(1, 2)
+class Embeddings:
+    def __init__(self, config: DPTv2Config, num_tokens=1):
+        num_patches = (config.img_size // config.patch_size) ** 2
+        self.patch_embeddings = PatchEmbeddings(config)
+        self.cls_token = Tensor.zeros(1, 1, config.embed_dim)
+        self.mask_token = Tensor.zeros(1, config.embed_dim)  # unused
+        self.position_embeddings = Tensor.zeros(1, num_patches + num_tokens, config.embed_dim)
+    def __call__(self, x: Tensor) -> Tensor:
+        x = self.patch_embeddings(x)
+        x = Tensor.cat(self.cls_token.expand(x.shape[0], -1, -1), x, dim=1)
+        x = x + self.position_embeddings
+        return x
+class Attention:
+    def __init__(self, config: DPTv2Config):
+        self.num_heads = config.num_heads
+        self.scale = (config.embed_dim // config.num_heads) ** -0.5
+        self.query = nn.Linear(config.embed_dim, config.embed_dim)
+        self.key = nn.Linear(config.embed_dim, config.embed_dim)
+        self.value = nn.Linear(config.embed_dim, config.embed_dim)
+    def __call__(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        ch = C // self.num_heads
+        q = self.query(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
+        k = self.key(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
+        v = self.value(x).reshape(B, N, self.num_heads, ch).transpose(2, 1)
+        attn: Tensor = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(axis=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        return x
+class MLP:
+    def __init__(self, config: DPTv2Config):
+        in_features = config.embed_dim
+        hidden_features = int(config.embed_dim * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, in_features)
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.fc2(self.fc1(x).gelu())
+class Layer:
+    def __init__(self, config: DPTv2Config):
+        self.attention = Attention(config)
+        self.dense = nn.Linear(config.embed_dim, config.embed_dim)
+        self.layer_scales = [Tensor.ones(config.embed_dim) * 1e-5 for _ in range(2)]
+        self.norms = [nn.LayerNorm(config.embed_dim, eps=1e-6) for _ in range(2)]
+        self.mlp = MLP(config)
+    def __call__(self, x: Tensor) -> Tensor:
+        x = x + self.layer_scales[0] * self.dense(self.attention(self.norms[0](x)))
+        x = x + self.layer_scales[1] * self.mlp(self.norms[1](x))
+        return x
+    def _asdict(self):
+        return {
+            "attention.attention": self.attention,
+            "attention.output.dense": self.dense,
+            "layer_scale1.lambda1": self.layer_scales[0],
+            "layer_scale2.lambda1": self.layer_scales[1],
+            "mlp": self.mlp,
+            "norm1": self.norms[0],
+            "norm2": self.norms[1],
+        }
+class Encoder:
+    def __init__(self, config: DPTv2Config):
+        self.layer = [Layer(config) for _ in range(config.depth)]
+    def __call__(self, x: Tensor) -> Tensor:
+        outputs = []
+        for layer in self.layer:
+            x = layer(x)
+            outputs.append(x)
+        return outputs
+class Backbone:
+    def __init__(self, config: DPTv2Config):
+        self.indermediate_layers = config.indermediate_layers
+        self.embeddings = Embeddings(config)
+        self.encoder = Encoder(config)
+        self.layernorm = nn.LayerNorm(config.embed_dim, eps=1e-6)
+    def __call__(self, x: Tensor) -> Tensor:
+        x = self.encoder(self.embeddings(x))
+        return [self.layernorm(x[ind]) for ind in self.indermediate_layers]
+class Head:
+    def __init__(self, config: DPTv2Config):
+        in_feats, out_feats = config.features, config.features // 2
+        self.conv1 = nn.Conv2d(in_feats, out_feats, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(out_feats, 32, kernel_size=3, padding=1)
+        self.conv3 = nn.Conv2d(32, 1, kernel_size=1)
+        self.patch_h = self.patch_w = config.img_size // config.patch_size
+        self.patch_h = self.patch_w = self.patch_h * config.patch_size
+    def __call__(self, x: Tensor) -> Tensor:
+        x = self.conv1(x).interpolate((self.patch_h, self.patch_w), align_corners=True)
+        x = self.conv3(self.conv2(x).relu()).relu()
+        return x
+class ResidualLayer:
+    def __init__(self, config: DPTv2Config):
+        in_feats = config.features
+        self.convolution1 = nn.Conv2d(in_feats, in_feats, kernel_size=3, padding=1)
+        self.convolution2 = nn.Conv2d(in_feats, in_feats, kernel_size=3, padding=1)
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.convolution2(self.convolution1(x.relu()).relu()) + x
+class FusionStage:
+    def __init__(self, config: DPTv2Config):
+        in_feats = config.features
+        self.residual_layer1 = ResidualLayer(config)
+        self.residual_layer2 = ResidualLayer(config)
+        self.projection = nn.Conv2d(in_feats, in_feats, kernel_size=1)
+    def __call__(self, layer0: Tensor, layer1: Tensor = None, size=None) -> Tensor:
+        if layer1 is not None:
+            layer0 = layer0 + self.residual_layer1(layer1)
+        layer0 = self.residual_layer2(layer0)
+        size = list(map(lambda x: x * 2, layer0.shape[2:])) if size is None else size
+        return self.projection(layer0.interpolate(size, align_corners=True))
+class ReassembleStage:
+    def __init__(self, config: DPTv2Config):
+        ins, outs = config.embed_dim, config.out_channels
+        self.projection = [
+            nn.Conv2d(in_channels=ins, out_channels=out_channel, kernel_size=1) for out_channel in outs
+        ]
+        self.resize_layers = [
+            nn.ConvTranspose2d(in_channels=outs[0], out_channels=outs[0], kernel_size=4, stride=4),
+            nn.ConvTranspose2d(in_channels=outs[1], out_channels=outs[1], kernel_size=2, stride=2),
+            lambda x: x,
+            nn.Conv2d(in_channels=outs[3], out_channels=outs[3], kernel_size=3, stride=2, padding=1),
+        ]
+        self.patch_h = self.patch_w = config.img_size // config.patch_size
+    def __call__(self, inputs: list[Tensor]) -> list[Tensor]:
+        outputs = []
+        for i, out in enumerate(inputs):
+            x = out[:, 1:]  # remove the cls token
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], self.patch_h, self.patch_w))
+            x = self.resize_layers[i](self.projection[i](x))
+            outputs.append(x)
+        return outputs
+    def _asdict(self):
+        return {
+            "layers": [{"projection": p, "resize": r} for p, r in zip(self.projection, self.resize_layers)]
+        }
+class Neck:
+    def __init__(self, config: DPTv2Config):
+        self.convs = [
+            nn.Conv2d(in_channels=ch, out_channels=config.features, kernel_size=3, padding=1, bias=False)
+            for ch in config.out_channels
+        ]
+        self.reassemble_stage = ReassembleStage(config)
+        self.fusion_stage = [FusionStage(config) for _ in range(4)]
+    def __call__(self, x: Tensor) -> Tensor:
+        outputs = self.reassemble_stage(x)
+        outputs = [conv(out) for out, conv in zip(outputs, self.convs)]
+        path_4 = self.fusion_stage[0](outputs[3], size=outputs[2].shape[2:])
+        path_3 = self.fusion_stage[1](path_4, outputs[2], size=outputs[1].shape[2:])
+        path_2 = self.fusion_stage[2](path_3, outputs[1], size=outputs[0].shape[2:])
+        path_1 = self.fusion_stage[3](path_2, outputs[0])
+        return path_1
+    def _asdict(self):
+        return {
+            "convs": self.convs,
+            "fusion_stage.layers": self.fusion_stage,
+            "reassemble_stage": self.reassemble_stage,
+        }
+class DPTv2:
+    def __init__(self, config):
+        self.backbone = Backbone(config)
+        self.head = Head(config)
+        self.neck = Neck(config)
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.head(self.neck(self.backbone(x)))

examples/demo01.jpg CHANGED Viewed

Git LFS Details

SHA256: 35ef1bbb63f6540e49aa9b6302b9b938be4fe8b9c08c07c3694b02396b0e87e0
Pointer size: 131 Bytes
Size of remote file: 488 kB

examples/demo02.jpg CHANGED Viewed

Git LFS Details

SHA256: c1f116034aa5abd5b5470226be2bb03bd938c8affe90389c52d10fe8b1ac7e21
Pointer size: 131 Bytes
Size of remote file: 511 kB

examples/demo03.jpg CHANGED Viewed

Git LFS Details

SHA256: 764dffd4d97bbacd620bc005fa86837018393ccb5ffd1059c2245a3cacff7782
Pointer size: 131 Bytes
Size of remote file: 465 kB

examples/demo04.jpg CHANGED Viewed

Git LFS Details

SHA256: 3a301f4e0361fe75ca4d256a35062f87eecc3f7655d747c9def3259c86e26a45
Pointer size: 131 Bytes
Size of remote file: 300 kB

examples/demo05.jpg CHANGED Viewed

Git LFS Details

SHA256: 50e7e2f057c5a2d27bb09b0b3e814147966e30139ddaf54362c72746a5320339
Pointer size: 131 Bytes
Size of remote file: 353 kB

examples/demo06.jpg CHANGED Viewed

Git LFS Details

SHA256: 0fd815bddeab139e7477c948a22fffdf84d9b87f81d77dcf6fd8ef39ebaaafb5
Pointer size: 131 Bytes
Size of remote file: 783 kB

examples/demo07.jpg CHANGED Viewed

Git LFS Details

SHA256: 345bec735adc4c238bf14ddf1d182c4881f8ba08814c4f4074c1d79e9e4adc52
Pointer size: 131 Bytes
Size of remote file: 400 kB

examples/demo08.jpg CHANGED Viewed

Git LFS Details

SHA256: d32b480349013be5f84521b0df1d6590139163aef8457f051076ed03c7371e6f
Pointer size: 131 Bytes
Size of remote file: 103 kB

examples/demo09.jpg CHANGED Viewed

Git LFS Details

SHA256: 6a64033ba69bb408c092dbff811abfbcb0196f1f87541902d03d2a909a0b8ea9
Pointer size: 131 Bytes
Size of remote file: 410 kB

examples/demo10.jpg CHANGED Viewed

Git LFS Details

SHA256: bc77f215081f58de8d079e821e2808f6ee2727dfa729c10a5921c186a32c7638
Pointer size: 131 Bytes
Size of remote file: 487 kB

examples/demo11.jpg CHANGED Viewed

Git LFS Details

SHA256: 150ef98e997ee6ff705bd06105c343f76a8f181ef93ff9ceebbd62a3ab6b592b
Pointer size: 131 Bytes
Size of remote file: 244 kB

examples/demo12.jpg CHANGED Viewed

Git LFS Details

SHA256: 264458adcf5af6e3733dfda7ef4628c4a1dc49ed249aa8896256d9534a8377c4
Pointer size: 131 Bytes
Size of remote file: 263 kB

examples/demo13.jpg CHANGED Viewed

Git LFS Details

SHA256: 9168fc752a002d50138a56621e8de5fab7fed125a978dd293319d28d30993564
Pointer size: 131 Bytes
Size of remote file: 421 kB

examples/demo14.jpg CHANGED Viewed

Git LFS Details

SHA256: 01480d952bc950332f0eea31da0777f66d5f285d8edfe2a5f47508f4b260a99f
Pointer size: 131 Bytes
Size of remote file: 643 kB

examples/demo15.jpg CHANGED Viewed

Git LFS Details

SHA256: bf60ce3879f627e8886280cc61442174c91908894a5b059681341fed600f7db3
Pointer size: 131 Bytes
Size of remote file: 769 kB

examples/demo16.jpg CHANGED Viewed

Git LFS Details

SHA256: a92e51732b38ad8b21b5cbbc6883374bd5ab56bb4907d6c4f1e13307970480ee
Pointer size: 131 Bytes
Size of remote file: 378 kB

examples/demo17.jpg CHANGED Viewed

Git LFS Details

SHA256: 7174dcfbbb95a2e581ebf1e14cfbb4bef7a1295ae9cece405c87145223dcb32d
Pointer size: 131 Bytes
Size of remote file: 153 kB

examples/demo18.jpg CHANGED Viewed

Git LFS Details

SHA256: 4deeb16dbee40108f194bd87c8621416110427c8ab5fc5ad6a1d9002b2b620c2
Pointer size: 131 Bytes
Size of remote file: 179 kB

examples/demo20.jpg CHANGED Viewed

Git LFS Details

SHA256: 2958fd1b7018e40b68ccc8d74ff8e50bf143f5046711d57c54eec2a479550ace
Pointer size: 131 Bytes
Size of remote file: 498 kB

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:29ed775b4357eee3c4f3c7b591810521d94c058e1a7b19600e1b2563b73c6eea
-size 95

+gradio_imageslider
+gradio==4.36.0
+tinygrad
+safetensors
+opencv-python
+matplotlib
+huggingface_hub

transform.py CHANGED Viewed

@@ -1,3 +1,185 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff3a6169f6ca0bd3677861765d49e8dfaddcfd57e2797f880a339bf862e05b75
-size 7018

+import cv2
+import numpy as np
+from tinygrad import Tensor
+class Resize(object):
+    """Resize sample to given size (width, height)."""
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"], (width, height), interpolation=self.__image_interpolation_method
+        )
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(
+                    sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST
+                )
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std."""
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input."""
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample
+def image2tensor(raw_image, input_size=518):
+    transforms = [
+        Resize(
+            width=input_size,
+            height=input_size,
+            resize_target=False,
+            keep_aspect_ratio=False,
+            ensure_multiple_of=14,
+            resize_method="lower_bound",
+            image_interpolation_method=cv2.INTER_CUBIC,
+        ),
+        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        PrepareForNet(),
+    ]
+    image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+    for transform in transforms:
+        image = transform({"image": image})["image"]
+    image = Tensor(image).unsqueeze(0)
+    return image, raw_image.shape[:2]