diff --git a/computervision/depths/.gitignore b/computervision/depths/.gitignore new file mode 100644 index 0000000..f5e96db --- /dev/null +++ b/computervision/depths/.gitignore @@ -0,0 +1 @@ +venv \ No newline at end of file diff --git a/computervision/depths/.gitignore:Zone.Identifier b/computervision/depths/.gitignore:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/DSC01677.jpg b/computervision/depths/DSC01677.jpg new file mode 100644 index 0000000..0818555 Binary files /dev/null and b/computervision/depths/DSC01677.jpg differ diff --git a/computervision/depths/DSC01677.jpg:Zone.Identifier b/computervision/depths/DSC01677.jpg:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/Dybde.png b/computervision/depths/Dybde.png new file mode 100644 index 0000000..9e07859 Binary files /dev/null and b/computervision/depths/Dybde.png differ diff --git a/computervision/depths/Dybde.png:Zone.Identifier b/computervision/depths/Dybde.png:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/LICENSE b/computervision/depths/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/computervision/depths/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/computervision/depths/LICENSE:Zone.Identifier b/computervision/depths/LICENSE:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/README.md b/computervision/depths/README.md new file mode 100644 index 0000000..6fcc5b4 --- /dev/null +++ b/computervision/depths/README.md @@ -0,0 +1,276 @@ +
+

Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data

+ +[**Lihe Yang**](https://liheyoung.github.io/)1 · [**Bingyi Kang**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)2† · [**Zilong Huang**](http://speedinghzl.github.io/)2 · [**Xiaogang Xu**](https://xiaogang00.github.io/)3,4 · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)2 · [**Hengshuang Zhao**](https://hszhao.github.io/)1* + +1HKU    2TikTok    3CUHK    4ZJU + +†project lead *corresponding author + +**CVPR 2024** + +Paper PDF +Project Page + + +
+ +This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**. + +![teaser](assets/teaser.png) + +
+ Try our latest Depth Anything V2 models!
+
+ +## News + +* **2024-06-14:** [Depth Anything V2](https://github.com/DepthAnything/Depth-Anything-V2) is released. +* **2024-02-27:** Depth Anything is accepted by CVPR 2024. +* **2024-02-05:** [Depth Anything Gallery](./gallery.md) is released. Thank all the users! +* **2024-02-02:** Depth Anything serves as the default depth processor for [InstantID](https://github.com/InstantID/InstantID) and [InvokeAI](https://github.com/invoke-ai/InvokeAI/releases/tag/v3.6.1). +* **2024-01-25:** Support [video depth visualization](./run_video.py). An [online demo for video](https://huggingface.co/spaces/JohanDL/Depth-Anything-Video) is also available. +* **2024-01-23:** The new ControlNet based on Depth Anything is integrated into [ControlNet WebUI](https://github.com/Mikubill/sd-webui-controlnet) and [ComfyUI's ControlNet](https://github.com/Fannovel16/comfyui_controlnet_aux). +* **2024-01-23:** Depth Anything [ONNX](https://github.com/fabio-sim/Depth-Anything-ONNX) and [TensorRT](https://github.com/spacewalk01/depth-anything-tensorrt) versions are supported. +* **2024-01-22:** Paper, project page, code, models, and demo ([HuggingFace](https://huggingface.co/spaces/LiheYoung/Depth-Anything), [OpenXLab](https://openxlab.org.cn/apps/detail/yyfan/depth_anything)) are released. + + +## Features of Depth Anything + +***If you need other features, please first check [existing community supports](#community-support).*** + +- **Relative depth estimation**: + + Our foundation models listed [here](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints) can provide relative depth estimation for any given image robustly. Please refer [here](#running) for details. + +- **Metric depth estimation** + + We fine-tune our Depth Anything model with metric depth information from NYUv2 or KITTI. It offers strong capabilities of both in-domain and zero-shot metric depth estimation. Please refer [here](./metric_depth) for details. + + +- **Better depth-conditioned ControlNet** + + We re-train **a better depth-conditioned ControlNet** based on Depth Anything. It offers more precise synthesis than the previous MiDaS-based ControlNet. Please refer [here](./controlnet/) for details. You can also use our new ControlNet based on Depth Anything in [ControlNet WebUI](https://github.com/Mikubill/sd-webui-controlnet) or [ComfyUI's ControlNet](https://github.com/Fannovel16/comfyui_controlnet_aux). + +- **Downstream high-level scene understanding** + + The Depth Anything encoder can be fine-tuned to downstream high-level perception tasks, *e.g.*, semantic segmentation, 86.2 mIoU on Cityscapes and 59.4 mIoU on ADE20K. Please refer [here](./semseg/) for details. + + +## Performance + +Here we compare our Depth Anything with the previously best MiDaS v3.1 BEiTL-512 model. + +Please note that the latest MiDaS is also trained on KITTI and NYUv2, while we do not. + +| Method | Params | KITTI || NYUv2 || Sintel || DDAD || ETH3D || DIODE || +|-|-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| | | AbsRel | $\delta_1$ | AbsRel | $\delta_1$ | AbsRel | $\delta_1$ | AbsRel | $\delta_1$ | AbsRel | $\delta_1$ | AbsRel | $\delta_1$ | +| MiDaS | 345.0M | 0.127 | 0.850 | 0.048 | *0.980* | 0.587 | 0.699 | 0.251 | 0.766 | 0.139 | 0.867 | 0.075 | 0.942 | +| **Ours-S** | 24.8M | 0.080 | 0.936 | 0.053 | 0.972 | 0.464 | 0.739 | 0.247 | 0.768 | 0.127 | **0.885** | 0.076 | 0.939 | +| **Ours-B** | 97.5M | *0.080* | *0.939* | *0.046* | 0.979 | **0.432** | *0.756* | *0.232* | *0.786* | **0.126** | *0.884* | *0.069* | *0.946* | +| **Ours-L** | 335.3M | **0.076** | **0.947** | **0.043** | **0.981** | *0.458* | **0.760** | **0.230** | **0.789** | *0.127* | 0.882 | **0.066** | **0.952** | + +We highlight the **best** and *second best* results in **bold** and *italic* respectively (**better results**: AbsRel $\downarrow$ , $\delta_1 \uparrow$). + +## Pre-trained models + +We provide three models of varying scales for robust relative depth estimation: + +| Model | Params | Inference Time on V100 (ms) | A100 | RTX4090 ([TensorRT](https://github.com/spacewalk01/depth-anything-tensorrt)) | +|:-|-:|:-:|:-:|:-:| +| Depth-Anything-Small | 24.8M | 12 | 8 | 3 | +| Depth-Anything-Base | 97.5M | 13 | 9 | 6 | +| Depth-Anything-Large | 335.3M | 20 | 13 | 12 | + +Note that the V100 and A100 inference time (*without TensorRT*) is computed by excluding the pre-processing and post-processing stages, whereas the last column RTX4090 (*with TensorRT*) is computed by including these two stages (please refer to [Depth-Anything-TensorRT](https://github.com/spacewalk01/depth-anything-tensorrt)). + +You can easily load our pre-trained models by: +```python +from depth_anything.dpt import DepthAnything + +encoder = 'vits' # can also be 'vitb' or 'vitl' +depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)) +``` + +Depth Anything is also supported in [``transformers``](https://github.com/huggingface/transformers). You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). + +### *No network connection, cannot load these models?* + +
+Click here for solutions + +- First, manually download the three checkpoints: [depth-anything-large](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth), [depth-anything-base](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitb14.pth), and [depth-anything-small](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vits14.pth). + +- Second, upload the folder containing the checkpoints to your remote server. + +- Lastly, load the model locally: +```python +from depth_anything.dpt import DepthAnything + +model_configs = { + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]} +} + +encoder = 'vitl' # or 'vitb', 'vits' +depth_anything = DepthAnything(model_configs[encoder]) +depth_anything.load_state_dict(torch.load(f'./checkpoints/depth_anything_{encoder}14.pth')) +``` +Note that in this locally loading manner, you also do not have to install the ``huggingface_hub`` package. In this way, please feel free to delete this [line](https://github.com/LiheYoung/Depth-Anything/blob/e7ef4b4b7a0afd8a05ce9564f04c1e5b68268516/depth_anything/dpt.py#L5) and the ``PyTorchModelHubMixin`` in this [line](https://github.com/LiheYoung/Depth-Anything/blob/e7ef4b4b7a0afd8a05ce9564f04c1e5b68268516/depth_anything/dpt.py#L169). +
+ + +## Usage + +### Installation + +```bash +git clone https://github.com/LiheYoung/Depth-Anything +cd Depth-Anything +pip install -r requirements.txt +``` + +### Running + +```bash +python run.py --encoder --img-path --outdir [--pred-only] [--grayscale] +``` +Arguments: +- ``--img-path``: you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths. +- ``--pred-only`` is set to save the predicted depth map only. Without it, by default, we visualize both image and its depth map side by side. +- ``--grayscale`` is set to save the grayscale depth map. Without it, by default, we apply a color palette to the depth map. + +For example: +```bash +python run.py --encoder vitl --img-path assets/examples --outdir depth_vis +``` + +**If you want to use Depth Anything on videos:** +```bash +python run_video.py --encoder vitl --video-path assets/examples_video --outdir video_depth_vis +``` + +### Gradio demo + +To use our gradio demo locally: + +```bash +python app.py +``` + +You can also try our [online demo](https://huggingface.co/spaces/LiheYoung/Depth-Anything). + +### Import Depth Anything to your project + +If you want to use Depth Anything in your own project, you can simply follow [``run.py``](run.py) to load our models and define data pre-processing. + +
+Code snippet (note the difference between our data pre-processing and that of MiDaS) + +```python +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet + +import cv2 +import torch +from torchvision.transforms import Compose + +encoder = 'vits' # can also be 'vitb' or 'vitl' +depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)).eval() + +transform = Compose([ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), +]) + +image = cv2.cvtColor(cv2.imread('your image path'), cv2.COLOR_BGR2RGB) / 255.0 +image = transform({'image': image})['image'] +image = torch.from_numpy(image).unsqueeze(0) + +# depth shape: 1xHxW +depth = depth_anything(image) +``` +
+ +### Do not want to define image pre-processing or download model definition files? + +Easily use Depth Anything through [``transformers``](https://github.com/huggingface/transformers) within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). + +**Note:** If you encounter ``KeyError: 'depth_anything'``, please install the latest [``transformers``](https://github.com/huggingface/transformers) from source: +```bash +pip install git+https://github.com/huggingface/transformers.git +``` +
+Click here for a brief demo: + +```python +from transformers import pipeline +from PIL import Image + +image = Image.open('Your-image-path') +pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf") +depth = pipe(image)["depth"] +``` +
+ +## Community Support + +**We sincerely appreciate all the extensions built on our Depth Anything from the community. Thank you a lot!** + +Here we list the extensions we have found: +- Depth Anything TensorRT: + - https://github.com/spacewalk01/depth-anything-tensorrt + - https://github.com/thinvy/DepthAnythingTensorrtDeploy + - https://github.com/daniel89710/trt-depth-anything +- Depth Anything ONNX: https://github.com/fabio-sim/Depth-Anything-ONNX +- Depth Anything in Transformers.js (3D visualization): https://huggingface.co/spaces/Xenova/depth-anything-web +- Depth Anything for video (online demo): https://huggingface.co/spaces/JohanDL/Depth-Anything-Video +- Depth Anything in ControlNet WebUI: https://github.com/Mikubill/sd-webui-controlnet +- Depth Anything in ComfyUI's ControlNet: https://github.com/Fannovel16/comfyui_controlnet_aux +- Depth Anything in X-AnyLabeling: https://github.com/CVHub520/X-AnyLabeling +- Depth Anything in OpenXLab: https://openxlab.org.cn/apps/detail/yyfan/depth_anything +- Depth Anything in OpenVINO: https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/280-depth-anything +- Depth Anything ROS: + - https://github.com/scepter914/DepthAnything-ROS + - https://github.com/polatztrk/depth_anything_ros +- Depth Anything Android: + - https://github.com/FeiGeChuanShu/ncnn-android-depth_anything + - https://github.com/shubham0204/Depth-Anything-Android +- Depth Anything in TouchDesigner: https://github.com/olegchomp/TDDepthAnything +- LearnOpenCV research article on Depth Anything: https://learnopencv.com/depth-anything +- Learn more about the DPT architecture we used: https://github.com/heyoeyo/muggled_dpt +- Depth Anything in NVIDIA Jetson Orin: https://github.com/ZhuYaoHui1998/jetson-examples/blob/main/reComputer/scripts/depth-anything + + +If you have your amazing projects supporting or improving (*e.g.*, speed) Depth Anything, please feel free to drop an issue. We will add them here. + + +## Acknowledgement + +We would like to express our deepest gratitude to [AK(@_akhaliq)](https://twitter.com/_akhaliq) and the awesome HuggingFace team ([@niels](https://huggingface.co/nielsr), [@hysts](https://huggingface.co/hysts), and [@yuvraj](https://huggingface.co/ysharma)) for helping improve the online demo and build the HF models. + +Besides, we thank the [MagicEdit](https://magic-edit.github.io/) team for providing some video examples for video depth estimation, and [Tiancheng Shen](https://scholar.google.com/citations?user=iRY1YVoAAAAJ) for evaluating the depth maps with MagicEdit. + +## Citation + +If you find this project useful, please consider citing: + +```bibtex +@inproceedings{depthanything, + title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, + author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, + booktitle={CVPR}, + year={2024} +} +``` diff --git a/computervision/depths/README.md:Zone.Identifier b/computervision/depths/README.md:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/app.ipynb b/computervision/depths/app.ipynb new file mode 100644 index 0000000..47508e7 --- /dev/null +++ b/computervision/depths/app.ipynb @@ -0,0 +1,55 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from depth_anything.dpt import DepthAnything\n", + "from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet\n", + "\n", + "import cv2\n", + "import torch\n", + "from torchvision.transforms import Compose\n", + "\n", + "encoder = 'vits' # can also be 'vitb' or 'vitl'\n", + "depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)).eval()\n", + "\n", + "transform = Compose([\n", + " Resize(\n", + " width=518,\n", + " height=518,\n", + " resize_target=False,\n", + " keep_aspect_ratio=True,\n", + " ensure_multiple_of=14,\n", + " resize_method='lower_bound',\n", + " image_interpolation_method=cv2.INTER_CUBIC,\n", + " ),\n", + " NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + " PrepareForNet(),\n", + "])\n", + "\n", + "image = cv2.cvtColor(cv2.imread('DSC01677.jpg'), cv2.COLOR_BGR2RGB) / 255.0\n", + "image = transform({'image': image})['image']\n", + "image = torch.from_numpy(image).unsqueeze(0)\n", + "\n", + "# depth shape: 1xHxW\n", + "depth = depth_anything(image)\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/computervision/depths/app.ipynb:Zone.Identifier b/computervision/depths/app.ipynb:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/app.py b/computervision/depths/app.py new file mode 100644 index 0000000..35e85e8 --- /dev/null +++ b/computervision/depths/app.py @@ -0,0 +1,95 @@ +import gradio as gr +import cv2 +import numpy as np +import os +from PIL import Image +import torch +import torch.nn.functional as F +from torchvision.transforms import Compose +import tempfile +from gradio_imageslider import ImageSlider + +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet + +css = """ +#img-display-container { + max-height: 100vh; + } +#img-display-input { + max-height: 80vh; + } +#img-display-output { + max-height: 80vh; + } +""" +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' +model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval() + +title = "# Depth Anything" +description = """Official demo for **Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data**. + +Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details.""" + +transform = Compose([ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), +]) + +@torch.no_grad() +def predict_depth(model, image): + return model(image) + +with gr.Blocks(css=css) as demo: + gr.Markdown(title) + gr.Markdown(description) + gr.Markdown("### Depth Prediction demo") + gr.Markdown("You can slide the output to compare the depth prediction with input image") + + with gr.Row(): + input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input') + depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5) + raw_file = gr.File(label="16-bit raw depth (can be considered as disparity)") + submit = gr.Button("Submit") + + def on_submit(image): + original_image = image.copy() + + h, w = image.shape[:2] + + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 + image = transform({'image': image})['image'] + image = torch.from_numpy(image).unsqueeze(0).to(DEVICE) + + depth = predict_depth(model, image) + depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0] + + raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint16')) + tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False) + raw_depth.save(tmp.name) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.cpu().numpy().astype(np.uint8) + colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] + + return [(original_image, colored_depth), tmp.name] + + submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, raw_file]) + + example_files = os.listdir('assets/examples') + example_files.sort() + example_files = [os.path.join('assets/examples', filename) for filename in example_files] + examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, raw_file], fn=on_submit, cache_examples=False) + + +if __name__ == '__main__': + demo.queue().launch() \ No newline at end of file diff --git a/computervision/depths/app.py:Zone.Identifier b/computervision/depths/app.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/depth_anything/blocks.py b/computervision/depths/depth_anything/blocks.py new file mode 100644 index 0000000..38dbcfe --- /dev/null +++ b/computervision/depths/depth_anything/blocks.py @@ -0,0 +1,153 @@ +import torch.nn as nn + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape*2 + out_shape3 = out_shape*4 + if len(in_shape) >= 4: + out_shape4 = out_shape*8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d( + in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + + return scratch + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + if self.bn==True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn==True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn==True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand==True: + out_features = features//2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + self.size=size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = nn.functional.interpolate( + output, **modifier, mode="bilinear", align_corners=self.align_corners + ) + + output = self.out_conv(output) + + return output diff --git a/computervision/depths/depth_anything/blocks.py:Zone.Identifier b/computervision/depths/depth_anything/blocks.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/depth_anything/dpt.py b/computervision/depths/depth_anything/dpt.py new file mode 100644 index 0000000..56b9545 --- /dev/null +++ b/computervision/depths/depth_anything/dpt.py @@ -0,0 +1,187 @@ +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +from huggingface_hub import PyTorchModelHubMixin, hf_hub_download + +from depth_anything.blocks import FeatureFusionBlock, _make_scratch + + +def _make_fusion_block(features, use_bn, size = None): + return FeatureFusionBlock( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + size=size, + ) + + +class DPTHead(nn.Module): + def __init__(self, nclass, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False): + super(DPTHead, self).__init__() + + self.nclass = nclass + self.use_clstoken = use_clstoken + + self.projects = nn.ModuleList([ + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channel, + kernel_size=1, + stride=1, + padding=0, + ) for out_channel in out_channels + ]) + + self.resize_layers = nn.ModuleList([ + nn.ConvTranspose2d( + in_channels=out_channels[0], + out_channels=out_channels[0], + kernel_size=4, + stride=4, + padding=0), + nn.ConvTranspose2d( + in_channels=out_channels[1], + out_channels=out_channels[1], + kernel_size=2, + stride=2, + padding=0), + nn.Identity(), + nn.Conv2d( + in_channels=out_channels[3], + out_channels=out_channels[3], + kernel_size=3, + stride=2, + padding=1) + ]) + + if use_clstoken: + self.readout_projects = nn.ModuleList() + for _ in range(len(self.projects)): + self.readout_projects.append( + nn.Sequential( + nn.Linear(2 * in_channels, in_channels), + nn.GELU())) + + self.scratch = _make_scratch( + out_channels, + features, + groups=1, + expand=False, + ) + + self.scratch.stem_transpose = None + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + head_features_1 = features + head_features_2 = 32 + + if nclass > 1: + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_1, nclass, kernel_size=1, stride=1, padding=0), + ) + else: + self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1) + + self.scratch.output_conv2 = nn.Sequential( + nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True), + nn.Identity(), + ) + + def forward(self, out_features, patch_h, patch_w): + out = [] + for i, x in enumerate(out_features): + if self.use_clstoken: + x, cls_token = x[0], x[1] + readout = cls_token.unsqueeze(1).expand_as(x) + x = self.readout_projects[i](torch.cat((x, readout), -1)) + else: + x = x[0] + + x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)) + + x = self.projects[i](x) + x = self.resize_layers[i](x) + + out.append(x) + + layer_1, layer_2, layer_3, layer_4 = out + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv1(path_1) + out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) + out = self.scratch.output_conv2(out) + + return out + + +class DPT_DINOv2(nn.Module): + def __init__(self, encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False, localhub=True): + super(DPT_DINOv2, self).__init__() + + assert encoder in ['vits', 'vitb', 'vitl'] + + # in case the Internet connection is not stable, please load the DINOv2 locally + if localhub: + self.pretrained = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False) + else: + self.pretrained = torch.hub.load('facebookresearch/dinov2', 'dinov2_{:}14'.format(encoder)) + + dim = self.pretrained.blocks[0].attn.qkv.in_features + + self.depth_head = DPTHead(1, dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken) + + def forward(self, x): + h, w = x.shape[-2:] + + features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True) + + patch_h, patch_w = h // 14, w // 14 + + depth = self.depth_head(features, patch_h, patch_w) + depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True) + depth = F.relu(depth) + + return depth.squeeze(1) + + +class DepthAnything(DPT_DINOv2, PyTorchModelHubMixin): + def __init__(self, config): + super().__init__(**config) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "--encoder", + default="vits", + type=str, + choices=["vits", "vitb", "vitl"], + ) + args = parser.parse_args() + + model = DepthAnything.from_pretrained("LiheYoung/depth_anything_{:}14".format(args.encoder)) + + print(model) + \ No newline at end of file diff --git a/computervision/depths/depth_anything/dpt.py:Zone.Identifier b/computervision/depths/depth_anything/dpt.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/depth_anything/util/transform.py b/computervision/depths/depth_anything/util/transform.py new file mode 100644 index 0000000..7beab14 --- /dev/null +++ b/computervision/depths/depth_anything/util/transform.py @@ -0,0 +1,248 @@ +import random +from PIL import Image, ImageOps, ImageFilter +import torch +from torchvision import transforms +import torch.nn.functional as F + +import numpy as np +import cv2 +import math + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST + ) + + if "semseg_mask" in sample: + # sample["semseg_mask"] = cv2.resize( + # sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST + # ) + sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0] + + if "mask" in sample: + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + # sample["mask"] = sample["mask"].astype(bool) + + # print(sample['image'].shape, sample['depth'].shape) + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + if "semseg_mask" in sample: + sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32) + sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"]) + + return sample diff --git a/computervision/depths/depth_anything/util/transform.py:Zone.Identifier b/computervision/depths/depth_anything/util/transform.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/depth_map.jpg b/computervision/depths/depth_map.jpg new file mode 100644 index 0000000..dde57d2 Binary files /dev/null and b/computervision/depths/depth_map.jpg differ diff --git a/computervision/depths/depth_map.jpg:Zone.Identifier b/computervision/depths/depth_map.jpg:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/dybdeimage.png b/computervision/depths/dybdeimage.png new file mode 100644 index 0000000..9e07859 Binary files /dev/null and b/computervision/depths/dybdeimage.png differ diff --git a/computervision/depths/dybdeimage.png:Zone.Identifier b/computervision/depths/dybdeimage.png:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/dybdeimage2.png b/computervision/depths/dybdeimage2.png new file mode 100644 index 0000000..9e07859 Binary files /dev/null and b/computervision/depths/dybdeimage2.png differ diff --git a/computervision/depths/dybdeimage2.png:Zone.Identifier b/computervision/depths/dybdeimage2.png:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/dybdeimage3.png b/computervision/depths/dybdeimage3.png new file mode 100644 index 0000000..9e07859 Binary files /dev/null and b/computervision/depths/dybdeimage3.png differ diff --git a/computervision/depths/dybdeimage3.png:Zone.Identifier b/computervision/depths/dybdeimage3.png:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/gallery.md b/computervision/depths/gallery.md new file mode 100644 index 0000000..43b35b0 --- /dev/null +++ b/computervision/depths/gallery.md @@ -0,0 +1,160 @@ +# $Depth$ $Anything$ ${\color{crimson}G\color{coral}a\color{royalblue}l\color{olive}l\color{teal}e\color{navy}r\color{plum}y}$ + + + +Here we exhibit awesome community showcases of Depth Anything. Thank all the users for sharing them on the Internet (mainly from Twitter). + +We organize these cases into three groups: [**image**](#image), [**video**](#video), and [**3D**](#3d). + + +## Image + +You can click on the titles below to be directed to corresponding source pages. + +### [Monument Valley](https://twitter.com/weebney/status/1749541957108441309) + + + +### [Cyber rabbit monitoring screens](https://twitter.com/hayas1357/status/1749298607260316139) + + + +### [Astronaut cat](https://twitter.com/nanase_ja/status/1749653152406884392) + + + +### [Animation images](https://twitter.com/PlayShingo/status/1750368475867128200) + + + +### [DALL·E bear](https://twitter.com/letalvoj/status/1749341999646347741) + + + +### [Cat](https://twitter.com/sajilobroker/status/1749364184419016846) + + + +### [Surprised bald man](https://twitter.com/mayfer/status/1749712454408679780) + + + +### [Minecraft](https://twitter.com/BarlowTwin/status/1749353070008693224) + + + +### [Robotic knight amidst lightning](https://twitter.com/IterIntellectus/status/1749432836158021738) + + + +### [Football game](https://twitter.com/AB9Mamun/status/1751202608545456235) + + + +### [Classical raft painting](https://twitter.com/acidbjazz/status/1749491155698331774) + + + +### [Diner scene](https://twitter.com/R0b0tSp1der/status/1749301061964435846) + + + +### [Elon Musk](https://twitter.com/ai_for_success/status/1749304903418482954) + + + +### [Painted tunnel](https://twitter.com/NodiMend/status/1750800040304492814) + + + +### [Iron man](https://twitter.com/ai_for_success/status/1749304906664808751) + + + +### [Skull](https://twitter.com/ai_for_success/status/1749304909730906381) + + + +### [Chibi cat-eared character](https://twitter.com/nanase_ja/status/1749484958522204605) + + + +### [Exuberant gamer celebration](https://twitter.com/hmaon/status/1749372352016625748) + + + +### [Ocean](https://twitter.com/jarrahorphin/status/1749878678111309870) + + + +### [Aerial images](https://twitter.com/lTlanual/status/1749641678124892384) + + + +### [Grilled chicken skewers](https://twitter.com/promptlord/status/1752323556409856157) + + + +### [Artistic images](https://twitter.com/ZainHasan6/status/1753553755998416933) + + + +### [Iconic distracted man](https://twitter.com/ZainHasan6/status/1749308193237303620) + + + +### [Eye-stalked](https://twitter.com/RJdoesVR/status/1749494967800590780) + + + +### [Tearful green frog](https://twitter.com/qsdnl/status/1749298425064313080) + + + + +## Video + +For more online showcases, please refer to https://twitter.com/WilliamLamkin/status/1755623301907460582. + +The videos below may be slow to load. Please wait a moment. + +### [Racing game](https://twitter.com/i/status/1750683014152040853) + + + +### [Building](https://twitter.com/WayneINR/status/1750945037863551247) + + + +### [nuScenes](https://github.com/scepter914/DepthAnything-ROS) + + + +### [Indoor moving](https://twitter.com/PINTO03091/status/1750162506453041437) + + + + +## 3D + +The videos below may be slow to load. Please wait a moment. + +### [3D visualization](https://twitter.com/victormustar/status/1753008143469093212) + +

+ + +### [2D videos to 3D videos](https://twitter.com/stspanho/status/1751709292913143895) + + + +### Reconstruction + +- [case1](https://twitter.com/Artoid_XYZ/status/1751542601772421378) + + + +- [case2](https://twitter.com/DennisLoevlie/status/1753846358463709489) + + + diff --git a/computervision/depths/gallery.md:Zone.Identifier b/computervision/depths/gallery.md:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/newapp.py b/computervision/depths/newapp.py new file mode 100644 index 0000000..3d4e168 --- /dev/null +++ b/computervision/depths/newapp.py @@ -0,0 +1,70 @@ +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet +import numpy as np +import cv2 +import torch +from torchvision.transforms import Compose + +encoder = 'vits' # can also be 'vitb' or 'vitl' +depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)).eval() + +transform = Compose([ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), +]) + +image = cv2.cvtColor(cv2.imread('DSC01677.jpg'), cv2.COLOR_BGR2RGB) / 255.0 +image = transform({'image': image})['image'] +image = torch.from_numpy(image).unsqueeze(0) + +# depth shape: 1xHxW +depth = depth_anything(image) +# Run the depth estimation model on the current frame +# The pipeline returns a dict with key "depth" +print(depth) + + + + +def save_tensor_as_image(tensor, filename): + # Konverter til NumPy, frakoble fra beregningsgrafen og flytt til CPU + tensor = tensor.detach().cpu().numpy() + + # Fjern unødvendige dimensjoner + tensor = np.squeeze(tensor) + + # Normaliser til 0-255 og konverter til uint8 + tensor_norm = (tensor - np.min(tensor)) / (np.max(tensor) - np.min(tensor)) + tensor_uint8 = (tensor_norm * 255).astype(np.uint8) + + # Lagre bildet + cv2.imwrite(filename, tensor_uint8) + +# Bruk funksjonen til å lagre bildene +save_tensor_as_image(depth, "dybdeimage3.png") +save_tensor_as_image(depth, "depth_map.jpg") + + + +""" +tensor = depth.cpu().numpy() # make sure tensor is on cpu +cv2.imwrite(tensor, "Dybde.png") + +depth_map = depth +# Ensure depth_map is a numpy array (if not already) +# save depth map +cv2.imwrite("depth_map.jpg", depth_map) + +""" + + + diff --git a/computervision/depths/newapp.py:Zone.Identifier b/computervision/depths/newapp.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/requirements.txt b/computervision/depths/requirements.txt new file mode 100644 index 0000000..4044895 --- /dev/null +++ b/computervision/depths/requirements.txt @@ -0,0 +1,6 @@ +gradio_imageslider +gradio==4.14.0 +torch +torchvision +opencv-python +huggingface_hub \ No newline at end of file diff --git a/computervision/depths/requirements.txt:Zone.Identifier b/computervision/depths/requirements.txt:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/run.py b/computervision/depths/run.py new file mode 100644 index 0000000..06c7716 --- /dev/null +++ b/computervision/depths/run.py @@ -0,0 +1,113 @@ +import argparse +import cv2 +import numpy as np +import os +import torch +import torch.nn.functional as F +from torchvision.transforms import Compose +from tqdm import tqdm + +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--img-path', type=str) + parser.add_argument('--outdir', type=str, default='./vis_depth') + parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl']) + + parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') + parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') + + args = parser.parse_args() + + margin_width = 50 + caption_height = 60 + + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 1 + font_thickness = 2 + + DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE).eval() + + total_params = sum(param.numel() for param in depth_anything.parameters()) + print('Total parameters: {:.2f}M'.format(total_params / 1e6)) + + transform = Compose([ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + ]) + + if os.path.isfile(args.img_path): + if args.img_path.endswith('txt'): + with open(args.img_path, 'r') as f: + filenames = f.read().splitlines() + else: + filenames = [args.img_path] + else: + filenames = os.listdir(args.img_path) + filenames = [os.path.join(args.img_path, filename) for filename in filenames if not filename.startswith('.')] + filenames.sort() + + os.makedirs(args.outdir, exist_ok=True) + + for filename in tqdm(filenames): + raw_image = cv2.imread(filename) + image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0 + + h, w = image.shape[:2] + + image = transform({'image': image})['image'] + image = torch.from_numpy(image).unsqueeze(0).to(DEVICE) + + with torch.no_grad(): + depth = depth_anything(image) + + depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0] + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + + depth = depth.cpu().numpy().astype(np.uint8) + + if args.grayscale: + depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) + else: + depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) + + filename = os.path.basename(filename) + + if args.pred_only: + cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth) + else: + split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255 + combined_results = cv2.hconcat([raw_image, split_region, depth]) + + caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255 + captions = ['Raw image', 'Depth Anything'] + segment_width = w + margin_width + + for i, caption in enumerate(captions): + # Calculate text size + text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0] + + # Calculate x-coordinate to center the text + text_x = int((segment_width * i) + (w - text_size[0]) / 2) + + # Add text caption + cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness) + + final_result = cv2.vconcat([caption_space, combined_results]) + + cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result) + \ No newline at end of file diff --git a/computervision/depths/run.py:Zone.Identifier b/computervision/depths/run.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/depths/run_video.py b/computervision/depths/run_video.py new file mode 100644 index 0000000..16edb66 --- /dev/null +++ b/computervision/depths/run_video.py @@ -0,0 +1,94 @@ +import argparse +import cv2 +import numpy as np +import os +import torch +import torch.nn.functional as F +from torchvision.transforms import Compose + +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--video-path', type=str) + parser.add_argument('--outdir', type=str, default='./vis_video_depth') + parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl']) + + args = parser.parse_args() + + margin_width = 50 + + DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE).eval() + + total_params = sum(param.numel() for param in depth_anything.parameters()) + print('Total parameters: {:.2f}M'.format(total_params / 1e6)) + + transform = Compose([ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + ]) + + if os.path.isfile(args.video_path): + if args.video_path.endswith('txt'): + with open(args.video_path, 'r') as f: + lines = f.read().splitlines() + else: + filenames = [args.video_path] + else: + filenames = os.listdir(args.video_path) + filenames = [os.path.join(args.video_path, filename) for filename in filenames if not filename.startswith('.')] + filenames.sort() + + os.makedirs(args.outdir, exist_ok=True) + + for k, filename in enumerate(filenames): + print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename) + + raw_video = cv2.VideoCapture(filename) + frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) + output_width = frame_width * 2 + margin_width + + filename = os.path.basename(filename) + output_path = os.path.join(args.outdir, filename[:filename.rfind('.')] + '_video_depth.mp4') + out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height)) + + while raw_video.isOpened(): + ret, raw_frame = raw_video.read() + if not ret: + break + + frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0 + + frame = transform({'image': frame})['image'] + frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE) + + with torch.no_grad(): + depth = depth_anything(frame) + + depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0] + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + + depth = depth.cpu().numpy().astype(np.uint8) + depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) + + split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 + combined_frame = cv2.hconcat([raw_frame, split_region, depth_color]) + + out.write(combined_frame) + + raw_video.release() + out.release() diff --git a/computervision/depths/run_video.py:Zone.Identifier b/computervision/depths/run_video.py:Zone.Identifier new file mode 100644 index 0000000..e69de29 diff --git a/computervision/objectdetection/yolov8.py b/computervision/objectdetection/yolov8.py new file mode 100644 index 0000000..bb68fa8 --- /dev/null +++ b/computervision/objectdetection/yolov8.py @@ -0,0 +1,64 @@ +from ultralytics import YOLO +import cv2 +import math +# start webcam +cap = cv2.VideoCapture("WIN_20250225_22_33_58_Pro.mp4") +cap.set(3, 640) +cap.set(4, 480) + +# model +model = YOLO("yolo-Weights/yolov8n.pt") + +# object classes +classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", + "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", + "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", + "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", + "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", + "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", + "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", + "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", + "teddy bear", "hair drier", "toothbrush" + ] + + +while True: + success, img = cap.read() + results = model(img, stream=True) + + # coordinates + for r in results: + boxes = r.boxes + + for box in boxes: + # bounding box + x1, y1, x2, y2 = box.xyxy[0] + x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values + + # put box in cam + cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3) + + # confidence + confidence = math.ceil((box.conf[0]*100))/100 + print("Confidence --->",confidence) + + # class name + cls = int(box.cls[0]) + print("Class name -->", classNames[cls]) + + # object details + org = [x1, y1] + font = cv2.FONT_HERSHEY_SIMPLEX + fontScale = 1 + color = (255, 0, 0) + thickness = 2 + + cv2.putText(img, classNames[cls], org, font, fontScale, color, thickness) + + cv2.imshow('Webcam', img) + if cv2.waitKey(1) == ord('q'): + break + +cap.release() +cv2.destroyAllWindows() \ No newline at end of file diff --git a/computervision/objectdetection/yolov8.py:Zone.Identifier b/computervision/objectdetection/yolov8.py:Zone.Identifier new file mode 100644 index 0000000..e69de29