Skip to content

Commit a8398bb

Browse files
committed
build and run success
0 parents  commit a8398bb

30 files changed

+52390
-0
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.vscode
2+
weights
3+
__pycache__
4+
axpi_pro_bsp_sdk
5+
build*
6+
third_party

CMakeLists.txt

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# set cmake_install_prefix path
2+
if(NOT DEFINED CMAKE_INSTALL_PREFIX)
3+
set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
4+
endif()
5+
6+
message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
7+
8+
# check if building type is not set
9+
if(NOT CMAKE_BUILD_TYPE)
10+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE)
11+
endif()
12+
13+
# bsp
14+
if(NOT BSP_MSP_DIR)
15+
set(BSP_MSP_DIR ${CMAKE_SOURCE_DIR}/axpi_pro_bsp_sdk/msp/out)
16+
endif()
17+
18+
message(STATUS "BSP_MSP_DIR = ${BSP_MSP_DIR}")
19+
20+
# set(CMAKE_TOOLCHAIN_FILE ${CMAKE_SOURCE_DIR}/toolchains/aarch64-none-linux-gnu.toolchain.cmake)
21+
cmake_minimum_required(VERSION 3.0)
22+
project(OWLVIT-ONNX-AX650-CPP)
23+
24+
# 添加 FLAGS 检查代码是否有明显 bug
25+
# include(cmake/overlook.cmake)
26+
27+
add_compile_options(-std=c++17)
28+
set(CMAKE_CXX_STANDARD 17)
29+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
30+
31+
32+
33+
# link third-party library onnxruntime-win-x64-1.14.1 and opencv4
34+
if(NOT ONNXRUNTIME_DIR)
35+
set(ONNXRUNTIME_DIR ${CMAKE_SOURCE_DIR}/third_party/onnxruntime)
36+
endif()
37+
38+
message(STATUS "ONNXRUNTIME_DIR Path: ${ONNXRUNTIME_DIR}")
39+
40+
include_directories(${ONNXRUNTIME_DIR}/include)
41+
link_directories("${ONNXRUNTIME_DIR}/lib")
42+
set(ONNXRUNTIME_LIB onnxruntime)
43+
44+
if(NOT OpenCV_DIR)
45+
set(OpenCV_DIR ${CMAKE_SOURCE_DIR}/third_party/opencv-mobile-4.6.0-ubuntu-2004/lib/cmake/opencv4)
46+
endif()
47+
48+
message(STATUS "OPENCV_DIR Path: ${OpenCV_DIR}")
49+
find_package(OpenCV REQUIRED)
50+
51+
include_directories(${OpenCV_INCLUDE_DIRS})
52+
include_directories(${BSP_MSP_DIR}/include)
53+
link_directories(${BSP_MSP_DIR}/lib)
54+
55+
option(BUILD_WITH_AX650 "build with ax650" OFF)
56+
57+
if(BUILD_WITH_AX650)
58+
set(AXERA_TARGET_CHIP "AX650")
59+
add_compile_definitions(BUILD_WITH_AX650)
60+
add_compile_definitions(AXERA_TARGET_CHIP_AX650)
61+
endif()
62+
63+
message(STATUS "BUILD_WITH_AX650 : ${BUILD_WITH_AX650}")
64+
65+
add_library(owlvit STATIC
66+
src/Runner/BaseRunner.cpp
67+
src/Runner/ax_model_runner_ax650.cpp
68+
)
69+
70+
add_executable(main src/main.cpp)
71+
72+
target_link_libraries(main owlvit)
73+
74+
target_link_libraries(main ${OpenCV_LIBS})
75+
target_link_libraries(main gomp ${ONNXRUNTIME_LIB})
76+
77+
if(BUILD_WITH_AX650)
78+
target_link_libraries(main ax_engine ax_interpreter ax_sys)
79+
endif()
80+
81+
install(TARGETS main DESTINATION bin)

README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# OWLVIT
2+
3+
https://huggingface.co/docs/transformers/model_doc/owlvit
4+
5+
6+
7+
<img src="ssd_horse.jpg" height="320" /> <img src="result.jpg" height="320" />
8+
9+
## Build
10+
```
11+
mkdir build
12+
cd build
13+
```
14+
if x86 onnxruntime
15+
```
16+
cmake -DONNXRUNTIME_DIR=${onnxruntime_dir} -DOpenCV_DIR=${opencv_cmake_file_dir} ..
17+
```
18+
else if ax650
19+
```
20+
cmake -DONNXRUNTIME_DIR=${onnxruntime_dir} -DOpenCV_DIR=${opencv_cmake_file_dir} -DBSP_MSP_DIR=${msp_out_dir} -DBUILD_WITH_AX650=ON -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-none-linux-gnu.toolchain.cmake ..
21+
```
22+
```
23+
make -j4
24+
```
25+
aarch64-none-gnu library:\
26+
[onnxruntime](https://github.com/ZHEQIUSHUI/SAM-ONNX-AX650-CPP/releases/download/ax_models/onnxruntime-aarch64-none-gnu-1.16.0.zip)\
27+
[opencv](https://github.com/ZHEQIUSHUI/SAM-ONNX-AX650-CPP/releases/download/ax_models/libopencv-4.6-aarch64-none.zip)
28+
29+
### run
30+
```
31+
/opt/test/owlvit # ./main --ienc owlvit-image.axmodel --tenc owlvit-text.onnx -d
32+
owlvit-post.onnx -v vocab.txt -i ssd_horse.jpg -t text.txt
33+
Engine creating handle is done.
34+
Engine creating context is done.
35+
Engine get io info is done.
36+
Engine alloc io is done.
37+
[I][ init][ 280]: BGR MODEL
38+
[I][ load_image_encoder][ 17]: input size 768 768
39+
[I][ load_image_encoder][ 29]: image feature len 442368
40+
[I][ load_image_encoder][ 32]: pred box cnt 576
41+
[I][ load_text_encoder][ 141]: text feature len 512
42+
[I][ main][ 116]: image_src [ssd_horse.jpg]
43+
[I][ main][ 117]: text_src [text.txt]
44+
encode text Inference Cost time : 0.281856s
45+
[I][ decode][ 239]: logits_size: 576
46+
[I][ decode][ 239]: logits_size: 576
47+
[I][ decode][ 239]: logits_size: 576
48+
[I][ decode][ 239]: logits_size: 576
49+
post Inference Cost time : 0.0981112s
50+
a photo of person 268.899292 20.153463 88.163696 235.837906
51+
a photo of person 428.696014 123.745819 19.836823 55.102310
52+
horse 191.756058 55.418949 229.225601 318.581055
53+
a photo of car 0.000000 98.398750 145.470108 92.571877
54+
a photo of dog 145.470108 203.093140 57.306412 156.490570
55+
```

result.jpg

158 KB
Loading

scripts/bpe_simple_vocab_16e6.txt.gz

1.29 MB
Binary file not shown.

scripts/hf_demo.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import requests
2+
from PIL import Image
3+
import torch
4+
5+
from transformers import OwlViTProcessor, OwlViTForObjectDetection
6+
7+
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
8+
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
9+
10+
url = "./test.jpg"
11+
image = Image.open(url)
12+
texts = [["a photo of people"]]
13+
inputs = processor(text=texts, images=image, return_tensors="pt")
14+
outputs = model(**inputs)
15+
16+
# torch.onnx.export(model, (inputs["input_ids"],inputs["pixel_values"],inputs["attention_mask"]), "weights/owlvit.onnx", opset_version=14,
17+
# input_names=["input_ids","pixel_values","attention_mask"],
18+
# output_names=["logits","pred_boxes","text_embeds","image_embeds"])
19+
20+
print(inputs)
21+
# print(processor)
22+
23+
# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
24+
target_sizes = torch.Tensor([image.size[::-1]])
25+
# Convert outputs (bounding boxes and class logits) to COCO API
26+
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
27+
i = 0 # Retrieve predictions for the first image for the corresponding text queries
28+
text = texts[i]
29+
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
30+
for box, score, label in zip(boxes, scores, labels):
31+
box = [round(i, 2) for i in box.tolist()]
32+
print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

scripts/onnx_edit.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# optimum-cli export onnx --model google/owlvit-base-patch32 --task zero-shot-object-detection weights/ --opset 16
2+
3+
import onnx
4+
5+
input_path = "weights/owlvit.onnx"
6+
output_path = "weights/model-image.onnx"
7+
input_names = ["pixel_values"]
8+
output_names = ["image_embeds","pred_boxes"]
9+
10+
onnx.utils.extract_model(input_path, output_path, input_names, output_names)
11+
12+
# output_path = "weights/model-text.onnx"
13+
# input_names = ["input_ids","attention_mask"]
14+
# output_names = ["/owlvit/Div_output_0"]
15+
16+
# onnx.utils.extract_model(input_path, output_path, input_names, output_names)
17+
18+
output_path = "weights/model-post.onnx"
19+
input_names = ["image_embeds","input_ids","/owlvit/Div_output_0"]
20+
output_names = ["logits"]
21+
22+
onnx.utils.extract_model(input_path, output_path, input_names, output_names)

scripts/onnx_run.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import onnxruntime
2+
3+
class onnx_inferencer:
4+
5+
def __init__(self, model_path) -> None:
6+
self.onnx_model_sess = onnxruntime.InferenceSession(model_path)
7+
self.output_names = []
8+
self.input_names = []
9+
print(model_path)
10+
for i in range(len(self.onnx_model_sess.get_inputs())):
11+
self.input_names.append(self.onnx_model_sess.get_inputs()[i].name)
12+
print(" input:", i,
13+
self.onnx_model_sess.get_inputs()[i].name,self.onnx_model_sess.get_inputs()[i].type,
14+
self.onnx_model_sess.get_inputs()[i].shape)
15+
16+
for i in range(len(self.onnx_model_sess.get_outputs())):
17+
self.output_names.append(
18+
self.onnx_model_sess.get_outputs()[i].name)
19+
print(" output:", i,
20+
self.onnx_model_sess.get_outputs()[i].name,self.onnx_model_sess.get_outputs()[i].type,
21+
self.onnx_model_sess.get_outputs()[i].shape)
22+
print("")
23+
24+
def get_input_count(self):
25+
return len(self.input_names)
26+
27+
def get_input_shape(self, idx: int):
28+
return self.onnx_model_sess.get_inputs()[idx].shape
29+
30+
def get_input_names(self):
31+
return self.input_names
32+
33+
def get_output_count(self):
34+
return len(self.output_names)
35+
36+
def get_output_shape(self, idx: int):
37+
return self.onnx_model_sess.get_outputs()[idx].shape
38+
39+
def get_output_names(self):
40+
return self.output_names
41+
42+
def inference(self, tensor):
43+
return self.onnx_model_sess.run(
44+
self.output_names, input_feed={self.input_names[0]: tensor})
45+
46+
def inference_multi_input(self, tensors: list):
47+
inputs = dict()
48+
for idx, tensor in enumerate(tensors):
49+
inputs[self.input_names[idx]] = tensor
50+
return self.onnx_model_sess.run(self.output_names, input_feed=inputs)
51+
52+
backbone = onnx_inferencer("weights/owlvit-image.onnx")
53+
bert = onnx_inferencer("weights/owlvit-text.onnx")
54+
transformer = onnx_inferencer("weights/owlvit-post.onnx")
55+
56+
import torchvision.transforms as T
57+
from tokenizer import build_tokenizer
58+
import torch
59+
import cv2
60+
import numpy as np
61+
from PIL import Image
62+
63+
def load_image(image_path: str):
64+
transform = T.Compose(
65+
[
66+
T.Resize([768,768]),
67+
T.ToTensor(),
68+
T.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954,0.26130258,0.27577711]),
69+
]
70+
)
71+
image_source = Image.open(image_path).convert("RGB")
72+
image = np.asarray(image_source)
73+
image_transformed = transform(image_source)
74+
return image, image_transformed
75+
76+
77+
tokenizer = build_tokenizer()
78+
79+
BOX_TRESHOLD = 0.35
80+
TEXT_TRESHOLD = 0.25
81+
82+
IMAGE_PATH = "./test.jpg"
83+
TEXT_PROMPT = ["football"]
84+
image_source, image = load_image(IMAGE_PATH)
85+
print(image.shape)
86+
image_embeds, pred_boxes = backbone.inference(image.unsqueeze(0).numpy())
87+
88+
print(image_embeds[0].shape)
89+
90+
input_ids = np.array([tokenizer.encode(t) for t in TEXT_PROMPT]).reshape(-1)
91+
print(input_ids)
92+
93+
94+
input_ids = np.pad([49406,*input_ids,49407],(0,16-len(input_ids)-2))
95+
print(input_ids)
96+
mask = (input_ids > 0).astype(np.int64)
97+
98+
print(mask)
99+
100+
text_embeds = bert.inference_multi_input([input_ids.reshape(1,16), mask.reshape(1,16)])[0].reshape(1,-1)
101+
print(text_embeds)
102+
logits = transformer.inference_multi_input([image_embeds[0].reshape(1,24,24,768),text_embeds,input_ids.reshape(1,16)])[0]
103+
104+
logits = torch.Tensor(logits).sigmoid().numpy().reshape(-1)
105+
pred_boxes = pred_boxes.reshape(-1,4)
106+
107+
print(logits.shape)
108+
print(pred_boxes.shape)
109+
110+
# get idx of boxes with confidence > BOX_TRESHOLD
111+
idxs = np.where(logits > BOX_TRESHOLD)[0]
112+
if(len(idxs) == 0):
113+
print("no boxes found")
114+
exit()
115+
print(idxs)
116+
# print(logits[idx])
117+
# print(pred_boxes[idx][0])
118+
_h,_w,_ = image_source.shape
119+
image_source = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
120+
for idx in idxs:
121+
print(idx,pred_boxes[idx])
122+
xc,yc,w,h = pred_boxes[idx]
123+
xc*=_w
124+
yc*=_h
125+
w*=_w
126+
h*=_h
127+
128+
129+
cv2.rectangle(image_source,(int(xc-w/2),int(yc-h/2)),(int(xc+w/2),int(yc+h/2)),(0,0,255),2)
130+
cv2.imwrite("out.jpg", image_source)
131+

scripts/out.jpg

557 KB
Loading

0 commit comments

Comments
 (0)