|
import json |
|
from PIL import Image |
|
import requests |
|
from transformers import CLIPProcessor, CLIPModel |
|
from transformers.models.clip.modeling_clip import _get_vector_norm |
|
import torch |
|
import numpy as np |
|
import platform |
|
import sys |
|
import os |
|
|
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
inputs = processor(text="two cats on a pink blanket", images=image, return_tensors="pt", padding="max_length", truncation=True) |
|
np_inputs = {k: v.numpy() for k, v in inputs.data.items()} |
|
|
|
class VisionModel(torch.nn.Module): |
|
def __init__(self, model): |
|
super(VisionModel, self).__init__() |
|
self.model = model |
|
|
|
def forward(self, x): |
|
model = self.model |
|
vision_outputs = model.vision_model.forward(x) |
|
pooled_output = vision_outputs.pooler_output |
|
image_features = self.model.visual_projection(pooled_output) |
|
image_features = image_features / _get_vector_norm(image_features) |
|
return image_features |
|
|
|
def eval(self): |
|
self.model.eval() |
|
self.model.vision_model.eval() |
|
self.model.visual_projection.eval() |
|
return super().eval() |
|
|
|
class TextModel(torch.nn.Module): |
|
def __init__(self, model): |
|
super(TextModel, self).__init__() |
|
self.model = model |
|
|
|
def forward(self, input_ids, attention_mask): |
|
model = self.model |
|
text_outputs = model.text_model.forward(input_ids, attention_mask) |
|
pooled_output = text_outputs.pooler_output |
|
text_features = self.model.text_projection(pooled_output) |
|
text_features = text_features / _get_vector_norm(text_features) |
|
return text_features |
|
|
|
def eval(self): |
|
self.model.eval() |
|
self.model.text_model.eval() |
|
self.model.text_projection.eval() |
|
return super().eval() |
|
|
|
torch.set_grad_enabled(False) |
|
ptmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
with torch.no_grad(): |
|
vision = VisionModel(ptmodel) |
|
vision.eval() |
|
traced_vision_model = torch.jit.trace(vision, inputs.data['pixel_values']) |
|
|
|
text = TextModel(ptmodel) |
|
text.eval() |
|
traced_text_model = torch.jit.trace(text, (inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
|
def convert_coreml(): |
|
import coremltools as ct |
|
coreml_model = ct.convert(traced_vision_model, inputs=[ct.TensorType(shape=inputs.data['pixel_values'].shape)]) |
|
coreml_model.save('vision.mlpackage') |
|
|
|
coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)]) |
|
coreml_model.save('text.mlpackage') |
|
|
|
|
|
|
|
def infer_coreml(): |
|
import coremltools as ct |
|
coreml_vision_model = ct.models.MLModel('vision.mlpackage') |
|
coreml_text_model = ct.models.MLModel('text.mlpackage') |
|
|
|
vision_predictions = coreml_vision_model.predict({'x': np_inputs['pixel_values']}) |
|
text_predictions = coreml_text_model.predict({'input_ids_1': np_inputs['input_ids'].astype(np.float32), 'attention_mask_1': np_inputs['attention_mask'].astype(np.float32)}) |
|
|
|
image_embeds = vision_predictions['var_877'] |
|
text_embeds = text_predictions['var_1050'] |
|
|
|
|
|
logits_per_text = text_embeds @ image_embeds.T |
|
|
|
|
|
print("similarity:", logits_per_text.item()) |
|
|
|
def convert_onnx(): |
|
torch.onnx.export(traced_vision_model, inputs.data['pixel_values'], "vision.onnx") |
|
torch.onnx.export(traced_text_model, (inputs.data['input_ids'], inputs.data['input_ids']), "text.onnx") |
|
|
|
|
|
|
|
def infer_onnx(): |
|
import onnxruntime as ort |
|
|
|
providers: list[str] = [] |
|
if sys.platform == "darwin": |
|
providers.append("CoreMLExecutionProvider") |
|
|
|
if ("linux" in sys.platform or "win" in sys.platform) and ( |
|
platform.machine() == "x86_64" or platform.machine() == "AMD64" |
|
): |
|
providers.append(("CUDAExecutionProvider", {"device_id": 0})) |
|
|
|
providers.append("CPUExecutionProvider") |
|
|
|
vision_session = ort.InferenceSession("vision.onnx", providers=providers) |
|
text_session = ort.InferenceSession("text.onnx", providers=providers) |
|
|
|
vision_inputs = {vision_session.get_inputs()[0].name: np_inputs['pixel_values']} |
|
text_inputs = { |
|
text_session.get_inputs()[0].name: np_inputs['input_ids'], |
|
text_session.get_inputs()[1].name: np_inputs['attention_mask'] |
|
} |
|
|
|
vision_predictions = vision_session.run(None, vision_inputs) |
|
text_predictions = text_session.run(None, text_inputs) |
|
|
|
image_embeds = vision_predictions[0] |
|
text_embeds = text_predictions[0] |
|
|
|
logits_per_text = text_embeds @ image_embeds.T |
|
|
|
print("similarity:", logits_per_text.item()) |
|
|
|
|
|
|
|
def convert_openvino(): |
|
import openvino as ov |
|
ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values']) |
|
ov.save_model(ov_vision_model, "openvino/vision.xml") |
|
|
|
ov_text_model = ov.convert_model(traced_text_model, example_input=(inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
ov.save_model(ov_text_model, "openvino/text.xml") |
|
|
|
|
|
|
|
def infer_openvino(): |
|
import openvino as ov |
|
ov_vision_model = ov.Core().read_model("openvino/vision.xml") |
|
ov_text_model = ov.Core().read_model("openvino/text.xml") |
|
|
|
compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") |
|
compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") |
|
|
|
vision_predictions = compiled_vision_model(inputs.data['pixel_values']) |
|
text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
|
image_embeds = vision_predictions[0] |
|
text_embeds = text_predictions[0] |
|
|
|
logits_per_text = text_embeds @ image_embeds.T |
|
|
|
print("similarity:", logits_per_text.item()) |
|
|
|
|
|
|
|
def export_openvino_int8(): |
|
import openvino as ov |
|
import text_calibration |
|
import image_calibration |
|
import nncf |
|
|
|
ov_vision_model = ov.Core().read_model("openvino/vision.xml") |
|
ov_text_model = ov.Core().read_model("openvino/text.xml") |
|
|
|
vision_calibration_dataset = image_calibration.get_image_calibration_data() |
|
text_calibration_dataset = text_calibration.get_text_calibration_data() |
|
|
|
vision_dataset = nncf.Dataset(vision_calibration_dataset) |
|
text_dataset = nncf.Dataset(text_calibration_dataset) |
|
|
|
quantized_vision_model = nncf.quantize(ov_vision_model, vision_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, |
|
|
|
) |
|
|
|
quantized_text_model = nncf.quantize(ov_text_model, text_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, |
|
|
|
) |
|
|
|
ov.save_model(quantized_vision_model, "openvino/vision_int8.xml") |
|
ov.save_model(quantized_text_model, "openvino/text_int8.xml") |
|
|
|
export_openvino_int8() |
|
|
|
def infer_openvino_int8(): |
|
import openvino as ov |
|
ov_vision_model = ov.Core().read_model("openvino/vision_int8.xml") |
|
ov_text_model = ov.Core().read_model("openvino/text_int8.xml") |
|
|
|
compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") |
|
compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") |
|
|
|
vision_predictions = compiled_vision_model(inputs.data['pixel_values']) |
|
text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
|
image_embeds = vision_predictions[0] |
|
text_embeds = text_predictions[0] |
|
|
|
logits_per_text = text_embeds @ image_embeds.T |
|
|
|
print("similarity:", logits_per_text.item()) |
|
|
|
infer_openvino_int8() |
|
|
|
def export_ncnn(): |
|
traced_vision_model.save(f"vision.pt") |
|
input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "") |
|
os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'") |
|
|
|
traced_text_model.save(f"text.pt") |
|
input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "") |
|
input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "") |
|
os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'") |
|
|
|
|
|
|
|
def infer_ncnn(): |
|
import ncnn |
|
|
|
vision_extractor = ncnn.Net() |
|
vision_extractor.load_param("vision.ncnn.param") |
|
vision_extractor.load_model("vision.ncnn.bin") |
|
|
|
text_extractor = ncnn.Net() |
|
text_extractor.load_param("text.ncnn.param") |
|
text_extractor.load_model("text.ncnn.bin") |
|
|
|
vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy()) |
|
text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy()) |
|
text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy()) |
|
|
|
vision_extractor.input(vision_extractor.input_names()[0], vision_mat) |
|
text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat) |
|
text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat) |
|
|
|
image_embeds = vision_extractor.extract("out0") |
|
text_embeds = text_extractor.extract("out0") |
|
|
|
logits_per_text = text_embeds @ image_embeds.T |
|
|
|
print("similarity:", logits_per_text[0]) |
|
|
|
|
|
|
|
def infer_torch(): |
|
outputs = ptmodel(**inputs) |
|
logits_per_image = outputs.logits_per_image |
|
probs = logits_per_image.softmax(dim=1) |
|
print(probs) |