clip / export.py
Koushik Dutta
working quantized model
286d8f1
import json
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
from transformers.models.clip.modeling_clip import _get_vector_norm
import torch
import numpy as np
import platform
import sys
import os
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text="two cats on a pink blanket", images=image, return_tensors="pt", padding="max_length", truncation=True)
np_inputs = {k: v.numpy() for k, v in inputs.data.items()}
class VisionModel(torch.nn.Module):
def __init__(self, model):
super(VisionModel, self).__init__()
self.model = model
def forward(self, x):
model = self.model
vision_outputs = model.vision_model.forward(x)
pooled_output = vision_outputs.pooler_output
image_features = self.model.visual_projection(pooled_output)
image_features = image_features / _get_vector_norm(image_features)
return image_features
def eval(self):
self.model.eval()
self.model.vision_model.eval()
self.model.visual_projection.eval()
return super().eval()
class TextModel(torch.nn.Module):
def __init__(self, model):
super(TextModel, self).__init__()
self.model = model
def forward(self, input_ids, attention_mask):
model = self.model
text_outputs = model.text_model.forward(input_ids, attention_mask)
pooled_output = text_outputs.pooler_output
text_features = self.model.text_projection(pooled_output)
text_features = text_features / _get_vector_norm(text_features)
return text_features
def eval(self):
self.model.eval()
self.model.text_model.eval()
self.model.text_projection.eval()
return super().eval()
torch.set_grad_enabled(False)
ptmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
with torch.no_grad():
vision = VisionModel(ptmodel)
vision.eval()
traced_vision_model = torch.jit.trace(vision, inputs.data['pixel_values'])
text = TextModel(ptmodel)
text.eval()
traced_text_model = torch.jit.trace(text, (inputs.data['input_ids'], inputs.data['attention_mask']))
def convert_coreml():
import coremltools as ct
coreml_model = ct.convert(traced_vision_model, inputs=[ct.TensorType(shape=inputs.data['pixel_values'].shape)])
coreml_model.save('vision.mlpackage')
coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)])
coreml_model.save('text.mlpackage')
# convert_coreml()
def infer_coreml():
import coremltools as ct
coreml_vision_model = ct.models.MLModel('vision.mlpackage')
coreml_text_model = ct.models.MLModel('text.mlpackage')
vision_predictions = coreml_vision_model.predict({'x': np_inputs['pixel_values']})
text_predictions = coreml_text_model.predict({'input_ids_1': np_inputs['input_ids'].astype(np.float32), 'attention_mask_1': np_inputs['attention_mask'].astype(np.float32)})
image_embeds = vision_predictions['var_877']
text_embeds = text_predictions['var_1050']
# Compute logits
logits_per_text = text_embeds @ image_embeds.T
print("similarity:", logits_per_text.item())
def convert_onnx():
torch.onnx.export(traced_vision_model, inputs.data['pixel_values'], "vision.onnx")
torch.onnx.export(traced_text_model, (inputs.data['input_ids'], inputs.data['input_ids']), "text.onnx")
# convert_onnx()
def infer_onnx():
import onnxruntime as ort
providers: list[str] = []
if sys.platform == "darwin":
providers.append("CoreMLExecutionProvider")
if ("linux" in sys.platform or "win" in sys.platform) and (
platform.machine() == "x86_64" or platform.machine() == "AMD64"
):
providers.append(("CUDAExecutionProvider", {"device_id": 0}))
providers.append("CPUExecutionProvider")
vision_session = ort.InferenceSession("vision.onnx", providers=providers)
text_session = ort.InferenceSession("text.onnx", providers=providers)
vision_inputs = {vision_session.get_inputs()[0].name: np_inputs['pixel_values']}
text_inputs = {
text_session.get_inputs()[0].name: np_inputs['input_ids'],
text_session.get_inputs()[1].name: np_inputs['attention_mask']
}
vision_predictions = vision_session.run(None, vision_inputs)
text_predictions = text_session.run(None, text_inputs)
image_embeds = vision_predictions[0]
text_embeds = text_predictions[0]
logits_per_text = text_embeds @ image_embeds.T
print("similarity:", logits_per_text.item())
# infer_onnx()
def convert_openvino():
import openvino as ov
ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values'])
ov.save_model(ov_vision_model, "openvino/vision.xml")
ov_text_model = ov.convert_model(traced_text_model, example_input=(inputs.data['input_ids'], inputs.data['attention_mask']))
ov.save_model(ov_text_model, "openvino/text.xml")
# convert_openvino()
def infer_openvino():
import openvino as ov
ov_vision_model = ov.Core().read_model("openvino/vision.xml")
ov_text_model = ov.Core().read_model("openvino/text.xml")
compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU")
compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU")
vision_predictions = compiled_vision_model(inputs.data['pixel_values'])
text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask']))
image_embeds = vision_predictions[0]
text_embeds = text_predictions[0]
logits_per_text = text_embeds @ image_embeds.T
print("similarity:", logits_per_text.item())
# infer_openvino()
def export_openvino_int8():
import openvino as ov
import text_calibration
import image_calibration
import nncf
ov_vision_model = ov.Core().read_model("openvino/vision.xml")
ov_text_model = ov.Core().read_model("openvino/text.xml")
vision_calibration_dataset = image_calibration.get_image_calibration_data()
text_calibration_dataset = text_calibration.get_text_calibration_data()
vision_dataset = nncf.Dataset(vision_calibration_dataset)
text_dataset = nncf.Dataset(text_calibration_dataset)
quantized_vision_model = nncf.quantize(ov_vision_model, vision_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER,
# advanced_parameters=nncf.AdvancedQuantizationParameters(disable_bias_correction=True)
)
quantized_text_model = nncf.quantize(ov_text_model, text_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER,
# advanced_parameters=nncf.AdvancedQuantizationParameters(disable_bias_correction=True)
)
ov.save_model(quantized_vision_model, "openvino/vision_int8.xml")
ov.save_model(quantized_text_model, "openvino/text_int8.xml")
export_openvino_int8()
def infer_openvino_int8():
import openvino as ov
ov_vision_model = ov.Core().read_model("openvino/vision_int8.xml")
ov_text_model = ov.Core().read_model("openvino/text_int8.xml")
compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU")
compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU")
vision_predictions = compiled_vision_model(inputs.data['pixel_values'])
text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask']))
image_embeds = vision_predictions[0]
text_embeds = text_predictions[0]
logits_per_text = text_embeds @ image_embeds.T
print("similarity:", logits_per_text.item())
infer_openvino_int8()
def export_ncnn():
traced_vision_model.save(f"vision.pt")
input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "")
os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'")
traced_text_model.save(f"text.pt")
input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "")
input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "")
os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'")
# export_ncnn()
def infer_ncnn():
import ncnn
vision_extractor = ncnn.Net()
vision_extractor.load_param("vision.ncnn.param")
vision_extractor.load_model("vision.ncnn.bin")
text_extractor = ncnn.Net()
text_extractor.load_param("text.ncnn.param")
text_extractor.load_model("text.ncnn.bin")
vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy())
text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy())
text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy())
vision_extractor.input(vision_extractor.input_names()[0], vision_mat)
text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat)
text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat)
image_embeds = vision_extractor.extract("out0")
text_embeds = text_extractor.extract("out0")
logits_per_text = text_embeds @ image_embeds.T
print("similarity:", logits_per_text[0])
# infer_ncnn()
def infer_torch():
outputs = ptmodel(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
print(probs)