# Copyright 2021 Zilliz. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from typing import NamedTuple from pathlib import Path from PIL import Image import torch from timm.data import resolve_data_config from timm.data.transforms_factory import create_transform from towhee.operator import Operator class VisionTransformerEmbeddingOperator(Operator): """ Embedding extractor using ViT. Args: model_name (`string`): Model name. weights_path (`string`): Path to local weights. """ def __init__(self, model_name: str = 'vit_large_patch16_224', framework: str = 'pytorch', weights_path: str = None) -> None: super().__init__() sys.path.append(str(Path(__file__).parent)) if framework == 'pytorch': from vit_embedding.pytorch.model import Model self.model = Model(model_name, weights_path) config = resolve_data_config({}, model=self.model._model) self.tfms = create_transform(**config) def __call__(self, img_path: str) -> NamedTuple('Outputs', [('embedding', torch.Tensor)]): Outputs = NamedTuple('Outputs', [('embedding', torch.Tensor)]) img = self.tfms(Image.open(img_path)).unsqueeze(0) features = self.model(img) return Outputs(features.flatten().detach().numpy())