SentenceTransformers可以给句子 和图片进行编码,采用BERT思路。
https://www.sbert.net/index.html
源码:https://github.com/UKPLab/sentence-transformers
pip install -U sentence-transformers
给图片进行编码:
import base64, io, requests
from PIL import Image
def loag_img_url(url):
res = requests.get(url)
image_bytes = res.content
return Image.open(io.BytesIO(image_bytes))
image_model = SentenceTransformer('clip-ViT-B-32')
url = 'https://img2022.cnblogs.com/blog/1761971/202202/1761971-20220217211036046-443787327.png'
img = loag_img_url(url)
img1_vec = image_model.encode([img])
img1_vec.shape
报错:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [19], in <module>
----> 1 img1_vec = image_model.encode([img])
2 img1_vec.shape
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/sentence_transformers/SentenceTransformer.py:160, in SentenceTransformer.encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)
158 for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
159 sentences_batch = sentences_sorted[start_index:start_index+batch_size]
--> 160 features = self.tokenize(sentences_batch)
161 features = batch_to_device(features, device)
163 with torch.no_grad():
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/sentence_transformers/SentenceTransformer.py:318, in SentenceTransformer.tokenize(self, texts)
314 def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
315 """
316 Tokenizes the texts
317 """
--> 318 return self._first_module().tokenize(texts)
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/sentence_transformers/models/CLIPModel.py:71, in CLIPModel.tokenize(self, texts)
68 if len(images) == 0:
69 images = None
---> 71 inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
72 inputs['image_text_info'] = image_text_info
73 return inputs
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/transformers/models/clip/processing_clip.py:152, in CLIPProcessor.__call__(self, text, images, return_tensors, **kwargs)
149 encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
151 if images is not None:
--> 152 image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
154 if text is not None and images is not None:
155 encoding["pixel_values"] = image_features.pixel_values
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:152, in CLIPFeatureExtractor.__call__(self, images, return_tensors, **kwargs)
150 images = [self.center_crop(image, self.crop_size) for image in images]
151 if self.do_normalize:
--> 152 images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
154 # return as BatchFeature
155 data = {"pixel_values": images}
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:152, in <listcomp>(.0)
150 images = [self.center_crop(image, self.crop_size) for image in images]
151 if self.do_normalize:
--> 152 images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
154 # return as BatchFeature
155 data = {"pixel_values": images}
File ~/anaconda3/envs/BLIP/lib/python3.8/site-packages/transformers/image_utils.py:186, in ImageFeatureExtractionMixin.normalize(self, image, mean, std)
184 return (image - mean[:, None, None]) / std[:, None, None]
185 else:
--> 186 return (image - mean) / std
ValueError: operands could not be broadcast together with shapes (4,224,224) (3,)
原因:
不支持png,改为jpg
换一张jpg格式的图片
from sentence_transformers import SentenceTransformer, util
from PIL import Image
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')
#Encode an image:
img_emb = model.encode(Image.open('two_dogs_in_snow.jpg'))
#Encode text descriptions
text_emb = model.encode(['Two dogs in the snow', 'A cat on a table', 'A picture of London at night'])
#Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)
tensor([[0.2973, 0.0996, 0.1102]])
参考:https://www.sbert.net/examples/applications/image-search/README.html