import os # Verhindert, dass transformers unnötig torchvision lädt #os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1") import torch from transformers import AutoModel, AutoTokenizer MODEL_NAME = "BAAI/bge-small-en-v1.5" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) model.eval() def get_embedding(text: str) -> torch.Tensor: inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = model(**inputs) token_embeddings = outputs.last_hidden_state attention_mask = inputs["attention_mask"].unsqueeze(-1).expand(token_embeddings.size()).float() summed = torch.sum(token_embeddings * attention_mask, dim=1) counts = torch.clamp(attention_mask.sum(dim=1), min=1e-9) embedding = summed / counts # Optional, aber für Ähnlichkeitsvergleiche meist sinnvoll embedding = torch.nn.functional.normalize(embedding, p=2, dim=1) return embedding.squeeze(0) def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float: return torch.nn.functional.cosine_similarity(a, b, dim=-1).item() embeddings_love = get_embedding("love") embeddings_hate = get_embedding("hate") embeddings_lovely = get_embedding("lovely") embeddings_hateful = get_embedding("hateful") print("Dimension of the embedding space:", embeddings_love.shape) print("Similarity between love and hate:", round(cosine_similarity(embeddings_love, embeddings_hate), 3)) print("Similarity between lovely and hateful:", round(cosine_similarity(embeddings_lovely, embeddings_hateful), 3)) print("Similarity between love and lovely:", round(cosine_similarity(embeddings_love, embeddings_lovely), 3)) print("Similarity between hate and hateful:", round(cosine_similarity(embeddings_hate, embeddings_hateful), 3))