Erste Tests
This commit is contained in:
@@ -0,0 +1,48 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
# Verhindert, dass transformers unnötig torchvision lädt
|
||||||
|
#os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding(text: str) -> torch.Tensor:
|
||||||
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
token_embeddings = outputs.last_hidden_state
|
||||||
|
attention_mask = inputs["attention_mask"].unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||||
|
|
||||||
|
summed = torch.sum(token_embeddings * attention_mask, dim=1)
|
||||||
|
counts = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
|
||||||
|
embedding = summed / counts
|
||||||
|
|
||||||
|
# Optional, aber für Ähnlichkeitsvergleiche meist sinnvoll
|
||||||
|
embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
|
||||||
|
|
||||||
|
return embedding.squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float:
|
||||||
|
return torch.nn.functional.cosine_similarity(a, b, dim=-1).item()
|
||||||
|
|
||||||
|
|
||||||
|
embeddings_love = get_embedding("love")
|
||||||
|
embeddings_hate = get_embedding("hate")
|
||||||
|
embeddings_lovely = get_embedding("lovely")
|
||||||
|
embeddings_hateful = get_embedding("hateful")
|
||||||
|
|
||||||
|
print("Dimension of the embedding space:", embeddings_love.shape)
|
||||||
|
print("Similarity between love and hate:", round(cosine_similarity(embeddings_love, embeddings_hate), 3))
|
||||||
|
print("Similarity between lovely and hateful:", round(cosine_similarity(embeddings_lovely, embeddings_hateful), 3))
|
||||||
|
print("Similarity between love and lovely:", round(cosine_similarity(embeddings_love, embeddings_lovely), 3))
|
||||||
|
print("Similarity between hate and hateful:", round(cosine_similarity(embeddings_hate, embeddings_hateful), 3))
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
model_name = "Qwen/Qwen3-1.7B"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
text = "This is a great blog about Tokenizers."
|
||||||
|
tokens = tokenizer.tokenize(text)
|
||||||
|
print(tokens)
|
||||||
Reference in New Issue
Block a user