diff --git a/embedding_demo.py b/embedding_demo.py new file mode 100644 index 0000000..d349584 --- /dev/null +++ b/embedding_demo.py @@ -0,0 +1,48 @@ +import os + +# Verhindert, dass transformers unnötig torchvision lädt +#os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1") + +import torch +from transformers import AutoModel, AutoTokenizer + +MODEL_NAME = "BAAI/bge-small-en-v1.5" + +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME) +model.eval() + + +def get_embedding(text: str) -> torch.Tensor: + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + with torch.no_grad(): + outputs = model(**inputs) + + token_embeddings = outputs.last_hidden_state + attention_mask = inputs["attention_mask"].unsqueeze(-1).expand(token_embeddings.size()).float() + + summed = torch.sum(token_embeddings * attention_mask, dim=1) + counts = torch.clamp(attention_mask.sum(dim=1), min=1e-9) + embedding = summed / counts + + # Optional, aber für Ähnlichkeitsvergleiche meist sinnvoll + embedding = torch.nn.functional.normalize(embedding, p=2, dim=1) + + return embedding.squeeze(0) + + +def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float: + return torch.nn.functional.cosine_similarity(a, b, dim=-1).item() + + +embeddings_love = get_embedding("love") +embeddings_hate = get_embedding("hate") +embeddings_lovely = get_embedding("lovely") +embeddings_hateful = get_embedding("hateful") + +print("Dimension of the embedding space:", embeddings_love.shape) +print("Similarity between love and hate:", round(cosine_similarity(embeddings_love, embeddings_hate), 3)) +print("Similarity between lovely and hateful:", round(cosine_similarity(embeddings_lovely, embeddings_hateful), 3)) +print("Similarity between love and lovely:", round(cosine_similarity(embeddings_love, embeddings_lovely), 3)) +print("Similarity between hate and hateful:", round(cosine_similarity(embeddings_hate, embeddings_hateful), 3)) \ No newline at end of file diff --git a/tokenizer_demo.py b/tokenizer_demo.py new file mode 100644 index 0000000..bd021fe --- /dev/null +++ b/tokenizer_demo.py @@ -0,0 +1,7 @@ +from transformers import AutoTokenizer + +model_name = "Qwen/Qwen3-1.7B" +tokenizer = AutoTokenizer.from_pretrained(model_name) +text = "This is a great blog about Tokenizers." +tokens = tokenizer.tokenize(text) +print(tokens) \ No newline at end of file