Skip to content

Basic of LLM

Somkiat Puisungnoen edited this page Jun 22, 2025 · 11 revisions

Basic of LLM

  • Tokenizer
  • Embedding

0. File requirements.txt

tiktoken
sentence-transformers
openai

Setup environment for Python

$python -m venv ./demo/venv
$source ./demo/venv/bin/activate

Install

$pip install -r requirements.txt

1. Calculate number of tokens

import tiktoken
def num_tokens_from_string(string: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-4o")
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string("สวัสดีประเทศไทย"))

2.1 Sentence Embeddings

from sentence_transformers import SentenceTransformer

def get_sentence_embeddings(sentences):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(sentences, clean_up_tokenization_spaces=True)
    return embeddings

if __name__ == "__main__":
    sentences = ["I love dogs", "I like cats"]
    embeddings = get_sentence_embeddings(sentences)
    print(embeddings)
    print(embeddings.shape)

2.2 OpenAI Embedding

import openai

client = openai.Client()

def get_openai_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        response = client.embeddings.create(
            input=sentence,
            model="text-embedding-ada-002"
        )
        embeddings.append(response.data[0].embedding)
    return embeddings

if __name__ == "__main__":
    sentences = ["I love dogs", "I like cats"]
    embeddings = get_openai_embeddings(sentences)
    print(embeddings)
    print("Dimension of embedding:", len(embeddings[0]))


2.3 Ollama

$pip install ollama

$ollama pull bge-m3

Demo code

from ollama import Client
client = Client(
  host='http://159.223.78.26:11434'
)

response = client.embed(
  model='bge-m3',
  input='I love dogs',
)

print(response["embeddings"])
print("Size : " + str(len(response["embeddings"][0])))

3. Similarity

from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences, clean_up_tokenization_spaces=True)
print(embeddings.shape) # (item=3, dimension=384)

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(model.similarity_fn_name)
print(similarities)

Clone this wiki locally