Let's explore a practical implementation of tokenization using the Hugging Face Transformers library:
$ vi tokenizer.py
from transformers import AutoModelForCausalLM, AutoTokenizer
# load a pre-trained model and its tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
# Add padding token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# display tokenizer properties
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Model max length: {tokenizer.model_max_length}")
print(f"Special tokens: {tokenizer.special_tokens_map}")
# display tokens directly without decoding
tokens = tokenizer.tokenize("Hello Tokenizers!")
# display the tokens
print(f"\nTokens: {tokens}")
# encode text into token IDs + return attention mask
tokens = tokenizer("Hello Tokenizers!", return_tensors='pt', return_attention_mask=True)
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
# display the token IDs
print("\nInput token IDs:", input_ids)
# display the attention mask
print("Attention mask:", attention_mask)
# encode text into token IDs (similar to the above)
# input_ids = tokenizer.encode("Hello Tokenizers!", return_tensors='pt')
# display individual tokens by converting IDs to text
print("\nIndividual input tokens:")
for i, token_id in enumerate(input_ids[0]):
token = tokenizer.decode([token_id])
print(f"Token {i}: ID {token_id} -> '{token}'")
# generate text from the model based on the input
# attention mask used to avoid this warning 'The attention mask is not set and cannot be inferred from input because pad token is same as eos token.'
output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=10,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# display the output token IDs
print(f"\nOutput token IDs: {output}")
# display individual output tokens
print("\nIndividual output tokens:")
for i, token_id in enumerate(output[0]):
token = tokenizer.decode([token_id])
print(f"Token {i}: ID {token_id} -> '{token}'")
# convert token IDs to their corresponding text
generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
print(f"\nGenerated text: {generated_text}")
# convert token IDs to their corresponding text without special tokens
generated_clean_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"\nGenerated clean text: {generated_clean_text}")
Run the Python script:
$ python3 tokenizer.py
Output:
Vocab size: 50257
Model max length: 1024
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}
Tokens: ['Hello', 'ĠToken', 'izers', '!']
Input token IDs: tensor([[15496, 29130, 11341, 0]])
Attention mask: tensor([[1, 1, 1, 1]])
Individual input tokens:
Token 0: ID 15496 -> 'Hello'
Token 1: ID 29130 -> ' Token'
Token 2: ID 11341 -> 'izers'
Token 3: ID 0 -> '!'
Output token IDs: tensor([[15496, 29130, 11341, 0, 50256]])
Individual output tokens:
Token 0: ID 15496 -> 'Hello'
Token 1: ID 29130 -> ' Token'
Token 2: ID 11341 -> 'izers'
Token 3: ID 0 -> '!'
Token 4: ID 50256 -> '<|endoftext|>'
Generated text: Hello Tokenizers!<|endoftext|>
Generated clean text: Hello Tokenizers!