import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn
Setup Hyperparameters
Hyperparameters are external configurations for a model that cannot be learned from the data during training. They are set before the training process begins and play a crucial role in controlling the behavior of the training algorithm and the performance of the trained models.
# Hyperparameters
batch_size = 4 # How many batches per training step
context_length = 16 # Length of the token chunk each batch
d_model = 64 # The vector size of the token embeddings
num_layers = 8 # Number of transformer blocks
num_heads = 4 # Number of heads in Multi-head attention # 我们的代码中通过 d_model / num_heads = 来获取 head_size
learning_rate = 1e-3 # 0.001
dropout = 0.1 # Dropout rate
max_iters = 5000 # Total of training iterations
eval_interval = 50 # How often to evaluate the model
eval_iters = 20 # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Instead of using the cpu, we'll use the GPU if it's available.
TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)
Prepare the Dataset
As in our example, we'll use a small dataset for training. The dataset is a text file containing a sales textbook. We'll use the text file to train a language model that can generate sales text.
# download a sample txt file from https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt
if not os.path.exists('sales_textbook.txt'):
url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
with open('sales_textbook.txt', 'w') as f:
f.write(requests.get(url).text)
with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
text = f.read()
Step 1: Tokenization
We'll use the tiktoken library to tokenize the dataset. The library is a fast and lightweight tokenizer that can be used to tokenize text into tokens.
# Using TikToken to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text) # size of tokenized source text is 77,919
vocab_size = len(set(tokenized_text)) # size of vocabulary is 3,771
max_token_value = max(tokenized_text)
print(f"Tokenized text size: {len(tokenized_text)}")
print(f"Vocabulary size: {vocab_size}")
print(f"The maximum value in the tokenized text is: {max_token_value}")
Printed output:
Tokenized text size: 77919
Vocabulary size: 3771
The maximum value in the tokenized text is: 100069
Step 2: Word Embedding
We'll split the dataset into training and validation sets. The training set will be used to train the model, and the validation set will be used to evaluate the model's performance.
# Split train and validation
split_idx = int(len(tokenized_text) * 0.8)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]
# Prepare data for training batch
# Prepare data for training batch
data = train_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
x_batch = torch.stack([data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, x_batch.shape)
Printed output (the shape of the training input x and y):
torch.Size([4, 16]) torch.Size([4, 16])
Step 3: Positional Encoding
We'll use a simple embedding layer to convert the input tokens into vectors.
# Define Token Embedding look-up table
token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)
# Get X and Y embedding
x = token_embedding_lookup_table(x_batch.data)
y = token_embedding_lookup_table(y_batch.data)
Now, both our input x and y are of shape (batch_size, context_length, d_model).
# Get x and y embedding
x = token_embedding_lookup_table(x_batch.data) # [4, 16, 64] [batch_size, context_length, d_model]
y = token_embedding_lookup_table(y_batch.data)
Apply Positional Embedding
As described in the original paper, we'll use sine and cosine to generate a positional embedding table, then add these positional information to the input embedding tokens.
# Define Position Encoding look-up table
position_encoding_lookup_table = torch.zeros(context_length, d_model) # initial with zeros with shape (context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
# apply the sine & cosine
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) #add batch to the first dimension
print("Position Encoding Look-up Table: ", position_encoding_lookup_table.shape)
Printed output:
Position Encoding Look-up Table: torch.Size([4, 16, 64])
Then, add the positional encoding into the input embedding vectors.
# Add positional encoding into the input embedding vector
input_embedding_x = x + position_encoding_lookup_table # [4, 16, 64] [batch_size, context_length, d_model]
input_embedding_y = y + position_encoding_lookup_table
X = input_embedding_x
x_plot = input_embedding_x[0].detach().cpu().numpy()
print("Final Input Embedding of x: \n", pd.DataFrame(x_plot))
Now, we get our final input embedding of X, which is the value to be fed into the transformer block:
Note: y embedding vector will be the same shape as our x.
Step 4: Transformer Block
4.1 Multi-head Attention Overview
Let's bring back our Multi-head Attention diagram.
Now we have our input embedding X, we can start to implement the Multi-head Attention block. There will be a series of steps to implement the Multi-head Attention block. Let's code them one by one.
We then reshape our Q, K, V to [batch_size, num_heads, context_length, head_size] for further computation.
# Transpose q,k,v from [batch_size, context_length, num_heads, head_size] to [batch_size, num_heads, context_length, head_size]
# The reason is that treat each batch with "num_heads" as its first dimension.
Q = Q.transpose(1, 2) # [4, 4, 16, 16]
K = K.transpose(1, 2) # [4, 4, 16, 16]
V = V.transpose(1, 2) # [4, 4, 16, 16]
4.3 Calculate QK^T Attention
This can be done very easily by using the torch.matmul function.
# Calculate the attention score betwee Q and K^T
attention_score = torch.matmul(Q, K.transpose(-2, -1))
4.4 Scale
# Then Scale the attention score by the square root of the head size
attention_score = attention_score / math.sqrt(d_model // num_heads)
Note: the shape now is [4, 16, 64] which is [batch_size, context_length, d_model].
Now, we can apply another [64,64] linear layer of Wo(which are learned weights during training) and get our final output of the Multi-head Attention block:
What we've finished above is just one transformer block. In practise, we will stack multiple transformer blocks together to form a transformer decoder.
We actually should pack our code to classes and use PyTorch nn.Module to build our transformer decoder. But for demonstration, we'll just leave it with one block.
The last step is to softmax the logits to get the probabilities of each token:
# torch.softmax usually used during inference, during training we use torch.nn.CrossEntropyLoss
# but for illustration purpose, we'll use torch.softmax here
probabilities = torch.softmax(logits, dim=-1)
Note what we get here is a huge matrix with shape [16, 100069] which is the probabilities of each token in the whole vocabulary.
Full Working Code
In practise, multiple transformer blocks will be stacked together to perform one decoding transaction. And during training, the output token will be compared with the ground truth token to calculate the loss. Then repeat the process for our max_iters times defined in the hyper parameters.
I have a full working Transformer Decoder code in my GitHub you can check out.