English-French Neural Machine Translation¶

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR

from matplotlib import pyplot as plt
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
Out[1]:
device(type='cpu')
In [2]:
src_language = 'eng'
target_language = 'fra'
In [3]:
# Read the file and split into lines
with open('data/%s-%s.txt' % (src_language, target_language), encoding='utf-8') as file:
    text_data = file.read().splitlines()
print(text_data[:5])
['Go.\tVa !', 'Run!\tCours\u202f!', 'Run!\tCourez\u202f!', 'Wow!\tÇa alors\u202f!', 'Fire!\tAu feu !']

The text is in Unicode. So, we will take the following preprocessing steps:

  1. Turn Unicode characters to ASCII
  2. lowercase
  3. Trim most punctuation
In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(sent):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sent)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_string(sent):
    "lowercase, unicode_to_ascii, trim, and remove non-letter characters"
    sent = sent.lower().strip()
    sent = unicode_to_ascii(sent)
    # The backreference \1 (backslash one) references the first capturing group. 
    # space followed by \1 matches the exact same text that was matched by the first capturing group [.!?].
    sent = re.sub(r"([.!?])", r" \1", sent)
    # replace character which are not from this set (a-zA-Z.!?) by single space character
    sent = re.sub(r"[^a-zA-Z.!?]+", r" ", sent)
    return sent.strip()
In [5]:
preprocess_string(text_data[3])
print(text_data[3])
print(text_data[3].split('\t')[::-1])
Wow!	Ça alors !
['Ça alors\u202f!', 'Wow!']
In [6]:
def load_data(file_name, reverse=False):
    print("Reading text file...")

    # Read the file and split into lines
    with open('data/%s' % (file_name), encoding='utf-8') as file:
        lines = file.read().splitlines()

    # Split every line into pairs [src_lang, target_lang] and preprocess
    pairs = [[preprocess_string(s) for s in line.split('\t')] for line in lines]

    if reverse:
        pairs = [p[::-1] for p in pairs]
        
    return pairs
In [7]:
pairs = load_data('eng-fra.txt', reverse=False)
Reading text file...
In [8]:
SOS_token = 0
EOS_token = 1
UNK = 2
PAD = 3
BLOCK_SIZE = 12

class Language:
    def __init__(self, lang_name, src=True):
        self.lang_name = lang_name
        self.word_to_index = {"SOS":0, "EOS":1, "UNK": 2, "PAD":3}
        self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK", 3:"PAD"}
        self.word_to_count = {}
        self.vocab_size = 4
        self.src = src

    def add_word(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.vocab_size
            self.index_to_word[self.vocab_size] = word
            self.vocab_size += 1
        self.word_to_count[word] = self.word_to_count.get(word, 0) + 1

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def sentence_to_indexes(self, sentence):
        idxs = [self.word_to_index[word] if word in self.word_to_index else self.word_to_index["UNK"] for word in sentence.split(' ')]
        return idxs
        
    def indexes_to_sentence(self, indexes):
        return ' '.join([self.index_to_word[index] for index in indexes])

    def sentence_to_tensor(self, sentence):
        indexes = self.sentence_to_indexes(sentence)
        indexes = [SOS_token] + indexes + [EOS_token]

        max_len = BLOCK_SIZE if self.src else BLOCK_SIZE + 1
        
        if len(indexes) < max_len:
            indexes += [PAD]*(max_len-len(indexes))
        else:
            indexes = indexes[:max_len]
            
        indexes = torch.tensor(indexes, dtype=torch.long)
        return indexes

    def tensor_to_sentence(self, idx_tensor):
        if len(idx_tensor.shape) > 1:
            idxs = idx_tensor.tolist()[0]
        else:
            idxs = idx_tensor.tolist()
        sentence = self.indexes_to_sentence(idxs)
        return sentence
In [9]:
# create language instances
src_lang = Language(src_language)
target_lang = Language(target_language)

for src, target in pairs:
    src_lang.add_sentence(src)
    target_lang.add_sentence(target) 

Since there are a lot of example sentences and we want to train something quickly, we’ll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we’re filtering to sentences that translate to the form “I am” or “He is” etc. (accounting for apostrophes replaced earlier).

In [10]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p, reverse=False):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1 if reverse else 0].startswith(eng_prefixes)


def filterPairs(pairs, reverse):
    return [pair for pair in pairs if filterPair(pair, reverse)]

The full process for preparing the data is:

Read text file and split into lines, split lines into pairs

Normalize text, filter by length and content

Make word lists from sentences in pairs

In [11]:
def create_dataset(src_lang, target_lang, reverse=False):
    pairs = load_data('eng-fra.txt', reverse)

    # create language instances
    input_lang = Language(src_lang)
    output_lang = Language(target_lang, src=False)

    print("Read %s sentence pairs" % len(pairs))

    pairs = filterPairs(pairs, reverse)
    print("Trimmed to %s sentence pairs" % len(pairs))

    # train/val/test split
    n_total = len(pairs)
    n_train = int(0.8*n_total)
    n_val = int(0.1*n_total)
    n_test = n_total - n_train - n_val
    print(f"{n_train=}, {n_val=}, {n_test=}")
    pair_split = {}
    pair_split['train'] = pairs[:n_train]
    pair_split['val'] = pairs[n_train:n_train + n_val]
    pair_split['test'] = pairs[n_train + n_val:]
    
    print("Counting words...")
    print("Creating source and target language vocab using pair_split['train']...")
    for src, target in pair_split['train']:
        input_lang.add_sentence(src)
        output_lang.add_sentence(target) 

    print("Counted words:")
    print(input_lang.lang_name, input_lang.vocab_size)
    print(output_lang.lang_name, output_lang.vocab_size)
    return input_lang, output_lang, pair_split
In [12]:
src_lang, target_lang, pair_split = create_dataset('eng', 'fra', reverse=False)
print(random.choice(pair_split["train"]))
Reading text file...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
n_train=8479, n_val=1059, n_test=1061
Counting words...
Creating source and target language vocab using pair_split['train']...
Counted words:
eng 2184
fra 3526
['you re single .', 'vous etes celibataire .']

Tokenization: Words to indexes¶

In seq2seq task takes an input sequence (source seq) and outputs another sequence (target seq). In our case, we have an input sentence in English language and a corresponding translated sentence in French language. These sentence need to converted into numbers (integers) to be able to into to a neural network. For this, we will use word-level tokenization, i.e word to integer index mapping.

We need some special tokens to indicate start (SOS) and end (EOS) of a sentence. For the input sequence (source seq), the model needs to know when the input has ended and for the target sequence, the model needs to know when to start and when to end.

So, we will append the EOS token to the end of input sentence and wrap the target sentence by SOS (in the beginning) and the EOS (in the end) tokens.

In [13]:
def sent_pair_to_tensor_pair(pair, src_lang, target_lang):
    input_tensor = src_lang.sentence_to_tensor(pair[0])
    target_tensor = target_lang.sentence_to_tensor(pair[1])
    return input_tensor, target_tensor

def tensor_pair_to_sent_pair(pair, src_lang, target_lang):
    input_tensor = src_lang.tensor_to_sentence(pair[0])
    target_tensor = target_lang.tensor_to_sentence(pair[1])
    return (input_tensor, target_tensor)
In [14]:
pair = random.choices(pair_split["train"], k=1)[0]
print(pair)
print(sent_pair_to_tensor_pair(pair, src_lang, target_lang))
print(sent_pair_to_tensor_pair(pair, src_lang, target_lang))

p1, p2 = sent_pair_to_tensor_pair(pair, src_lang, target_lang)
print(tensor_pair_to_sent_pair((p1,p2), src_lang, target_lang))
['he is sure of success .', 'il est sur de son succes .']
(tensor([   0,   16,   42,   35,  520, 1462,    6,    1,    3,    3,    3,    3]), tensor([   0,   26,   27,   53,  103, 1360,  966,    7,    1,    3,    3,    3,
           3]))
(tensor([   0,   16,   42,   35,  520, 1462,    6,    1,    3,    3,    3,    3]), tensor([   0,   26,   27,   53,  103, 1360,  966,    7,    1,    3,    3,    3,
           3]))
('SOS he is sure of success . EOS PAD PAD PAD PAD', 'SOS il est sur de son succes . EOS PAD PAD PAD PAD')
In [15]:
class Batch:
    def __init__(self, src, target=None, pad_idx=2) -> None:
        # src and target sequences have same length
        self.src = src #shape:(B,T)
        # src_mask:(B,1,1,T)
        self.src_mask = (src != pad_idx).unsqueeze(-2).unsqueeze(-2)
        if target is not None:
            # decoder output shifted by one
            self.tgt = target[:,:-1] #shape:(B,T)
            self.tgt_y = target[:,1:]
            # padding mask: (B,T)
            self.tgt_pad_mask = (self.tgt_y != pad_idx)
            self.tgt_mask = self.causal_mask(self.tgt, pad_idx)
            self.n_tokens = self.tgt_pad_mask.sum().item()

    @staticmethod   
    def causal_mask(target, pad_idx):
        # max context length = block_size
        T = target.shape[1]
        # causal attention mask: (1,T,T)
        causal_attn_mask = torch.tril(torch.ones(1, T, T, dtype=torch.bool, device=target.device))
        # padding mask: (B,1,T)
        tgt_pad_mask = (target != pad_idx).unsqueeze(-2)
        target_mask = tgt_pad_mask & causal_attn_mask
        return target_mask.unsqueeze(1) # (B,1,T,T)

        
def build_batch(split, batch_size=4):
    if split == "train":
        pairs = pair_split["train"]
    elif split == "val":
        pairs = pair_split["val"]
    else:
        pairs = pair_split["test"]
    # randomly (uniformly) sample a start index for a sentence of length block_size
    # number of sequences in a batch is batch_size
    batch_pairs = random.choices(pairs, k=batch_size)
    # input 
    src_batch = []
    target_batch = []
    
    for pair in batch_pairs:
        src_tensor, target_tensor = sent_pair_to_tensor_pair(pair, src_lang, target_lang)
        src_batch.append(src_tensor)
        target_batch.append(target_tensor)
    
    src = torch.stack(src_batch).to(device)
    target = torch.stack(target_batch).to(device)

    batch = Batch(src, target, PAD)
    return batch
In [16]:
b = build_batch("train", batch_size=2)
In [17]:
print(src_lang.tensor_to_sentence(b.src))
print(target_lang.tensor_to_sentence(b.tgt))
print(target_lang.tensor_to_sentence(b.tgt_y))
SOS i m not married yet . EOS PAD PAD PAD PAD
SOS je ne suis pas encore marie . EOS PAD PAD PAD
je ne suis pas encore marie . EOS PAD PAD PAD PAD

Seq2Seq English-French Machine Translation Transformer Model¶

In [18]:
import sys
sys.path.append("../")
from transformers.transformer import EncDecTransformer
In [19]:
# model hyperparameters
model_hyparam = {
    "src_vocab_size": src_lang.vocab_size,
    "tgt_vocab_size": target_lang.vocab_size,
    "block_size": BLOCK_SIZE,
    "model_dim": 32,
    "n_layer": 2,
    "n_head": 2,
    "cross_attention":False
}
In [20]:
batch = build_batch("train", batch_size=4)
batch
Out[20]:
<__main__.Batch at 0x7facd031a340>
In [21]:
# training hyperparameters
batch_size = 16
n_iters = 10
eval_interval = 1#100
base_lr = 1.0
eval_iters = 2
warmup_steps = 5

Vary the learning rate over the course of training, according to the formula:

$lr = {d_{model}}^{-0.5}*\min({step\_num}^{-0.5}, step\_num * {warmup\_steps}^{-1.5})$
This corresponds to increasing the learning rate linearly for the first $warmup\_steps$ training steps, and decreasing it thereafter proportionally to the inverse square root of the step number.

In [22]:
def learning_rate(step, model_size, factor, warmup):
    # default the step to 1 for LambdaLR function to avoid zero raising to negative power
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )
In [23]:
mt_transformer = EncDecTransformer(**model_hyparam)
optimizer = torch.optim.AdamW(mt_transformer.parameters(), base_lr, betas=(0.9, 0.98), eps=1e-9)
lr_scheduler = LambdaLR(optimizer, lr_lambda=lambda step: learning_rate(step, 
                                                                        model_hyparam["model_dim"], 
                                                                        factor=1, warmup=warmup_steps))

# print the number of parameters in the model
print(sum(p.numel() for p in mt_transformer.parameters())/1e6, 'M parameters')
0.350406 M parameters
In [24]:
train_losses = []
val_losses = []
In [25]:
@torch.no_grad()
def compute_loss():
    mt_transformer.eval()
    out_loss = {}
    for split in ["train", "val"]:
        running_loss = 0.0
        total_tokens = 0
        for _ in range(eval_iters):
            batch = build_batch(split, batch_size)
            _, loss = mt_transformer(batch.src, 
                                batch.src_mask,
                                batch.tgt,
                                batch.tgt_mask,
                                batch.tgt_y,
                                batch.tgt_pad_mask)
            running_loss += loss.item()
            total_tokens += batch.n_tokens
        out_loss[split] = running_loss/total_tokens
    mt_transformer.train()
    return out_loss
compute_loss()
Out[25]:
{'train': 0.07855133252723195, 'val': 0.06314297802043412}
In [27]:
# training loop
for iter in range(n_iters):
    mt_transformer.train()
    batch = build_batch("train", batch_size)
    _, loss = mt_transformer(batch.src, 
                                batch.src_mask,
                                batch.tgt,
                                batch.tgt_mask,
                                batch.tgt_y,
                                batch.tgt_pad_mask)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    
    if iter % eval_interval == 0 or iter == n_iters - 1:
        losses = compute_loss()
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        # save model checkpoint
        torch.save(mt_transformer.state_dict(), "mt_transformer.pt")
    
step 0: train loss 0.0722, val loss 0.0564
step 1: train loss 0.0668, val loss 0.0556
step 2: train loss 0.0537, val loss 0.0492
step 3: train loss 0.0458, val loss 0.0435
step 4: train loss 0.0511, val loss 0.0477
step 5: train loss 0.0507, val loss 0.0540
step 6: train loss 0.0471, val loss 0.0438
step 7: train loss 0.0521, val loss 0.0432
step 8: train loss 0.0461, val loss 0.0460
step 9: train loss 0.0414, val loss 0.0427
In [28]:
train_losses[-1], val_losses[-1]
Out[28]:
(0.041378151286732064, 0.04274626944562514)
In [29]:
plt.plot(train_losses, label="train")
plt.plot(val_losses, label="val")
plt.legend()
Out[29]:
<matplotlib.legend.Legend at 0x7fad04b80df0>
In [30]:
batch = build_batch("test", batch_size=1)
print(f"{src_language}: {src_lang.tensor_to_sentence(batch.src[0])}")
out = mt_transformer.generate(batch.src, batch.src_mask, max_tokens=11)
print(f"{target_language}: {target_lang.tensor_to_sentence(out[0])}")
eng: SOS we re still getting to know each other . EOS PAD
fra: SOS tu viens ne font a si un ainsi fiere ennuis une