import torch
import matplotlib.pyplot as plt
%matplotlib inline


# read names.txt file
names = None
with open("names.txt",'r') as file:
    names = file.read().splitlines() #splitline will split the text on '\n' character and will return a list of strings


n_names = len(names)
print(f"Total no. of names in the dataset: {n_names}")
print(f"List first few names: {names[:10]}")
lengths = [len(w) for w in names]
print(f"Length of longest word: {max(lengths)} ({names[lengths.index(max(lengths))]}) and shortest word: {min(lengths)} ({names[lengths.index(min(lengths))]})")

Total no. of names in the dataset: 32033
List first few names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Length of longest word: 15 (muhammadibrahim) and shortest word: 2 (an)


# let's create a character vocabulary for names dataset and also add ".", the start token to the vocab
char_vocab = ['.'] + sorted(set("".join(names)))
vocab_size = len(char_vocab)
print(f"{vocab_size=} and characters in vocab: {''.join(char_vocab)}")

# character to index mapping
chartoi = {}
# index to character mapping
itochar = {}
for i, c in enumerate(char_vocab):
    chartoi[c] = i
    itochar[i] = c
print(f"{chartoi=}")
print(f"{itochar=}")

vocab_size=27 and characters in vocab: .abcdefghijklmnopqrstuvwxyz
chartoi={'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
itochar={0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


# bigram:count dictionary of bigrams
bigrams = {}
total_bigrams = 0
for i, name in enumerate(names):
    # string to list of characters in the given string
    chars = ['.'] + list(name)+ ['.']
    # cool way to generate bigrams of a word in python
    for char1, char2 in zip(chars[:-1], chars[1:]):
        bigrams[(char1,char2)] = bigrams.get((char1,char2), 0) + 1
        total_bigrams += 1
    if i == 0:
        print(f"First word: {name}\n{chars=}\n{bigrams=}")
print(f"Number of unique bigrams: {len(bigrams)}")
print(f"Total number of bigrams in the dataset: {total_bigrams}")

First word: emma
chars=['.', 'e', 'm', 'm', 'a', '.']
bigrams={('.', 'e'): 1, ('e', 'm'): 1, ('m', 'm'): 1, ('m', 'a'): 1, ('a', '.'): 1}
Number of unique bigrams: 627
Total number of bigrams in the dataset: 228146


# sort the bigrams wrt count in descending order
bigrams = sorted(bigrams.items(), key=lambda x:x[1], reverse=True)
# bigrams


count_table = torch.zeros((vocab_size, vocab_size)) # default dtype=torch.float32

for bigram, count in bigrams:
    ch1, ch2 = bigram
    count_table[chartoi[ch1], chartoi[ch2]] = count
    
plt.imshow(count_table)

<matplotlib.image.AxesImage at 0x7fc5f96fe3a0>


# this code block is from Andrej's lecture
plt.figure(figsize=(22,22))
plt.imshow(count_table, cmap='Blues')
for i in range(vocab_size):
    for j in range(vocab_size):
        chstr = itochar[i] + itochar[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, count_table[i, j].item(), ha="center", va="top", color='gray')
plt.xlabel("Next character")
plt.ylabel("Previous character")
plt.axis();


probs = count_table / count_table.sum(dim=-1, keepdim=True) # -1 is the last dimension
# check if above tensor operation gives desired results
assert (probs[11] == count_table[11]/count_table[11].sum()).all()
# check if each row sums to 1.0
assert (probs.sum(dim=-1).round() == torch.ones(vocab_size)).all()


# for reproduciblity of results
g = torch.Generator().manual_seed(2147483647)
n = 10 # generate n words
for i in range(10):
    # start token
    curr_char = '.'
    curr_word = ''
    while True:
        # sample index of next character
        next_char_ix = torch.multinomial(probs[chartoi[curr_char]], num_samples=1, replacement=True, generator=g).item()
        curr_char = itochar[next_char_ix]
        curr_word += curr_char
        if curr_char == '.':
            print(curr_word)
            break

mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.
odaren.
iaddash.
h.
jhinatien.
egushl.


# for reproduciblity of results
g = torch.Generator().manual_seed(2147483647)
n = 10 # generate n words
for i in range(10):
    # start token
    curr_char = '.'
    curr_word = ''
    while True:
        # sample index of next character
        next_char_ix = torch.randint(low=0, high=27, size=(1,), generator=g).item()
        curr_char = itochar[next_char_ix]
        curr_word += curr_char
        if curr_char == '.':
            print(curr_word)
            break

.
zpleihsxgtvqpsgip.
czwftvgdanmehww.
nfvaejgjkqpwxumidzawywrtwzwegb.
vexfknlwlnrokczeikudwfwpnilovamwlshqn.
ilvxxlrindhq.
janfhbprmbrxsbwhghiileshtfuuosdilzmzftrskqbwt.
vpsjbgtmsiibxtmvi.
iwcoov.
nchtesrqfenu.


log_likelihood = 0
total_bigrams = 0
for bigram, count in bigrams:
    ch1, ch2 = bigram
    log_likelihood += count * torch.log(probs[chartoi[ch1], chartoi[ch2]]).item()
    total_bigrams += count
print(f"{log_likelihood=}")
log_likelihood /= total_bigrams
nll = -log_likelihood
print(f"{nll=}")

log_likelihood=-559873.5915061831
nll=2.454014497322693


log_likelihood = 0.0
total_bigrams = 0
for name in names:
    # string to list of characters in the given string
    chars = ['.'] + list(name)+ ['.']
    # sweet way to generate bigrams of a word in python
    for ch1, ch2 in zip(chars[:-1], chars[1:]):
        log_likelihood += torch.log(probs[chartoi[ch1], chartoi[ch2]]).item()
        total_bigrams += 1
print(f"{log_likelihood=}")
log_likelihood /= total_bigrams
nll = -log_likelihood
print(f"{nll=}")

log_likelihood=-559873.5915061831
nll=2.454014497322693


log_likelihood = 0.0
total_bigrams = 0
word = 'swechhabk'
# string to list of characters in the given string
word = ['.'] + list(word)+ ['.']
# sweet way to generate bigrams of a word in python
for ch1, ch2 in zip(word[:-1], word[1:]):
    log_prob = torch.log(probs[chartoi[ch1], chartoi[ch2]]).item()
    if log_prob == -float('inf'):
        print(f"bigram:{ch1}{ch2}")
    log_likelihood += log_prob
    total_bigrams += 1
print(f"{log_likelihood=}")
log_likelihood /= total_bigrams
nll = -log_likelihood
print(f"{nll=}")

bigram:bk
log_likelihood=-inf
nll=inf


count_table[chartoi['b'], chartoi['k']], probs[chartoi['b'], chartoi['k']]

(tensor(0.), tensor(0.))


corrected_count_table = count_table + 1
corrected_probs = corrected_count_table / corrected_count_table.sum(dim=-1, keepdim=True) # -1 is the last dimension
print(f"{corrected_count_table[chartoi['b'], chartoi['k']]=}, {corrected_probs[chartoi['b'], chartoi['k']]=}")

corrected_count_table[chartoi['b'], chartoi['k']]=tensor(1.), corrected_probs[chartoi['b'], chartoi['k']]=tensor(0.0004)


log_likelihood = 0.0
total_bigrams = 0
word = 'swechhabk'
# string to list of characters in the given string
word = ['.'] + list(word)+ ['.']
# sweet way to generate bigrams of a word in python
for ch1, ch2 in zip(word[:-1], word[1:]):
    log_prob = torch.log(corrected_probs[chartoi[ch1], chartoi[ch2]]).item()
    if log_prob == -float('inf'):
        print(f"bigram:{ch1}{ch2}")
    log_likelihood += log_prob
    total_bigrams += 1
print(f"{log_likelihood=}")
log_likelihood /= total_bigrams
nll = -log_likelihood
print(f"{nll=}")

log_likelihood=-41.08368647098541
nll=4.108368647098541

Char-level Language Modeling - Part 1¶

Bi-gram count based lanugae model¶

1. Defining the problem¶

2. Data collection¶

4.Modeling¶

Generation:¶

How can we do a qunatitative evaluation of these models?¶

Label Smoothing¶