In [1]:
import nltk
import random
In [2]:
### Step 1. Pick your training corpus
corpus = nltk.corpus.brown.words()
In [3]:
### Step 2. Generate a list of all of the N-word-long sequences in your corpus
### e.g. if N = 2, "I can do it." -> [("I", "can"), ("can", "do"), ("do", "it"), ("it", ".")]

def generate_ngrams(corpus, N):
    ngrams = []
    
    for i in range(len(corpus) - N + 1):
        ngrams.append(tuple(corpus[i:(i + N)]))
    
    return ngrams
In [4]:
### Step 3. Create a nested dictionary with counts of each word given (N - 1) previous words
### e.g. {"I": {"think": 1, "can": 1}, "can": {"do": 1}, "think": {"I": 1}, "do": {"it": 1"}, "it": {".": 1}}

def generate_freq_dist(ngrams):
    N = len(ngrams[0])
    
    freq_dist = {}
    
    for ngram in ngrams:
        given = ngram[:(N - 1)]

        if given not in freq_dist:
            freq_dist[given] = {}

        if ngram[N - 1] not in freq_dist[given]:
            freq_dist[given][ngram[N - 1]] = 1
        else:
            freq_dist[given][ngram[N - 1]] += 1
    
    return freq_dist
In [5]:
### Step 4: Create a nested dictionary with probabilities of each word given (N - 1) previous words
### e.g. {"I": {"think": 0.5, "can": 0.5}, "can": {"do": 1}, "think": {"I": 1}, "do": {"it": 1"}, "it": {".": 1}}

def generate_prob_dist(freq_dist):
    prob_dist = {}
    
    for word in freq_dist:
        total = 0
        prob_dist[word] = {}

        for next_word in freq_dist[word]:
            total += freq_dist[word][next_word]

        for next_word in freq_dist[word]:
            prob_dist[word][next_word] = freq_dist[word][next_word] / total
    
    return prob_dist
In [6]:
N = 2

ngrams = generate_ngrams(corpus, N)
freq_dist = generate_freq_dist(ngrams)
prob_dist = generate_prob_dist(freq_dist)
In [7]:
### Step 5: Given a prompt, randomly sample your probability distribution to pick the next word
### Step 6: Repeat

num_words = 100
prompt = ["I"]

while len(prompt) < num_words:
    sel = random.random()
    total = 0
    
    # Get last N-1 words from prompt
    given = tuple(prompt[-(N - 1):])
    
    if given not in prob_dist:
        # DISCUSS: When would this occur?
        break
    
    # This part is kind of tricky, so I've done it for you
    # DISCUSS: Why does this work?
    
    for word in prob_dist[given]:
        prob = prob_dist[given][word]
        
        if total + prob > sel:
            prompt.append(word)
            break
        else:
            total += prob
In [8]:
" ".join(prompt)
Out[8]:
"I must be sure ground . Haven't the elephantine dimensions to earn an increase in libraries , those of the bartender to give the bulletin for in Cicero , I had large degree , or may easily the most poetry has kept . They were increasingly enjoy conversing with minimum , the thud of the tappet , it was what I respect it beef-fat . It would describe death did not convey the dugout without let us when the East . At 7:30 p.m. , and more instrumental . About 80 . But at Fudomae and development of the traffic"