import nltk
import random
### Step 1. Pick your training corpus
corpus = nltk.corpus.brown.words()
### Step 2. Generate a list of all of the N-word-long sequences in your corpus
### e.g. if N = 2, "I can do it." -> [("I", "can"), ("can", "do"), ("do", "it"), ("it", ".")]
def generate_ngrams(corpus):
ngrams = []
# ~2 lines: Fill `ngrams` with 2-word tuples from `corpus`
return ngrams
### Step 3. Create a nested dictionary with counts of each word given (N - 1) previous words
### e.g. {"I": {"think": 1, "can": 1}, "can": {"do": 1}, "think": {"I": 1}, "do": {"it": 1"}, "it": {".": 1}}
def generate_freq_dist(ngrams):
freq_dist = {}
# ~10 lines: Create frequencies dictionary
# HINT: Loop through `ngrams`, adding 1 to freq_dist[ngram[0]][ngram[1]] each time
# DISCUSS: What will you need to do if ngram[0] or ngram[1] are not in your `freq_dist`?
return freq_dist
### Step 4: Create a nested dictionary with probabilities of each word given (N - 1) previous words
### e.g. {"I": {"think": 0.5, "can": 0.5}, "can": {"do": 1}, "think": {"I": 1}, "do": {"it": 1"}, "it": {".": 1}}
def generate_prob_dist(freq_dist):
prob_dist = {}
for word in freq_dist:
# 3 lines: Calculate the total number of times `word` was used
# 3 lines: Set p[ngram[0]][ngram[1]] = f[ngram[0]][ngram[1]] / total for each word
return prob_dist
# 1-3 lines: Generate probability distribution from corpus
### Step 5: Given a prompt, randomly sample your probability distribution to pick the next word
### Step 6: Repeat
num_words = 100
prompt = ["I"]
while len(prompt) < num_words:
sel = random.random()
total = 0
# Get last N-1 words from prompt
given = tuple(prompt[-(N - 1):])
if given not in prob_dist:
# DISCUSS: When would this occur?
break
# This part is kind of tricky, so I've done it for you
# DISCUSS: Why does this work?
for word in prob_dist[given]:
prob = prob_dist[given][word]
if total + prob > sel:
prompt.append(word)
break
else:
total += prob
" ".join(prompt)