Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import random # built-in library for generating random numbers
import nltk # library for working with human language
from nltk.tokenize import word_tokenize # A function from NLTK for breaking words down
from nltk import FreqDist # A class from NLTK for representing the frequency distribution of a set.
nltk.download('punkt') # models used by word_tokenize to tokenize words.
def calc_word_probs(text): # this takes a text as input,
words = word_tokenize(text) # tokenizes it into words,
word_counts = FreqDist(words) # then it calculates the frequency distribution of them,
total_words = len(words) # then it computes the probability of each word in said text.
probs = {word: count / total_words for word, count in word_counts.items()}
return probs
# This line is creating a Python dictionary ({}),
# where each word in the text is a key,
# and the corresponding value is the probability of that word occurring in the given text.
# it iterates over each item in the word_counts dictionary.
# For each item, it splits the tuple into two variables; word and count.
# The word is used as the key, and the value is calculated by dividing
# the count of that word by the total number of words in the text.
def generate_word(probs):
possible_outcomes = list(probs.keys()) # get the words (keys) from the probabilities dictionary and convert them into a list. (support)
likely_usage = list(probs.values()) # get the probabilities (values) from the probabilities dictionary and convert them into a list. (weights).
generated_word = random.choices(possible_outcomes, likely_usage)[0] #randomly choose a word from the list of words (support) based on their probabilities (weights).
return generated_word
if __name__ == '__main__':
with open('republic.txt', 'r', encoding='utf-8') as file:
input_text = file.read()
word_probs= calc_word_probs(input_text)
generated_word = generate_word(word_probs)
print("Input text:", input_text)
print("Generated word:", generated_word)
# read the content of republic.txt and store it in the variable input_text.
# calculate the word probabilities using the calc_word_probs function.
# generate a random word based on the computed probabilities using the generate_word function
# print both the input text and the generated word.