import random   # built-in library for generating random numbers
import nltk     # library for working with human language 
from nltk.tokenize import word_tokenize     # A function from NLTK for breaking words down
from nltk import FreqDist   # A class from NLTK for representing the frequency distribution of a set.
nltk.download('punkt')      # models used by word_tokenize to tokenize words.


def calc_word_probs(text):          # this takes a text as input,
    words = word_tokenize(text)     # tokenizes it into words, 
    word_counts = FreqDist(words)   # then it calculates the frequency distribution of them, 
    total_words = len(words)        # then it computes the probability of each word in said text.

    probs = {word: count / total_words for word, count in word_counts.items()}
    return probs

# This line is creating a Python dictionary ({}),
# where each word in the text is a key, 
# and the corresponding value is the probability of that word occurring in the given text.
# it iterates over each item in the word_counts dictionary.
# For each item, it splits the tuple into two variables; word and count.
# The word is used as the key, and the value is calculated by dividing 
# the count of that word by the total number of words in the text. 

def generate_word(probs):
    possible_outcomes = list(probs.keys())  # get the words (keys) from the probabilities dictionary and convert them into a list. (support)
    likely_usage = list(probs.values())     # get the probabilities (values) from the probabilities dictionary and convert them into a list. (weights). 
    
    generated_word = random.choices(possible_outcomes, likely_usage)[0] #randomly choose a word from the list of words (support) based on their probabilities (weights). 
    return generated_word

if __name__ == '__main__':
    with open('republic.txt', 'r', encoding='utf-8') as file:
        input_text = file.read()

    word_probs= calc_word_probs(input_text)
    generated_word = generate_word(word_probs)

    print("Input text:", input_text)
    print("Generated word:", generated_word)

# read the content of republic.txt and store it in the variable input_text.
# calculate the word probabilities using the calc_word_probs function.
# generate a random word based on the computed probabilities using the generate_word function
# print both the input text and the generated word.