transcripts.py

import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt

def get_input_file_path(initial_dir:str):
    """
    A function to collect the input file name from the user.
    """
    root = tkinter.Tk()
    root.withdraw()
    file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
                                                filetypes=[('CSV', '.csv')],
                                                title='Select input file')
    return file_path


def write_title_tex(title:str):
    """
    A function to write a title.tex file.
    """
    s = "\setstretch{1.0}" + "\n"
    s += "\\title{" + title + "}" + "\n"
    s += "\\author{Audio transcript}" + "\n"
    s += "\\date{}" + "\n"
    s += "\maketitle" + "\n"
    s += "\setstretch{1.2}" + "\n"
    output_file = open("title.tex", "w")
    output_file.write(s)
    output_file.close()


def srt_to_txt(srt_file):
    """
    A function to remove the numbers and times from an srt file.
    """
    lines = []
    input_file = open(srt_file, "r")
    input_lines = input_file.readlines()
    input_file.close()
    for line in input_lines:
        # Skip lines that start with a number.
        if len(line) > 0:
            if line[0].isdigit():
                continue
        lines.append(line)
    return lines


def process_files(doc_dir, csv_file_path):
    # The data files are expected to be in the 
    # same directory as the input CSV file.
    data_dir = os.path.dirname(csv_file_path)

    # The directory where the LaTeX files are.
    os.chdir(doc_dir)

    input_file = open(csv_file_path, "r", newline='')
    dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    # Require both columns to be present.
    if "file_name" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
    if "title" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named title.")
    
    csv_lines = sum(1 for row in dict_reader)
    print(csv_lines)

    # Process each row in the CSV file.
    for row_dict in dict_reader:
        transcript_file_name = row_dict["file_name"]
        if len(transcript_file_name) == 0:
            continue
        title = row_dict["title"]
        if len(title) == 0:
            continue

        transcript_file_name = os.path.join(data_dir, transcript_file_name)

        # Set the output file file name.
        file_prefix, file_extension = os.path.splitext(transcript_file_name)
        pdf_file_name = file_prefix + ".pdf"
    
        # Attempt to covert the transcript file to text.
        lines = []
        if file_extension == ".srt":
            lines += srt_to_txt(transcript_file_name)
    
        # If the transcript file cannot be converted.
        if len(lines) == 0:
            print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
            continue

        # Write the tex files.
        write_title_tex(title)
        output_file = open("body.tex", "w")
        output_file.writelines(lines)
        output_file.close()

        # Create the PDF file.  The number of pages is calculated
        # the first time LaTeX is run.
        ret_code = subprocess.call(["pdflatex", "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        ret_code = subprocess.call(["pdflatex", "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        if ret_code != 0:
            print(f"!! pdflatex failed for {transcript_file_name}")
            break

        # Remove the output PDF file if it exists.
        if os.path.isfile(pdf_file_name):
            os.unlink(pdf_file_name)

        # Rename the output file.
        os.rename("transcript.pdf", pdf_file_name)

        print(f">> Written \"{pdf_file_name}\"")

    input_file.close()


if __name__ == "__main__":
    parent_dir = os.path.dirname(os.getcwd())
    data_dir = os.path.join(parent_dir, "data")
    doc_dir = os.path.join(parent_dir, "doc")
    transcript_csv = get_input_file_path(data_dir)
    if len(transcript_csv) > 0:
        process_files(doc_dir, transcript_csv)