transcripts.py

import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt


def get_input_file_path(initial_dir: str) -> str:
    """
    A function to collect the input file name from the user.
    """
    root = tkinter.Tk()
    root.withdraw()
    file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
                                                   filetypes=[('CSV', '.csv')],
                                                   title='Select input file')
    return file_path


def write_title_tex(title: str, title_file: str) -> None:
    """
    A function to write a title.tex file.
    """
    s = "\\setstretch{1.0}" + "\n"
    s += "\\title{" + title + "}" + "\n"
    s += "\\author{Audio transcript}" + "\n"
    s += "\\date{}" + "\n"
    s += "\\maketitle" + "\n"
    s += "\\setstretch{1.2}" + "\n"
    output_file = open(title_file, "w")
    output_file.write(s)
    output_file.close()


def srt_to_txt(srt_file: str) -> list:
    """
    A function to remove the numbers and times from an srt captions file.
    """
    lines = []
    input_file = open(srt_file, "r")
    input_lines = input_file.readlines()
    input_file.close()
    for line in input_lines:
        # Skip lines that start with a number.
        if len(line) > 0:
            if line[0].isdigit():
                continue
        lines.append(line)
    return lines


def vtt_to_txt(vtt_file: str) -> list:
    """
    A function to remove non-text content from a vtt captions file.
    """
    vtt = webvtt.read(vtt_file)
    lines = []
    for line in vtt:
        lines.append(line.text + "\n\n")
    return lines


def process_files(doc_dir: str, csv_file_path: str) -> None:
    """
    A function to process all caption files and produce
    PDF files.
    """

    # The data files are expected to be in the
    # same directory as the input CSV file.
    data_dir = os.path.dirname(csv_file_path)

    # The directory where the LaTeX files are.
    os.chdir(doc_dir)

    input_file = open(csv_file_path, "r", newline='')
    dict_reader = csv.DictReader(input_file, delimiter=',',
                                 quotechar='"', quoting=csv.QUOTE_MINIMAL)

    # Require both columns to be present.
    if "file_name" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
    if "title" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named title.")

    # Process each row in the CSV file.
    for row_dict in dict_reader:
        transcript_file_name = row_dict["file_name"]
        if len(transcript_file_name) == 0:
            continue
        title = row_dict["title"]
        if len(title) == 0:
            continue

        transcript_file_name = os.path.join(data_dir, transcript_file_name)

        # Set the output file file name.
        file_prefix, file_extension = os.path.splitext(transcript_file_name)
        pdf_file_name = file_prefix + ".pdf"

        # Attempt to covert the transcript file to text.
        lines = []
        if file_extension == ".srt":
            lines += srt_to_txt(transcript_file_name)
        elif file_extension == ".vtt":
            lines += vtt_to_txt(transcript_file_name)

        # If the transcript file cannot be converted.
        if len(lines) == 0:
            print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
            continue

        # Write the tex files.
        title_file = "title.tex"
        body_file = "body.tex"
        write_title_tex(title, title_file)
        output_file = open(body_file, "w")
        output_file.writelines(lines)
        output_file.close()

        # Create the PDF file.  The number of pages is calculated
        # the first time LaTeX is run.
        ret_code = subprocess.call(["pdflatex",
                                    "-interaction=nonstopmode",
                                    "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        ret_code = subprocess.call(["pdflatex",
                                    "-interaction=nonstopmode",
                                    "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        if ret_code != 0:
            print(f"!! pdflatex failed for {transcript_file_name}")
            break

        # Remove the output PDF file if it exists.
        if os.path.isfile(pdf_file_name):
            os.unlink(pdf_file_name)

        # Rename the output file.
        os.rename("transcript.pdf", pdf_file_name)

        # Clean up the doc directory.
        if os.path.isfile(title_file):
            os.unlink(title_file)
        if os.path.isfile(body_file):
            os.unlink(body_file)

        print(f">> Written \"{pdf_file_name}\"")

    input_file.close()


if __name__ == "__main__":
    """
    A program to convert .srt and .vtt caption files to
    PDF transcript files.
    """
    parent_dir = os.path.dirname(os.getcwd())
    data_dir = os.path.join(parent_dir, "data")
    doc_dir = os.path.join(parent_dir, "doc")
    transcript_csv = get_input_file_path(data_dir)
    if len(transcript_csv) > 0:
        # Normalise the file path for Windows.
        transcript_csv = os.path.normpath(transcript_csv)

        # Process the files.
        process_files(doc_dir, transcript_csv)