Skip to content
Snippets Groups Projects
transcripts.py 5.4 KiB
Newer Older
William Bell's avatar
William Bell committed
import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt

William Bell's avatar
William Bell committed

def get_input_file_path(initial_dir: str) -> str:
William Bell's avatar
William Bell committed
    """
    A function to collect the input file name from the user.
    """
    root = tkinter.Tk()
    root.withdraw()
    file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
William Bell's avatar
William Bell committed
                                                   filetypes=[('CSV', '.csv')],
                                                   title='Select input file')
William Bell's avatar
William Bell committed
    return file_path


William Bell's avatar
William Bell committed
def write_title_tex(title: str, title_file: str) -> None:
William Bell's avatar
William Bell committed
    """
    A function to write a title.tex file.
    """
William Bell's avatar
William Bell committed
    s = "\\setstretch{1.0}" + "\n"
William Bell's avatar
William Bell committed
    s += "\\title{" + title + "}" + "\n"
    s += "\\author{Audio transcript}" + "\n"
    s += "\\date{}" + "\n"
William Bell's avatar
William Bell committed
    s += "\\maketitle" + "\n"
    s += "\\setstretch{1.2}" + "\n"
    output_file = open(title_file, "w")
William Bell's avatar
William Bell committed
    output_file.write(s)
    output_file.close()


William Bell's avatar
William Bell committed
def srt_to_txt(srt_file: str) -> list:
William Bell's avatar
William Bell committed
    """
William Bell's avatar
William Bell committed
    A function to remove the numbers and times from an srt captions file.
William Bell's avatar
William Bell committed
    """
    lines = []
    input_file = open(srt_file, "r")
    input_lines = input_file.readlines()
    input_file.close()
    for line in input_lines:
        # Skip lines that start with a number.
        if len(line) > 0:
            if line[0].isdigit():
                continue
        lines.append(line)
    return lines


William Bell's avatar
William Bell committed
def vtt_to_txt(vtt_file: str) -> list:
    """
    A function to remove non-text content from a vtt captions file.
    """
    vtt = webvtt.read(vtt_file)
    lines = []
    for line in vtt:
        lines.append(line.text + "\n\n")
    return lines


def process_files(doc_dir: str, csv_file_path: str) -> None:
    """
    A function to process all caption files and produce
    PDF files.
    """

    # The data files are expected to be in the
William Bell's avatar
William Bell committed
    # same directory as the input CSV file.
    data_dir = os.path.dirname(csv_file_path)

    # The directory where the LaTeX files are.
    os.chdir(doc_dir)

    input_file = open(csv_file_path, "r", newline='')
William Bell's avatar
William Bell committed
    dict_reader = csv.DictReader(input_file, delimiter=',',
                                 quotechar='"', quoting=csv.QUOTE_MINIMAL)
William Bell's avatar
William Bell committed

    # Require both columns to be present.
    if "file_name" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
    if "title" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named title.")

    # Process each row in the CSV file.
    for row_dict in dict_reader:
        transcript_file_name = row_dict["file_name"]
        if len(transcript_file_name) == 0:
            continue
        title = row_dict["title"]
        if len(title) == 0:
            continue

        transcript_file_name = os.path.join(data_dir, transcript_file_name)

        # Set the output file file name.
        file_prefix, file_extension = os.path.splitext(transcript_file_name)
        pdf_file_name = file_prefix + ".pdf"
William Bell's avatar
William Bell committed

William Bell's avatar
William Bell committed
        # Attempt to covert the transcript file to text.
        lines = []
        if file_extension == ".srt":
            lines += srt_to_txt(transcript_file_name)
William Bell's avatar
William Bell committed
        elif file_extension == ".vtt":
            lines += vtt_to_txt(transcript_file_name)

William Bell's avatar
William Bell committed
        # If the transcript file cannot be converted.
        if len(lines) == 0:
            print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
            continue

        # Write the tex files.
William Bell's avatar
William Bell committed
        title_file = "title.tex"
        body_file = "body.tex"
        write_title_tex(title, title_file)
        output_file = open(body_file, "w")
William Bell's avatar
William Bell committed
        output_file.writelines(lines)
        output_file.close()

        # Create the PDF file.  The number of pages is calculated
        # the first time LaTeX is run.
William Bell's avatar
William Bell committed
        ret_code = subprocess.call(["pdflatex",
                                    "-interaction=nonstopmode",
                                    "transcript.tex"],
William Bell's avatar
William Bell committed
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
William Bell's avatar
William Bell committed
        ret_code = subprocess.call(["pdflatex",
                                    "-interaction=nonstopmode",
                                    "transcript.tex"],
William Bell's avatar
William Bell committed
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        if ret_code != 0:
            print(f"!! pdflatex failed for {transcript_file_name}")
            break

        # Remove the output PDF file if it exists.
        if os.path.isfile(pdf_file_name):
            os.unlink(pdf_file_name)

        # Rename the output file.
        os.rename("transcript.pdf", pdf_file_name)

William Bell's avatar
William Bell committed
        # Clean up the doc directory.
        if os.path.isfile(title_file):
            os.unlink(title_file)
        if os.path.isfile(body_file):
            os.unlink(body_file)

William Bell's avatar
William Bell committed
        print(f">> Written \"{pdf_file_name}\"")

    input_file.close()


if __name__ == "__main__":
William Bell's avatar
William Bell committed
    """
    A program to convert .srt and .vtt caption files to
    PDF transcript files.
    """
William Bell's avatar
William Bell committed
    parent_dir = os.path.dirname(os.getcwd())
    data_dir = os.path.join(parent_dir, "data")
    doc_dir = os.path.join(parent_dir, "doc")
    transcript_csv = get_input_file_path(data_dir)
    if len(transcript_csv) > 0:
William Bell's avatar
William Bell committed
        # Normalise the file path for Windows.
        transcript_csv = os.path.normpath(transcript_csv)

        # Process the files.
        process_files(doc_dir, transcript_csv)