Skip to content
Snippets Groups Projects
transcripts.py 4.27 KiB
Newer Older
William Bell's avatar
William Bell committed
import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt

def get_input_file_path(initial_dir:str):
    """
    A function to collect the input file name from the user.
    """
    root = tkinter.Tk()
    root.withdraw()
    file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
                                                filetypes=[('CSV', '.csv')],
                                                title='Select input file')
    return file_path


def write_title_tex(title:str):
    """
    A function to write a title.tex file.
    """
    s = "\setstretch{1.0}" + "\n"
    s += "\\title{" + title + "}" + "\n"
    s += "\\author{Audio transcript}" + "\n"
    s += "\\date{}" + "\n"
    s += "\maketitle" + "\n"
    s += "\setstretch{1.2}" + "\n"
    output_file = open("title.tex", "w")
    output_file.write(s)
    output_file.close()


def srt_to_txt(srt_file):
    """
    A function to remove the numbers and times from an srt file.
    """
    lines = []
    input_file = open(srt_file, "r")
    input_lines = input_file.readlines()
    input_file.close()
    for line in input_lines:
        # Skip lines that start with a number.
        if len(line) > 0:
            if line[0].isdigit():
                continue
        lines.append(line)
    return lines


def process_files(doc_dir, csv_file_path):
    # The data files are expected to be in the 
    # same directory as the input CSV file.
    data_dir = os.path.dirname(csv_file_path)

    # The directory where the LaTeX files are.
    os.chdir(doc_dir)

    input_file = open(csv_file_path, "r", newline='')
    dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    # Require both columns to be present.
    if "file_name" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
    if "title" not in dict_reader.fieldnames:
        print(f"!! The file \"{csv_file_path}\" must contain a column named title.")
    
    csv_lines = sum(1 for row in dict_reader)
    print(csv_lines)

    # Process each row in the CSV file.
    for row_dict in dict_reader:
        transcript_file_name = row_dict["file_name"]
        if len(transcript_file_name) == 0:
            continue
        title = row_dict["title"]
        if len(title) == 0:
            continue

        transcript_file_name = os.path.join(data_dir, transcript_file_name)

        # Set the output file file name.
        file_prefix, file_extension = os.path.splitext(transcript_file_name)
        pdf_file_name = file_prefix + ".pdf"
    
        # Attempt to covert the transcript file to text.
        lines = []
        if file_extension == ".srt":
            lines += srt_to_txt(transcript_file_name)
    
        # If the transcript file cannot be converted.
        if len(lines) == 0:
            print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
            continue

        # Write the tex files.
        write_title_tex(title)
        output_file = open("body.tex", "w")
        output_file.writelines(lines)
        output_file.close()

        # Create the PDF file.  The number of pages is calculated
        # the first time LaTeX is run.
        ret_code = subprocess.call(["pdflatex", "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        ret_code = subprocess.call(["pdflatex", "transcript.tex"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.STDOUT)
        if ret_code != 0:
            print(f"!! pdflatex failed for {transcript_file_name}")
            break

        # Remove the output PDF file if it exists.
        if os.path.isfile(pdf_file_name):
            os.unlink(pdf_file_name)

        # Rename the output file.
        os.rename("transcript.pdf", pdf_file_name)

        print(f">> Written \"{pdf_file_name}\"")

    input_file.close()


if __name__ == "__main__":
    parent_dir = os.path.dirname(os.getcwd())
    data_dir = os.path.join(parent_dir, "data")
    doc_dir = os.path.join(parent_dir, "doc")
    transcript_csv = get_input_file_path(data_dir)
    if len(transcript_csv) > 0:
        process_files(doc_dir, transcript_csv)