import csv import os import subprocess import tkinter import tkinter.filedialog import webvtt def get_input_file_path(initial_dir: str) -> str: """ A function to collect the input file name from the user. """ root = tkinter.Tk() root.withdraw() file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir, filetypes=[('CSV', '.csv')], title='Select input file') return file_path def write_title_tex(title: str, title_file: str) -> None: """ A function to write a title.tex file. """ s = "\\setstretch{1.0}" + "\n" s += "\\title{" + title + "}" + "\n" s += "\\author{Audio transcript}" + "\n" s += "\\date{}" + "\n" s += "\\maketitle" + "\n" s += "\\setstretch{1.2}" + "\n" output_file = open(title_file, "w") output_file.write(s) output_file.close() def srt_to_txt(srt_file: str) -> list: """ A function to remove the numbers and times from an srt captions file. """ lines = [] input_file = open(srt_file, "r") input_lines = input_file.readlines() input_file.close() for line in input_lines: # Skip lines that start with a number. if len(line) > 0: if line[0].isdigit(): continue lines.append(line) return lines def vtt_to_txt(vtt_file: str) -> list: """ A function to remove non-text content from a vtt captions file. """ vtt = webvtt.read(vtt_file) lines = [] for line in vtt: lines.append(line.text + "\n\n") return lines def process_files(doc_dir: str, csv_file_path: str) -> None: """ A function to process all caption files and produce PDF files. """ # The data files are expected to be in the # same directory as the input CSV file. data_dir = os.path.dirname(csv_file_path) # The directory where the LaTeX files are. os.chdir(doc_dir) input_file = open(csv_file_path, "r", newline='') dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # Require both columns to be present. if "file_name" not in dict_reader.fieldnames: print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.") if "title" not in dict_reader.fieldnames: print(f"!! The file \"{csv_file_path}\" must contain a column named title.") # Process each row in the CSV file. for row_dict in dict_reader: transcript_file_name = row_dict["file_name"] if len(transcript_file_name) == 0: continue title = row_dict["title"] if len(title) == 0: continue transcript_file_name = os.path.join(data_dir, transcript_file_name) # Set the output file file name. file_prefix, file_extension = os.path.splitext(transcript_file_name) pdf_file_name = file_prefix + ".pdf" # Attempt to covert the transcript file to text. lines = [] if file_extension == ".srt": lines += srt_to_txt(transcript_file_name) elif file_extension == ".vtt": lines += vtt_to_txt(transcript_file_name) # If the transcript file cannot be converted. if len(lines) == 0: print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.") continue # Write the tex files. title_file = "title.tex" body_file = "body.tex" write_title_tex(title, title_file) output_file = open(body_file, "w") output_file.writelines(lines) output_file.close() # Create the PDF file. The number of pages is calculated # the first time LaTeX is run. ret_code = subprocess.call(["pdflatex", "-interaction=nonstopmode", "transcript.tex"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) ret_code = subprocess.call(["pdflatex", "-interaction=nonstopmode", "transcript.tex"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) if ret_code != 0: print(f"!! pdflatex failed for {transcript_file_name}") break # Remove the output PDF file if it exists. if os.path.isfile(pdf_file_name): os.unlink(pdf_file_name) # Rename the output file. os.rename("transcript.pdf", pdf_file_name) # Clean up the doc directory. if os.path.isfile(title_file): os.unlink(title_file) if os.path.isfile(body_file): os.unlink(body_file) print(f">> Written \"{pdf_file_name}\"") input_file.close() if __name__ == "__main__": """ A program to convert .srt and .vtt caption files to PDF transcript files. """ parent_dir = os.path.dirname(os.getcwd()) data_dir = os.path.join(parent_dir, "data") doc_dir = os.path.join(parent_dir, "doc") transcript_csv = get_input_file_path(data_dir) if len(transcript_csv) > 0: # Normalise the file path for Windows. transcript_csv = os.path.normpath(transcript_csv) # Process the files. process_files(doc_dir, transcript_csv)