import csv import os import subprocess import tkinter import tkinter.filedialog import webvtt def get_input_file_path(initial_dir:str): """ A function to collect the input file name from the user. """ root = tkinter.Tk() root.withdraw() file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir, filetypes=[('CSV', '.csv')], title='Select input file') return file_path def write_title_tex(title:str): """ A function to write a title.tex file. """ s = "\setstretch{1.0}" + "\n" s += "\\title{" + title + "}" + "\n" s += "\\author{Audio transcript}" + "\n" s += "\\date{}" + "\n" s += "\maketitle" + "\n" s += "\setstretch{1.2}" + "\n" output_file = open("title.tex", "w") output_file.write(s) output_file.close() def srt_to_txt(srt_file): """ A function to remove the numbers and times from an srt file. """ lines = [] input_file = open(srt_file, "r") input_lines = input_file.readlines() input_file.close() for line in input_lines: # Skip lines that start with a number. if len(line) > 0: if line[0].isdigit(): continue lines.append(line) return lines def process_files(doc_dir, csv_file_path): # The data files are expected to be in the # same directory as the input CSV file. data_dir = os.path.dirname(csv_file_path) # The directory where the LaTeX files are. os.chdir(doc_dir) input_file = open(csv_file_path, "r", newline='') dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # Require both columns to be present. if "file_name" not in dict_reader.fieldnames: print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.") if "title" not in dict_reader.fieldnames: print(f"!! The file \"{csv_file_path}\" must contain a column named title.") csv_lines = sum(1 for row in dict_reader) print(csv_lines) # Process each row in the CSV file. for row_dict in dict_reader: transcript_file_name = row_dict["file_name"] if len(transcript_file_name) == 0: continue title = row_dict["title"] if len(title) == 0: continue transcript_file_name = os.path.join(data_dir, transcript_file_name) # Set the output file file name. file_prefix, file_extension = os.path.splitext(transcript_file_name) pdf_file_name = file_prefix + ".pdf" # Attempt to covert the transcript file to text. lines = [] if file_extension == ".srt": lines += srt_to_txt(transcript_file_name) # If the transcript file cannot be converted. if len(lines) == 0: print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.") continue # Write the tex files. write_title_tex(title) output_file = open("body.tex", "w") output_file.writelines(lines) output_file.close() # Create the PDF file. The number of pages is calculated # the first time LaTeX is run. ret_code =["pdflatex", "transcript.tex"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) ret_code =["pdflatex", "transcript.tex"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) if ret_code != 0: print(f"!! pdflatex failed for {transcript_file_name}") break # Remove the output PDF file if it exists. if os.path.isfile(pdf_file_name): os.unlink(pdf_file_name) # Rename the output file. os.rename("transcript.pdf", pdf_file_name) print(f">> Written \"{pdf_file_name}\"") input_file.close() if __name__ == "__main__": parent_dir = os.path.dirname(os.getcwd()) data_dir = os.path.join(parent_dir, "data") doc_dir = os.path.join(parent_dir, "doc") transcript_csv = get_input_file_path(data_dir) if len(transcript_csv) > 0: process_files(doc_dir, transcript_csv)