Newer
Older
import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt
"""
A function to collect the input file name from the user.
"""
root = tkinter.Tk()
root.withdraw()
file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
s += "\\title{" + title + "}" + "\n"
s += "\\author{Audio transcript}" + "\n"
s += "\\date{}" + "\n"
s += "\\maketitle" + "\n"
s += "\\setstretch{1.2}" + "\n"
output_file = open(title_file, "w")
A function to remove the numbers and times from an srt captions file.
"""
lines = []
input_file = open(srt_file, "r")
input_lines = input_file.readlines()
input_file.close()
for line in input_lines:
# Skip lines that start with a number.
if len(line) > 0:
if line[0].isdigit():
continue
lines.append(line)
return lines
def vtt_to_txt(vtt_file: str) -> list:
"""
A function to remove non-text content from a vtt captions file.
"""
vtt = webvtt.read(vtt_file)
lines = []
for line in vtt:
lines.append(line.text + "\n\n")
return lines
def process_files(doc_dir: str, csv_file_path: str) -> None:
"""
A function to process all caption files and produce
PDF files.
"""
# The data files are expected to be in the
# same directory as the input CSV file.
data_dir = os.path.dirname(csv_file_path)
# The directory where the LaTeX files are.
os.chdir(doc_dir)
input_file = open(csv_file_path, "r", newline='')
dict_reader = csv.DictReader(input_file, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Require both columns to be present.
if "file_name" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
if "title" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named title.")
# Process each row in the CSV file.
for row_dict in dict_reader:
transcript_file_name = row_dict["file_name"]
if len(transcript_file_name) == 0:
continue
title = row_dict["title"]
if len(title) == 0:
continue
transcript_file_name = os.path.join(data_dir, transcript_file_name)
# Set the output file file name.
file_prefix, file_extension = os.path.splitext(transcript_file_name)
pdf_file_name = file_prefix + ".pdf"
# Attempt to covert the transcript file to text.
lines = []
if file_extension == ".srt":
lines += srt_to_txt(transcript_file_name)
elif file_extension == ".vtt":
lines += vtt_to_txt(transcript_file_name)
# If the transcript file cannot be converted.
if len(lines) == 0:
print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
continue
# Write the tex files.
title_file = "title.tex"
body_file = "body.tex"
write_title_tex(title, title_file)
output_file = open(body_file, "w")
output_file.writelines(lines)
output_file.close()
# Create the PDF file. The number of pages is calculated
# the first time LaTeX is run.
ret_code = subprocess.call(["pdflatex",
"-interaction=nonstopmode",
"transcript.tex"],
ret_code = subprocess.call(["pdflatex",
"-interaction=nonstopmode",
"transcript.tex"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
if ret_code != 0:
print(f"!! pdflatex failed for {transcript_file_name}")
break
# Remove the output PDF file if it exists.
if os.path.isfile(pdf_file_name):
os.unlink(pdf_file_name)
# Rename the output file.
os.rename("transcript.pdf", pdf_file_name)
# Clean up the doc directory.
if os.path.isfile(title_file):
os.unlink(title_file)
if os.path.isfile(body_file):
os.unlink(body_file)
print(f">> Written \"{pdf_file_name}\"")
input_file.close()
if __name__ == "__main__":
"""
A program to convert .srt and .vtt caption files to
PDF transcript files.
"""
parent_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(parent_dir, "data")
doc_dir = os.path.join(parent_dir, "doc")
transcript_csv = get_input_file_path(data_dir)
if len(transcript_csv) > 0:
# Normalise the file path for Windows.
transcript_csv = os.path.normpath(transcript_csv)
# Process the files.
process_files(doc_dir, transcript_csv)