Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import csv
import os
import subprocess
import tkinter
import tkinter.filedialog
import webvtt
def get_input_file_path(initial_dir:str):
"""
A function to collect the input file name from the user.
"""
root = tkinter.Tk()
root.withdraw()
file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
filetypes=[('CSV', '.csv')],
title='Select input file')
return file_path
def write_title_tex(title:str):
"""
A function to write a title.tex file.
"""
s = "\setstretch{1.0}" + "\n"
s += "\\title{" + title + "}" + "\n"
s += "\\author{Audio transcript}" + "\n"
s += "\\date{}" + "\n"
s += "\maketitle" + "\n"
s += "\setstretch{1.2}" + "\n"
output_file = open("title.tex", "w")
output_file.write(s)
output_file.close()
def srt_to_txt(srt_file):
"""
A function to remove the numbers and times from an srt file.
"""
lines = []
input_file = open(srt_file, "r")
input_lines = input_file.readlines()
input_file.close()
for line in input_lines:
# Skip lines that start with a number.
if len(line) > 0:
if line[0].isdigit():
continue
lines.append(line)
return lines
def process_files(doc_dir, csv_file_path):
# The data files are expected to be in the
# same directory as the input CSV file.
data_dir = os.path.dirname(csv_file_path)
# The directory where the LaTeX files are.
os.chdir(doc_dir)
input_file = open(csv_file_path, "r", newline='')
dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Require both columns to be present.
if "file_name" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
if "title" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named title.")
csv_lines = sum(1 for row in dict_reader)
print(csv_lines)
# Process each row in the CSV file.
for row_dict in dict_reader:
transcript_file_name = row_dict["file_name"]
if len(transcript_file_name) == 0:
continue
title = row_dict["title"]
if len(title) == 0:
continue
transcript_file_name = os.path.join(data_dir, transcript_file_name)
# Set the output file file name.
file_prefix, file_extension = os.path.splitext(transcript_file_name)
pdf_file_name = file_prefix + ".pdf"
# Attempt to covert the transcript file to text.
lines = []
if file_extension == ".srt":
lines += srt_to_txt(transcript_file_name)
# If the transcript file cannot be converted.
if len(lines) == 0:
print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
continue
# Write the tex files.
write_title_tex(title)
output_file = open("body.tex", "w")
output_file.writelines(lines)
output_file.close()
# Create the PDF file. The number of pages is calculated
# the first time LaTeX is run.
ret_code = subprocess.call(["pdflatex", "transcript.tex"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
ret_code = subprocess.call(["pdflatex", "transcript.tex"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
if ret_code != 0:
print(f"!! pdflatex failed for {transcript_file_name}")
break
# Remove the output PDF file if it exists.
if os.path.isfile(pdf_file_name):
os.unlink(pdf_file_name)
# Rename the output file.
os.rename("transcript.pdf", pdf_file_name)
print(f">> Written \"{pdf_file_name}\"")
input_file.close()
if __name__ == "__main__":
parent_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(parent_dir, "data")
doc_dir = os.path.join(parent_dir, "doc")
transcript_csv = get_input_file_path(data_dir)
if len(transcript_csv) > 0:
process_files(doc_dir, transcript_csv)