Skip to content
Snippets Groups Projects
Commit 86c923bc authored by William Bell's avatar William Bell
Browse files

Escaping special characters. Coping with non-UTF-8 characters from Microsoft Stream.

parent ce1f4a58
No related branches found
No related tags found
No related merge requests found
......@@ -28,11 +28,43 @@ def write_title_tex(title: str, title_file: str) -> None:
s += "\\date{}" + "\n"
s += "\\maketitle" + "\n"
s += "\\setstretch{1.2}" + "\n"
output_file = open(title_file, "w")
output_file = open(title_file, "w", encoding="UTF-8")
output_file.write(s)
output_file.close()
def txt_to_tex(text: str) -> str:
"""
A function to escape special text characters, such that
LaTeX renders them correctly.
"""
lookup_dict = {
"%": "\\%",
"$": "\\$",
"{": "\\{",
"}": "\\}",
"_": "\\_",
"#": "\\#",
"&": "\\&",
">": "\\textgreater",
"<": "\\textless"
}
tex = ""
for c in text:
# Replace the existing backslashes first.
if c == "\\":
tex += "\\textbackslash"
continue
if c in lookup_dict.keys():
tex += lookup_dict[c]
continue
tex += c
return tex
def srt_to_txt(srt_file: str) -> list:
"""
A function to remove the numbers and times from an srt captions file.
......@@ -46,7 +78,11 @@ def srt_to_txt(srt_file: str) -> list:
if len(line) > 0:
if line[0].isdigit():
continue
lines.append(line)
# Replace special characters.
tex = txt_to_tex(line)
lines.append(tex)
return lines
......@@ -57,7 +93,13 @@ def vtt_to_txt(vtt_file: str) -> list:
vtt = webvtt.read(vtt_file)
lines = []
for line in vtt:
lines.append(line.text + "\n\n")
# Microsoft Stream vtt text sometimes includes non-UTF-8 characters.
text_line = str(line.text)
# Replace special characters.
tex = txt_to_tex(text_line)
lines.append(tex + "\n\n")
return lines
......@@ -115,7 +157,7 @@ def process_files(doc_dir: str, csv_file_path: str) -> None:
title_file = "title.tex"
body_file = "body.tex"
write_title_tex(title, title_file)
output_file = open(body_file, "w")
output_file = open(body_file, "w", encoding="UTF-8")
output_file.writelines(lines)
output_file.close()
......@@ -158,7 +200,8 @@ if __name__ == "__main__":
A program to convert .srt and .vtt caption files to
PDF transcript files.
"""
parent_dir = os.path.dirname(os.getcwd())
python_dir = os.path.dirname(__file__)
parent_dir = os.path.dirname(python_dir)
data_dir = os.path.join(parent_dir, "data")
doc_dir = os.path.join(parent_dir, "doc")
transcript_csv = get_input_file_path(data_dir)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment