Skip to content
Snippets Groups Projects
Commit 7ffb4b25 authored by William Bell's avatar William Bell
Browse files

Updating files

parent 2b07caec
No related branches found
No related tags found
No related merge requests found
file_name,title
test.srt,My title
test1.srt,A test srt file
test2.vtt,A test vtt file
1
00:00:04,090 --> 00:00:05,260
Hi.
2
00:00:05,260 --> 00:00:07,710
So in this video, I'm
going to talk about
3
00:00:07,710 --> 00:00:11,640
what exactly is a database, so
what do we mean by a database?
4
00:00:11,640 --> 00:00:14,220
So a database is an
organised collection of data
5
00:00:14,220 --> 00:00:17,760
that we want to exist
for a period of time.
6
00:00:17,760 --> 00:00:19,050
We want to have access to it.
7
00:00:19,050 --> 00:00:20,070
We want to update it.
8
00:00:20,070 --> 00:00:22,750
We want to be able to
manipulate it in some way.
9
00:00:22,750 --> 00:00:26,280
So we want to have essentially
persistent storage and access
10
00:00:26,280 --> 00:00:28,180
to this data.
11
00:00:28,180 --> 00:00:29,900
So access to a
database is generally
12
00:00:29,900 --> 00:00:32,400
managed by a database
management system.
13
00:00:32,400 --> 00:00:35,120
So that allows us to do
various things, like control
14
00:00:35,120 --> 00:00:38,780
who has access to what parts
of the database, control who
15
00:00:38,780 --> 00:00:43,010
can update the database, allow
for the database to be updated,
16
00:00:43,010 --> 00:00:45,770
allow for searches to be
conducted, for information
17
00:00:45,770 --> 00:00:48,710
to be extracted,
and generally just
18
00:00:48,710 --> 00:00:51,027
all aspects of how we
interact with the data that
19
00:00:51,027 --> 00:00:54,280
is stored within the database.
20
00:00:54,280 --> 00:00:57,390
So what sort of applications
do we use databases for?
21
00:00:57,390 --> 00:00:59,850
So there are a wide
variety of databases
22
00:00:59,850 --> 00:01:01,890
in terms of size and scope.
23
00:01:01,890 --> 00:01:03,900
So you may have
tiny databases that
24
00:01:03,900 --> 00:01:06,120
operate on your mobile
device, for example, just
25
00:01:06,120 --> 00:01:08,160
for some of your personal data.
26
00:01:08,160 --> 00:01:10,230
You may have more medium
size databases that
27
00:01:10,230 --> 00:01:12,700
live within applications
that we use,
28
00:01:12,700 --> 00:01:15,510
so things like Skype
or our browsers
29
00:01:15,510 --> 00:01:17,580
to, again, manage some
of the data that we have.
30
00:01:17,580 --> 00:01:20,650
And we move up to larger
scale databases, the things
31
00:01:20,650 --> 00:01:23,920
that you might see in
enterprises, for example.
32
00:01:23,920 --> 00:01:28,110
So your bank or your insurer
may have larger data sets
33
00:01:28,110 --> 00:01:30,930
that they want to
interact with and use.
34
00:01:30,930 --> 00:01:33,680
Then we have even bigger
types of applications.
35
00:01:33,680 --> 00:01:37,510
So if you imagine a search
engine or a recommender system
36
00:01:37,510 --> 00:01:43,690
or some sort of internet-based
video browser or service,
37
00:01:43,690 --> 00:01:45,700
for example, like
YouTube, then you're
38
00:01:45,700 --> 00:01:48,280
going to have millions or
billions of interactions
39
00:01:48,280 --> 00:01:50,620
every day, millions
or billions of items
40
00:01:50,620 --> 00:01:54,100
that you want to maintain,
that you want to log,
41
00:01:54,100 --> 00:01:56,230
that you want to have
information about.
42
00:01:56,230 --> 00:01:57,730
And it's at this
point that we may
43
00:01:57,730 --> 00:02:01,120
need to consider using
different types of technologies
44
00:02:01,120 --> 00:02:03,400
to traditional
relational database
45
00:02:03,400 --> 00:02:05,650
to allow us to
manage those volumes
46
00:02:05,650 --> 00:02:08,670
and rapidly changing
types of data.
47
00:02:08,670 --> 00:02:10,000
\documentclass[12pt,a4paper]{article}
\usepackage{strath-assignment}
\usepackage{strath-transcript}
\usepackage{amsmath}
\usepackage{listings}
\usepackage[parfill]{parskip}
......
......@@ -5,36 +5,37 @@ import tkinter
import tkinter.filedialog
import webvtt
def get_input_file_path(initial_dir:str):
def get_input_file_path(initial_dir: str) -> str:
"""
A function to collect the input file name from the user.
"""
root = tkinter.Tk()
root.withdraw()
file_path = tkinter.filedialog.askopenfilename(initialdir=initial_dir,
filetypes=[('CSV', '.csv')],
title='Select input file')
filetypes=[('CSV', '.csv')],
title='Select input file')
return file_path
def write_title_tex(title:str):
def write_title_tex(title: str, title_file: str) -> None:
"""
A function to write a title.tex file.
"""
s = "\setstretch{1.0}" + "\n"
s = "\\setstretch{1.0}" + "\n"
s += "\\title{" + title + "}" + "\n"
s += "\\author{Audio transcript}" + "\n"
s += "\\date{}" + "\n"
s += "\maketitle" + "\n"
s += "\setstretch{1.2}" + "\n"
output_file = open("title.tex", "w")
s += "\\maketitle" + "\n"
s += "\\setstretch{1.2}" + "\n"
output_file = open(title_file, "w")
output_file.write(s)
output_file.close()
def srt_to_txt(srt_file):
def srt_to_txt(srt_file: str) -> list:
"""
A function to remove the numbers and times from an srt file.
A function to remove the numbers and times from an srt captions file.
"""
lines = []
input_file = open(srt_file, "r")
......@@ -49,8 +50,24 @@ def srt_to_txt(srt_file):
return lines
def process_files(doc_dir, csv_file_path):
# The data files are expected to be in the
def vtt_to_txt(vtt_file: str) -> list:
"""
A function to remove non-text content from a vtt captions file.
"""
vtt = webvtt.read(vtt_file)
lines = []
for line in vtt:
lines.append(line.text + "\n\n")
return lines
def process_files(doc_dir: str, csv_file_path: str) -> None:
"""
A function to process all caption files and produce
PDF files.
"""
# The data files are expected to be in the
# same directory as the input CSV file.
data_dir = os.path.dirname(csv_file_path)
......@@ -58,16 +75,14 @@ def process_files(doc_dir, csv_file_path):
os.chdir(doc_dir)
input_file = open(csv_file_path, "r", newline='')
dict_reader = csv.DictReader(input_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
dict_reader = csv.DictReader(input_file, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Require both columns to be present.
if "file_name" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named file_name.")
if "title" not in dict_reader.fieldnames:
print(f"!! The file \"{csv_file_path}\" must contain a column named title.")
csv_lines = sum(1 for row in dict_reader)
print(csv_lines)
# Process each row in the CSV file.
for row_dict in dict_reader:
......@@ -83,29 +98,37 @@ def process_files(doc_dir, csv_file_path):
# Set the output file file name.
file_prefix, file_extension = os.path.splitext(transcript_file_name)
pdf_file_name = file_prefix + ".pdf"
# Attempt to covert the transcript file to text.
lines = []
if file_extension == ".srt":
lines += srt_to_txt(transcript_file_name)
elif file_extension == ".vtt":
lines += vtt_to_txt(transcript_file_name)
# If the transcript file cannot be converted.
if len(lines) == 0:
print(f"!! Warning the file \"{transcript_file_name}\" cannot be convert to text.")
continue
# Write the tex files.
write_title_tex(title)
output_file = open("body.tex", "w")
title_file = "title.tex"
body_file = "body.tex"
write_title_tex(title, title_file)
output_file = open(body_file, "w")
output_file.writelines(lines)
output_file.close()
# Create the PDF file. The number of pages is calculated
# the first time LaTeX is run.
ret_code = subprocess.call(["pdflatex", "transcript.tex"],
ret_code = subprocess.call(["pdflatex",
"-interaction=nonstopmode",
"transcript.tex"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
ret_code = subprocess.call(["pdflatex", "transcript.tex"],
ret_code = subprocess.call(["pdflatex",
"-interaction=nonstopmode",
"transcript.tex"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
if ret_code != 0:
......@@ -119,15 +142,29 @@ def process_files(doc_dir, csv_file_path):
# Rename the output file.
os.rename("transcript.pdf", pdf_file_name)
# Clean up the doc directory.
if os.path.isfile(title_file):
os.unlink(title_file)
if os.path.isfile(body_file):
os.unlink(body_file)
print(f">> Written \"{pdf_file_name}\"")
input_file.close()
if __name__ == "__main__":
"""
A program to convert .srt and .vtt caption files to
PDF transcript files.
"""
parent_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(parent_dir, "data")
doc_dir = os.path.join(parent_dir, "doc")
transcript_csv = get_input_file_path(data_dir)
if len(transcript_csv) > 0:
process_files(doc_dir, transcript_csv)
\ No newline at end of file
# Normalise the file path for Windows.
transcript_csv = os.path.normpath(transcript_csv)
# Process the files.
process_files(doc_dir, transcript_csv)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment