Compare revisions

William Bell · William Bell · 5df4201e · 5df4201e · 5df4201e · 5df4201e
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# For LaTeX
+*.aux
+*.glo
+*.idx
+*.log
+*.toc
+*.ist
+*.acn
+*.acr
+*.alg
+*.bbl
+*.blg
+*.dvi
+*.glg
+*.gls
+*.ilg
+*.ind
+*.lof
+*.lot
+*.maf
+*.mtc
+*.mtc1
+*.out
+*.synctex.gz
+*-eps-converted-to.pdf
--- a/src/.gitignore
+++ b/src/.gitignore
+# Ignore .png files that are produced.
+*.png
\ No newline at end of file
--- a/src/analysis_functions.py
+++ b/src/analysis_functions.py
+def precision(relevant_ranks: list) -> list:
+    """
+    A function to return the precision as a function of rank.
+    The function must be passed a list of the rank positions of
+    relevant documents.
+    """
+    n = max(relevant_ranks)
+    results = [0.]*n
+    n_retrieved = 0
+    n_relevant = 0
+    for i in range(n):
+        rank = i + 1
+        if rank in relevant_ranks:
+            n_relevant += 1
+        n_retrieved += 1
+        results[i] = n_relevant/n_retrieved
+    return results
+
+
+def recall(relevant_ranks: list) -> list:
+    """
+    A function to return the recall as a function of rank.
+    The function must be passed a list of the rank positions of
+    relevant documents.
+    """
+    n = max(relevant_ranks)
+    results = [0.]*n
+    total_relevant = len(relevant_ranks)
+    n_relevant = 0
+    for i in range(n):
+        rank = i + 1
+        if rank in relevant_ranks:
+            n_relevant += 1
+        results[i] = n_relevant/total_relevant
+    return results
+
+
+def interpolate_precision(recall: list, precision: list) -> list:
+    """
+    A function to interpolate precision values.  The function
+    returns a tuple of the interpolated recall and precision.
+    """
+    if len(recall) == 0 or len(precision) == 0:
+        return ([], [])
+
+    # Interpolate to the right.
+    n = len(precision)
+    int_precision = [0.] * n
+    for i in range(n):
+        int_precision[i] = max(precision[i:])
+
+    # Interpolate back to recall = 0.
+    int_recall = recall.copy()
+    if int_recall[0] != 0:
+        int_recall = [0] + int_recall
+        int_precision = [int_precision[0]] + int_precision
+
+    return (int_recall, int_precision)
+
+
+def average_precision(relevant_ranks: list, precision: list) -> list:
+    """
+    A function to calculate the average precision at
+    each rank.
+    """
+    n = len(precision)
+    ranks = list(range(1, n+1))
+    results = [0.] * n
+    n_relevant = [0] * n
+    for i in range(n):
+        if i > 0:
+            results[i] += results[i - 1]
+            n_relevant[i] += n_relevant[i - 1]
+        if ranks[i] in relevant_ranks:
+            results[i] += precision[i]
+            n_relevant[i] += 1
+    for i in range(n):
+        results[i] /= n_relevant[i]
+    return results
--- a/src/analysis_plots.py
+++ b/src/analysis_plots.py
+import matplotlib.pyplot as plt
+
+
+def recall_precision(file_name: str,
+                     rank: list,
+                     recall: list,
+                     precision: list) -> None:
+    """
+    A function to plot recall and precision verses
+    rank, where the x-axis is shared.
+    """
+
+    fig, ax1 = plt.subplots()
+
+    # Draw the recall curve.
+    ax1.set_xlabel('Rank')
+    ax1.set_ylabel('Recall')
+    line_recall = ax1.plot(rank, recall, label="Recall")
+    ax1.tick_params(axis='y')
+
+    # Draw the precision curve.
+    ax2 = ax1.twinx()
+    ax2.set_ylabel('Precision')
+    line_precision = ax2.plot(rank, precision, '--', label="Precision")
+    ax2.tick_params(axis='y')
+
+    # Add a the legend.
+    lines = line_recall + line_precision
+    labels = [line.get_label() for line in lines]
+    ax1.legend(lines, labels, bbox_to_anchor=(1.12, 1), borderaxespad=0)
+
+    fig.tight_layout()
+    plt.savefig(file_name)
+    plt.close()
+
+
+def precision_vs_recall(file_name: str,
+                        recall: list,
+                        precision: list,
+                        interpolated_recall: list = [],
+                        interpolated_precision: list = [],
+                        use_legend: bool = True) -> None:
+    """
+    A function to plot the precision vs recal
+    """
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.plot(recall, precision, label="Precision")
+    if len(interpolated_precision) > 0:
+        plt.plot(interpolated_recall, interpolated_precision,
+                 linestyle='dotted', label="Interpolated", color='red')
+    plt.tick_params(axis='y')
+    if use_legend:
+        plt.legend(loc=1)
+    plt.tight_layout()
+    plt.savefig(file_name)
+    plt.close()
+
+
+def line_plot(file_name: str,
+              x: list,
+              y: list,
+              labels: dict) -> None:
+    """
+    A function to produce a line plot.
+    """
+    plt.title(labels["title"])
+    plt.xlabel(labels["xlabel"])
+    plt.ylabel(labels["ylabel"])
+    plt.plot(x, y)
+    plt.tick_params(axis='y')
+    plt.tight_layout()
+    plt.savefig(file_name)
+    plt.close()
--- a/src/average_precision.py
+++ b/src/average_precision.py
+import analysis_functions
+import analysis_plots
+import data_model
+
+"""
+A program to demonstrate average precision at a specific rank
+and for all ranks.
+"""
+
+# Load the CSV data.
+query_results = data_model.load_query_results()
+
+# Find the precision.
+precision_vs_rank = analysis_functions.precision(query_results[1])
+rank = list(range(1, len(precision_vs_rank)+1))
+
+# Find the average precision.
+average_precision_vs_rank = analysis_functions.average_precision(query_results[1],
+                                                                 precision_vs_rank)
+
+# Plot the average precision.
+analysis_plots.line_plot("average_precision.png",
+                         rank,
+                         average_precision_vs_rank,
+                         labels={
+                            "title": "Average precision vs rank for query 1",
+                            "xlabel": "Rank",
+                            "ylabel": "Average precision"
+                         })
+
+# The average precision at the highest rank.
+print(f"Average precision, after all ranks = {average_precision_vs_rank[-1]}")
+
+
+# Calculating the average precision using recall.
+recall_vs_rank = analysis_functions.recall(query_results[1])
+avp = 0
+n = len(recall_vs_rank)
+previous_recall = 0
+for i in range(n):
+    avp += (recall_vs_rank[i] - previous_recall) * precision_vs_rank[i]
+    previous_recall = recall_vs_rank[i]
+print(f"Average precision, for all ranks =   {avp}")
--- a/src/data_model.py
+++ b/src/data_model.py
+import csv
+
+
+def load_query_results() -> dict:
+    """
+    A function to load the query and relevant document rank
+    position data.
+    """
+    query_results = {}
+    input_file = open("query_results.csv", "r")
+    dict_reader = csv.DictReader(input_file)
+    for row in dict_reader:
+        query_id = int(row["QueryId"])
+        rank = int(row["Rank"])
+        if query_id not in query_results:
+            query_results[query_id] = []
+        query_results[query_id].append(rank)
+    input_file.close()
+    return query_results
--- a/src/precision_vs_recall.py
+++ b/src/precision_vs_recall.py
+import analysis_functions
+import analysis_plots
+import data_model
+
+"""
+A program to demonstrate plotting
+precision vs recall.
+"""
+
+# Load the CSV data.
+query_results = data_model.load_query_results()
+
+# Find the recall and precision.
+recall_vs_rank = analysis_functions.recall(query_results[1])
+precision_vs_rank = analysis_functions.precision(query_results[1])
+
+# Interpolate the precision, to the right
+# and back to zero.
+interpolated_values = analysis_functions.interpolate_precision(recall_vs_rank,
+                                                               precision_vs_rank)
+
+# Plot the values and the interpolated values.
+analysis_plots.precision_vs_recall("precision_vs_recall.png",
+                                   recall_vs_rank,
+                                   precision_vs_rank,
+                                   interpolated_values[0],
+                                   interpolated_values[1])
--- a/src/precision_vs_recall_average.py
+++ b/src/precision_vs_recall_average.py
+import analysis_functions
+import analysis_plots
+import data_model
+import math
+
+"""
+A program to demonstrate forming a mean
+precision vs recall distribution.
+"""
+
+# Load the CSV data.
+query_results = data_model.load_query_results()
+
+# Find all interpolated values and unique recall points.
+all_values = []
+recall_points = []
+for query_id in query_results.keys():
+
+    # Find the recall and precision.
+    recall_vs_rank = analysis_functions.recall(query_results[query_id])
+    precision_vs_rank = analysis_functions.precision(query_results[query_id])
+
+    # Interpolate the precision, to the right
+    # and back to zero.
+    interpolated_values = analysis_functions.interpolate_precision(recall_vs_rank,
+                                                                   precision_vs_rank)
+
+    # Keep the interpolated values.
+    all_values.append(interpolated_values)
+
+    # Append the recall points, if they are not already
+    # in the list.
+    interpolated_recalls = interpolated_values[0]
+    for recall_value in interpolated_recalls:
+        matched = False
+        for recall_point in recall_points:
+            if math.isclose(recall_value, recall_point):
+                matched = True
+                break
+        if not matched:
+            recall_points.append(recall_value)
+recall_points.sort()
+
+# Find the precision average for each available recall point,
+# interpolating to the right if needed.
+n = len(recall_points)
+precision_average = [0.] * n
+n_counts = [0] * n
+for i in range(n):
+    # Consider each known recall point.
+    recall_point = recall_points[i]
+
+    # Consider each set of interpolated values.
+    for interpolated_values in all_values:
+        interpolated_recalls = interpolated_values[0]
+        interpolated_precisions = interpolated_values[1]
+        n_values = len(interpolated_recalls)
+
+        # Find the matching precision value for this recall
+        # point.
+        for j in range(n_values):
+            interpolated_recall = interpolated_recalls[j]
+
+            # Get point to the right if needed.
+            if interpolated_recall >= recall_point:
+                precision_average[i] += interpolated_precisions[j]
+                n_counts[i] += 1
+                break
+
+# Divide by the number of counts for the recall point.
+for i in range(n):
+    precision_average[i] /= n_counts[i]
+
+# Plot the average values.
+analysis_plots.line_plot("precision_vs_recall_average.png",
+                         recall_points,
+                         precision_average,
+                         labels={
+                             "title": f"Average from {len(query_results.keys())} queries",
+                             "xlabel": "Recall",
+                             "ylabel": "Precision"
+                         })
--- a/src/query_results.csv
+++ b/src/query_results.csv
+QueryId,Rank
+1,1
+1,2
+1,5
+1,7
+2,1
+2,4
+2,5
+3,1
+3,5
+3,8
+4,2
+4,3
+4,5
+4,7
\ No newline at end of file
--- a/src/recall_precision.py
+++ b/src/recall_precision.py
+import analysis_functions
+import analysis_plots
+import data_model
+
+"""
+A program to demonstrate plotting
+precision and recall vs rank.
+"""
+
+# Load the CSV data.
+query_results = data_model.load_query_results()
+
+# Find the recall and precision.
+recall_vs_rank = analysis_functions.recall(query_results[1])
+precision_vs_rank = analysis_functions.precision(query_results[1])
+rank = list(range(1, len(precision_vs_rank)+1))
+
+# Plot the recall and precision.
+analysis_plots.recall_precision("recall_precision.png",
+                                rank,
+                                recall_vs_rank,
+                                precision_vs_rank)
No results found