precision_vs_recall_average.py

import analysis_functions
import analysis_plots
import data_model
import math

"""
A program to demonstrate forming a mean
precision vs recall distribution.
"""

# Load the CSV data.
query_results = data_model.load_query_results()

# Find all interpolated values and unique recall points.
all_values = []
recall_points = []
for query_id in query_results.keys():

    # Find the recall and precision.
    recall_vs_rank = analysis_functions.recall(query_results[query_id])
    precision_vs_rank = analysis_functions.precision(query_results[query_id])

    # Interpolate the precision, to the right
    # and back to zero.
    interpolated_values = analysis_functions.interpolate_precision(recall_vs_rank,
                                                                   precision_vs_rank)

    # Keep the interpolated values.
    all_values.append(interpolated_values)

    # Append the recall points, if they are not already
    # in the list.
    interpolated_recalls = interpolated_values[0]
    for recall_value in interpolated_recalls:
        matched = False
        for recall_point in recall_points:
            if math.isclose(recall_value, recall_point):
                matched = True
                break
        if not matched:
            recall_points.append(recall_value)
recall_points.sort()

# Find the precision average for each available recall point,
# interpolating to the right if needed.
n = len(recall_points)
precision_average = [0.] * n
n_counts = [0] * n
for i in range(n):
    # Consider each known recall point.
    recall_point = recall_points[i]

    # Consider each set of interpolated values.
    for interpolated_values in all_values:
        interpolated_recalls = interpolated_values[0]
        interpolated_precisions = interpolated_values[1]
        n_values = len(interpolated_recalls)

        # Find the matching precision value for this recall
        # point.
        for j in range(n_values):
            interpolated_recall = interpolated_recalls[j]

            # Get point to the right if needed.
            if interpolated_recall >= recall_point:
                precision_average[i] += interpolated_precisions[j]
                n_counts[i] += 1
                break

# Divide by the number of counts for the recall point.
for i in range(n):
    precision_average[i] /= n_counts[i]

# Plot the average values.
analysis_plots.line_plot("precision_vs_recall_average.png",
                         recall_points,
                         precision_average,
                         labels={
                             "title": f"Average from {len(query_results.keys())} queries",
                             "xlabel": "Recall",
                             "ylabel": "Precision"
                         })