Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions docs/guide/GettingStarted.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -297,24 +297,20 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"als_mc = mc.empty_copy()\n",
"als_mc.measure_collection(als_recs, all_test)\n",
"als_metrics = als_mc.list_metrics()"
"als_summary, als_metrics = mc.measure_run(als_recs, all_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iknn_mc = mc.empty_copy()\n",
"iknn_mc.measure_collection(iknn_recs, all_test)\n",
"iknn_metrics = iknn_mc.list_metrics()"
"iknn_summary, iknn_metrics = mc.measure_run(iknn_recs, all_test)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/guide/batch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ And measure their results:

>>> collect = MeasurementCollector()
>>> collect.add_metric(RBP())
>>> collect.measure_collection(recs, split.test)
>>> collect.add_collection_measurements(recs, split.test)
>>> collect.summary_metrics() # doctest: +ELLIPSIS
{... 'RBP.mean': 0.06..., ...}

Expand Down
67 changes: 39 additions & 28 deletions docs/guide/evaluation/collection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,49 @@ Basic Principles
~~~~~~~~~~~~~~~~

A single measurement collector collects metrics for recommendation lists in a
**single run**: evaluating one pipeline on one test set. The basic use pattern
is as follows:
**single run**: evaluating one pipeline on one test set. The simple way to use
a collector is as follows:

1. Create a :class:`MeasurementCollector`.
2. Add metrics to collector with :meth:`~MeasurementCollector.add_metric`.
3. Measure a run with :meth:`~MeasurementCollector.measure_run` to get both
summary metrics and per-list metrics.

Example
~~~~~~~

If you have a dictionary of recommendation results in ``run_recs``, you can
measure them with:

.. code:: python

base_mc = MeasurementCollector()
base_mc.add_metric(NDCG(n=10))
base_mc.add_metric(RBP(n=10))
base_mc.add_metric(RecipRank(n=10))

run_list_metrics = {}
run_summaries = {}
for name, recs in run_recs.items():
result = mc.measure_run(recs, test)
run_summaries[name] = result.summary_metrics
run_list_metrics[name] = result.list_metrics

list_metrics = pd.concat(run_list_metrics, name=['recommender'])
metrics = pd.DataFrame.from_dict(run_summaries, orient="index")


Advanced Usage
~~~~~~~~~~~~~~

The measurement collector is **stateful**, and can be used with state as follows:

1. Create a :class:`MeasurementCollector`.
2. Add metrics to collector with :meth:`~MeasurementCollector.add_metric`.
3. Measure individual lists and their corresponding truth with
:meth:`~MeasurementCollector.measure_list` or an entire collection of
recommendations with :meth:`~MeasurementCollector.measure_collection`.
:meth:`~MeasurementCollector.add_list_measurement` or an entire collection
of recommendations with
:meth:`~MeasurementCollector.add_collection_measurements`.
4. Obtain individual list metrics with
:meth:`~MeasurementCollector.list_metrics` (returning data frame with one
row per list), or aggregate metrics and summary statistics with
Expand Down Expand Up @@ -71,27 +106,3 @@ on the results.
Further analysis can be done by collecting metric results (either summary or
per-list) into larger data frames and analyzing with your preferred
analytics library.

Example
~~~~~~~

If you have a dictionary of recommendation results in ``run_recs``, you can
measure them with:

.. code:: python

base_mc = MeasurementCollector()
base_mc.add_metric(NDCG(n=10))
base_mc.add_metric(RBP(n=10))
base_mc.add_metric(RecipRank(n=10))

run_list_metrics = {}
run_summaries = {}
for name, recs in run_recs.items():
mc = base_mc.empty_copy()
mc.measure_collection(recs, test)
run_list_metrics[name] = mc.list_metrics()
run_summaries[name] = mc.summary_metrics()

list_metrics = pd.concat(run_list_metrics, name=['recommender'])
metrics = pd.DataFrame.from_dict(run_summaries, orient="index")
3 changes: 2 additions & 1 deletion src/lenskit/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from lenskit.data import ItemList

from ._base import ListMetric, Metric, MetricFunction, MetricResult, MetricVal
from ._collect import MeasurementCollector
from ._collect import MeasurementCollector, RunMetrics
from ._quick import quick_measure_model
from .basic import ListLength, TestItemCount
from .bulk import RunAnalysis, RunAnalysisResult
Expand Down Expand Up @@ -46,6 +46,7 @@
"MetricResult",
"MetricVal",
"MeasurementCollector",
"RunMetrics",
"ListMetric",
"RankingMetricBase",
"RunAnalysis",
Expand Down
6 changes: 6 additions & 0 deletions src/lenskit/metrics/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
)

type MetricVal = float | int | object
"""
A single metric value.
"""
type MetricResult = MetricVal | Mapping[str, MetricVal]
"""
Results of a metric, either a single value or a dictionary of values.
"""


class MetricFunction(Protocol):
Expand Down
38 changes: 34 additions & 4 deletions src/lenskit/metrics/_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from collections.abc import Mapping
from dataclasses import dataclass, replace
from typing import Any
from typing import Any, NamedTuple

import pandas as pd

Expand All @@ -27,6 +27,21 @@
_log = get_logger(__name__)


class RunMetrics(NamedTuple):
"""
Results of measuring a single run in a metric collector.
"""

summary_metrics: Mapping[str, MetricVal]
"""
Overall summary metrics for the run.
"""
list_metrics: pd.DataFrame
"""
Metrics for each individual list in the run.
"""


@dataclass
class MetricState:
"""
Expand Down Expand Up @@ -114,7 +129,7 @@ def add_metric(
wrapper = _wrap_metric(metric, label)
self._metrics.append(wrapper)

def measure_list(self, output: ItemList, test: ItemList, **keys: Any):
def add_list_measurement(self, output: ItemList, test: ItemList, **keys: Any):
"""
Measure a single list and accumulate the intermediate results.

Expand All @@ -138,7 +153,7 @@ def measure_list(self, output: ItemList, test: ItemList, **keys: Any):

self._list_records.append(rec)

def measure_collection(
def add_collection_measurements(
self, outputs: ItemListCollection, test: ItemListCollection, **keys: Any
):
"""
Expand All @@ -164,12 +179,27 @@ def measure_collection(
no_test_count += 1
list_test = ItemList([])

self.measure_list(out, list_test, **key_kwargs)
self.add_list_measurement(out, list_test, **key_kwargs)
pb.update()

if no_test_count:
_log.warning("could not find test data for %d lists", no_test_count)

def measure_run(self, outputs: ItemListCollection, test: ItemListCollection) -> RunMetrics:
"""
Convenience method to measure a set of recommendations and return the
results.

The measurement collector must be empty. The provided lists are
measured, but their measurements are **not** added to this collector.

This method is intended to free up users from the need to manage
collector state.
"""
copy = self.empty_copy()
copy.add_collection_measurements(outputs, test)
return RunMetrics(copy.summary_metrics(), copy.list_metrics())

def list_metrics(self) -> pd.DataFrame:
"""
Get the per-list metric results as a DataFrame.
Expand Down
2 changes: 1 addition & 1 deletion src/lenskit/metrics/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def measure(
copy = self.collector.empty_copy()
copy._validate_setup()

copy.measure_collection(outputs, test)
copy.add_collection_measurements(outputs, test)

res = RunAnalysisResult(
copy.list_metrics(), pd.Series(copy.summary_metrics()), self._defaults
Expand Down
62 changes: 46 additions & 16 deletions tests/eval/test_measurement_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_accumulator_empty_and_unmeasured_defaults():
assert not acc.summary_metrics()

# measuring with no metrics
acc.measure_list(ItemList([1]), ItemList([1]), user="u1")
acc.add_list_measurement(ItemList([1]), ItemList([1]), user="u1")
assert acc.list_metrics().empty
assert not acc.summary_metrics()

Expand All @@ -59,8 +59,8 @@ def test_accumulator_measures_list_and_summary(sample_lists):
acc = MeasurementCollector()
acc.add_metric(ListLength())

acc.measure_list(sample_lists["recs1"], sample_lists["test1"], user="u1")
acc.measure_list(sample_lists["recs2"], sample_lists["test2"], user="u2")
acc.add_list_measurement(sample_lists["recs1"], sample_lists["test1"], user="u1")
acc.add_list_measurement(sample_lists["recs2"], sample_lists["test2"], user="u2")

list_metrics = acc.list_metrics()
summary = acc.summary_metrics()
Expand All @@ -74,9 +74,9 @@ def test_accumulator_empty_itemlists():
acc = MeasurementCollector()
acc.add_metric(ListLength())

acc.measure_list(ItemList([]), ItemList([1, 2]), user="u1")
acc.measure_list(ItemList([1, 2]), ItemList([]), user="u2")
acc.measure_list(ItemList([]), ItemList([]), user="u3")
acc.add_list_measurement(ItemList([]), ItemList([1, 2]), user="u1")
acc.add_list_measurement(ItemList([1, 2]), ItemList([]), user="u2")
acc.add_list_measurement(ItemList([]), ItemList([]), user="u3")

metrics = acc.list_metrics()
assert len(metrics) == 3
Expand All @@ -86,7 +86,7 @@ def test_accumulator_empty_itemlists():
def test_list_metrics_no_key_fields():
acc = MeasurementCollector()
acc.add_metric(ListLength())
acc.measure_list(ItemList([1, 2]), ItemList([1]))
acc.add_list_measurement(ItemList([1, 2]), ItemList([1]))
metrics = acc.list_metrics()
assert len(metrics) == 1
assert metrics.index.names == [None]
Expand Down Expand Up @@ -123,7 +123,7 @@ def custom_metric(recs, test):
def test_accumulator_key_fields(keys, expected_names):
acc = MeasurementCollector()
acc.add_metric(ListLength())
acc.measure_list(ItemList([1, 2]), ItemList([2]), **keys)
acc.add_list_measurement(ItemList([1, 2]), ItemList([2]), **keys)
assert acc.key_fields == expected_names
metrics = acc.list_metrics()
assert set(metrics.index.names) == set(expected_names)
Expand Down Expand Up @@ -181,7 +181,7 @@ def test_full_workflow_integration_improved(ml_ds):
for user, truth_il in test_users:
scores = scorer(ItemList(all_items, user=[user.user_id] * len(all_items)))
recs_il = ItemList(scores.top_n(10), user=[user.user_id] * 10, ordered=True)
acc.measure_list(recs_il, truth_il, user=user.user_id)
acc.add_list_measurement(recs_il, truth_il, user=user.user_id)

list_metrics = acc.list_metrics()
summary = acc.summary_metrics()
Expand Down Expand Up @@ -235,9 +235,9 @@ def measure_list(self, recs, test):
acc = MeasurementCollector()
acc.add_metric(TestMetric())

acc.measure_list(ItemList([3]), ItemList(), x=1)
acc.measure_list(ItemList([]), ItemList(), x=2)
acc.measure_list(ItemList([5, 20, 3]), ItemList(), x=3)
acc.add_list_measurement(ItemList([3]), ItemList(), x=1)
acc.add_list_measurement(ItemList([]), ItemList(), x=2)
acc.add_list_measurement(ItemList([5, 20, 3]), ItemList(), x=3)

lms = acc.list_metrics()
assert lms.loc[1, "test"] == 1
Expand All @@ -253,8 +253,8 @@ def test_reset():
acc.add_metric(ListLength())
acc.add_metric(RecipRank())

acc.measure_list(ItemList([1, 2, 3, 4, 5]), ItemList([4]))
acc.measure_list(ItemList([5, 4, 3, 2, 1]), ItemList([1]))
acc.add_list_measurement(ItemList([1, 2, 3, 4, 5]), ItemList([4]))
acc.add_list_measurement(ItemList([5, 4, 3, 2, 1]), ItemList([1]))

lms = acc.list_metrics()
assert len(lms) == 2
Expand All @@ -264,12 +264,42 @@ def test_reset():
assert sms["RecipRank.mean"] == approx(0.225)

acc.reset()
acc.measure_list(ItemList([1, 2, 3, 4, 5, 10]), ItemList([2]))
acc.measure_list(ItemList([5, 4, 3, 2, 1, 10]), ItemList([2]))
acc.add_list_measurement(ItemList([1, 2, 3, 4, 5, 10]), ItemList([2]))
acc.add_list_measurement(ItemList([5, 4, 3, 2, 1, 10]), ItemList([2]))

lms = acc.list_metrics()
assert len(lms) == 2
assert np.all(lms["N"] == 6)
sms = acc.summary_metrics()
assert sms["N.mean"] == approx(6.0)
assert sms["RecipRank.mean"] == approx(0.375)


def test_measure_run():
acc = MeasurementCollector()
acc.add_metric(ListLength())
acc.add_metric(RecipRank())

sms, lms = acc.measure_run(
ItemListCollection.from_dict(
{
1: ItemList([1, 2, 3, 4, 5]),
2: ItemList([5, 4, 3, 2, 1]),
},
key="user_id",
),
ItemListCollection.from_dict(
{
1: ItemList([4]),
2: ItemList([1]),
},
key="user_id",
),
)

assert len(lms) == 2
assert np.all(lms["N"] == 5)
assert sms["N.mean"] == approx(5.0)
assert sms["RecipRank.mean"] == approx(0.225)

assert len(acc.list_metrics()) == 0
Loading