Skip to content

Commit 5391abc

Browse files
authored
feat: tagging APIs can properly report implicit tag usage counts (#506)
See openedx/modular-learning#253
1 parent 52bcdcf commit 5391abc

7 files changed

Lines changed: 552 additions & 134 deletions

File tree

src/openedx_core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
"""
77

88
# The version for the entire repository
9-
__version__ = "0.39.1"
9+
__version__ = "0.39.2"

src/openedx_tagging/api.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
"""
1313
from __future__ import annotations
1414

15-
from typing import Any
15+
from collections import defaultdict
16+
from typing import Any, Counter
1617

1718
from django.db import models, transaction
1819
from django.db.models import F, QuerySet, Value
@@ -116,7 +117,6 @@ def search_tags(
116117
taxonomy: Taxonomy,
117118
search_term: str,
118119
exclude_object_id: str | None = None,
119-
include_counts: bool = False,
120120
) -> TagDataQuerySet:
121121
"""
122122
Returns a list of all tags that contains `search_term` of the given
@@ -138,7 +138,6 @@ def search_tags(
138138
qs = taxonomy.cast().get_filtered_tags(
139139
search_term=search_term,
140140
excluded_values=excluded_values,
141-
include_counts=include_counts,
142141
)
143142
return qs
144143

@@ -525,3 +524,49 @@ def unmark_copied_tags(object_id: str) -> None:
525524
Update copied object tags on the given object to mark them as "not copied".
526525
"""
527526
ObjectTag.objects.filter(object_id=object_id).update(is_copied=False)
527+
528+
529+
def add_usage_counts(taxonomy: Taxonomy, tag_data: TagDataQuerySet) -> TagDataQuerySet:
530+
"""
531+
Add usage counts to the query result.
532+
533+
Not a simple raw count of each tags usage. A tag can be directly
534+
applied to an object, which can be a course, library, module,
535+
or something else.
536+
537+
A tag can also be indirectly applied when some of its children
538+
are applied to an object, it is considered automatically applied.
539+
So, if the tags "Chemistry" and "Physics" are applied once
540+
each to different objects, their parent tag "Natural Science" is
541+
considered indirectly applied to 2 objects.
542+
543+
Deduplication: A tag can only be applied to a single object once.
544+
So if two child tags are applied to the same object, e.g.
545+
"Chemistry" and "Physics" are applied to the same course, the
546+
parent tag, "Natural Science" is only applied to it once,
547+
because no tag can be applied to the same object twice.
548+
549+
For performance reasons, we call this function with the list result of the
550+
QuerySet so we can then add the counts in-memory rather than annotate to a
551+
QuerySet which would require a very expensive annotation to join the
552+
in-memory data to the original QuerySet.
553+
"""
554+
555+
object_tags = taxonomy.objecttag_set.values_list("object_id", "tag__lineage")
556+
tag_counts: Counter[str] = Counter()
557+
object_tag_lineage_seen: defaultdict[str, set] = defaultdict(set)
558+
559+
for object_id, tag_lineage in object_tags:
560+
# split the lineages to get a dict of {tag.value: [lineages]}
561+
lineage_tags = list(tag_lineage.split('\t')) if tag_lineage else []
562+
# de-duplicate based on if the lineage is already 'seen' per object
563+
unseen_tags = [t for t in lineage_tags if t not in object_tag_lineage_seen[object_id]]
564+
565+
tag_counts.update(unseen_tags)
566+
object_tag_lineage_seen[object_id].update(unseen_tags)
567+
568+
# In-memory 'annotation'; this is faster than using annotate() on the QuerySet.
569+
for row in tag_data:
570+
row["usage_count"] = tag_counts.get(row["value"], 0)
571+
572+
return tag_data

src/openedx_tagging/models/base.py

Lines changed: 5 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments
426426
depth: int | None = None,
427427
parent_tag_value: str | None = None,
428428
search_term: str | None = None,
429-
include_counts: bool = False,
430429
excluded_values: list[str] | None = None,
431430
) -> TagDataQuerySet:
432431
"""
@@ -451,7 +450,7 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments
451450
if self.allow_free_text:
452451
if parent_tag_value is not None:
453452
raise ValueError("Cannot specify a parent tag ID for free text taxonomies")
454-
result = self._get_filtered_tags_free_text(search_term=search_term, include_counts=include_counts)
453+
result = self._get_filtered_tags_free_text(search_term=search_term)
455454
if excluded_values:
456455
return result.exclude(value__in=excluded_values)
457456
else:
@@ -460,7 +459,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments
460459
result = self._get_filtered_tags_one_level(
461460
parent_tag_value=parent_tag_value,
462461
search_term=search_term,
463-
include_counts=include_counts,
464462
)
465463
if excluded_values:
466464
return result.exclude(value__in=excluded_values)
@@ -470,7 +468,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments
470468
return self._get_filtered_tags_deep(
471469
parent_tag_value=parent_tag_value,
472470
search_term=search_term,
473-
include_counts=include_counts,
474471
excluded_values=excluded_values,
475472
)
476473
else:
@@ -479,7 +476,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments
479476
def _get_filtered_tags_free_text(
480477
self,
481478
search_term: str | None,
482-
include_counts: bool,
483479
) -> TagDataQuerySet:
484480
"""
485481
Implementation of get_filtered_tags() for free text taxonomies.
@@ -499,16 +495,13 @@ def _get_filtered_tags_free_text(
499495
_id=Value(None, output_field=models.CharField()),
500496
)
501497
qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id").order_by("value")
502-
if include_counts:
503-
return qs.annotate(usage_count=models.Count("value"))
504-
else:
505-
return qs.distinct() # type: ignore[return-value]
498+
499+
return qs.distinct() # type: ignore[return-value]
506500

507501
def _get_filtered_tags_one_level(
508502
self,
509503
parent_tag_value: str | None,
510504
search_term: str | None,
511-
include_counts: bool,
512505
) -> TagDataQuerySet:
513506
"""
514507
Implementation of get_filtered_tags() for closed taxonomies, where
@@ -531,24 +524,13 @@ def _get_filtered_tags_one_level(
531524
qs = qs.annotate(_id=F("id")) # ID has an underscore to encourage use of 'value' rather than this internal ID
532525
qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id")
533526
qs = qs.order_by("value")
534-
if include_counts:
535-
# We need to include the count of how many times this tag is used to tag objects.
536-
# You'd think we could just use:
537-
# qs = qs.annotate(usage_count=models.Count("objecttag__pk"))
538-
# but that adds another join which starts creating a cross product and the children and usage_count become
539-
# intertwined and multiplied with each other. So we use a subquery.
540-
obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(
541-
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
542-
count=models.Func(F('id'), function='Count')
543-
)
544-
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count')))
527+
545528
return qs # type: ignore[return-value]
546529

547530
def _get_filtered_tags_deep(
548531
self,
549532
parent_tag_value: str | None,
550533
search_term: str | None,
551-
include_counts: bool,
552534
excluded_values: list[str] | None,
553535
) -> TagDataQuerySet:
554536
"""
@@ -615,17 +597,7 @@ def _get_filtered_tags_deep(
615597
# lineage is a case-insensitive column storing "Root\tParent\t...\tThisValue\t", so
616598
# ordering by it gives the tree sort order that we want.
617599
qs = qs.order_by("lineage")
618-
if include_counts:
619-
# Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
620-
obj_tags = (
621-
ObjectTag.objects.filter(tag_id=models.OuterRef("pk"))
622-
.order_by()
623-
.annotate(
624-
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
625-
count=models.Func(F("id"), function="Count")
626-
)
627-
)
628-
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count")))
600+
629601
return qs # type: ignore[return-value]
630602

631603
def add_tag(

src/openedx_tagging/rest_api/v1/views.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from ...api import (
1818
TagDoesNotExist,
1919
add_tag_to_taxonomy,
20+
add_usage_counts,
2021
create_taxonomy,
2122
delete_tags_from_taxonomy,
2223
get_object_tag_counts,
@@ -844,21 +845,33 @@ def get_queryset(self) -> TagDataQuerySet:
844845
parent_tag_value=parent_tag_value,
845846
search_term=search_term,
846847
depth=depth,
847-
include_counts=include_counts,
848848
)
849849
if depth == 1:
850850
# We're already returning just a single level. It will be paginated normally.
851+
if include_counts:
852+
results_with_counts = add_usage_counts(self.get_taxonomy(), results)
853+
return results_with_counts
854+
851855
return results
852856
elif full_depth_threshold and len(results) < full_depth_threshold:
853857
# We can load and display all the tags in this (sub)tree at once:
854858
self.pagination_class = DisabledTagsPagination
859+
if include_counts:
860+
results_with_counts = add_usage_counts(self.get_taxonomy(), results)
861+
return results_with_counts
862+
855863
return results
856864
else:
857865
# We had to do a deep query, but we will only return one level of results.
858866
# This is because the user did not request a deep response (via full_depth_threshold) or the result was too
859867
# large (larger than the threshold).
860868
# It will be paginated normally.
861-
return results.filter(parent_value=parent_tag_value)
869+
filtered_results = results.filter(parent_value=parent_tag_value)
870+
if include_counts:
871+
results_with_counts = add_usage_counts(self.get_taxonomy(), results)
872+
return results_with_counts
873+
874+
return filtered_results
862875

863876
def post(self, request, *args, **kwargs):
864877
"""

tests/openedx_tagging/test_api.py

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -752,53 +752,53 @@ def get_object_tags():
752752

753753
@ddt.data(
754754
("ChA", [
755-
"Archaea (used: 1, children: 2)",
756-
" Euryarchaeida (used: 0, children: 0)",
757-
" Proteoarchaeota (used: 0, children: 0)",
758-
"Bacteria (used: 0, children: 1)", # does not contain "cha" but a child does
759-
" Archaebacteria (used: 1, children: 0)",
755+
"Archaea (children: 2)",
756+
" Euryarchaeida (children: 0)",
757+
" Proteoarchaeota (children: 0)",
758+
"Bacteria (children: 1)", # does not contain "cha" but a child does
759+
" Archaebacteria (children: 0)",
760760
]),
761761
("ar", [
762-
"Archaea (used: 1, children: 2)",
763-
" Euryarchaeida (used: 0, children: 0)",
764-
" Proteoarchaeota (used: 0, children: 0)",
765-
"Bacteria (used: 0, children: 1)", # does not contain "ar" but a child does
766-
" Archaebacteria (used: 1, children: 0)",
767-
"Eukaryota (used: 0, children: 1)",
768-
" Animalia (used: 1, children: 2)", # does not contain "ar" but a child does
769-
" Arthropoda (used: 1, children: 0)",
770-
" Cnidaria (used: 0, children: 0)",
762+
"Archaea (children: 2)",
763+
" Euryarchaeida (children: 0)",
764+
" Proteoarchaeota (children: 0)",
765+
"Bacteria (children: 1)", # does not contain "ar" but a child does
766+
" Archaebacteria (children: 0)",
767+
"Eukaryota (children: 1)",
768+
" Animalia (children: 2)", # does not contain "ar" but a child does
769+
" Arthropoda (children: 0)",
770+
" Cnidaria (children: 0)",
771771
]),
772772
("aE", [
773-
"Archaea (used: 1, children: 2)",
774-
" Euryarchaeida (used: 0, children: 0)",
775-
" Proteoarchaeota (used: 0, children: 0)",
776-
"Bacteria (used: 0, children: 1)", # does not contain "ae" but a child does
777-
" Archaebacteria (used: 1, children: 0)",
778-
"Eukaryota (used: 0, children: 1)", # does not contain "ae" but a child does
779-
" Plantae (used: 1, children: 0)",
773+
"Archaea (children: 2)",
774+
" Euryarchaeida (children: 0)",
775+
" Proteoarchaeota (children: 0)",
776+
"Bacteria (children: 1)", # does not contain "ae" but a child does
777+
" Archaebacteria (children: 0)",
778+
"Eukaryota (children: 1)", # does not contain "ae" but a child does
779+
" Plantae (children: 0)",
780780
]),
781781
("a", [
782-
"Archaea (used: 1, children: 3)",
783-
" DPANN (used: 0, children: 0)",
784-
" Euryarchaeida (used: 0, children: 0)",
785-
" Proteoarchaeota (used: 0, children: 0)",
786-
"Bacteria (used: 0, children: 2)",
787-
" Archaebacteria (used: 1, children: 0)",
788-
" Eubacteria (used: 0, children: 0)",
789-
"Eukaryota (used: 0, children: 4)",
790-
" Animalia (used: 1, children: 7)",
791-
" Arthropoda (used: 1, children: 0)",
792-
" Chordata (used: 0, children: 1)",
793-
" Mammalia (used: 0, children: 0)",
794-
" Cnidaria (used: 0, children: 0)",
795-
" Ctenophora (used: 0, children: 0)",
796-
" Gastrotrich (used: 1, children: 0)",
797-
" Placozoa (used: 1, children: 0)",
798-
" Porifera (used: 0, children: 0)",
799-
" Monera (used: 1, children: 0)",
800-
" Plantae (used: 1, children: 0)",
801-
" Protista (used: 0, children: 0)",
782+
"Archaea (children: 3)",
783+
" DPANN (children: 0)",
784+
" Euryarchaeida (children: 0)",
785+
" Proteoarchaeota (children: 0)",
786+
"Bacteria (children: 2)",
787+
" Archaebacteria (children: 0)",
788+
" Eubacteria (children: 0)",
789+
"Eukaryota (children: 4)",
790+
" Animalia (children: 7)",
791+
" Arthropoda (children: 0)",
792+
" Chordata (children: 1)",
793+
" Mammalia (children: 0)",
794+
" Cnidaria (children: 0)",
795+
" Ctenophora (children: 0)",
796+
" Gastrotrich (children: 0)",
797+
" Placozoa (children: 0)",
798+
" Porifera (children: 0)",
799+
" Monera (children: 0)",
800+
" Plantae (children: 0)",
801+
" Protista (children: 0)",
802802
]),
803803
)
804804
@ddt.unpack
@@ -817,7 +817,7 @@ def test_autocomplete_tags_closed(self, search: str, expected: list[str]) -> Non
817817
_value=value,
818818
).save()
819819

820-
result = tagging_api.search_tags(closed_taxonomy, search, include_counts=True)
820+
result = tagging_api.search_tags(closed_taxonomy, search)
821821
assert pretty_format_tags(result, parent=False) == expected
822822

823823
def test_autocomplete_tags_closed_omit_object(self) -> None:

0 commit comments

Comments
 (0)