Group ideographic characters in tag cloud (#613)

* Fix #588, Ideographic characters should be grouped together.
Following the suggestion of using regex to find the ideographic
range in this SO answer https://stackoverflow.com/a/2718203/554903

We group the ideographic characters together, while keeping other
chinese, japanese and korean characters apart.

* cleanup

---------

Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
Jonathan Sundqvist 2024-03-16 07:09:37 +01:00 committed by GitHub
parent 38204c87cf
commit 683cf529d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 74 additions and 10 deletions

View file

@ -39,7 +39,7 @@ class TagCloudTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
group_element = group_elements[group_index]
link_elements = group_element.select("a")
self.assertEqual(len(link_elements), len(tags))
self.assertEqual(len(link_elements), len(tags), tags)
for tag_index, tag in enumerate(tags, start=0):
link_element = link_elements[tag_index]
@ -50,6 +50,59 @@ class TagCloudTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
link_elements = soup.select("p.selected-tags a")
self.assertEqual(len(link_elements), count)
def test_cjk_using_single_group(self):
"""
Ideographic characters will be using the same group
While other japanese and korean characters will have separate groups.
"""
tags = [
self.setup_tag(name="Aardvark"),
self.setup_tag(name="Armadillo"),
self.setup_tag(name="あひる"),
self.setup_tag(name="あきらか"),
self.setup_tag(name="アヒル"),
self.setup_tag(name="アキラカ"),
self.setup_tag(name="ひる"),
self.setup_tag(name="アヒル"),
self.setup_tag(name="오리"),
self.setup_tag(name=""),
self.setup_tag(name="家鴨"),
self.setup_tag(name="感じ"),
]
self.setup_bookmark(tags=tags)
rendered_template = self.render_template()
self.assertTagGroups(
rendered_template,
[
[
"Aardvark",
"Armadillo",
],
[
"あきらか",
"あひる",
],
[
"ひる",
],
[
"アキラカ",
"アヒル",
],
[
"",
],
[
"오리",
],
[
"家鴨",
"感じ",
],
],
)
def test_group_alphabetically(self):
tags = [
self.setup_tag(name="Cockatoo"),

View file

@ -1,5 +1,6 @@
import urllib.parse
from typing import Set, List
import re
from django.core.handlers.wsgi import WSGIRequest
from django.core.paginator import Paginator
@ -11,13 +12,13 @@ from bookmarks import utils
from bookmarks.models import (
Bookmark,
BookmarkSearch,
BookmarkSearchForm,
User,
UserProfile,
Tag,
)
DEFAULT_PAGE_SIZE = 30
CJK_RE = re.compile(r"[\u4e00-\u9fff]+")
class BookmarkItem:
@ -123,13 +124,13 @@ class BookmarkListContext:
)
def get_base_url(self):
raise Exception(f"Must be implemented by subclass")
raise Exception("Must be implemented by subclass")
def get_base_action_url(self):
raise Exception(f"Must be implemented by subclass")
raise Exception("Must be implemented by subclass")
def get_bookmark_query_set(self):
raise Exception(f"Must be implemented by subclass")
raise Exception("Must be implemented by subclass")
class ActiveBookmarkListContext(BookmarkListContext):
@ -178,23 +179,33 @@ class TagGroup:
self.tags = []
self.char = char
def __repr__(self):
return f"<{self.char} TagGroup>"
@staticmethod
def create_tag_groups(tags: Set[Tag]):
# Ensure groups, as well as tags within groups, are ordered alphabetically
sorted_tags = sorted(tags, key=lambda x: str.lower(x.name))
group = None
groups = []
cjk_used = False
cjk_group = TagGroup("Ideographic")
# Group tags that start with a different character than the previous one
for tag in sorted_tags:
tag_char = tag.name[0].lower()
if not group or group.char != tag_char:
if CJK_RE.match(tag_char):
cjk_used = True
cjk_group.tags.append(tag)
elif not group or group.char != tag_char:
group = TagGroup(tag_char)
groups.append(group)
group.tags.append(tag)
else:
group.tags.append(tag)
group.tags.append(tag)
if cjk_used:
groups.append(cjk_group)
return groups
@ -224,7 +235,7 @@ class TagCloudContext:
self.has_selected_tags = has_selected_tags
def get_tag_query_set(self):
raise Exception(f"Must be implemented by subclass")
raise Exception("Must be implemented by subclass")
def get_selected_tags(self, tags: List[Tag]):
parsed_query = queries.parse_query_string(self.search.q)