v1.1.0

2026-05-10 16:37:46 +02:00
parent 6207cab27a
commit af21672b1c
3 changed files with 166 additions and 72 deletions
--- a/baloo_tools/baloo_tools.py
+++ b/baloo_tools/baloo_tools.py
@@ -10,6 +10,7 @@ import lmdb
 import os
 import re
 import sys
+import unicodedata
 from typing import Tuple

 PROPERTIES_ID_MAP = {
@@ -100,6 +101,18 @@ PROPERTIES_ID_MAP = {
 }


+def normalize_text(text):
+    """
+    Remove accents/diacritics for string comparison.
+    """
+    if not text:
+        return ""
+    text = unicodedata.normalize('NFD', text)
+    text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
+    # return text.lower().strip()
+    return text.strip()
+
+
 class BalooTools:
    """Class to interact directly with the Baloo LMDB index."""

@@ -214,11 +227,46 @@ class BalooTools:
                            for p in parts:
                                p = p.strip()
                                if p:
-                                    tag = p.removeprefix('TAG-').removeprefix('TA')
-                                    tags.append(tag)
+                                    """ 'TA' elements are tags normalized to lowercase
+                                    and stripped of accents/diacritics, while 'TAG'
+                                    elements are the original tags as they were added by
+                                    the user. We need to process both to ensure we can
+                                    match tags in a case-insensitive and
+                                    accent-insensitive way. But we only want to add the
+                                    original tags to the final result, not the
+                                    normalized  ones, because the normalized ones are
+                                    not handle correctly tags with spaces and words with
+                                    less than three characters.
+                                    """
+                                    if p.startswith('TAG-'):
+                                        tag = p.removeprefix('TAG-')
+                                        tags.append(tag)

-                            return {'tags': tags}
-                            # return {'tags': ",".join(tags)}
+                            result_set = set(tags)
+
+                            """ Must add individual parts of the tags to the result set
+                            to be able to match them with queries like 'tags:callas'
+                            or 'tags:maria' for tags "María Callas" or "Person/María
+                            Callas". To maintain Baloo tag behaviour with spaces, it's
+                            not possible to search for tags="María Callas" and must
+                            search for tags=María tags:Callas, items with spaces are
+                            not added to avoid confusion."""
+                            for item in tags:
+                                parts = re.split(r'[ /\n\t]+', item)
+
+                                for part in parts:
+                                    if part:
+                                        result_set.add(part)
+                                        normalize_part = normalize_text(part)
+                                        if normalize_part:
+                                            result_set.add(normalize_part)
+
+                            tags = sorted(list(result_set))
+
+                            if not tags:
+                                return {}
+                            else:
+                                return {'tags': tags}

        except lmdb.Error as e:
            print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)