v1.1.0
This commit is contained in:
@@ -10,6 +10,7 @@ import lmdb
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from typing import Tuple
|
||||
|
||||
PROPERTIES_ID_MAP = {
|
||||
@@ -100,6 +101,18 @@ PROPERTIES_ID_MAP = {
|
||||
}
|
||||
|
||||
|
||||
def normalize_text(text):
|
||||
"""
|
||||
Remove accents/diacritics for string comparison.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = unicodedata.normalize('NFD', text)
|
||||
text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
|
||||
# return text.lower().strip()
|
||||
return text.strip()
|
||||
|
||||
|
||||
class BalooTools:
|
||||
"""Class to interact directly with the Baloo LMDB index."""
|
||||
|
||||
@@ -214,11 +227,46 @@ class BalooTools:
|
||||
for p in parts:
|
||||
p = p.strip()
|
||||
if p:
|
||||
tag = p.removeprefix('TAG-').removeprefix('TA')
|
||||
tags.append(tag)
|
||||
""" 'TA' elements are tags normalized to lowercase
|
||||
and stripped of accents/diacritics, while 'TAG'
|
||||
elements are the original tags as they were added by
|
||||
the user. We need to process both to ensure we can
|
||||
match tags in a case-insensitive and
|
||||
accent-insensitive way. But we only want to add the
|
||||
original tags to the final result, not the
|
||||
normalized ones, because the normalized ones are
|
||||
not handle correctly tags with spaces and words with
|
||||
less than three characters.
|
||||
"""
|
||||
if p.startswith('TAG-'):
|
||||
tag = p.removeprefix('TAG-')
|
||||
tags.append(tag)
|
||||
|
||||
return {'tags': tags}
|
||||
# return {'tags': ",".join(tags)}
|
||||
result_set = set(tags)
|
||||
|
||||
""" Must add individual parts of the tags to the result set
|
||||
to be able to match them with queries like 'tags:callas'
|
||||
or 'tags:maria' for tags "María Callas" or "Person/María
|
||||
Callas". To maintain Baloo tag behaviour with spaces, it's
|
||||
not possible to search for tags="María Callas" and must
|
||||
search for tags=María tags:Callas, items with spaces are
|
||||
not added to avoid confusion."""
|
||||
for item in tags:
|
||||
parts = re.split(r'[ /\n\t]+', item)
|
||||
|
||||
for part in parts:
|
||||
if part:
|
||||
result_set.add(part)
|
||||
normalize_part = normalize_text(part)
|
||||
if normalize_part:
|
||||
result_set.add(normalize_part)
|
||||
|
||||
tags = sorted(list(result_set))
|
||||
|
||||
if not tags:
|
||||
return {}
|
||||
else:
|
||||
return {'tags': tags}
|
||||
|
||||
except lmdb.Error as e:
|
||||
print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user