This commit is contained in:
Ignacio Serantes
2026-05-10 16:37:46 +02:00
parent 6207cab27a
commit af21672b1c
3 changed files with 166 additions and 72 deletions

View File

@@ -33,31 +33,18 @@ def expression_contains_tags(text):
class EvaluateExpression: class EvaluateExpression:
def __init__(self): def __init__(self):
# Pre-define the grammar structure during initialization
self.grammar = self._build_grammar() self.grammar = self._build_grammar()
def _compare_single(self, l_val, op, r_val): def _compare_single(self, l_val, op, r_val):
""" # 1. CASE SENSITIVE (Strict)
Atomic comparison logic for individual values. if op == "==":
Handles numeric conversion and standard operators. return str(l_val) == str(r_val)
"""
# Numeric conversion for mathematical operators # 2. NUMERIC LOGIC
if op in (">", "<", ">=", "<="): if op in (">", "<", ">=", "<="):
try: try:
# Attempt to treat both sides as floats # We use float for numeric magnitude
curr_l, curr_r = float(l_val), float(r_val) curr_l, curr_r = float(l_val), float(r_val)
except (ValueError, TypeError):
# Fallback to string comparison if conversion fails
curr_l, curr_r = str(l_val), str(r_val)
else:
# Default to string representation for other operators
curr_l, curr_r = str(l_val), str(r_val)
# Standard operator logic
if op == "=":
return l_val == r_val
if op == "!=":
return l_val != r_val
if op == ">": if op == ">":
return curr_l > curr_r return curr_l > curr_r
if op == "<": if op == "<":
@@ -66,44 +53,63 @@ class EvaluateExpression:
return curr_l >= curr_r return curr_l >= curr_r
if op == "<=": if op == "<=":
return curr_l <= curr_r return curr_l <= curr_r
except (ValueError, TypeError):
# Fallback to case-insensitive string if not numeric
pass
# 3. CASE INSENSITIVE (Default for =, !=, :)
curr_l = str(l_val).lower()
curr_r = str(r_val).lower()
if op == "=":
return curr_l == curr_r
if op == "!=":
return curr_l != curr_r
if op == ":": if op == ":":
return str(r_val).lower() in str(l_val).lower() return curr_r in curr_l
# String fallback for magnitude if numeric failed
if op == ">":
return curr_l > curr_r
if op == "<":
return curr_l < curr_r
if op == ">=":
return curr_l >= curr_r
if op == "<=":
return curr_l <= curr_r
return False return False
def _compare(self, data, left_key, op, right_val): def _compare(self, data, left_key, op, right_val):
""" # Normalizing keys for lookup, but KEEPING the values intact
Main comparison router. Checks if the field is a list or a single value.
"""
# Normalize data keys to lowercase for case-insensitive lookup
normalized_data = {k.lower(): v for k, v in data.items()} normalized_data = {k.lower(): v for k, v in data.items()}
# Extract the left-hand value (the field from the JSON) # Get left value from data or use as literal
l_val = normalized_data.get(left_key.lower(), left_key) l_val = normalized_data.get(left_key.lower(), left_key)
# Extract the right-hand value (check if it's a literal or another field) # Resolve right value: if it's a key in data, use its value.
r_val = normalized_data.get(str(right_val).lower(), right_val) # Important: use lower() only for the KEY lookup, not the value itself.
r_key_lookup = str(right_val).lower()
if r_key_lookup in normalized_data:
r_val = normalized_data[r_key_lookup]
else:
r_val = right_val
# IF THE FIELD VALUE IS A LIST
if isinstance(l_val, list): if isinstance(l_val, list):
# Return True if ANY item in the list satisfies the condition
return any(self._compare_single(item, op, r_val) for item in l_val) return any(self._compare_single(item, op, r_val) for item in l_val)
# IF THE FIELD VALUE IS A SINGLE DATA POINT
return self._compare_single(l_val, op, r_val) return self._compare_single(l_val, op, r_val)
def _build_grammar(self): def _build_grammar(self):
""" # CRITICAL: '==' must come BEFORE '=' in the list
Defines the pyparsing grammar for the expression engine. # We use a list to ensure explicit priority in the parser
""" operators = one_of(["==", ">=", "<=", "!=", "=", ">", "<", ":"])
operators = one_of(">= <= != = > < :")
identifier = Word(alphanums + "_./\\") identifier = Word(alphanums + "_./\\")
quoted_string = QuotedString("'") | QuotedString('"') quoted_string = QuotedString("'") | QuotedString('"')
operand = quoted_string | identifier operand = quoted_string | identifier
# Define basic condition (e.g., "width > 100" or "word")
condition = Group((operand + operators + operand) | operand) condition = Group((operand + operators + operand) | operand)
# Attach the parse action to convert tokens into executable functions (lambdas)
condition.set_parse_action(lambda t: self._create_evaluator_func(t[0])) condition.set_parse_action(lambda t: self._create_evaluator_func(t[0]))
return infix_notation( return infix_notation(
@@ -119,25 +125,16 @@ class EvaluateExpression:
) )
def _create_evaluator_func(self, tokens): def _create_evaluator_func(self, tokens):
"""
Creates a closure that captures tokens and waits for the data dictionary.
"""
if len(tokens) == 1: if len(tokens) == 1:
# Rule: Single term -> path CONTAINS term
return lambda data: self._compare(data, 'path', ':', tokens[0]) return lambda data: self._compare(data, 'path', ':', tokens[0])
else: else:
# Rule: Explicit triplet (key, operator, value)
return lambda data: self._compare(data, tokens[0], tokens[1], tokens[2]) return lambda data: self._compare(data, tokens[0], tokens[1], tokens[2])
def compile(self, expression): def compile(self, expression):
"""
Parses the expression once and returns a reusable function.
"""
try: try:
return self.grammar.parse_string(expression, parse_all=True)[0] return self.grammar.parse_string(expression, parse_all=True)[0]
except Exception as e: except Exception as e:
print(f"Compilation Error: {e}") print(f"Compilation Error: {e}")
# Fallback: return a function that always fails gracefully
return lambda data: False return lambda data: False
@@ -250,7 +247,7 @@ class BagheeraSearcher:
self.ids_processed.add(file_id) self.ids_processed.add(file_id)
if exclude_evaluator: if exclude_evaluator:
file_info = {'path': item["path"]} file_info = {'path': item["path"], 'filename': Path(item["path"]).name}
if exclude_sources.get('properties'): if exclude_sources.get('properties'):
file_info = file_info | get_info(file_id) file_info = file_info | get_info(file_id)
if exclude_sources.get('tags'): if exclude_sources.get('tags'):
@@ -325,7 +322,7 @@ class BagheeraSearcher:
self.ids_processed.add(file_id) self.ids_processed.add(file_id)
if exclude_evaluator: if exclude_evaluator:
file_info = {'path': item["path"]} file_info = {'path': item["path"], 'filename': Path(item["path"]).name}
if exclude_sources.get('properties'): if exclude_sources.get('properties'):
file_info = file_info | get_info(file_id) file_info = file_info | get_info(file_id)
if exclude_sources.get('tags'): if exclude_sources.get('tags'):

View File

@@ -14,7 +14,7 @@ __status__ = "Production"
import argparse import argparse
import json import json
import signal import os
import sys import sys
from pathlib import Path from pathlib import Path
# from baloo_tools import get_resolution # from baloo_tools import get_resolution
@@ -26,7 +26,7 @@ PROG_NAME = "Bagheera Search Tool"
PROG_ID = "bagheerasearch" PROG_ID = "bagheerasearch"
PROG_VERSION = __version__ PROG_VERSION = __version__
PROG_BY = __author__ PROG_BY = __author__
PROG_DATE = "2026-05-09" PROG_DATE = "2026-05-10"
CONFIG_DIR = Path.home() / ".config" / PROG_ID CONFIG_DIR = Path.home() / ".config" / PROG_ID
CONFIG_FILE = CONFIG_DIR / "config.json" CONFIG_FILE = CONFIG_DIR / "config.json"
@@ -55,7 +55,7 @@ def save_config(config: dict) -> None:
def print_help_query() -> None: def print_help_query() -> None:
"""Prints the detailed help for query syntax.""" """Prints the detailed help for query syntax."""
help_query = f"""Help updated to 2025-01-01. help_query = f"""{PROG_NAME} uses the Baloo search engine, which is part of the KDE ecosystem, to perform file searches so next help is obtained from Baloo documentation on 2025-01-01, with some additional information, and it may not be up to date with the latest features or changes in Baloo. For the most current information, please refer to the official Baloo documentation or resources.
Baloo offers a rich syntax for searching through your files. Certain attributes of a file can be searched through. Baloo offers a rich syntax for searching through your files. Certain attributes of a file can be searched through.
@@ -63,7 +63,7 @@ For example 'type' can be used to filter for files based on their general type:
type:Audio OR type:Document type:Audio OR type:Document
The following comparison operators are supported, but note that 'not equal' (!=) operator is not available. The following comparison operators are supported, but note that 'not equal' (!=) operator is not available in Baloo search engine.
· : - contains (only for text comparison) · : - contains (only for text comparison)
· = - equal · = - equal
· > - greater than · > - greater than
@@ -71,7 +71,7 @@ The following comparison operators are supported, but note that 'not equal' (!=)
· < - less than · < - less than
· <= - less than or equal to · <= - less than or equal to
Currently the following types are supported: Currently the following types, to use in --type property, are supported:
· Archive · Archive
· Folder · Folder
· Audio · Audio
@@ -90,7 +90,7 @@ The full list of properties which can be searched is listed below. They are grou
All Files All Files
· filename · filename
· mimetype · mimetype
· modified · modified (formated as yyyy-MM-dd[ hh[:mm[:ss]]])
· rating · rating
· tags · tags
· userComment · userComment
@@ -103,7 +103,7 @@ Audio
· Channels · Channels
· Comment · Comment
· Composer · Composer
· Duration · Duration (this value must be in seconds, for example use 'duration > 300' to find files longer than 5 minutes)
· Genre · Genre
· Lyricist · Lyricist
· ReleaseYear · ReleaseYear
@@ -113,7 +113,7 @@ Audio
Documents Documents
· Author · Author
· Copyright · Copyright
· CreationDate · CreationDate (formated as yyyy-MM-dd[ hh[:mm[:ss]]])
· Generator · Generator
· Keywords · Keywords
· Language · Language
@@ -153,8 +153,44 @@ Media
· PhotoWhiteBalance · PhotoWhiteBalance
· Width · Width
Next properties are undocumented but available in source code, may work or not, but worth trying:
· AssistiveAlternateDescription
· Arranger
· AudioCodec
· ColorSpace
· Compilation
· Conductor
· Description
· DiscNumber
· Ensemble
· Label
· License
· Location
· Lyrics
· Manufacturer
· Model
· Opus
· OriginUrl
· OriginEmailSubject
· OriginEmailSender
· OriginEmailMessageId
· Performer
· PixelFormat
· ReplayGainAlbumPeak
· ReplayGainAlbumGain
· ReplayGainTrackPeak
· ReplayGainTrackGain
· TranslationUnitsTotal
· TranslationUnitsWithTranslation
· TranslationUnitsWithDraftTranslation
· TranslationLastAuthor
· TranslationLastUpDate
· TranslationTemplateDate
· VideoCodec
{PROG_NAME} recognizes some natural language sentences in English, as long as they are capitalized, and transforms them into queries that can be interpreted by the search engine. Baloo documentation ends here, but {PROG_NAME} adds some extra features on top of it.
Search engine recognizes some natural language sentences in English, as long as they are capitalized, and transforms them into queries that can be interpreted by the search engine.
Supported natural language sentences and patterns for queries are: Supported natural language sentences and patterns for queries are:
· MODIFIED TODAY · MODIFIED TODAY
@@ -166,12 +202,18 @@ Supported natural language sentences and patterns for queries are:
<NUMBER> can be any number or a number text from ONE to TWENTY. <NUMBER> can be any number or a number text from ONE to TWENTY.
The --exclude and --recursive-exclude options allow you to filter files out of the results. The syntax for both options supports parentheses and logical operators (AND, OR, and NOT) to combine multiple patterns. The --exclude and --recursive-exclude options allow you to filter files out of the results.
The syntax for both options supports parentheses and logical operators (AND, OR, and NOT) to combine multiple patterns.
In addition to standard query comparison operators, the not equal (!=) operator is available for comparing properties against specific values. Furthermore, you can compare two properties directly; for example, 'width > height' is a valid expression. In addition to standard query comparison operators, the not equal (!=) operator is available for comparing properties against specific values. Furthermore, you can compare two properties directly; for example, 'width > height' is a valid expression.
Remarks: Remarks:
· All text comparison are case insensitive. · Text comparisons are case sensitive with '==' operator but case insensitive with '=' and ':' operator. For example, 'filename:report' would match 'report.docx', 'Report.docx', and 'REPORT.docx', while 'filename=report.docx' would only match 'report.docx'.
· Tags comparisons are performed against both individual full tag string (using the '/' character as a level separator) and each individual level. All individual level values are normalized to lowercase and stripped of accents or diacritics. For example, a file tagged as 'Opera,Person/María Callas,Singer' would match any of the following elements: ['Opera', 'Person/María Callas', 'Singer', 'callas', 'maria', 'opera', 'person', 'singer']." · Tags comparisons are performed against both individual full tag string (using the '/' character as a level separator) and each individual level. All individual level values are normalized to lowercase and stripped of accents or diacritics. For example, a file tagged as 'Opera,Person/María Callas,Singer' would match any of the following elements: ['Opera', 'Person/María Callas', 'Singer', 'callas', 'maria', 'opera', 'person', 'singer']."
· Only text and numeric data are supported.""" · Only text and numeric data are supported, dates are not supported as of now.
· Baloo limit of at least three characters for property values is not applied in --exclude and --recursive-exclude options, so you can use shorter values in those options.
For example, if you have a tag named 'Science' and another one 'Science Fiction' you can't obtain only results tagged with 'Science' becouse Baloo search engine will match both 'Science' and 'Science Fiction' tags when you use 'tags:Science' in your query. To exclude results tagged with 'Science Fiction' you can use the following query:
{PROG_ID} --exclude tags:Fiction tags:Science"""
print(help_query) print(help_query)
@@ -183,19 +225,12 @@ def print_version() -> None:
"the good people at KDE" "the good people at KDE"
) )
def signal_handler(sig, frame) -> None:
"""Handles Ctrl+C gracefully."""
print("\nSearch canceled at user request.")
sys.exit(0)
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="An improved search tool for Baloo" description="An improved search tool for Baloo"
) )
parser.add_argument("query", nargs="?", help="list of words to query for") parser.add_argument("query", nargs="?", help="list of words to query for")
parser.add_argument("-d", "--directory", help="limit search to specified directory") parser.add_argument("-d", "--directory", help="limit search to specified directory tree")
parser.add_argument("-e", "--exclude", help="Search exclude pattern") parser.add_argument("-e", "--exclude", help="Search exclude pattern")
parser.add_argument("-i", "--id", action="store_true", help="show document IDs") parser.add_argument("-i", "--id", action="store_true", help="show document IDs")
parser.add_argument("-k", "--konsole", action="store_true", help="show files using file:/ and quotes") parser.add_argument("-k", "--konsole", action="store_true", help="show files using file:/ and quotes")
@@ -203,7 +238,7 @@ def main():
parser.add_argument("-o", "--offset", type=int, help="offset from which to start the search") parser.add_argument("-o", "--offset", type=int, help="offset from which to start the search")
parser.add_argument("-r", "--recursive", nargs="?", const="", default=None, help="enable recurse with or without a query") parser.add_argument("-r", "--recursive", nargs="?", const="", default=None, help="enable recurse with or without a query")
parser.add_argument("-n", "--recursive-indent", help="recursive indent character") parser.add_argument("-n", "--recursive-indent", help="recursive indent character")
parser.add_argument("-x", "--recursive-exclude", help="recursion exclude pattern") parser.add_argument("-x", "--recursive-exclude", help="recursion exclude query")
parser.add_argument("-s", "--sort", help="sorting criteria <auto|none>") parser.add_argument("-s", "--sort", help="sorting criteria <auto|none>")
parser.add_argument("-t", "--type", help="type of Baloo data to be searched") parser.add_argument("-t", "--type", help="type of Baloo data to be searched")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose mode") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose mode")
@@ -318,16 +353,30 @@ def main():
except FileNotFoundError as e: except FileNotFoundError as e:
print(e) print(e)
sys.exit(1) sys.exit(1)
except KeyboardInterrupt:
# Captura Ctrl+C dentro de main para una salida inmediata y limpia
print("\nSearch canceled at user request.")
sys.exit(0)
except BrokenPipeError:
# Silencia errores cuando se usa con 'head' o 'less' y se cierra el pipe
devnull = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull, sys.stdout.fileno())
sys.exit(1)
except Exception as e: except Exception as e:
print(f"Error executing search: {e}") print(f"Error executing search: {e}")
sys.exit(1) sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
signal.signal(signal.SIGINT, signal_handler)
try: try:
main() main()
except KeyboardInterrupt:
# Respaldo por si la interrupción ocurre fuera del bloque principal de main
print("\nSearch canceled at user request.")
try:
sys.exit(0)
except SystemExit:
os._exit(0)
except Exception as e: except Exception as e:
print(f"Critical error: {e}") print(f"Critical error: {e}")
sys.exit(1) sys.exit(1)

View File

@@ -10,6 +10,7 @@ import lmdb
import os import os
import re import re
import sys import sys
import unicodedata
from typing import Tuple from typing import Tuple
PROPERTIES_ID_MAP = { PROPERTIES_ID_MAP = {
@@ -100,6 +101,18 @@ PROPERTIES_ID_MAP = {
} }
def normalize_text(text):
"""
Remove accents/diacritics for string comparison.
"""
if not text:
return ""
text = unicodedata.normalize('NFD', text)
text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
# return text.lower().strip()
return text.strip()
class BalooTools: class BalooTools:
"""Class to interact directly with the Baloo LMDB index.""" """Class to interact directly with the Baloo LMDB index."""
@@ -214,11 +227,46 @@ class BalooTools:
for p in parts: for p in parts:
p = p.strip() p = p.strip()
if p: if p:
tag = p.removeprefix('TAG-').removeprefix('TA') """ 'TA' elements are tags normalized to lowercase
and stripped of accents/diacritics, while 'TAG'
elements are the original tags as they were added by
the user. We need to process both to ensure we can
match tags in a case-insensitive and
accent-insensitive way. But we only want to add the
original tags to the final result, not the
normalized ones, because the normalized ones are
not handle correctly tags with spaces and words with
less than three characters.
"""
if p.startswith('TAG-'):
tag = p.removeprefix('TAG-')
tags.append(tag) tags.append(tag)
result_set = set(tags)
""" Must add individual parts of the tags to the result set
to be able to match them with queries like 'tags:callas'
or 'tags:maria' for tags "María Callas" or "Person/María
Callas". To maintain Baloo tag behaviour with spaces, it's
not possible to search for tags="María Callas" and must
search for tags=María tags:Callas, items with spaces are
not added to avoid confusion."""
for item in tags:
parts = re.split(r'[ /\n\t]+', item)
for part in parts:
if part:
result_set.add(part)
normalize_part = normalize_text(part)
if normalize_part:
result_set.add(normalize_part)
tags = sorted(list(result_set))
if not tags:
return {}
else:
return {'tags': tags} return {'tags': tags}
# return {'tags': ",".join(tags)}
except lmdb.Error as e: except lmdb.Error as e:
print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr) print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)