BagheeraSearch/baloo_tools/baloo_tools.py

#!/usr/bin/env python3

"""
Baloo Tools Library
Helper functions to interact directly with the Baloo LMDB index.
"""

import json
import lmdb
import os
import re
import sys
import unicodedata
from typing import Tuple

PROPERTIES_ID_MAP = {
    '0': 'Empty',
    '1': 'BitRate',
    '2': 'Channels',
    '3': 'Duration',
    '4': 'Genre',
    '5': 'SampleRate',
    '6': 'TrackNumber',
    '7': 'ReleaseYear',
    '8': 'Comment',
    '9': 'Artist',
    '10': 'Album',
    '11': 'AlbumArtist',
    '12': 'Composer',
    '13': 'Lyricist',
    '14': 'Author',
    '15': 'Title',
    '16': 'Subject',
    '17': 'Generator',
    '18': 'PageCount',
    '19': 'WordCount',
    '20': 'LineCount',
    '21': 'Language',
    '22': 'Copyright',
    '23': 'Publisher',
    '24': 'CreationDate',
    '25': 'Keywords',
    '26': 'Width',
    '27': 'Height',
    '28': 'AspectRatio',
    '29': 'FrameRate',
    '30': 'Manufacturer',
    '31': 'Model',
    '32': 'ImageDateTime',
    '33': 'ImageOrientation',
    '34': 'PhotoFlash',
    '35': 'PhotoPixelXDimension',
    '36': 'PhotoPixelYDimension',
    '37': 'PhotoDateTimeOriginal',
    '38': 'PhotoFocalLength',
    '39': 'PhotoFocalLengthIn35mmFilm',
    '40': 'PhotoExposureTime',
    '41': 'PhotoFNumber',
    '42': 'PhotoApertureValue',
    '43': 'PhotoExposureBiasValue',
    '44': 'PhotoWhiteBalance',
    '45': 'PhotoMeteringMode',
    '46': 'PhotoISOSpeedRatings',
    '47': 'PhotoSaturation',
    '48': 'PhotoSharpness',
    '49': 'PhotoGpsLatitude',
    '50': 'PhotoGpsLongitude',
    '51': 'PhotoGpsAltitude',
    '52': 'TranslationUnitsTotal',
    '53': 'TranslationUnitsWithTranslation',
    '54': 'TranslationUnitsWithDraftTranslation',
    '55': 'TranslationLastAuthor',
    '56': 'TranslationLastUpDate',
    '57': 'TranslationTemplateDate',
    '58': 'OriginUrl',
    '59': 'OriginEmailSubject',
    '60': 'OriginEmailSender',
    '61': 'OriginEmailMessageId',
    '62': 'DiscNumber',
    '63': 'Location',
    '64': 'Performer',
    '65': 'Ensemble',
    '66': 'Arranger',
    '67': 'Conductor',
    '68': 'Opus',
    '69': 'Label',
    '70': 'Compilation',
    '71': 'License',
    '72': 'Rating',
    '73': 'Lyrics',
    '74': 'ReplayGainAlbumPeak',
    '75': 'ReplayGainAlbumGain',
    '76': 'ReplayGainTrackPeak',
    '77': 'ReplayGainTrackGain',
    '78': 'Description',
    '79': 'VideoCodec',
    '80': 'AudioCodec',
    '81': 'PixelFormat',
    '82': 'ColorSpace',
    '83': 'AssistiveAlternateDescription'
}


def normalize_text(text):
    """
    Remove accents/diacritics for string comparison.
    """
    if not text:
        return ""
    text = unicodedata.normalize('NFD', text)
    text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
    # return text.lower().strip()
    return text.strip()


class BalooTools:
    """Class to interact directly with the Baloo LMDB index."""

    def __init__(self) -> None:
        """Initializes the connection path to the Baloo index."""
        self.baloo_db_path = os.path.join(
            os.path.expanduser("~"), ".local/share/baloo/index"
        )

    def get_info(self, file_id: int) -> json:
        """
        Retrieves file metadata from the Baloo index.

        Args:
            file_id: The integer ID of the file.

        Returns:
            A json with all file metadata fields.
        """
        try:
            # Using context manager ensures the environment is closed properly
            with lmdb.Environment(
                self.baloo_db_path,
                subdir=False,
                readonly=True,
                lock=False,
                max_dbs=20
            ) as env:
                document_data_db = env.open_db(b'documentdatadb')

                with env.begin() as txn:
                    cursor = txn.cursor(document_data_db)

                    # Convert ID to 8-byte little-endian format
                    file_id_bytes = int.to_bytes(
                        file_id, length=8, byteorder='little', signed=False
                    )

                    if cursor.set_range(file_id_bytes):
                        for key, value in cursor:
                            if key != file_id_bytes:
                                break

                            try:
                                jvalue = json.loads(value.decode())
                                return {PROPERTIES_ID_MAP.get(k, k):
                                        v for k, v in jvalue.items()}
                            except (json.JSONDecodeError, KeyError):
                                return {}

        except lmdb.Error as e:
            print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)

        return {}

    def get_resolution(self, file_id: int, sep: str = 'x') -> Tuple[int, int]:
        """
        Retrieves the width and height of an image/video from the Baloo index.

        Args:
            file_id: The integer ID of the file.
            sep: Separator used (unused currently, kept for compatibility).

        Returns:
            A tuple of (width, height) integers. Returns (-1, -1) if not found.
        """
        file_info = self.get_info(file_id)
        try:
            return file_info.get('26', -1), file_info.get('27', -1)
        except (json.JSONDecodeError, KeyError):
            return -1, -1

    def get_tags(self, file_id: int) -> json:
        """
        Retrieves a string with all file tags from the Baloo index.

        Args:
            file_id: The integer ID of the file.

        Returns:
            A json with a field called tags with all tags comma separated.
        """
        try:
            # Using context manager ensures the environment is closed properly
            with lmdb.Environment(
                self.baloo_db_path,
                subdir=False,
                readonly=True,
                lock=False,
                max_dbs=20
            ) as env:
                document_data_db = env.open_db(b'docxatrrterms')

                with env.begin() as txn:
                    cursor = txn.cursor(document_data_db)

                    # Convert ID to 8-byte little-endian format
                    file_id_bytes = int.to_bytes(
                        file_id, length=8, byteorder='little', signed=False
                    )

                    if cursor.set_range(file_id_bytes):
                        for key, value in cursor:
                            if key != file_id_bytes:
                                break

                            text = value.decode('utf-8', errors='replace')
                            text = re.sub(r'\x00(?![T])', '', text)
                            parts = re.split(r'[\x00\x01]', text)

                            tags = []
                            for p in parts:
                                p = p.strip()
                                if p:
                                    """ 'TA' elements are tags normalized to lowercase
                                    and stripped of accents/diacritics, while 'TAG'
                                    elements are the original tags as they were added by
                                    the user. We need to process both to ensure we can
                                    match tags in a case-insensitive and
                                    accent-insensitive way. But we only want to add the
                                    original tags to the final result, not the
                                    normalized  ones, because the normalized ones are
                                    not handle correctly tags with spaces and words with
                                    less than three characters.
                                    """
                                    if p.startswith('TAG-'):
                                        tag = p.removeprefix('TAG-')
                                        tags.append(tag)

                            result_set = set(tags)

                            """ Must add individual parts of the tags to the result set
                            to be able to match them with queries like 'tags:callas'
                            or 'tags:maria' for tags "María Callas" or "Person/María
                            Callas". To maintain Baloo tag behaviour with spaces, it's
                            not possible to search for tags="María Callas" and must
                            search for tags=María tags:Callas, items with spaces are
                            not added to avoid confusion."""
                            for item in tags:
                                parts = re.split(r'[ /\n\t]+', item)

                                for part in parts:
                                    if part:
                                        result_set.add(part)
                                        normalize_part = normalize_text(part)
                                        if normalize_part:
                                            result_set.add(normalize_part)

                            tags = sorted(list(result_set))

                            if not tags:
                                return {}
                            else:
                                return {'tags': tags}

        except lmdb.Error as e:
            print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)

        return {}


# Helper function to maintain compatibility with bagheera_search_lib.py
# since it imports `get_resolution` directly.
def get_resolution(file_id: int, sep: str = 'x') -> Tuple[int, int]:
    """Standalone helper function to instantiate BalooTools and get resolution."""
    tools = BalooTools()
    return tools.get_resolution(file_id, sep)


if __name__ == '__main__':
    # CLI execution support for testing
    if len(sys.argv) > 1:
        try:
            target_id = int(sys.argv[1], 16)
            width, height = get_resolution(target_id)
            print(f"{width} {height}")
        except ValueError:
            print("Error: Please provide a valid hexadecimal file ID.", file=sys.stderr)
            sys.exit(1)