v0.9.16

2026-03-31 23:35:57 +02:00
parent ff7c1aa373
commit cb751b2970
14 changed files with 2431 additions and 119 deletions
--- a/duplicatecache.py
+++ b/duplicatecache.py
@@ -0,0 +1,723 @@
+import os
+import logging
+import struct
+import time
+import collections
+import shutil
+import lmdb
+from pathlib import Path
+import PIL.Image
+
+from PySide6.QtCore import (
+    QObject, QThread, Signal, QMutex, QSemaphore, QReadWriteLock,
+    QMutexLocker, QReadLocker, QWriteLocker, QRunnable
+)
+
+import imagehash  # For perceptual hashing
+
+from constants import (
+    DUPLICATE_CACHE_PATH, DUPLICATE_HASH_DB_NAME,
+    DUPLICATE_EXCEPTIONS_DB_NAME, DUPLICATE_PENDING_DB_NAME,
+    MAX_DHASH_DISTANCE, UITexts
+)
+
+logger = logging.getLogger(__name__)
+
+# Result structure for duplicate detection
+DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity'])
+
+
+class BKTree:
+    """A Burkhard-Keller tree for efficient similarity searching using Hamming distance."""
+    def __init__(self, distance_func):
+        self.distance_func = distance_func
+        self.tree = None
+
+    def add(self, item):
+        if self.tree is None:
+            self.tree = (item, {})
+            return
+        node = self.tree
+        while True:
+            val, children = node
+            dist = self.distance_func(item, val)
+            if dist == 0:
+                return
+            if dist in children:
+                node = children[dist]
+            else:
+                children[dist] = (item, {})
+                break
+
+    def query(self, item, max_dist):
+        if self.tree is None:
+            return []
+        results = []
+        candidates = [self.tree]
+        while candidates:
+            val, children = candidates.pop()
+            dist = self.distance_func(item, val)
+            if dist <= max_dist:
+                results.append((val, dist))
+            for d in range(max(0, dist - max_dist), dist + max_dist + 1):
+                if d in children:
+                    candidates.append(children[d])
+        return results
+
+
+class HashWorker(QRunnable):
+    """Worker to calculate image hash in a thread pool."""
+    def __init__(self, path, detector, result_dict, mutex, semaphore):
+        super().__init__()
+        self.path = path
+        self.detector = detector
+        self.result_dict = result_dict
+        self.mutex = mutex
+        self.semaphore = semaphore
+
+    def run(self):
+        if self.detector._is_running:
+            try:
+                # imagehash requires a PIL/Pillow image object.
+                with PIL.Image.open(self.path) as pil_img:
+                    # Using dHash from imagehash library as default
+                    h = str(imagehash.dhash(pil_img))
+                    with QMutexLocker(self.mutex):
+                        self.result_dict[self.path] = h
+            except Exception as e:
+                logger.warning(f"HashWorker failed for {self.path}: {e}")
+
+        self.semaphore.release()
+
+
+class DuplicateCache(QObject):
+    """
+    Manages a persistent LMDB cache for perceptual hashes and duplicate relationships.
+    Uses (device_id, inode) as primary keys for robustness against file renames/moves.
+    """
+    def __init__(self):
+        super().__init__()
+        self._lmdb_env = None
+        self._hash_db = None
+        self._exceptions_db = None
+        self._pending_db = None
+        self._db_lock = QMutex()  # Protects LMDB transactions
+        # In-memory cache for hashes: (dev, inode) -> (hash_value, path)
+        self._hash_cache = {}
+        self._hash_cache_lock = QReadWriteLock()
+
+        self.lmdb_open()
+
+    def lmdb_open(self):
+        cache_dir = Path(DUPLICATE_CACHE_PATH)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+
+        try:
+            self._lmdb_env = lmdb.open(
+                DUPLICATE_CACHE_PATH,
+                map_size=10 * 1024 * 1024 * 1024,  # 10GB default
+                max_dbs=3,  # For hashes, exceptions and pending
+                readonly=False,
+                create=True
+            )
+            self._hash_db = self._lmdb_env.open_db(DUPLICATE_HASH_DB_NAME)
+            self._exceptions_db = self._lmdb_env.open_db(DUPLICATE_EXCEPTIONS_DB_NAME)
+            self._pending_db = self._lmdb_env.open_db(DUPLICATE_PENDING_DB_NAME)
+            logger.info(f"Duplicate LMDB cache opened: {DUPLICATE_CACHE_PATH}")
+        except Exception as e:
+            logger.error(f"Failed to open duplicate LMDB cache: {e}")
+            self._lmdb_env = None
+
+    def lmdb_close(self):
+        if self._lmdb_env:
+            self._lmdb_env.close()
+            self._lmdb_env = None
+            self._hash_db = None
+            self._exceptions_db = None
+            self._pending_db = None
+
+    def get_hash_stats(self):
+        """Returns (count, size_bytes) for the hash database."""
+        count = 0
+        if not self._lmdb_env:
+            return 0, 0
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                count = txn.stat(db=self._hash_db)['entries']
+
+        size = 0
+        disk_path = os.path.join(DUPLICATE_CACHE_PATH, "data.mdb")
+        if os.path.exists(disk_path):
+            size = os.path.getsize(disk_path)
+
+        return count, size
+
+    def clear_hashes(self):
+        """Clears all hashes from the database by recreating the environment."""
+        with QWriteLocker(self._hash_cache_lock):
+            self._hash_cache.clear()
+
+        self.lmdb_close()
+        try:
+            if os.path.exists(DUPLICATE_CACHE_PATH):
+                shutil.rmtree(DUPLICATE_CACHE_PATH)
+            self.lmdb_open()
+            logger.info("Duplicate hash cache cleared.")
+        except Exception as e:
+            logger.error(f"Error clearing duplicate LMDB: {e}")
+
+    def __del__(self):
+        self.lmdb_close()
+
+    @staticmethod
+    def _get_inode_info(path):
+        try:
+            stat_info = os.stat(path)
+            return stat_info.st_dev, struct.pack('Q', stat_info.st_ino)
+        except OSError:
+            return 0, None
+
+    def _get_lmdb_key(self, dev_id, inode_key_bytes):
+        return f"{dev_id}-{inode_key_bytes.hex()}".encode('utf-8')
+
+    def get_hash_and_path(self, dev_id, inode_key_bytes):
+        """Retrieves hash, mtime and path for a given (dev_id, inode_key_bytes)."""
+        # Check in-memory cache first
+        with QReadLocker(self._hash_cache_lock):
+            cached_data = self._hash_cache.get((dev_id, inode_key_bytes))
+            if cached_data:
+                return cached_data  # (hash_value, mtime, path)
+
+        # Check LMDB
+        if not self._lmdb_env:
+            return None, 0, None
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes)
+                value_bytes = txn.get(lmdb_key, db=self._hash_db)
+                if value_bytes:
+                    # Handle format "hash_value_str|mtime|path_str" or old "hash|path"
+                    parts = value_bytes.decode('utf-8').split('|', 2)
+                    if len(parts) == 3:
+                        hash_str, mtime_str, path_str = parts
+                        mtime = float(mtime_str)
+                    elif len(parts) == 2:
+                        hash_str, path_str = parts
+                        mtime = 0.0  # Force re-hash
+                    else:
+                        return None, 0, None
+
+                    with QWriteLocker(self._hash_cache_lock):
+                        self._hash_cache[(dev_id, inode_key_bytes)] = (hash_str, mtime, path_str)
+                    return hash_str, mtime, path_str
+        return None, 0, None
+
+    def get_hash_for_path(self, path, current_mtime, dev_id=None, inode_key_bytes=None):
+        if dev_id is None or inode_key_bytes is None:
+            dev_id, inode_key_bytes = self._get_inode_info(path)
+        if not inode_key_bytes:
+            return None
+        hash_value, cached_mtime, _ = self.get_hash_and_path(dev_id, inode_key_bytes)
+        # Return hash only if mtime matches (with small float tolerance)
+        if hash_value and abs(cached_mtime - current_mtime) < 0.001:
+            return hash_value
+        return None
+
+    def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
+        if dev_id is None or inode_key_bytes is None:
+            dev_id, inode_key_bytes = self._get_inode_info(path)
+        if not inode_key_bytes or not self._lmdb_env:
+            return False
+
+        value_str = f"{hash_value}|{mtime}|{path}"
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes)
+                txn.put(lmdb_key, value_str.encode('utf-8'), db=self._hash_db)
+
+        with QWriteLocker(self._hash_cache_lock):
+            self._hash_cache[(dev_id, inode_key_bytes)] = (hash_value, mtime, path)
+        return True
+
+    def remove_hash_for_path(self, path):
+        dev_id, inode_key_bytes = self._get_inode_info(path)
+        if not inode_key_bytes or not self._lmdb_env:
+            return False
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes)
+                txn.delete(lmdb_key, db=self._hash_db)
+
+        with QWriteLocker(self._hash_cache_lock):
+            self._hash_cache.pop((dev_id, inode_key_bytes), None)
+
+        # Also remove any exceptions involving this path
+        self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
+        self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
+        return True
+
+    def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
+        # Ensure canonical order for exception keys
+        key_parts = sorted([f"{dev1}-{inode1.hex()}", f"{dev2}-{inode2.hex()}"])
+        return f"{key_parts[0]}-{key_parts[1]}".encode('utf-8')
+
+    def _get_pair_lmdb_key(self, path1, path2):
+        dev1, inode1 = self._get_inode_info(path1)
+        dev2, inode2 = self._get_inode_info(path2)
+        if not inode1 or not inode2:
+            return None
+        return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
+
+    def mark_as_exception(self, path1, path2, is_exception=True, similarity=None):
+        if not self._lmdb_env:
+            return False
+
+        dev1, inode1 = self._get_inode_info(path1)
+        dev2, inode2 = self._get_inode_info(path2)
+        if not inode1 or not inode2:
+            return False
+
+        exception_key = self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
+        if not exception_key:
+            return False
+
+        # Store paths in value to make exception recovery independent of hash DB
+        val_str = f"{path1}|{path2}"
+        if similarity is not None:
+            val_str += f"|{similarity}"
+        value = val_str.encode('utf-8')
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                if is_exception:
+                    txn.put(exception_key, value, db=self._exceptions_db)
+                else:
+                    txn.delete(exception_key, db=self._exceptions_db)
+        return True
+
+    def is_exception(self, path1, path2):
+        if not self._lmdb_env:
+            return False
+
+        dev1, inode1 = self._get_inode_info(path1)
+        dev2, inode2 = self._get_inode_info(path2)
+        if not inode1 or not inode2:
+            return False
+
+        exception_key = self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
+        if not exception_key:
+            return False
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                return txn.get(exception_key, db=self._exceptions_db) is not None
+
+    def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle):
+        """Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
+        if not self._lmdb_env:
+            return
+
+        target_inode_hex = target_inode.hex()
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                cursor = txn.cursor(db=db_handle)
+                keys_to_delete = []
+                for key_bytes, _ in cursor:
+                    key_str = key_bytes.decode('utf-8')
+                    # Key format: "dev1-inode1_hex-dev2-inode2_hex"
+                    parts = key_str.split('-')
+
+                    dev1 = int(parts[0])
+                    inode1_hex = parts[1]
+                    dev2 = int(parts[2])
+                    inode2_hex = parts[3]
+
+                    if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
+                       (dev2 == target_dev and inode2_hex == target_inode_hex):
+                        keys_to_delete.append(key_bytes)
+
+                for key in keys_to_delete:
+                    txn.delete(key, db=db_handle)
+
+    def mark_as_pending(self, path1, path2, is_pending=True, similarity=None):
+        """Marks a pair as pending review."""
+        if not self._lmdb_env or self._pending_db is None:
+            return False
+
+        key = self._get_pair_lmdb_key(path1, path2)
+        if not key:
+            return False
+
+        # Store paths in value to allow reconstruction without scanning
+        val_str = f"{path1}|{path2}"
+        if similarity is not None:
+            val_str += f"|{similarity}"
+        value = val_str.encode('utf-8')
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                if is_pending:
+                    txn.put(key, value, db=self._pending_db)
+                else:
+                    # Check if it exists before deleting to avoid errors
+                    if txn.get(key, db=self._pending_db):
+                        txn.delete(key, db=self._pending_db)
+        return True
+
+    def get_all_pending_duplicates(self):
+        """Retrieves all pending duplicate pairs from the database."""
+        results = []
+        if not self._lmdb_env or self._pending_db is None:
+            return results
+
+        keys_to_delete = []
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                cursor = txn.cursor(db=self._pending_db)
+                for key, value_bytes in cursor:
+                    try:
+                        parts = value_bytes.decode('utf-8').split('|')
+                        p1, p2 = parts[0], parts[1]
+                        sim = int(parts[2]) if len(parts) > 2 else None
+                        if os.path.exists(p1) and os.path.exists(p2):
+                            results.append(DuplicateResult(p1, p2, None, False, sim))
+                        else:
+                            keys_to_delete.append(key)
+                    except Exception:
+                        keys_to_delete.append(key)
+                        continue
+
+            if keys_to_delete:
+                try:
+                    with self._lmdb_env.begin(write=True) as txn:
+                        for k in keys_to_delete:
+                            txn.delete(k, db=self._pending_db)
+                    logger.info(f"Cleaned up {len(keys_to_delete)} invalid pending duplicates (files deleted externally)")
+                except Exception as e:
+                    logger.error(f"Error cleaning up pending duplicates from DB: {e}")
+
+        return results
+
+    def get_all_exceptions(self):
+        """Retrieves all duplicate pairs marked as exceptions from the database."""
+        results = []
+        if not self._lmdb_env or self._exceptions_db is None:
+            return results
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                cursor = txn.cursor(db=self._exceptions_db)
+                for key_bytes, value_bytes in cursor:
+                    try:
+                        p1, p2 = None, None
+                        sim = None
+                        val_str = value_bytes.decode('utf-8')
+
+                        if '|' in val_str:
+                            # New format: paths are stored in the value
+                            parts = val_str.split('|')
+                            if len(parts) >= 2:
+                                p1, p2 = parts[0], parts[1]
+                                if len(parts) > 2:
+                                    sim = int(parts[2])
+
+                        if not p1 or not p2:
+                            # Legacy format fallback: lookup paths in hash db
+                            key_str = key_bytes.decode('utf-8')
+                            kp = key_str.split('-')
+                            if len(kp) == 4:
+                                k1, k2 = f"{kp[0]}-{kp[1]}".encode(), f"{kp[2]}-{kp[3]}".encode()
+                                v1, v2 = txn.get(k1, db=self._hash_db), txn.get(k2, db=self._hash_db)
+                                if v1 and v2:
+                                    # Format is hash|mtime|path|dist... path is always index 2
+                                    p1 = v1.decode('utf-8').split('|')[2]
+                                    p2 = v2.decode('utf-8').split('|')[2]
+
+                        if p1 and p2:
+                            if os.path.exists(p1) and os.path.exists(p2):
+                                results.append(DuplicateResult(p1, p2, None, True, sim))
+                    except Exception:
+                        continue
+        return results
+
+    def clean_stale_hashes(self):
+        """
+        Removes hash entries from the database for files that no longer exist on disk.
+        """
+        if not self._lmdb_env or self._hash_db is None:
+            return 0
+
+        keys_to_delete = []
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                cursor = txn.cursor(db=self._hash_db)
+                for key, value_bytes in cursor:
+                    try:
+                        # value_bytes is "hash|mtime|path|last_dist"
+                        parts = value_bytes.decode('utf-8').split('|')
+                        if len(parts) >= 3:
+                            path = parts[2]
+                            if not os.path.exists(path):
+                                keys_to_delete.append(key)
+                    except Exception:
+                        keys_to_delete.append(key)  # Corrupted entry
+                        continue
+
+            if keys_to_delete:
+                with self._lmdb_env.begin(write=True) as txn:
+                    for k in keys_to_delete:
+                        txn.delete(k, db=self._hash_db)
+                logger.info(f"Cleaned up {len(keys_to_delete)} stale hash entries (files deleted externally)")
+        return len(keys_to_delete)
+
+    def get_all_hashes_with_paths(self):
+        """Retrieves all hashes from the database along with their associated paths and inode info."""
+        # hash_value -> [(path, dev_id, inode_key_bytes)]
+        all_hashes = collections.defaultdict(list)
+        if not self._lmdb_env:
+            return all_hashes
+
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=False) as txn:
+                cursor = txn.cursor(db=self._hash_db)
+                for key_bytes, value_bytes in cursor:
+                    # key_bytes is like "dev_id-inode_hex"
+                    key_str = key_bytes.decode('utf-8')
+                    parts = key_str.split('-')
+                    dev_id = int(parts[0])
+                    inode_key_bytes = bytes.fromhex(parts[1])
+
+                    # value_bytes is "hash|mtime|path|last_dist"
+                    parts_val = value_bytes.decode('utf-8').split('|')
+                    if len(parts_val) >= 3:
+                        hash_value = parts_val[0]
+                        path = parts_val[2]
+                    else:
+                        continue
+
+                    all_hashes[hash_value].append((path, dev_id, inode_key_bytes))
+        return all_hashes
+
+    def rename_entry(self, old_path, new_path):
+        """
+        Updates the cache entry for a file that has been renamed or moved.
+        This involves deleting the old (dev, inode) entry and adding a new one
+        with the new (dev, inode) and path, preserving the hash value.
+        """
+        old_dev, old_inode_key_bytes = self._get_inode_info(old_path)
+        new_dev, new_inode_key_bytes = self._get_inode_info(new_path)
+
+        if not old_inode_key_bytes or not new_inode_key_bytes or not self._lmdb_env:
+            return False
+
+        # If the (dev, inode) pair is the same, only the path in the value needs updating.
+        # This happens if the file is renamed within the same filesystem.
+        if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes):
+            hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
+            if hash_value:
+                self.add_hash_for_path(new_path, hash_value, mtime)
+                self._update_pair_paths(old_path, new_path, self._pending_db)
+                return True
+            return False
+
+        # If (dev, inode) changed (cross-filesystem move), we need to:
+        # 1. Get the hash from the old entry.
+        # 2. Remove the old entry.
+        # 3. Add a new entry with the new (dev, inode) and path, using the old hash.
+        hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
+        if hash_value:
+            self.remove_hash_for_path(old_path)  # This removes the old (dev, inode) entry
+            self.add_hash_for_path(new_path, hash_value, mtime)  # Adds new (dev, inode) entry
+            self._update_pair_paths(old_path, new_path, self._pending_db)
+            return True
+        return False
+
+    def _update_pair_paths(self, old_path, new_path, db_handle):
+        """Updates stored paths in a pair-based DB value when a file is renamed."""
+        if not self._lmdb_env or db_handle is None:
+            return
+        with QMutexLocker(self._db_lock):
+            with self._lmdb_env.begin(write=True) as txn:
+                cursor = txn.cursor(db=db_handle)
+                for key, value_bytes in cursor:
+                    val_str = value_bytes.decode('utf-8')
+                    if old_path in val_str:
+                        p1, p2 = val_str.split('|')
+                        np1 = new_path if p1 == old_path else p1
+                        np2 = new_path if p2 == old_path else p2
+                        txn.put(key, f"{np1}|{np2}".encode('utf-8'), db=db_handle)
+
+
+class DuplicateDetector(QThread):
+    """
+    Worker thread for detecting duplicate images using perceptual hashing.
+    """
+    progress_update = Signal(int, int, str)  # current, total, message
+    duplicates_found = Signal(list)  # List of DuplicateResult
+    detection_finished = Signal()
+
+    def __init__(self, paths_to_scan, duplicate_cache, pool_manager, method="histogram_hashing", threshold=90, force_full=False):
+        super().__init__()
+        self.paths_to_scan = paths_to_scan
+        self.duplicate_cache = duplicate_cache
+        self.pool_manager = pool_manager
+        self.method = method
+        self.threshold = threshold  # Similarity percentage (50-100)
+        self.force_full = force_full
+        self._is_running = True
+
+    def stop(self):
+        self._is_running = False
+
+    def run(self):
+        total_files = len(self.paths_to_scan)
+        found_duplicates = []
+        unique_duplicate_pairs = set()  # To store frozenset((path1, path2)) for uniqueness
+        last_update_time = 0
+
+        pool = self.pool_manager.get_pool()
+
+        # 1. Load existing pending duplicates from cache to avoid recalculation (unless force_full)
+        if not self.force_full:
+            pending = self.duplicate_cache.get_all_pending_duplicates()
+            for p in pending:
+                if p.path1 in self.paths_to_scan and p.path2 in self.paths_to_scan:
+                    if p.similarity is None or p.similarity >= self.threshold:
+                        found_duplicates.append(p)
+                        unique_duplicate_pairs.add(frozenset((p.path1, p.path2)))
+
+        # Convert similarity threshold (percentage) to Hamming distance
+        distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100)
+        logger.info(f"Duplicate detection: Method={self.method}, Similarity Threshold={self.threshold}%, Hamming Distance Threshold={distance_threshold}")
+
+        # 2. Phase 1: Hash Collection (Parallelized)
+        path_to_hash = {}
+        dirty_hashes_objs = set()
+        dirty_paths = set()
+        paths_to_hash_parallel = []
+
+        for path in self.paths_to_scan:
+            try:
+                stat_info = os.stat(path)
+                mtime = stat_info.st_mtime
+                dev, inode = stat_info.st_dev, struct.pack('Q', stat_info.st_ino)
+
+                cached_h = None if self.force_full else \
+                    self.duplicate_cache.get_hash_for_path(path, mtime, dev, inode)
+
+                if cached_h:
+                    path_to_hash[path] = (cached_h, dev, inode)
+                else:
+                    dirty_paths.add(path)
+                    paths_to_hash_parallel.append((path, mtime, dev, inode))
+            except OSError:
+                continue
+
+        # Phase 1 starts with files already found in cache or skipped
+        processed_hashing = total_files - len(paths_to_hash_parallel)
+
+        if paths_to_hash_parallel and self._is_running:
+            batch_size = pool.maxThreadCount() * 2
+            results_mutex = QMutex()
+            new_hashes = {}
+            sem = QSemaphore(0)
+
+            for i in range(0, len(paths_to_hash_parallel), batch_size):
+                if not self._is_running:
+                    break
+                current_batch = paths_to_hash_parallel[i : i + batch_size]
+                for p_data in current_batch:
+                    pool.start(HashWorker(p_data[0], self, new_hashes, results_mutex, sem))
+
+                for _ in range(len(current_batch)):
+                    while not sem.tryAcquire(1, 100):
+                        if not self._is_running:
+                            break
+                    if not self._is_running:
+                        break
+                    processed_hashing += 1
+                    if time.perf_counter() - last_update_time > 0.05:
+                        self.progress_update.emit(processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
+                        last_update_time = time.perf_counter()
+
+            for p, mtime, dev, inode in paths_to_hash_parallel:
+                h = new_hashes.get(p)
+                if h:
+                    path_to_hash[p] = (h, dev, inode)
+                    dirty_hashes_objs.add(imagehash.hex_to_hash(h))
+                    self.duplicate_cache.add_hash_for_path(p, h, mtime, dev, inode)
+
+        if not self._is_running:
+            self.detection_finished.emit()
+            return
+
+        # Signal phase transition to exactly 50%
+        self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
+
+        if not self.force_full and not dirty_paths:
+            # No files changed and no re-scan forced.
+            # We can skip Phase 2 as all results were loaded from the pending cache.
+            self.duplicates_found.emit(found_duplicates)
+            self.detection_finished.emit()
+            return
+
+        # 3. Phase 2: Comparison (Optimized with BK-Tree)
+        hash_map = collections.defaultdict(list)
+        bk_tree = BKTree(lambda a, b: a - b)
+
+        for p, (h_str, dev, inode) in path_to_hash.items():
+            h_obj = imagehash.hex_to_hash(h_str)
+            if h_obj not in hash_map:
+                bk_tree.add(h_obj)
+            hash_map[h_obj].append((p, dev, inode))
+            if self.force_full or p in dirty_paths:
+                dirty_hashes_objs.add(h_obj)
+
+        # Optimization: Only query the tree for hashes associated with new or modified files.
+        # This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were handled in previous runs.
+        hashes_to_query = list(dirty_hashes_objs) if not self.force_full else list(hash_map.keys())
+        total_queries = len(hashes_to_query)
+
+        for i, h1 in enumerate(hashes_to_query):
+            if not self._is_running:
+                break
+
+            items1 = hash_map[h1]
+
+            if time.perf_counter() - last_update_time > 0.1:
+                # Scale Phase 2 progress to the 50%-100% range
+                phase2_progress = int(((i + 1) / total_queries) * total_files) if total_queries > 0 else total_files
+                self.progress_update.emit(total_files + phase2_progress, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
+                last_update_time = time.perf_counter()
+
+            # Query tree for similar hashes
+            for h2, distance in bk_tree.query(h1, distance_threshold):
+                items2 = hash_map[h2]
+
+                for p1, dev1, ino1 in items1:
+                    for p2, dev2, ino2 in items2:
+                        if not self._is_running:
+                            break
+                        if (dev1, ino1) == (dev2, ino2):
+                            continue
+
+                        # Optimization: Skip pair if BOTH were already verified
+                        if not self.force_full and p1 not in dirty_paths and p2 not in dirty_paths:
+                            continue
+
+                        canonical = frozenset((p1, p2))
+                        if not self._is_running:
+                            break
+                        if canonical not in unique_duplicate_pairs:
+                            if not self.duplicate_cache.is_exception(p1, p2):
+                                sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100)
+                                res = DuplicateResult(p1, p2, str(h1), False, sim)
+                                found_duplicates.append(res)
+                                unique_duplicate_pairs.add(canonical)
+                                self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim)
+
+        self.duplicates_found.emit(found_duplicates)
+        self.detection_finished.emit()