Fixed hang with gifs in duplicates form

2026-04-06 23:20:27 +02:00
parent 45c95c1bb1
commit 964974431c
6 changed files with 298 additions and 143 deletions
--- a/duplicatecache.py
+++ b/duplicatecache.py
@@ -24,11 +24,14 @@ from constants import (
 logger = logging.getLogger(__name__)

 # Result structure for duplicate detection
-DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
+DuplicateResult = collections.namedtuple(
+    'DuplicateResult',
+    ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])


 class BKTree:
-    """A Burkhard-Keller tree for efficient similarity searching using Hamming distance."""
+    """A Burkhard-Keller tree for efficient similarity searching using Hamming
+    distance."""
    def __init__(self, distance_func):
        self.distance_func = distance_func
        self.tree = None
@@ -210,7 +213,8 @@ class DuplicateCache(QObject):
                        return None, 0, None

                    with QWriteLocker(self._hash_cache_lock):
-                        self._hash_cache[(dev_id, inode_key_bytes)] = (hash_str, mtime, path_str)
+                        self._hash_cache[(dev_id, inode_key_bytes)] = (
+                            hash_str, mtime, path_str)
                    return hash_str, mtime, path_str
        return None, 0, None

@@ -225,7 +229,8 @@ class DuplicateCache(QObject):
            return hash_value
        return None

-    def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
+    def add_hash_for_path(self,
+                          path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
        if dev_id is None or inode_key_bytes is None:
            dev_id, inode_key_bytes = self._get_inode_info(path)
        if not inode_key_bytes or not self._lmdb_env:
@@ -264,8 +269,10 @@ class DuplicateCache(QObject):

        # Also remove any exceptions involving this path
        if clear_relationships:
-            self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
-            self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
+            self._remove_pair_entries_for_path(
+                dev_id, inode_key_bytes, self._exceptions_db)
+            self._remove_pair_entries_for_path(
+                dev_id, inode_key_bytes, self._pending_db)
        return True

    def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
@@ -280,7 +287,9 @@ class DuplicateCache(QObject):
            return None
        return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)

-    def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None):
+    def mark_as_exception(self,
+                          path1, path2, is_exception=True, similarity=None,
+                          timestamp=None):
        if not self._lmdb_env:
            return False

@@ -323,8 +332,10 @@ class DuplicateCache(QObject):
            with self._lmdb_env.begin(write=False) as txn:
                return txn.get(exception_key, db=self._exceptions_db) is not None

-    def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None):
-        """Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
+    def _remove_pair_entries_for_path(self,
+                                      target_dev, target_inode, db_handle, txn=None):
+        """Removes all entries involving a specific (dev, inode) pair from a pair-based
+        DB."""
        if not self._lmdb_env:
            return

@@ -336,8 +347,10 @@ class DuplicateCache(QObject):
            for key_bytes, _ in cursor:
                key_str = key_bytes.decode('utf-8')
                parts = key_str.split('-')
-                if len(parts) < 4: continue
-                dev1, inode1_hex, dev2, inode2_hex = int(parts[0]), parts[1], int(parts[2]), parts[3]
+                if len(parts) < 4:
+                    continue
+                dev1, inode1_hex, dev2, inode2_hex = int(
+                    parts[0]), parts[1], int(parts[2]), parts[3]
                if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
                   (dev2 == target_dev and inode2_hex == target_inode_hex):
                    keys_to_delete.append(key_bytes)
@@ -351,7 +364,8 @@ class DuplicateCache(QObject):
                with self._lmdb_env.begin(write=True) as t:
                    do_remove(t)

-    def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None):
+    def mark_as_pending(self,
+                        path1, path2, is_pending=True, similarity=None, timestamp=None):
        """Marks a pair as pending review."""
        if not self._lmdb_env or self._pending_db is None:
            return False
@@ -392,7 +406,8 @@ class DuplicateCache(QObject):
                        sim = int(parts[2]) if len(parts) > 2 and parts[2] else None
                        ts = int(parts[3]) if len(parts) > 3 else 0
                        if os.path.exists(p1) and os.path.exists(p2):
-                            results.append(DuplicateResult(p1, p2, None, False, sim, ts))
+                            results.append(
+                                DuplicateResult(p1, p2, None, False, sim, ts))
                        else:
                            keys_to_delete.append(key)
                    except Exception:
@@ -404,7 +419,8 @@ class DuplicateCache(QObject):
                    with self._lmdb_env.begin(write=True) as txn:
                        for k in keys_to_delete:
                            txn.delete(k, db=self._pending_db)
-                    logger.info(f"Cleaned up {len(keys_to_delete)} invalid pending duplicates (files deleted externally)")
+                    logger.info(f"Cleaned up {len(keys_to_delete)} invalid "
+                                "pending duplicates (files deleted externally)")
                except Exception as e:
                    logger.error(f"Error cleaning up pending duplicates from DB: {e}")

@@ -436,23 +452,28 @@ class DuplicateCache(QObject):
                                if len(parts) > 3:
                                    ts = int(parts[3])
                                else:
-                                    ts = int(os.path.getmtime(p1)) if os.path.exists(p1) else 0
+                                    ts = int(os.path.getmtime(p1)) \
+                                        if os.path.exists(p1) else 0

                        if not p1 or not p2:
                            # Legacy format fallback: lookup paths in hash db
                            key_str = key_bytes.decode('utf-8')
                            kp = key_str.split('-')
                            if len(kp) == 4:
-                                k1, k2 = f"{kp[0]}-{kp[1]}".encode(), f"{kp[2]}-{kp[3]}".encode()
-                                v1, v2 = txn.get(k1, db=self._hash_db), txn.get(k2, db=self._hash_db)
+                                k1, k2 = f"{kp[0]}-{kp[1]}".encode(),
+                                f"{kp[2]}-{kp[3]}".encode()
+                                v1, v2 = txn.get(k1, db=self._hash_db), \
+                                    txn.get(k2, db=self._hash_db)
                                if v1 and v2:
-                                    # Format is hash|mtime|path|dist... path is always index 2
+                                    # Format is hash|mtime|path|dist... path is always
+                                    # index 2
                                    p1 = v1.decode('utf-8').split('|')[2]
                                    p2 = v2.decode('utf-8').split('|')[2]

                        if p1 and p2:
                            if os.path.exists(p1) and os.path.exists(p2):
-                                results.append(DuplicateResult(p1, p2, None, True, sim, ts))
+                                results.append(
+                                    DuplicateResult(p1, p2, None, True, sim, ts))
                    except Exception:
                        continue
        return results
@@ -484,11 +505,13 @@ class DuplicateCache(QObject):
                with self._lmdb_env.begin(write=True) as txn:
                    for k in keys_to_delete:
                        txn.delete(k, db=self._hash_db)
-                logger.info(f"Cleaned up {len(keys_to_delete)} stale hash entries (files deleted externally)")
+                logger.info(f"Cleaned up {len(keys_to_delete)} stale hash "
+                            "entries (files deleted externally)")
        return len(keys_to_delete)

    def get_all_hashes_with_paths(self):
-        """Retrieves all hashes from the database along with their associated paths and inode info."""
+        """Retrieves all hashes from the database along with their associated paths and
+        inode info."""
        # hash_value -> [(path, dev_id, inode_key_bytes)]
        all_hashes = collections.defaultdict(list)
        if not self._lmdb_env:
@@ -527,7 +550,8 @@ class DuplicateCache(QObject):
        if not old_inode_key_bytes or not new_inode_key_bytes or not self._lmdb_env:
            return False

-        # If the (dev, inode) pair is the same, only the path in the value needs updating.
+        # If the (dev, inode) pair is the same, only the path in the value needs
+        # updating.
        # This happens if the file is renamed within the same filesystem.
        if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes):
            hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
@@ -543,8 +567,10 @@ class DuplicateCache(QObject):
        # 3. Add a new entry with the new (dev, inode) and path, using the old hash.
        hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
        if hash_value:
-            self.remove_hash_for_path(old_path)  # This removes the old (dev, inode) entry
-            self.add_hash_for_path(new_path, hash_value, mtime)  # Adds new (dev, inode) entry
+            # This removes the old (dev, inode) entry
+            self.remove_hash_for_path(old_path)
+            # Adds new (dev, inode) entry
+            self.add_hash_for_path(new_path, hash_value, mtime)
            self._update_pair_paths(old_path, new_path, self._pending_db)
            return True
        return False
@@ -573,7 +599,9 @@ class DuplicateDetector(QThread):
    duplicates_found = Signal(list)  # List of DuplicateResult
    detection_finished = Signal()

-    def __init__(self, paths_to_scan, duplicate_cache, pool_manager, method="histogram_hashing", threshold=90, force_full=False):
+    def __init__(self,
+                 paths_to_scan, duplicate_cache, pool_manager,
+                 method="histogram_hashing", threshold=90, force_full=False):
        super().__init__()
        self.paths_to_scan = paths_to_scan
        self.duplicate_cache = duplicate_cache
@@ -585,17 +613,19 @@ class DuplicateDetector(QThread):

    def stop(self):
        self._is_running = False
-        self.wait() # Add this line
+        self.wait()  # Add this line

    def run(self):
        total_files = len(self.paths_to_scan)
        found_duplicates = []
-        unique_duplicate_pairs = set()  # To store frozenset((path1, path2)) for uniqueness
+        # To store frozenset((path1, path2)) for uniqueness
+        unique_duplicate_pairs = set()
        last_update_time = 0

        pool = self.pool_manager.get_pool()

-        # 1. Load existing pending duplicates from cache to avoid recalculation (unless force_full)
+        # 1. Load existing pending duplicates from cache to avoid recalculation (unless
+        # force_full)
        if not self.force_full:
            pending = self.duplicate_cache.get_all_pending_duplicates()
            for p in pending:
@@ -606,7 +636,10 @@ class DuplicateDetector(QThread):

        # Convert similarity threshold (percentage) to Hamming distance
        distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100)
-        logger.info(f"Duplicate detection: Method={self.method}, Similarity Threshold={self.threshold}%, Hamming Distance Threshold={distance_threshold}")
+        logger.info(
+            f"Duplicate detection: Method={self.method}, "
+            f"Similarity Threshold={self.threshold}%, Hamming "
+            f"Distance Threshold={distance_threshold}")

        # 2. Phase 1: Hash Collection (Parallelized)
        path_to_hash = {}
@@ -645,7 +678,8 @@ class DuplicateDetector(QThread):
                    break
                current_batch = paths_to_hash_parallel[i : i + batch_size]
                for p_data in current_batch:
-                    pool.start(HashWorker(p_data[0], self, new_hashes, results_mutex, sem))
+                    pool.start(HashWorker(
+                        p_data[0], self, new_hashes, results_mutex, sem))

                for _ in range(len(current_batch)):
                    while not sem.tryAcquire(1, 100):
@@ -655,7 +689,9 @@ class DuplicateDetector(QThread):
                        break
                    processed_hashing += 1
                    if time.perf_counter() - last_update_time > 0.05:
-                        self.progress_update.emit(processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
+                        self.progress_update.emit(
+                            processed_hashing, total_files * 2,
+                            UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
                        last_update_time = time.perf_counter()

            for p, mtime, dev, inode in paths_to_hash_parallel:
@@ -670,7 +706,9 @@ class DuplicateDetector(QThread):
            return

        # Signal phase transition to exactly 50%
-        self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
+        self.progress_update.emit(
+            total_files, total_files * 2,
+            UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))

        # 3. Phase 2: Comparison (Optimized with BK-Tree)
        hash_map = collections.defaultdict(list)
@@ -684,9 +722,12 @@ class DuplicateDetector(QThread):
            if self.force_full or p in dirty_paths:
                dirty_hashes_objs.add(h_obj)

-        # Optimization: Only query the tree for hashes associated with new or modified files.
-        # This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were handled in previous runs.
-        hashes_to_query = list(dirty_hashes_objs) if not self.force_full else list(hash_map.keys())
+        # Optimization: Only query the tree for hashes associated with new or modified
+        # files.
+        # This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were
+        # handled in previous runs.
+        hashes_to_query = list(dirty_hashes_objs) \
+            if not self.force_full else list(hash_map.keys())
        total_queries = len(hashes_to_query)

        for i, h1 in enumerate(hashes_to_query):
@@ -697,8 +738,11 @@ class DuplicateDetector(QThread):

            if time.perf_counter() - last_update_time > 0.1:
                # Scale Phase 2 progress to the 50%-100% range
-                phase2_progress = int(((i + 1) / total_queries) * total_files) if total_queries > 0 else total_files
-                self.progress_update.emit(total_files + phase2_progress, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
+                phase2_progress = int(((i + 1) / total_queries) * total_files) \
+                    if total_queries > 0 else total_files
+                self.progress_update.emit(
+                    total_files + phase2_progress, total_files * 2,
+                    UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
                last_update_time = time.perf_counter()

            # Query tree for similar hashes
@@ -713,7 +757,8 @@ class DuplicateDetector(QThread):
                            continue

                        # Optimization: Skip pair if BOTH were already verified
-                        if not self.force_full and p1 not in dirty_paths and p2 not in dirty_paths:
+                        if not self.force_full \
+                           and p1 not in dirty_paths and p2 not in dirty_paths:
                            continue

                        canonical = frozenset((p1, p2))
@@ -726,7 +771,8 @@ class DuplicateDetector(QThread):
                                res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
                                found_duplicates.append(res)
                                unique_duplicate_pairs.add(canonical)
-                                self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim, timestamp=ts)
+                                self.duplicate_cache.mark_as_pending(
+                                    p1, p2, True, similarity=sim, timestamp=ts)

        self.duplicates_found.emit(found_duplicates)
        self.detection_finished.emit()