Fixed hang with gifs in duplicates form

This commit is contained in:
Ignacio Serantes
2026-04-06 23:20:27 +02:00
parent 45c95c1bb1
commit 964974431c
6 changed files with 298 additions and 143 deletions

View File

@@ -24,11 +24,14 @@ from constants import (
logger = logging.getLogger(__name__)
# Result structure for duplicate detection
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
DuplicateResult = collections.namedtuple(
'DuplicateResult',
['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
class BKTree:
"""A Burkhard-Keller tree for efficient similarity searching using Hamming distance."""
"""A Burkhard-Keller tree for efficient similarity searching using Hamming
distance."""
def __init__(self, distance_func):
self.distance_func = distance_func
self.tree = None
@@ -210,7 +213,8 @@ class DuplicateCache(QObject):
return None, 0, None
with QWriteLocker(self._hash_cache_lock):
self._hash_cache[(dev_id, inode_key_bytes)] = (hash_str, mtime, path_str)
self._hash_cache[(dev_id, inode_key_bytes)] = (
hash_str, mtime, path_str)
return hash_str, mtime, path_str
return None, 0, None
@@ -225,7 +229,8 @@ class DuplicateCache(QObject):
return hash_value
return None
def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
def add_hash_for_path(self,
path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
if dev_id is None or inode_key_bytes is None:
dev_id, inode_key_bytes = self._get_inode_info(path)
if not inode_key_bytes or not self._lmdb_env:
@@ -264,8 +269,10 @@ class DuplicateCache(QObject):
# Also remove any exceptions involving this path
if clear_relationships:
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
self._remove_pair_entries_for_path(
dev_id, inode_key_bytes, self._exceptions_db)
self._remove_pair_entries_for_path(
dev_id, inode_key_bytes, self._pending_db)
return True
def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
@@ -280,7 +287,9 @@ class DuplicateCache(QObject):
return None
return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None):
def mark_as_exception(self,
path1, path2, is_exception=True, similarity=None,
timestamp=None):
if not self._lmdb_env:
return False
@@ -323,8 +332,10 @@ class DuplicateCache(QObject):
with self._lmdb_env.begin(write=False) as txn:
return txn.get(exception_key, db=self._exceptions_db) is not None
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None):
"""Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
def _remove_pair_entries_for_path(self,
target_dev, target_inode, db_handle, txn=None):
"""Removes all entries involving a specific (dev, inode) pair from a pair-based
DB."""
if not self._lmdb_env:
return
@@ -336,8 +347,10 @@ class DuplicateCache(QObject):
for key_bytes, _ in cursor:
key_str = key_bytes.decode('utf-8')
parts = key_str.split('-')
if len(parts) < 4: continue
dev1, inode1_hex, dev2, inode2_hex = int(parts[0]), parts[1], int(parts[2]), parts[3]
if len(parts) < 4:
continue
dev1, inode1_hex, dev2, inode2_hex = int(
parts[0]), parts[1], int(parts[2]), parts[3]
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
(dev2 == target_dev and inode2_hex == target_inode_hex):
keys_to_delete.append(key_bytes)
@@ -351,7 +364,8 @@ class DuplicateCache(QObject):
with self._lmdb_env.begin(write=True) as t:
do_remove(t)
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None):
def mark_as_pending(self,
path1, path2, is_pending=True, similarity=None, timestamp=None):
"""Marks a pair as pending review."""
if not self._lmdb_env or self._pending_db is None:
return False
@@ -392,7 +406,8 @@ class DuplicateCache(QObject):
sim = int(parts[2]) if len(parts) > 2 and parts[2] else None
ts = int(parts[3]) if len(parts) > 3 else 0
if os.path.exists(p1) and os.path.exists(p2):
results.append(DuplicateResult(p1, p2, None, False, sim, ts))
results.append(
DuplicateResult(p1, p2, None, False, sim, ts))
else:
keys_to_delete.append(key)
except Exception:
@@ -404,7 +419,8 @@ class DuplicateCache(QObject):
with self._lmdb_env.begin(write=True) as txn:
for k in keys_to_delete:
txn.delete(k, db=self._pending_db)
logger.info(f"Cleaned up {len(keys_to_delete)} invalid pending duplicates (files deleted externally)")
logger.info(f"Cleaned up {len(keys_to_delete)} invalid "
"pending duplicates (files deleted externally)")
except Exception as e:
logger.error(f"Error cleaning up pending duplicates from DB: {e}")
@@ -436,23 +452,28 @@ class DuplicateCache(QObject):
if len(parts) > 3:
ts = int(parts[3])
else:
ts = int(os.path.getmtime(p1)) if os.path.exists(p1) else 0
ts = int(os.path.getmtime(p1)) \
if os.path.exists(p1) else 0
if not p1 or not p2:
# Legacy format fallback: lookup paths in hash db
key_str = key_bytes.decode('utf-8')
kp = key_str.split('-')
if len(kp) == 4:
k1, k2 = f"{kp[0]}-{kp[1]}".encode(), f"{kp[2]}-{kp[3]}".encode()
v1, v2 = txn.get(k1, db=self._hash_db), txn.get(k2, db=self._hash_db)
k1, k2 = f"{kp[0]}-{kp[1]}".encode(),
f"{kp[2]}-{kp[3]}".encode()
v1, v2 = txn.get(k1, db=self._hash_db), \
txn.get(k2, db=self._hash_db)
if v1 and v2:
# Format is hash|mtime|path|dist... path is always index 2
# Format is hash|mtime|path|dist... path is always
# index 2
p1 = v1.decode('utf-8').split('|')[2]
p2 = v2.decode('utf-8').split('|')[2]
if p1 and p2:
if os.path.exists(p1) and os.path.exists(p2):
results.append(DuplicateResult(p1, p2, None, True, sim, ts))
results.append(
DuplicateResult(p1, p2, None, True, sim, ts))
except Exception:
continue
return results
@@ -484,11 +505,13 @@ class DuplicateCache(QObject):
with self._lmdb_env.begin(write=True) as txn:
for k in keys_to_delete:
txn.delete(k, db=self._hash_db)
logger.info(f"Cleaned up {len(keys_to_delete)} stale hash entries (files deleted externally)")
logger.info(f"Cleaned up {len(keys_to_delete)} stale hash "
"entries (files deleted externally)")
return len(keys_to_delete)
def get_all_hashes_with_paths(self):
"""Retrieves all hashes from the database along with their associated paths and inode info."""
"""Retrieves all hashes from the database along with their associated paths and
inode info."""
# hash_value -> [(path, dev_id, inode_key_bytes)]
all_hashes = collections.defaultdict(list)
if not self._lmdb_env:
@@ -527,7 +550,8 @@ class DuplicateCache(QObject):
if not old_inode_key_bytes or not new_inode_key_bytes or not self._lmdb_env:
return False
# If the (dev, inode) pair is the same, only the path in the value needs updating.
# If the (dev, inode) pair is the same, only the path in the value needs
# updating.
# This happens if the file is renamed within the same filesystem.
if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes):
hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
@@ -543,8 +567,10 @@ class DuplicateCache(QObject):
# 3. Add a new entry with the new (dev, inode) and path, using the old hash.
hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
if hash_value:
self.remove_hash_for_path(old_path) # This removes the old (dev, inode) entry
self.add_hash_for_path(new_path, hash_value, mtime) # Adds new (dev, inode) entry
# This removes the old (dev, inode) entry
self.remove_hash_for_path(old_path)
# Adds new (dev, inode) entry
self.add_hash_for_path(new_path, hash_value, mtime)
self._update_pair_paths(old_path, new_path, self._pending_db)
return True
return False
@@ -573,7 +599,9 @@ class DuplicateDetector(QThread):
duplicates_found = Signal(list) # List of DuplicateResult
detection_finished = Signal()
def __init__(self, paths_to_scan, duplicate_cache, pool_manager, method="histogram_hashing", threshold=90, force_full=False):
def __init__(self,
paths_to_scan, duplicate_cache, pool_manager,
method="histogram_hashing", threshold=90, force_full=False):
super().__init__()
self.paths_to_scan = paths_to_scan
self.duplicate_cache = duplicate_cache
@@ -585,17 +613,19 @@ class DuplicateDetector(QThread):
def stop(self):
self._is_running = False
self.wait() # Add this line
self.wait() # Add this line
def run(self):
total_files = len(self.paths_to_scan)
found_duplicates = []
unique_duplicate_pairs = set() # To store frozenset((path1, path2)) for uniqueness
# To store frozenset((path1, path2)) for uniqueness
unique_duplicate_pairs = set()
last_update_time = 0
pool = self.pool_manager.get_pool()
# 1. Load existing pending duplicates from cache to avoid recalculation (unless force_full)
# 1. Load existing pending duplicates from cache to avoid recalculation (unless
# force_full)
if not self.force_full:
pending = self.duplicate_cache.get_all_pending_duplicates()
for p in pending:
@@ -606,7 +636,10 @@ class DuplicateDetector(QThread):
# Convert similarity threshold (percentage) to Hamming distance
distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100)
logger.info(f"Duplicate detection: Method={self.method}, Similarity Threshold={self.threshold}%, Hamming Distance Threshold={distance_threshold}")
logger.info(
f"Duplicate detection: Method={self.method}, "
f"Similarity Threshold={self.threshold}%, Hamming "
f"Distance Threshold={distance_threshold}")
# 2. Phase 1: Hash Collection (Parallelized)
path_to_hash = {}
@@ -645,7 +678,8 @@ class DuplicateDetector(QThread):
break
current_batch = paths_to_hash_parallel[i : i + batch_size]
for p_data in current_batch:
pool.start(HashWorker(p_data[0], self, new_hashes, results_mutex, sem))
pool.start(HashWorker(
p_data[0], self, new_hashes, results_mutex, sem))
for _ in range(len(current_batch)):
while not sem.tryAcquire(1, 100):
@@ -655,7 +689,9 @@ class DuplicateDetector(QThread):
break
processed_hashing += 1
if time.perf_counter() - last_update_time > 0.05:
self.progress_update.emit(processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
self.progress_update.emit(
processed_hashing, total_files * 2,
UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
last_update_time = time.perf_counter()
for p, mtime, dev, inode in paths_to_hash_parallel:
@@ -670,7 +706,9 @@ class DuplicateDetector(QThread):
return
# Signal phase transition to exactly 50%
self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
self.progress_update.emit(
total_files, total_files * 2,
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
# 3. Phase 2: Comparison (Optimized with BK-Tree)
hash_map = collections.defaultdict(list)
@@ -684,9 +722,12 @@ class DuplicateDetector(QThread):
if self.force_full or p in dirty_paths:
dirty_hashes_objs.add(h_obj)
# Optimization: Only query the tree for hashes associated with new or modified files.
# This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were handled in previous runs.
hashes_to_query = list(dirty_hashes_objs) if not self.force_full else list(hash_map.keys())
# Optimization: Only query the tree for hashes associated with new or modified
# files.
# This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were
# handled in previous runs.
hashes_to_query = list(dirty_hashes_objs) \
if not self.force_full else list(hash_map.keys())
total_queries = len(hashes_to_query)
for i, h1 in enumerate(hashes_to_query):
@@ -697,8 +738,11 @@ class DuplicateDetector(QThread):
if time.perf_counter() - last_update_time > 0.1:
# Scale Phase 2 progress to the 50%-100% range
phase2_progress = int(((i + 1) / total_queries) * total_files) if total_queries > 0 else total_files
self.progress_update.emit(total_files + phase2_progress, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
phase2_progress = int(((i + 1) / total_queries) * total_files) \
if total_queries > 0 else total_files
self.progress_update.emit(
total_files + phase2_progress, total_files * 2,
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
last_update_time = time.perf_counter()
# Query tree for similar hashes
@@ -713,7 +757,8 @@ class DuplicateDetector(QThread):
continue
# Optimization: Skip pair if BOTH were already verified
if not self.force_full and p1 not in dirty_paths and p2 not in dirty_paths:
if not self.force_full \
and p1 not in dirty_paths and p2 not in dirty_paths:
continue
canonical = frozenset((p1, p2))
@@ -726,7 +771,8 @@ class DuplicateDetector(QThread):
res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
found_duplicates.append(res)
unique_duplicate_pairs.add(canonical)
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim, timestamp=ts)
self.duplicate_cache.mark_as_pending(
p1, p2, True, similarity=sim, timestamp=ts)
self.duplicates_found.emit(found_duplicates)
self.detection_finished.emit()