Wolf-Chat-for-Lastwar/simple_bubble_dedup.py
z060142 2ac63718a9 Implement perceptual image hash deduplication for bubble processing
- Added `simple_bubble_dedup.py` module using perceptual hashing (pHash) to detect duplicate chat bubbles based on visual similarity.
- System keeps recent N (default 5) hashes and skips bubbles with Hamming distance below threshold (default 5).
- Integrated into `run_ui_monitoring_loop()`:
  - Hash is computed upon bubble snapshot capture.
  - Duplicate check occurs before message enqueue.
  - Sender info is optionally attached to matching hash entries after successful processing.
- Deduplication state is persisted in `simple_bubble_dedup.json`.
- F7 (`clear_history`) and F8 (`reset_state`) now also clear image-based hash history.
- Removed or commented out legacy `recent_texts` text-based deduplication logic.

This visual deduplication system reduces false negatives caused by slight text variations and ensures higher confidence in skipping repeated bubble interactions.
2025-05-16 02:02:31 +08:00

155 lines
6.4 KiB
Python

import os
import json
import collections
import threading
from PIL import Image
import imagehash
import numpy as np
import io
class SimpleBubbleDeduplication:
def __init__(self, storage_file="simple_bubble_dedup.json", max_bubbles=5, threshold=5, hash_size=16):
self.storage_file = storage_file
self.max_bubbles = max_bubbles # Keep the most recent 5 bubbles
self.threshold = threshold # Hash difference threshold (lower values are more strict)
self.hash_size = hash_size # Hash size
self.lock = threading.Lock()
# Use OrderedDict to maintain order
self.recent_bubbles = collections.OrderedDict()
# Load stored bubble hashes
self._load_storage()
def _load_storage(self):
"""Load processed bubble hash values from file"""
if os.path.exists(self.storage_file):
try:
with open(self.storage_file, 'r') as f:
data = json.load(f)
# Convert stored data to OrderedDict and load
self.recent_bubbles.clear()
# Use loaded_count to track loaded items, ensuring we don't exceed max_bubbles
loaded_count = 0
for bubble_id, bubble_data in data.items():
if loaded_count >= self.max_bubbles:
break
self.recent_bubbles[bubble_id] = {
'hash': imagehash.hex_to_hash(bubble_data['hash']),
'sender': bubble_data.get('sender', 'Unknown')
}
loaded_count += 1
print(f"Loaded {len(self.recent_bubbles)} bubble hash records")
except Exception as e:
print(f"Failed to load bubble hash records: {e}")
self.recent_bubbles.clear()
def _save_storage(self):
"""Save bubble hashes to file"""
try:
# Create temporary dictionary for saving
data_to_save = {}
for bubble_id, bubble_data in self.recent_bubbles.items():
data_to_save[bubble_id] = {
'hash': str(bubble_data['hash']),
'sender': bubble_data.get('sender', 'Unknown')
}
with open(self.storage_file, 'w') as f:
json.dump(data_to_save, f, indent=2)
print(f"Saved {len(data_to_save)} bubble hash records")
except Exception as e:
print(f"Failed to save bubble hash records: {e}")
def compute_image_hash(self, bubble_snapshot):
"""Calculate perceptual hash of bubble image"""
try:
# If bubble_snapshot is a PIL.Image object
if isinstance(bubble_snapshot, Image.Image):
img = bubble_snapshot
# If bubble_snapshot is a PyAutoGUI screenshot
elif hasattr(bubble_snapshot, 'save'):
img = bubble_snapshot
# If it's bytes or BytesIO
elif isinstance(bubble_snapshot, (bytes, io.BytesIO)):
img = Image.open(io.BytesIO(bubble_snapshot) if isinstance(bubble_snapshot, bytes) else bubble_snapshot)
# If it's a numpy array
elif isinstance(bubble_snapshot, np.ndarray):
img = Image.fromarray(bubble_snapshot)
else:
print(f"Unrecognized image format: {type(bubble_snapshot)}")
return None
# Calculate perceptual hash
phash = imagehash.phash(img, hash_size=self.hash_size)
return phash
except Exception as e:
print(f"Failed to calculate image hash: {e}")
return None
def generate_bubble_id(self, bubble_region):
"""Generate ID based on bubble region"""
return f"bubble_{bubble_region[0]}_{bubble_region[1]}_{bubble_region[2]}_{bubble_region[3]}"
def is_duplicate(self, bubble_snapshot, bubble_region, sender_name=""):
"""Check if bubble is a duplicate"""
with self.lock:
if bubble_snapshot is None:
return False
# Calculate hash of current bubble
current_hash = self.compute_image_hash(bubble_snapshot)
if current_hash is None:
print("Unable to calculate bubble hash, cannot perform deduplication")
return False
# Generate ID for current bubble
bubble_id = self.generate_bubble_id(bubble_region)
# Check if similar to any known bubbles
for stored_id, bubble_data in self.recent_bubbles.items():
stored_hash = bubble_data['hash']
hash_diff = current_hash - stored_hash
if hash_diff <= self.threshold:
print(f"Detected duplicate bubble (ID: {stored_id}, Hash difference: {hash_diff})")
if sender_name:
print(f"Sender: {sender_name}, Recorded sender: {bubble_data.get('sender', 'Unknown')}")
return True
# Not a duplicate, add to recent bubbles list
self.recent_bubbles[bubble_id] = {
'hash': current_hash,
'sender': sender_name
}
# If exceeding maximum count, remove oldest item
while len(self.recent_bubbles) > self.max_bubbles:
self.recent_bubbles.popitem(last=False) # Remove first item (oldest)
self._save_storage()
return False
def clear_all(self):
"""Clear all records"""
with self.lock:
count = len(self.recent_bubbles)
self.recent_bubbles.clear()
self._save_storage()
print(f"Cleared all {count} bubble records")
return count
def save_debug_image(self, bubble_snapshot, bubble_id, hash_value):
"""Save debug image (optional feature)"""
try:
debug_dir = "bubble_debug"
if not os.path.exists(debug_dir):
os.makedirs(debug_dir)
# Save original image
img_path = os.path.join(debug_dir, f"{bubble_id}_{hash_value}.png")
bubble_snapshot.save(img_path)
print(f"Saved debug image: {img_path}")
except Exception as e:
print(f"Failed to save debug image: {e}")