Replace text deduplication with difflib-based similarity matching to reduce false negatives

2025-05-17 02:16:41 +08:00 · 2025-05-17 02:16:41 +08:00 · f9457bf992
commit f9457bf992
parent a8603d4d45
1 changed files with 41 additions and 25 deletions
--- a/ui_interaction.py
+++ b/ui_interaction.py
@ -20,43 +20,59 @@ import threading # Import threading for Lock if needed, or just use a simple fla
 import math # Added for distance calculation in dual method
 import time # Ensure time is imported for MessageDeduplication
 from simple_bubble_dedup import SimpleBubbleDeduplication
+import difflib # Added for text similarity

 class MessageDeduplication:
    def __init__(self, expiry_seconds=3600):  # 1 hour expiry time
-        self.processed_messages = {}  # {composite_key: timestamp}
+        self.processed_messages = {}  # {message_key: timestamp}
        self.expiry_seconds = expiry_seconds

-    def create_key(self, sender, content):
-        """Create a standardized composite key."""
-        # Thoroughly standardize text - remove all whitespace and punctuation, lowercase
-        clean_content = ''.join(c.lower() for c in content if c.isalnum())
-        clean_sender = ''.join(c.lower() for c in sender if c.isalnum())
-
-        # Truncate content to first 100 chars to prevent overly long keys
-        if len(clean_content) > 100:
-            clean_content = clean_content[:100]
-
-        return f"{clean_sender}:{clean_content}"
-
    def is_duplicate(self, sender, content):
-        """Check if the message is a duplicate within the expiry period."""
+        """Check if the message is a duplicate within the expiry period using text similarity."""
        if not sender or not content:
            return False  # Missing necessary info, treat as new message

-        key = self.create_key(sender, content)
        current_time = time.time()
-
-        # Check if duplicate and not expired
-        if key in self.processed_messages:
-            last_time = self.processed_messages[key]
-            if current_time - last_time < self.expiry_seconds:
-                print(f"Deduplicator: Detected duplicate message: {sender} - {content[:20]}...")
-                return True
-
-        # Update processing time
-        self.processed_messages[key] = current_time
+        
+        # 遍歷所有已處理的消息
+        for key, timestamp in list(self.processed_messages.items()):
+            # 檢查是否過期
+            if current_time - timestamp >= self.expiry_seconds:
+                # 從 processed_messages 中移除過期的項目，避免集合在迭代時改變大小
+                # 但由於我們使用了 list(self.processed_messages.items())，所以這裡可以安全地 continue
+                # 或者，如果希望立即刪除，則需要不同的迭代策略或在 purge_expired 中處理
+                continue # 繼續檢查下一個，過期項目由 purge_expired 處理
+                
+            # 解析之前儲存的發送者和內容
+            stored_sender, stored_content = key.split(":", 1)
+            
+            # 檢查發送者是否相同
+            if sender.lower() == stored_sender.lower():
+                # Calculate text similarity
+                similarity = difflib.SequenceMatcher(None, content, stored_content).ratio()
+                if similarity >= 0.95:  # Use 0.95 as threshold
+                    print(f"Deduplicator: Detected similar message (similarity: {similarity:.2f}): {sender} - {content[:20]}...")
+                    return True
+        
+        # 不是重複消息，儲存它
+        # 注意：這裡儲存的 content 是原始 content，不是 clean_content
+        message_key = f"{sender.lower()}:{content}"
+        self.processed_messages[message_key] = current_time
        return False

+    # create_key 方法已不再需要，可以移除
+    # def create_key(self, sender, content):
+    #     """Create a standardized composite key."""
+    #     # Thoroughly standardize text - remove all whitespace and punctuation, lowercase
+    #     clean_content = ''.join(c.lower() for c in content if c.isalnum())
+    #     clean_sender = ''.join(c.lower() for c in sender if c.isalnum())
+
+    #     # Truncate content to first 100 chars to prevent overly long keys
+    #     if len(clean_content) > 100:
+    #         clean_content = clean_content[:100]
+
+    #     return f"{clean_sender}:{clean_content}"
+
    def purge_expired(self):
        """Remove expired message records."""
        current_time = time.time()