Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 100 additions & 10 deletions scripts/lib/dedupe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,81 @@
"""Near-duplicate detection for last30days skill."""

import re
from typing import List, Set, Tuple, Union
from typing import List, Optional, Set, Tuple, Union
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

from . import schema


# Tracking / analytics query parameters to strip during canonicalization
_TRACKING_PARAMS = frozenset({
# Google Analytics / UTM
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format',
# Facebook / Meta
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_ref', 'fb_source',
# Twitter / X
'ref_src', 'ref_url',
# Reddit
'share_id', 'ref_source',
# General trackers
'gclid', 'gclsrc', 'dclid', 'msclkid', 'mc_cid', 'mc_eid',
'igshid', '_ga', '_gl', 'yclid', 'twclid',
# Misc
'gi',
})


def canonicalize_url(url: Optional[str]) -> Optional[str]:
"""Canonicalize a URL for deduplication.

- Lowercases scheme and host
- Strips www. prefix
- Removes tracking/analytics query parameters
- Removes trailing slashes from path
- Removes fragments

Returns None for empty/invalid URLs.
"""
if not url or not url.strip():
return None

try:
parsed = urlparse(url.strip())
except ValueError:
return None

if not parsed.scheme or not parsed.netloc:
return None

scheme = parsed.scheme.lower()
if scheme not in ('http', 'https'):
return None
host = parsed.hostname or ''
host = host.lower()
if host.startswith('www.'):
host = host[4:]

# Preserve port if non-standard
port = parsed.port
netloc = host
if port and not (scheme == 'http' and port == 80) and not (scheme == 'https' and port == 443):
netloc = f"{host}:{port}"

# Strip trailing slashes from path (but keep "/" for root)
path = parsed.path.rstrip('/') or '/'

# Filter out tracking params, keep the rest sorted for consistency
query_params = parse_qs(parsed.query, keep_blank_values=True)
clean_params = {
k: v for k, v in query_params.items()
if k.lower() not in _TRACKING_PARAMS
}
query = urlencode(sorted(clean_params.items()), doseq=True) if clean_params else ''

# Drop fragment entirely
return urlunparse((scheme, netloc, path, '', query, ''))

# Stopwords for token-based Jaccard (cross-source linking)
STOPWORDS = frozenset({
'the', 'a', 'an', 'to', 'for', 'how', 'is', 'in', 'of', 'on',
Expand Down Expand Up @@ -69,6 +140,14 @@ def get_item_text(item: AnyItem) -> str:
return item.text


def _get_item_url(item: AnyItem) -> Optional[str]:
"""Get the canonical URL from any item type."""
url = getattr(item, 'url', None)
# HackerNews items have both url (article) and hn_url (discussion)
# Use the article URL for cross-source matching
return canonicalize_url(url)


def _get_cross_source_text(item: AnyItem) -> str:
"""Get text for cross-source comparison.

Expand Down Expand Up @@ -256,9 +335,10 @@ def cross_source_link(
) -> None:
"""Annotate items with cross-source references.

Compares items across different source types using hybrid similarity
(max of char-trigram Jaccard and token Jaccard). When similarity exceeds
threshold, adds bidirectional cross_refs with the related item's ID.
First checks URL identity (via canonicalization) for exact matches,
then falls back to hybrid text similarity (max of char-trigram Jaccard
and token Jaccard). When similarity exceeds threshold, adds
bidirectional cross_refs with the related item's ID.
Modifies items in-place.

Args:
Expand All @@ -272,19 +352,29 @@ def cross_source_link(
if len(all_items) <= 1:
return

# Pre-compute cross-source text for each item
# Pre-compute canonical URLs and cross-source text for each item
urls = [_get_item_url(item) for item in all_items]
texts = [_get_cross_source_text(item) for item in all_items]

def _add_xref(i: int, j: int) -> None:
"""Add bidirectional cross-reference between items i and j."""
if all_items[j].id not in all_items[i].cross_refs:
all_items[i].cross_refs.append(all_items[j].id)
if all_items[i].id not in all_items[j].cross_refs:
all_items[j].cross_refs.append(all_items[i].id)

for i in range(len(all_items)):
for j in range(i + 1, len(all_items)):
# Skip same-source comparisons (handled by per-source dedupe)
if type(all_items[i]) is type(all_items[j]):
continue

# Fast path: exact URL match after canonicalization
if urls[i] and urls[j] and urls[i] == urls[j]:
_add_xref(i, j)
continue

# Slow path: text similarity
similarity = _hybrid_similarity(texts[i], texts[j])
if similarity >= threshold:
# Bidirectional cross-reference
if all_items[j].id not in all_items[i].cross_refs:
all_items[i].cross_refs.append(all_items[j].id)
if all_items[i].id not in all_items[j].cross_refs:
all_items[j].cross_refs.append(all_items[i].id)
_add_xref(i, j)