Fix dedupe_reservoir issues.

cpyle0819 · cpyle0819 · commit ea3978889f07 · 2025-08-18T15:15:28.000-04:00
When dedupe_reservoir was run, it was not accounting for title_abbrev
fields that were already appended with a number. This change compares
the field values without any appended numbers, and then appends a new
number based on the count. Initially I wanted to just pick up where
the count left off. However, it turned out to be possible to generate
duplicate titles on subsequent runs, so there was no way to determine
what the 'original' duplicate was.
diff --git a/aws_doc_sdk_examples_tools/lliam/__init__.py b/aws_doc_sdk_examples_tools/lliam/__init__.py
diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py
@@ -1,7 +1,10 @@
+import logging
+import re
+
 from collections import Counter
 from dataclasses import replace
-import logging
-from typing import Dict
+from pathlib import Path
+from typing import Dict, Iterable, List
 
 from aws_doc_sdk_examples_tools.doc_gen import DocGen
 from aws_doc_sdk_examples_tools.lliam.domain.commands import DedupeReservoir
@@ -12,33 +15,75 @@
 logger = logging.getLogger(__name__)
 
 
-def make_title_abbreviation(example: Example, counter: Counter):
+def make_abbrev(example: Example, counter: Counter) -> str:
+    if not example.title_abbrev:
+        return ""
+
     count = counter[example.title_abbrev]
     abbrev = f"{example.title_abbrev} ({count + 1})" if count else example.title_abbrev
     counter[example.title_abbrev] += 1
     return abbrev
 
 
-def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
-    doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
+def reset_abbrev_count(examples: Dict[str, Example]) -> Dict[str, Example]:
+    """
+    Reset all duplicate title abbreviations back to their un-enumerated state.
+
+    I don't love this. Ideally we would only update new title_abbrev fields
+    with the incremented count. But there's no way to know which ones are new
+    or even which particular title_abbrev is the original.
 
-    examples: Dict[str, Example] = {}
+    Ex.
+    title_abbrev: some policy
+    title_abbrev: some policy (2)
+    title_abbrev: some policy
+    title_abbrev: some policy
 
-    for id, example in doc_gen.examples.items():
-        if cmd.packages and example.file:
-            package = example.file.name.split("_metadata.yaml")[0]
-            if package in cmd.packages:
-                examples[id] = example
-        else:
-            examples[id] = example
+    Which one is the original? Which ones are new?
+    """
 
-    title_abbrev_counts: Counter = Counter()
+    updated_examples = {}
 
     for id, example in examples.items():
-        examples[id] = replace(
+        updated_examples[id] = replace(
             example,
-            title_abbrev=make_title_abbreviation(example, title_abbrev_counts),
+            title_abbrev=re.sub(r"(\s\(\d+\))*$", "", example.title_abbrev or ""),
         )
 
+    return updated_examples
+
+
+def example_in_packages(example: Example, packages: List[str]) -> bool:
+    if packages and example.file:
+        example_pkg_name = example.file.name.split("_metadata.yaml")[0]
+        if not example_pkg_name in packages:
+            return False
+    return True
+
+
+def dedupe_examples(
+    examples: Dict[str, Example], packages: List[str]
+) -> Dict[str, Example]:
+    filtered = {
+        id: ex for id, ex in examples.items() if example_in_packages(ex, packages)
+    }
+
+    reset_examples = reset_abbrev_count(filtered)
+
+    counter = Counter()
+
+    return {
+        id: replace(ex, title_abbrev=make_abbrev(ex, counter))
+        for id, ex in reset_examples.items()
+    }
+
+
+def write_examples(examples: Dict[str, Example], root: Path):
     writes = prepare_write(examples)
-    write_many(cmd.root, writes)
+    write_many(root, writes)
+
+
+def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
+    doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
+    examples = dedupe_examples(doc_gen.examples, cmd.packages)
+    write_examples(examples, cmd.root)
diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py
@@ -22,6 +22,7 @@
     "ailly",
     "--max-depth",
     "10",
+    "--no-overwrite",
     "--root",
     str(AILLY_DIR_PATH),
 ]
diff --git a/aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py b/aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py
@@ -0,0 +1,122 @@
+from collections import Counter
+from pathlib import Path
+
+from aws_doc_sdk_examples_tools.metadata import Example
+from aws_doc_sdk_examples_tools.lliam.service_layer.dedupe_reservoir import (
+    make_abbrev,
+    example_in_packages,
+    reset_abbrev_count,
+    dedupe_examples,
+)
+
+
+def test_make_abbrev_continues_numbering():
+    """Test that numbering continues from existing numbered titles."""
+    counter = Counter({"Some abbrev": 2})
+    example = Example(id="test", file=Path(), languages={}, title_abbrev="Some abbrev")
+    result = make_abbrev(example, counter)
+
+    assert result == "Some abbrev (3)"
+
+
+def test_make_abbrev_first_occurrence():
+    """Test that first occurrence doesn't get numbered."""
+    counter = Counter()
+    example = Example(id="test", file=Path(), languages={}, title_abbrev="New abbrev")
+    result = make_abbrev(example, counter)
+
+    assert result == "New abbrev"
+    assert counter["New abbrev"] == 1
+
+
+def test_example_in_packages_no_packages():
+    """Test that example is included when no packages specified."""
+    example = Example(id="test", file=Path("test_metadata.yaml"), languages={})
+    result = example_in_packages(example, [])
+
+    assert result is True
+
+
+def test_example_in_packages_matching_package():
+    """Test that example is included when package matches."""
+    example = Example(id="test", file=Path("pkg1_metadata.yaml"), languages={})
+    result = example_in_packages(example, ["pkg1", "pkg2"])
+
+    assert result is True
+
+
+def test_example_in_packages_non_matching_package():
+    """Test that example is excluded when package doesn't match."""
+    example = Example(id="test", file=Path("pkg3_metadata.yaml"), languages={})
+    result = example_in_packages(example, ["pkg1", "pkg2"])
+
+    assert result is False
+
+
+def test_build_abbrev_counter():
+    """Test building counter from examples with existing numbered titles."""
+    examples = {
+        "1": Example(id="1", file=Path(), languages={}, title_abbrev="Test (1)"),
+        "2": Example(id="2", file=Path(), languages={}, title_abbrev="Test (2)"),
+        "3": Example(id="3", file=Path(), languages={}, title_abbrev="Other"),
+        "4": Example(id="4", file=Path(), languages={}, title_abbrev="Test"),
+    }
+
+    result = reset_abbrev_count(examples)
+
+    assert result["1"].title_abbrev == "Test"
+    assert result["2"].title_abbrev == "Test"
+    assert result["3"].title_abbrev == "Other"
+    assert result["4"].title_abbrev == "Test"
+
+
+def test_build_abbrev_counter_empty():
+    """Test building counter from empty examples list."""
+    result = reset_abbrev_count({})
+
+    assert len(result) == 0
+
+
+def test_dedupe_examples():
+    """Test deduping examples with existing numbered titles."""
+    examples = {
+        "ex1": Example(
+            id="ex1",
+            file=Path("pkg1_metadata.yaml"),
+            languages={},
+            title_abbrev="Test (2) (2)",
+        ),
+        "ex2": Example(
+            id="ex2",
+            file=Path("pkg1_metadata.yaml"),
+            languages={},
+            title_abbrev="Test (3) (3) (3)",
+        ),
+        "ex3": Example(
+            id="ex3", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
+        ),
+        "ex4": Example(
+            id="ex4", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
+        ),
+        "ex5": Example(
+            id="ex5", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
+        ),
+        "ex6": Example(
+            id="ex6", file=Path("pkg2_metadata.yaml"), languages={}, title_abbrev="Test"
+        ),
+    }
+
+    result = dedupe_examples(examples, [])
+
+    assert len(result) == 6
+    title_abbrevs = sorted(
+        [ex.title_abbrev for ex in result.values() if ex.title_abbrev]
+    )
+    assert title_abbrevs == [
+        "Test",
+        "Test (2)",
+        "Test (3)",
+        "Test (4)",
+        "Test (5)",
+        "Test (6)",
+    ]
diff --git a/aws_doc_sdk_examples_tools/metadata.py b/aws_doc_sdk_examples_tools/metadata.py
@@ -139,7 +139,7 @@ def validate(self, errors: MetadataErrors, root: Path):
 @dataclass
 class Example:
     id: str
-    file: Optional[Path]
+    file: Path
     languages: Dict[str, Language]
     # Human readable title.
     title: Optional[str] = field(default="")

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`"ailly",`
`23`	`23`	`"--max-depth",`
`24`	`24`	`"10",`
	`25`	`+ "--no-overwrite",`
`25`	`26`	`"--root",`
`26`	`27`	`str(AILLY_DIR_PATH),`
`27`	`28`	`]`