Skip to content

Commit ea39788

Browse files
committed
Fix dedupe_reservoir issues.
When dedupe_reservoir was run, it was not accounting for title_abbrev fields that were already appended with a number. This change compares the field values without any appended numbers, and then appends a new number based on the count. Initially I wanted to just pick up where the count left off. However, it turned out to be possible to generate duplicate titles on subsequent runs, so there was no way to determine what the 'original' duplicate was.
1 parent a17df93 commit ea39788

File tree

5 files changed

+186
-18
lines changed

5 files changed

+186
-18
lines changed

aws_doc_sdk_examples_tools/lliam/__init__.py

Whitespace-only changes.
Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import logging
2+
import re
3+
14
from collections import Counter
25
from dataclasses import replace
3-
import logging
4-
from typing import Dict
6+
from pathlib import Path
7+
from typing import Dict, Iterable, List
58

69
from aws_doc_sdk_examples_tools.doc_gen import DocGen
710
from aws_doc_sdk_examples_tools.lliam.domain.commands import DedupeReservoir
@@ -12,33 +15,75 @@
1215
logger = logging.getLogger(__name__)
1316

1417

15-
def make_title_abbreviation(example: Example, counter: Counter):
18+
def make_abbrev(example: Example, counter: Counter) -> str:
19+
if not example.title_abbrev:
20+
return ""
21+
1622
count = counter[example.title_abbrev]
1723
abbrev = f"{example.title_abbrev} ({count + 1})" if count else example.title_abbrev
1824
counter[example.title_abbrev] += 1
1925
return abbrev
2026

2127

22-
def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
23-
doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
28+
def reset_abbrev_count(examples: Dict[str, Example]) -> Dict[str, Example]:
29+
"""
30+
Reset all duplicate title abbreviations back to their un-enumerated state.
31+
32+
I don't love this. Ideally we would only update new title_abbrev fields
33+
with the incremented count. But there's no way to know which ones are new
34+
or even which particular title_abbrev is the original.
2435
25-
examples: Dict[str, Example] = {}
36+
Ex.
37+
title_abbrev: some policy
38+
title_abbrev: some policy (2)
39+
title_abbrev: some policy
40+
title_abbrev: some policy
2641
27-
for id, example in doc_gen.examples.items():
28-
if cmd.packages and example.file:
29-
package = example.file.name.split("_metadata.yaml")[0]
30-
if package in cmd.packages:
31-
examples[id] = example
32-
else:
33-
examples[id] = example
42+
Which one is the original? Which ones are new?
43+
"""
3444

35-
title_abbrev_counts: Counter = Counter()
45+
updated_examples = {}
3646

3747
for id, example in examples.items():
38-
examples[id] = replace(
48+
updated_examples[id] = replace(
3949
example,
40-
title_abbrev=make_title_abbreviation(example, title_abbrev_counts),
50+
title_abbrev=re.sub(r"(\s\(\d+\))*$", "", example.title_abbrev or ""),
4151
)
4252

53+
return updated_examples
54+
55+
56+
def example_in_packages(example: Example, packages: List[str]) -> bool:
57+
if packages and example.file:
58+
example_pkg_name = example.file.name.split("_metadata.yaml")[0]
59+
if not example_pkg_name in packages:
60+
return False
61+
return True
62+
63+
64+
def dedupe_examples(
65+
examples: Dict[str, Example], packages: List[str]
66+
) -> Dict[str, Example]:
67+
filtered = {
68+
id: ex for id, ex in examples.items() if example_in_packages(ex, packages)
69+
}
70+
71+
reset_examples = reset_abbrev_count(filtered)
72+
73+
counter = Counter()
74+
75+
return {
76+
id: replace(ex, title_abbrev=make_abbrev(ex, counter))
77+
for id, ex in reset_examples.items()
78+
}
79+
80+
81+
def write_examples(examples: Dict[str, Example], root: Path):
4382
writes = prepare_write(examples)
44-
write_many(cmd.root, writes)
83+
write_many(root, writes)
84+
85+
86+
def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
87+
doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
88+
examples = dedupe_examples(doc_gen.examples, cmd.packages)
89+
write_examples(examples, cmd.root)

aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"ailly",
2323
"--max-depth",
2424
"10",
25+
"--no-overwrite",
2526
"--root",
2627
str(AILLY_DIR_PATH),
2728
]
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from collections import Counter
2+
from pathlib import Path
3+
4+
from aws_doc_sdk_examples_tools.metadata import Example
5+
from aws_doc_sdk_examples_tools.lliam.service_layer.dedupe_reservoir import (
6+
make_abbrev,
7+
example_in_packages,
8+
reset_abbrev_count,
9+
dedupe_examples,
10+
)
11+
12+
13+
def test_make_abbrev_continues_numbering():
14+
"""Test that numbering continues from existing numbered titles."""
15+
counter = Counter({"Some abbrev": 2})
16+
example = Example(id="test", file=Path(), languages={}, title_abbrev="Some abbrev")
17+
result = make_abbrev(example, counter)
18+
19+
assert result == "Some abbrev (3)"
20+
21+
22+
def test_make_abbrev_first_occurrence():
23+
"""Test that first occurrence doesn't get numbered."""
24+
counter = Counter()
25+
example = Example(id="test", file=Path(), languages={}, title_abbrev="New abbrev")
26+
result = make_abbrev(example, counter)
27+
28+
assert result == "New abbrev"
29+
assert counter["New abbrev"] == 1
30+
31+
32+
def test_example_in_packages_no_packages():
33+
"""Test that example is included when no packages specified."""
34+
example = Example(id="test", file=Path("test_metadata.yaml"), languages={})
35+
result = example_in_packages(example, [])
36+
37+
assert result is True
38+
39+
40+
def test_example_in_packages_matching_package():
41+
"""Test that example is included when package matches."""
42+
example = Example(id="test", file=Path("pkg1_metadata.yaml"), languages={})
43+
result = example_in_packages(example, ["pkg1", "pkg2"])
44+
45+
assert result is True
46+
47+
48+
def test_example_in_packages_non_matching_package():
49+
"""Test that example is excluded when package doesn't match."""
50+
example = Example(id="test", file=Path("pkg3_metadata.yaml"), languages={})
51+
result = example_in_packages(example, ["pkg1", "pkg2"])
52+
53+
assert result is False
54+
55+
56+
def test_build_abbrev_counter():
57+
"""Test building counter from examples with existing numbered titles."""
58+
examples = {
59+
"1": Example(id="1", file=Path(), languages={}, title_abbrev="Test (1)"),
60+
"2": Example(id="2", file=Path(), languages={}, title_abbrev="Test (2)"),
61+
"3": Example(id="3", file=Path(), languages={}, title_abbrev="Other"),
62+
"4": Example(id="4", file=Path(), languages={}, title_abbrev="Test"),
63+
}
64+
65+
result = reset_abbrev_count(examples)
66+
67+
assert result["1"].title_abbrev == "Test"
68+
assert result["2"].title_abbrev == "Test"
69+
assert result["3"].title_abbrev == "Other"
70+
assert result["4"].title_abbrev == "Test"
71+
72+
73+
def test_build_abbrev_counter_empty():
74+
"""Test building counter from empty examples list."""
75+
result = reset_abbrev_count({})
76+
77+
assert len(result) == 0
78+
79+
80+
def test_dedupe_examples():
81+
"""Test deduping examples with existing numbered titles."""
82+
examples = {
83+
"ex1": Example(
84+
id="ex1",
85+
file=Path("pkg1_metadata.yaml"),
86+
languages={},
87+
title_abbrev="Test (2) (2)",
88+
),
89+
"ex2": Example(
90+
id="ex2",
91+
file=Path("pkg1_metadata.yaml"),
92+
languages={},
93+
title_abbrev="Test (3) (3) (3)",
94+
),
95+
"ex3": Example(
96+
id="ex3", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
97+
),
98+
"ex4": Example(
99+
id="ex4", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
100+
),
101+
"ex5": Example(
102+
id="ex5", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
103+
),
104+
"ex6": Example(
105+
id="ex6", file=Path("pkg2_metadata.yaml"), languages={}, title_abbrev="Test"
106+
),
107+
}
108+
109+
result = dedupe_examples(examples, [])
110+
111+
assert len(result) == 6
112+
title_abbrevs = sorted(
113+
[ex.title_abbrev for ex in result.values() if ex.title_abbrev]
114+
)
115+
assert title_abbrevs == [
116+
"Test",
117+
"Test (2)",
118+
"Test (3)",
119+
"Test (4)",
120+
"Test (5)",
121+
"Test (6)",
122+
]

aws_doc_sdk_examples_tools/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def validate(self, errors: MetadataErrors, root: Path):
139139
@dataclass
140140
class Example:
141141
id: str
142-
file: Optional[Path]
142+
file: Path
143143
languages: Dict[str, Language]
144144
# Human readable title.
145145
title: Optional[str] = field(default="")

0 commit comments

Comments
 (0)