Skip to content

Commit 3149575

Browse files
committed
Made page-save HTML formatting much more efficient
Replaced the existing xpath-heavy system with a more manual traversal approach. Fixes following slow areas of old system: - Old system would repeat ID-setting action for elements (Headers could be processed up to three times). - Old system had a few very open xpath queries for headers. - Old system would update links on every ID change, which triggers it's own xpath query for links, leading to exponential scaling issues. New system only does one xpath query for links when changes are needed. Added test to cover. For #3932
1 parent c803961 commit 3149575

File tree

2 files changed

+73
-45
lines changed

2 files changed

+73
-45
lines changed

app/Entities/Tools/PageContent.php

Lines changed: 54 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,15 @@
1919

2020
class PageContent
2121
{
22-
protected Page $page;
23-
24-
/**
25-
* PageContent constructor.
26-
*/
27-
public function __construct(Page $page)
28-
{
29-
$this->page = $page;
22+
public function __construct(
23+
protected Page $page
24+
) {
3025
}
3126

3227
/**
3328
* Update the content of the page with new provided HTML.
3429
*/
35-
public function setNewHTML(string $html)
30+
public function setNewHTML(string $html): void
3631
{
3732
$html = $this->extractBase64ImagesFromHtml($html);
3833
$this->page->html = $this->formatHtml($html);
@@ -43,7 +38,7 @@ public function setNewHTML(string $html)
4338
/**
4439
* Update the content of the page with new provided Markdown content.
4540
*/
46-
public function setNewMarkdown(string $markdown)
41+
public function setNewMarkdown(string $markdown): void
4742
{
4843
$markdown = $this->extractBase64ImagesFromMarkdown($markdown);
4944
$this->page->markdown = $markdown;
@@ -57,7 +52,7 @@ public function setNewMarkdown(string $markdown)
5752
*/
5853
protected function extractBase64ImagesFromHtml(string $htmlText): string
5954
{
60-
if (empty($htmlText) || strpos($htmlText, 'data:image') === false) {
55+
if (empty($htmlText) || !str_contains($htmlText, 'data:image')) {
6156
return $htmlText;
6257
}
6358

@@ -91,7 +86,7 @@ protected function extractBase64ImagesFromHtml(string $htmlText): string
9186
* Attempting to capture the whole data uri using regex can cause PHP
9287
* PCRE limits to be hit with larger, multi-MB, files.
9388
*/
94-
protected function extractBase64ImagesFromMarkdown(string $markdown)
89+
protected function extractBase64ImagesFromMarkdown(string $markdown): string
9590
{
9691
$matches = [];
9792
$contentLength = strlen($markdown);
@@ -183,32 +178,13 @@ protected function formatHtml(string $htmlText): string
183178
$childNodes = $body->childNodes;
184179
$xPath = new DOMXPath($doc);
185180

186-
// Set ids on top-level nodes
181+
// Map to hold used ID references
187182
$idMap = [];
188-
foreach ($childNodes as $index => $childNode) {
189-
[$oldId, $newId] = $this->setUniqueId($childNode, $idMap);
190-
if ($newId && $newId !== $oldId) {
191-
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
192-
}
193-
}
183+
// Map to hold changing ID references
184+
$changeMap = [];
194185

195-
// Set ids on nested header nodes
196-
$nestedHeaders = $xPath->query('//body//*//h1|//body//*//h2|//body//*//h3|//body//*//h4|//body//*//h5|//body//*//h6');
197-
foreach ($nestedHeaders as $nestedHeader) {
198-
[$oldId, $newId] = $this->setUniqueId($nestedHeader, $idMap);
199-
if ($newId && $newId !== $oldId) {
200-
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
201-
}
202-
}
203-
204-
// Ensure no duplicate ids within child items
205-
$idElems = $xPath->query('//body//*//*[@id]');
206-
foreach ($idElems as $domElem) {
207-
[$oldId, $newId] = $this->setUniqueId($domElem, $idMap);
208-
if ($newId && $newId !== $oldId) {
209-
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
210-
}
211-
}
186+
$this->updateIdsRecursively($body, 0, $idMap, $changeMap);
187+
$this->updateLinks($xPath, $changeMap);
212188

213189
// Generate inner html as a string
214190
$html = '';
@@ -223,20 +199,53 @@ protected function formatHtml(string $htmlText): string
223199
}
224200

225201
/**
226-
* Update the all links to the $old location to instead point to $new.
202+
* For the given DOMNode, traverse its children recursively and update IDs
203+
* where required (Top-level, headers & elements with IDs).
204+
* Will update the provided $changeMap array with changes made, where keys are the old
205+
* ids and the corresponding values are the new ids.
206+
*/
207+
protected function updateIdsRecursively(DOMNode $element, int $depth, array &$idMap, array &$changeMap): void
208+
{
209+
/* @var DOMNode $child */
210+
foreach ($element->childNodes as $child) {
211+
if ($child instanceof DOMElement && ($depth === 0 || in_array($child->nodeName, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) || $child->getAttribute('id'))) {
212+
[$oldId, $newId] = $this->setUniqueId($child, $idMap);
213+
if ($newId && $newId !== $oldId && !isset($idMap[$oldId])) {
214+
$changeMap[$oldId] = $newId;
215+
}
216+
}
217+
218+
if ($child->hasChildNodes()) {
219+
$this->updateIdsRecursively($child, $depth + 1, $idMap, $changeMap);
220+
}
221+
}
222+
}
223+
224+
/**
225+
* Update the all links in the given xpath to apply requires changes within the
226+
* given $changeMap array.
227227
*/
228-
protected function updateLinks(DOMXPath $xpath, string $old, string $new)
228+
protected function updateLinks(DOMXPath $xpath, array $changeMap): void
229229
{
230-
$old = str_replace('"', '', $old);
231-
$matchingLinks = $xpath->query('//body//*//*[@href="' . $old . '"]');
232-
foreach ($matchingLinks as $domElem) {
233-
$domElem->setAttribute('href', $new);
230+
if (empty($changeMap)) {
231+
return;
232+
}
233+
234+
$links = $xpath->query('//body//*//*[@href]');
235+
/** @var DOMElement $domElem */
236+
foreach ($links as $domElem) {
237+
$href = ltrim($domElem->getAttribute('href'), '#');
238+
$newHref = $changeMap[$href] ?? null;
239+
if ($newHref) {
240+
$domElem->setAttribute('href', '#' . $newHref);
241+
}
234242
}
235243
}
236244

237245
/**
238246
* Set a unique id on the given DOMElement.
239-
* A map for existing ID's should be passed in to check for current existence.
247+
* A map for existing ID's should be passed in to check for current existence,
248+
* and this will be updated with any new IDs set upon elements.
240249
* Returns a pair of strings in the format [old_id, new_id].
241250
*/
242251
protected function setUniqueId(DOMNode $element, array &$idMap): array
@@ -247,7 +256,7 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array
247256

248257
// Stop if there's an existing valid id that has not already been used.
249258
$existingId = $element->getAttribute('id');
250-
if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {
259+
if (str_starts_with($existingId, 'bkmrk') && !isset($idMap[$existingId])) {
251260
$idMap[$existingId] = true;
252261

253262
return [$existingId, $existingId];
@@ -258,7 +267,7 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array
258267
// the same content is passed through.
259268
$contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);
260269
$newId = urlencode($contentId);
261-
$loopIndex = 0;
270+
$loopIndex = 1;
262271

263272
while (isset($idMap[$newId])) {
264273
$newId = urlencode($contentId . '-' . $loopIndex);

tests/Entity/PageContentTest.php

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,4 +732,23 @@ public function test_non_breaking_spaces_are_preserved()
732732

733733
$this->assertStringContainsString('<p id="bkmrk-%C2%A0">&nbsp;</p>', $page->refresh()->html);
734734
}
735+
736+
public function test_page_save_with_many_headers_and_links_is_reasonable()
737+
{
738+
$page = $this->entities->page();
739+
740+
$content = '';
741+
for ($i = 0; $i < 500; $i++) {
742+
$content .= "<table><tbody><tr><td><h5 id='header-{$i}'>Simple Test</h5><a href='#header-{$i}'></a></td></tr></tbody></table>";
743+
}
744+
745+
$time = time();
746+
$this->asEditor()->put($page->getUrl(), [
747+
'name' => $page->name,
748+
'html' => $content,
749+
])->assertRedirect();
750+
751+
$timeElapsed = time() - $time;
752+
$this->assertLessThan(3, $timeElapsed);
753+
}
735754
}

0 commit comments

Comments
 (0)