Skip to content

Commit 897e44c

Browse files
Kdecherfj0k3r
authored andcommitted
Add ability to process prefetched content
fetchContent() now accepts an optional parameter, prefetchedContent, which can contain the content of a page that was fetched before calling Graby. If we take the example of Wallabag it gives the ability of sending the content of a page (through a browser extension for example) without making network calls to fetch the page. Signed-off-by: Kevin Decherf <[email protected]>
1 parent 474bbe1 commit 897e44c

File tree

3 files changed

+60
-5
lines changed

3 files changed

+60
-5
lines changed

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,22 @@ array(
129129

130130
The `date` result is the same as displayed in the content. If `date` is not `null` in the result, we recommend you to parse it using [`date_parse`](http://php.net/date_parse) (this is what we are using to validate that the date is correct).
131131

132+
### Retrieve content from a prefetched page
133+
134+
If you want to extract content from a page you fetched outside of Graby, you can call `setContentAsPrefetched()` before calling `fetchContent()`, e.g.:
135+
136+
``` php
137+
use Graby\Graby;
138+
139+
$article = 'http://www.bbc.com/news/entertainment-arts-32547474';
140+
141+
$input = '<html>[...]</html>';
142+
143+
$graby = new Graby();
144+
$graby->setContentAsPrefetched($input);
145+
$result = $graby->fetchContent($article);
146+
```
147+
132148
### Cleanup content
133149

134150
Since the 1.9.0 version, you can also send html content to be cleanup in the same way graby clean content retrieved from an url. The url is still needed to convert links to absolute, etc.

src/Graby.php

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class Graby
4242
private $punycode;
4343

4444
private $imgNoReferrer = false;
45+
private $prefetchedContent = null;
4546

4647
/**
4748
* @param array $config
@@ -161,6 +162,11 @@ public function getConfig($key)
161162
return $this->config[$key];
162163
}
163164

165+
public function setContentAsPrefetched(string $content): void
166+
{
167+
$this->prefetchedContent = $content;
168+
}
169+
164170
/**
165171
* Fetch content from the given url and return a readable content.
166172
*
@@ -272,6 +278,18 @@ public function cleanupHtml($contentBlock, $url)
272278
return trim($this->cleanupXss((string) $html));
273279
}
274280

281+
private function getResponseForPrefetchedContent(string $url): array
282+
{
283+
return [
284+
'body' => $this->prefetchedContent,
285+
'effective_url' => $url,
286+
'headers' => [
287+
'content-type' => 'text/html',
288+
],
289+
'status' => 200,
290+
];
291+
}
292+
275293
/**
276294
* Do fetch content from an url.
277295
*
@@ -284,9 +302,13 @@ private function doFetchContent($url)
284302
$url = $this->validateUrl($url);
285303
$siteConfig = $this->configBuilder->buildFromUrl($url);
286304

287-
$this->logger->info('Fetching url: {url}', ['url' => $url]);
288-
289-
$response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
305+
if (null === $this->prefetchedContent) {
306+
$this->logger->info('Fetching url: {url}', ['url' => $url]);
307+
$response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
308+
} else {
309+
$this->logger->info('Content provided as prefetched for url: {url}', ['url' => $url]);
310+
$response = $this->getResponseForPrefetchedContent($url);
311+
}
290312

291313
$effectiveUrl = $response['effective_url'];
292314
$effectiveUrl = str_replace(' ', '%20', $effectiveUrl);
@@ -330,7 +352,7 @@ private function doFetchContent($url)
330352

331353
// check site config for single page URL - fetch it if found
332354
$isSinglePage = false;
333-
if ($this->config['singlepage'] && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
355+
if ($this->config['singlepage'] && null === $this->prefetchedContent && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
334356
$isSinglePage = true;
335357
$effectiveUrl = $singlePageResponse['effective_url'];
336358

@@ -370,7 +392,7 @@ private function doFetchContent($url)
370392

371393
// Deal with multi-page articles
372394
$isMultiPage = (!$isSinglePage && $extractResult && null !== $this->extractor->getNextPageUrl());
373-
if ($this->config['multipage'] && $isMultiPage) {
395+
if ($this->config['multipage'] && null === $this->prefetchedContent && $isMultiPage) {
374396
$this->logger->info('Attempting to process multi-page article');
375397
// store first page to avoid parsing it again (previous url content is in `$contentBlock`)
376398
$multiPageUrls = [$effectiveUrl];

tests/GrabyTest.php

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1396,6 +1396,23 @@ public function testWithTooLongHtmlJitFail(): void
13961396
$this->assertNotSame('No title found', $res['title']);
13971397
}
13981398

1399+
public function testPrefetchedContent(): void
1400+
{
1401+
$httpMockClient = new HttpMockClient();
1402+
$graby = new Graby([
1403+
'debug' => true,
1404+
], $httpMockClient);
1405+
1406+
$input = '<html><body><h1>This is my awesome article</h1><article><p>' . str_repeat('This is an awesome text with some links, here there are the awesome', 7) . '</p></article></body></html>';
1407+
1408+
$graby->setContentAsPrefetched($input);
1409+
$res = $graby->fetchContent('https://example.com/prefetched-content');
1410+
1411+
$this->assertSame('This is my awesome article', $res['title']);
1412+
$this->assertSame('https://example.com/prefetched-content', $res['url']);
1413+
$this->assertStringContainsString('here there are the awesome', $res['html']);
1414+
}
1415+
13991416
/**
14001417
* Return an instance of graby with a mocked Guzzle client returning data from a predefined file.
14011418
*/

0 commit comments

Comments
 (0)