Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 35 additions & 27 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
*/
class ContentExtractor
{
private $html;
private $siteConfig;
private $title;
private $html = null;
private $config;
private $siteConfig = null;
private $title = null;
private $author = array();
private $language;
private $date;
private $body;
private $language = null;
private $date = null;
private $body = null;
private $success = false;
private $nextPageUrl;
private $nextPageUrl = null;
private $debug = false;
private $configBuilder = null;

public $readability;
public $readability = null;

public function __construct($config = array(), $debug = false)
{
Expand Down Expand Up @@ -134,41 +135,44 @@ public function buildSiteConfig($url, $html = '', $add_to_cache = true)
}

// if no match, use defaults
if (!$config) {
if (false === $config) {
$config = $this->configBuilder->create();
}

// load fingerprint config?
if ($config->autodetect_on_failure()) {
// check HTML for fingerprints
if (!empty($this->config['fingerprints']) && ($_fphost = $this->findHostUsingFingerprints($html)) && ($config_fingerprint = $this->configBuilder->build($_fphost))) {
// $this->debug("Appending site config settings from $_fphost (fingerprint match)");
$this->configBuilder->mergeConfig($config, $config_fingerprint);
$_fphost = $this->findHostUsingFingerprints($html);

if ($add_to_cache && !$this->configBuilder->getCachedVersion($_fphost)) {
//$config_fingerprint->cache_in_apc = true;
$this->configBuilder->addToCache($_fphost, $config_fingerprint);
if (false !== $_fphost) {
$config_fingerprint = $this->configBuilder->build($_fphost);

if (!empty($this->config['fingerprints']) && false !== $config_fingerprint) {
// $this->debug("Appending site config settings from $_fphost (fingerprint match)");
$this->configBuilder->mergeConfig($config, $config_fingerprint);

if ($add_to_cache && !$this->configBuilder->getCachedVersion($_fphost)) {
$this->configBuilder->addToCache($_fphost, $config_fingerprint);
}
}
}
}

// load global config?
if ($config->autodetect_on_failure() && ($config_global = $this->configBuilder->build('global', true))) {
$config_global = $this->configBuilder->build('global', true);
if ($config->autodetect_on_failure() && false !== $config_global) {
// $this->debug('Appending site config settings from global.txt');
$this->configBuilder->mergeConfig($config, $config_global);

if ($add_to_cache && !$this->configBuilder->getCachedVersion('global')) {
//$config_global->cache_in_apc = true;
$this->configBuilder->addToCache('global', $config_global);
}
}

// store copy of merged config
if ($add_to_cache) {
// do not store in APC if wildcard match
$use_apc = ($host == $config->cache_key);
$config->cache_key = null;
$this->configBuilder->addToCache("$host.merged", $config, $use_apc);
$this->configBuilder->addToCache("$host.merged", $config);
}

return $config;
Expand Down Expand Up @@ -302,7 +306,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
// $this->debug('Language matched: '.$this->language);
}

if ($this->language) {
if (null !== $this->language) {
break;
}
}
Expand Down Expand Up @@ -511,7 +515,6 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
//$elems->item(0)->parentNode->removeChild($elems->item(0));
if ($this->date) {
// $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
$detect_date = false;
} else {
$this->date = null;
}
Expand Down Expand Up @@ -556,7 +559,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
// what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
$e = $elems->item(0);

if ((strtolower($e->tagName) == 'img') || (trim($e->textContent) != '')) {
if ((strtolower($e->nodeName) == 'img') || (trim($e->textContent) != '')) {
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->siteConfig->prune()) {
Expand Down Expand Up @@ -652,7 +655,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
// what if it's empty? (content placed outside an empty itemprop='articleBody' element)
$e = $elems->item(0);

if ((strtolower($e->tagName) == 'img') || (trim($e->textContent) != '')) {
if ((strtolower($e->nodeName) == 'img') || (trim($e->textContent) != '')) {
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->siteConfig->prune()) {
Expand Down Expand Up @@ -720,7 +723,6 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
if ($author != '') {
// $this->debug("Author found (rel=\"author\"): $author");
$this->author[] = $author;
$detect_author = false;
}
}
}
Expand All @@ -738,14 +740,14 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
//$elems->item(0)->parentNode->removeChild($elems->item(0));
if ($this->date) {
// $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
$detect_date = false;
} else {
$this->date = null;
}
}
}

// still missing title or body, so we detect using Readability
$success = false;
if ($detect_title || $detect_body) {
// $this->debug('Using Readability');
// clone body if we're only using Readability for title (otherwise it may interfere with body element)
Expand Down Expand Up @@ -837,8 +839,14 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
// that tidy has messed up. So let's try again without tidy...
if (!$this->success && $tidied && $smart_tidy) {
unset($this->body, $xpath);

// $this->debug('Trying again without tidy');
return $this->process($this->readability->original_html, $url, $this->siteConfig, false);
return $this->process(
$this->readability->original_html,
$url,
$this->siteConfig,
false
);
}

return $this->success;
Expand Down
15 changes: 7 additions & 8 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
*/
class HttpClient
{
private $debug = false;
private $config = array();
private $httpClient = null;

Expand All @@ -39,10 +38,10 @@ public function __construct(Client $client, $config = array(), $debug = false)
// HTTP responses which match these content types will
// be returned without body.
'header_only_types' => array(
'application/pdf',
'image',
'audio',
'video',
'application/pdf',
'image',
'audio',
'video',
),
// URLs ending with one of these extensions will
// prompt Humble HTTP Agent to send a HEAD request first
Expand Down Expand Up @@ -151,7 +150,7 @@ public function fetch($url, $skipTypeVerification = false)

$redirectURL = $this->getMetaRefreshURL($effectiveUrl, $html) ?: $this->getUglyURL($effectiveUrl, $html);

if ($redirectURL) {
if (false !== $redirectURL) {
return $this->fetch($redirectURL, true);
}
}
Expand Down Expand Up @@ -289,7 +288,7 @@ private function getMetaRefreshURL($url, $html)
return false;
}

$redirect_url = $match[1];
$redirect_url = trim($match[1]);
if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute
// $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
Expand All @@ -305,7 +304,7 @@ private function getMetaRefreshURL($url, $html)

if ($absolute = \SimplePie_IRI::absolutize($base, $redirect_url)) {
// $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute;
return $absolute->get_iri();
}

return false;
Expand Down
50 changes: 26 additions & 24 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ public function __construct($config = array(), Client $client = null)
'blocked_urls' => array(),
'xss_filter' => true,
'content_type_exc' => array(
'application/pdf' => array('action' => 'link', 'name' => 'PDF'),
'image' => array('action' => 'link', 'name' => 'Image'),
'audio' => array('action' => 'link', 'name' => 'Audio'),
'video' => array('action' => 'link', 'name' => 'Video'),
'application/pdf' => array('action' => 'link', 'name' => 'PDF'),
'image' => array('action' => 'link', 'name' => 'Image'),
'audio' => array('action' => 'link', 'name' => 'Audio'),
'video' => array('action' => 'link', 'name' => 'Video'),
),
'content_links' => 'preserve',
'http_client' => array(),
Expand All @@ -68,6 +68,13 @@ public function __construct($config = array(), Client $client = null)
);
}

/**
* Return a config.
*
* @param string $key
*
* @return mixed
*/
public function getConfig($key)
{
if (!isset($this->config[$key])) {
Expand Down Expand Up @@ -127,20 +134,13 @@ private function doFetchContent($url)
}

$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);

if (false === $this->isUrlAllowed($url)) {
throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $url));
}

$response = $this->httpClient->fetch($url);

$do_content_extraction = true;
$extract_result = false;
$text_sample = null;
$permalink = $url;
$extracted_title = '';

$effective_url = $response['effective_url'];
if (!$this->isUrlAllowed($effective_url)) {
throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $effective_url));
Expand Down Expand Up @@ -190,7 +190,7 @@ private function doFetchContent($url)

// Deal with multi-page articles
//die('Next: '.$this->extractor->getNextPageUrl());
$is_multi_page = (!$is_single_page && $extract_result && $this->extractor->getNextPageUrl());
$is_multi_page = (!$is_single_page && $extract_result && null !== $this->extractor->getNextPageUrl());
if ($this->config['multipage'] && $is_multi_page) {
// debug('--------');
// debug('Attempting to process multi-page article');
Expand Down Expand Up @@ -260,7 +260,7 @@ private function doFetchContent($url)
}

// if we failed to extract content...
if (!$extract_result) {
if (!$extract_result || null === $content_block) {
return array(
'html' => $this->config['error_message'],
'title' => $extracted_title,
Expand Down Expand Up @@ -466,7 +466,7 @@ private function getSinglePage($html, $url)
$single_page_url = $this->makeAbsoluteStr($url, $single_page_url);

// check it's not what we have already!
if ($single_page_url != $url) {
if (false !== $single_page_url && $single_page_url != $url) {
// it's not, so let's try to fetch it...
return $this->httpClient->fetch($single_page_url);
}
Expand All @@ -477,10 +477,10 @@ private function getSinglePage($html, $url)
/**
* Make an absolute url from an element.
*
* @param string $base The base url
* @param DomElement $elem Element on which we'll retrieve the attribute
* @param string $base The base url
* @param \DOMNode $elem Element on which we'll retrieve the attribute
*/
private function makeAbsolute($base, $elem)
private function makeAbsolute($base, \DOMNode $elem)
{
$base = new \SimplePie_IRI($base);

Expand All @@ -498,7 +498,7 @@ private function makeAbsolute($base, $elem)
$this->makeAbsoluteAttr($base, $e, $attr);
}

if (strtolower($elem->tagName) == $tag) {
if (strtolower($elem->nodeName) == $tag) {
$this->makeAbsoluteAttr($base, $elem, $attr);
}
}
Expand All @@ -507,13 +507,13 @@ private function makeAbsolute($base, $elem)
/**
* Make an attribute absolute (href or src).
*
* @param string $base The base url
* @param DomElement $e Element on which we'll retrieve the attribute
* @param string $attr Attribute that contains the url to absolutize
* @param string $base The base url
* @param \DOMNode $e Element on which we'll retrieve the attribute
* @param string $attr Attribute that contains the url to absolutize
*/
private function makeAbsoluteAttr($base, \DomElement $e, $attr)
private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
{
if (!$e->hasAttribute($attr)) {
if (!$e->attributes->getNamedItem($attr)) {
return;
}

Expand Down Expand Up @@ -570,7 +570,9 @@ private function getExcerpt($text, $num_words = 55, $more = null)
$more = ' …';
}

$text = strip_tags($text);
// use regex instead of strip_tags to left some spaces when removing tags
$text = preg_replace('#<[^>]+>#', ' ', $text);

// @todo: Check if word count is based on single characters (East Asian characters)
/*
if (1==2) {
Expand Down
Loading