j0k3r · j0k3r · May 5, 2015 · May 4, 2015 · May 4, 2015 · May 5, 2015
diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php
@@ -15,19 +15,20 @@
  */
 class ContentExtractor
 {
-    private $html;
-    private $siteConfig;
-    private $title;
+    private $html = null;
+    private $config;
+    private $siteConfig = null;
+    private $title = null;
     private $author = array();
-    private $language;
-    private $date;
-    private $body;
+    private $language = null;
+    private $date = null;
+    private $body = null;
     private $success = false;
-    private $nextPageUrl;
+    private $nextPageUrl = null;
     private $debug = false;
     private $configBuilder = null;
 
-    public $readability;
+    public $readability = null;
 
     public function __construct($config = array(), $debug = false)
     {
@@ -134,41 +135,44 @@ public function buildSiteConfig($url, $html = '', $add_to_cache = true)
         }
 
         // if no match, use defaults
-        if (!$config) {
+        if (false === $config) {
             $config = $this->configBuilder->create();
         }
 
         // load fingerprint config?
         if ($config->autodetect_on_failure()) {
             // check HTML for fingerprints
-            if (!empty($this->config['fingerprints']) && ($_fphost = $this->findHostUsingFingerprints($html)) && ($config_fingerprint = $this->configBuilder->build($_fphost))) {
-                // $this->debug("Appending site config settings from $_fphost (fingerprint match)");
-                $this->configBuilder->mergeConfig($config, $config_fingerprint);
+            $_fphost = $this->findHostUsingFingerprints($html);
 
-                if ($add_to_cache && !$this->configBuilder->getCachedVersion($_fphost)) {
-                    //$config_fingerprint->cache_in_apc = true;
-                    $this->configBuilder->addToCache($_fphost, $config_fingerprint);
+            if (false !== $_fphost) {
+                $config_fingerprint = $this->configBuilder->build($_fphost);
+
+                if (!empty($this->config['fingerprints']) && false !== $config_fingerprint) {
+                    // $this->debug("Appending site config settings from $_fphost (fingerprint match)");
+                    $this->configBuilder->mergeConfig($config, $config_fingerprint);
+
+                    if ($add_to_cache && !$this->configBuilder->getCachedVersion($_fphost)) {
+                        $this->configBuilder->addToCache($_fphost, $config_fingerprint);
+                    }
                 }
             }
         }
 
         // load global config?
-        if ($config->autodetect_on_failure() && ($config_global = $this->configBuilder->build('global', true))) {
+        $config_global = $this->configBuilder->build('global', true);
+        if ($config->autodetect_on_failure() && false !== $config_global) {
             // $this->debug('Appending site config settings from global.txt');
             $this->configBuilder->mergeConfig($config, $config_global);
 
             if ($add_to_cache && !$this->configBuilder->getCachedVersion('global')) {
-                //$config_global->cache_in_apc = true;
                 $this->configBuilder->addToCache('global', $config_global);
             }
         }
 
         // store copy of merged config
         if ($add_to_cache) {
-            // do not store in APC if wildcard match
-            $use_apc = ($host == $config->cache_key);
             $config->cache_key = null;
-            $this->configBuilder->addToCache("$host.merged", $config, $use_apc);
+            $this->configBuilder->addToCache("$host.merged", $config);
         }
 
         return $config;
@@ -302,7 +306,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                     // $this->debug('Language matched: '.$this->language);
                 }
 
-                if ($this->language) {
+                if (null !== $this->language) {
                     break;
                 }
             }
@@ -511,7 +515,6 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                         //$elems->item(0)->parentNode->removeChild($elems->item(0));
                         if ($this->date) {
                             // $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
-                            $detect_date = false;
                         } else {
                             $this->date = null;
                         }
@@ -556,7 +559,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                             // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                             $e = $elems->item(0);
 
-                            if ((strtolower($e->tagName) == 'img') || (trim($e->textContent) != '')) {
+                            if ((strtolower($e->nodeName) == 'img') || (trim($e->textContent) != '')) {
                                 $this->body = $elems->item(0);
                                 // prune (clean up elements that may not be content)
                                 if ($this->siteConfig->prune()) {
@@ -652,7 +655,7 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                     // what if it's empty? (content placed outside an empty itemprop='articleBody' element)
                     $e = $elems->item(0);
 
-                    if ((strtolower($e->tagName) == 'img') || (trim($e->textContent) != '')) {
+                    if ((strtolower($e->nodeName) == 'img') || (trim($e->textContent) != '')) {
                         $this->body = $elems->item(0);
                         // prune (clean up elements that may not be content)
                         if ($this->siteConfig->prune()) {
@@ -720,7 +723,6 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                 if ($author != '') {
                     // $this->debug("Author found (rel=\"author\"): $author");
                     $this->author[] = $author;
-                    $detect_author = false;
                 }
             }
         }
@@ -738,14 +740,14 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
                 //$elems->item(0)->parentNode->removeChild($elems->item(0));
                 if ($this->date) {
                     // $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
-                    $detect_date = false;
                 } else {
                     $this->date = null;
                 }
             }
         }
 
         // still missing title or body, so we detect using Readability
+        $success = false;
         if ($detect_title || $detect_body) {
             // $this->debug('Using Readability');
             // clone body if we're only using Readability for title (otherwise it may interfere with body element)
@@ -837,8 +839,14 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smart_tidy
         // that tidy has messed up. So let's try again without tidy...
         if (!$this->success && $tidied && $smart_tidy) {
             unset($this->body, $xpath);
+
             // $this->debug('Trying again without tidy');
-            return $this->process($this->readability->original_html, $url, $this->siteConfig, false);
+            return $this->process(
+                $this->readability->original_html,
+                $url,
+                $this->siteConfig,
+                false
+            );
         }
 
         return $this->success;

diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php
@@ -13,7 +13,6 @@
  */
 class HttpClient
 {
-    private $debug = false;
     private $config = array();
     private $httpClient = null;
 
@@ -39,10 +38,10 @@ public function __construct(Client $client, $config = array(), $debug = false)
             // HTTP responses which match these content types will
             // be returned without body.
             'header_only_types' => array(
-               'application/pdf',
-               'image',
-               'audio',
-               'video',
+                'application/pdf',
+                'image',
+                'audio',
+                'video',
             ),
             // URLs ending with one of these extensions will
             // prompt Humble HTTP Agent to send a HEAD request first
@@ -151,7 +150,7 @@ public function fetch($url, $skipTypeVerification = false)
 
             $redirectURL = $this->getMetaRefreshURL($effectiveUrl, $html) ?: $this->getUglyURL($effectiveUrl, $html);
 
-            if ($redirectURL) {
+            if (false !== $redirectURL) {
                 return $this->fetch($redirectURL, true);
             }
         }
@@ -289,7 +288,7 @@ private function getMetaRefreshURL($url, $html)
             return false;
         }
 
-        $redirect_url = $match[1];
+        $redirect_url = trim($match[1]);
         if (preg_match('!^https?://!i', $redirect_url)) {
             // already absolute
             // $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
@@ -305,7 +304,7 @@ private function getMetaRefreshURL($url, $html)
 
         if ($absolute = \SimplePie_IRI::absolutize($base, $redirect_url)) {
             // $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
-            return $absolute;
+            return $absolute->get_iri();
         }
 
         return false;

diff --git a/src/Graby.php b/src/Graby.php
@@ -40,10 +40,10 @@ public function __construct($config = array(), Client $client = null)
             'blocked_urls' => array(),
             'xss_filter' => true,
             'content_type_exc' => array(
-               'application/pdf' => array('action' => 'link', 'name' => 'PDF'),
-               'image'           => array('action' => 'link', 'name' => 'Image'),
-               'audio'           => array('action' => 'link', 'name' => 'Audio'),
-               'video'           => array('action' => 'link', 'name' => 'Video'),
+                'application/pdf' => array('action' => 'link', 'name' => 'PDF'),
+                'image'           => array('action' => 'link', 'name' => 'Image'),
+                'audio'           => array('action' => 'link', 'name' => 'Audio'),
+                'video'           => array('action' => 'link', 'name' => 'Video'),
             ),
             'content_links' => 'preserve',
             'http_client' => array(),
@@ -68,6 +68,13 @@ public function __construct($config = array(), Client $client = null)
         );
     }
 
+    /**
+     * Return a config.
+     *
+     * @param string $key
+     *
+     * @return mixed
+     */
     public function getConfig($key)
     {
         if (!isset($this->config[$key])) {
@@ -127,20 +134,13 @@ private function doFetchContent($url)
         }
 
         $url = filter_var($url, FILTER_SANITIZE_URL);
-        $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
 
         if (false === $this->isUrlAllowed($url)) {
             throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $url));
         }
 
         $response = $this->httpClient->fetch($url);
 
-        $do_content_extraction = true;
-        $extract_result = false;
-        $text_sample = null;
-        $permalink = $url;
-        $extracted_title = '';
-
         $effective_url = $response['effective_url'];
         if (!$this->isUrlAllowed($effective_url)) {
             throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $effective_url));
@@ -190,7 +190,7 @@ private function doFetchContent($url)
 
         // Deal with multi-page articles
         //die('Next: '.$this->extractor->getNextPageUrl());
-        $is_multi_page = (!$is_single_page && $extract_result && $this->extractor->getNextPageUrl());
+        $is_multi_page = (!$is_single_page && $extract_result && null !== $this->extractor->getNextPageUrl());
         if ($this->config['multipage'] && $is_multi_page) {
             // debug('--------');
             // debug('Attempting to process multi-page article');
@@ -260,7 +260,7 @@ private function doFetchContent($url)
         }
 
         // if we failed to extract content...
-        if (!$extract_result) {
+        if (!$extract_result || null === $content_block) {
             return array(
                 'html' => $this->config['error_message'],
                 'title' => $extracted_title,
@@ -466,7 +466,7 @@ private function getSinglePage($html, $url)
         $single_page_url = $this->makeAbsoluteStr($url, $single_page_url);
 
         // check it's not what we have already!
-        if ($single_page_url != $url) {
+        if (false !== $single_page_url && $single_page_url != $url) {
             // it's not, so let's try to fetch it...
             return $this->httpClient->fetch($single_page_url);
         }
@@ -477,10 +477,10 @@ private function getSinglePage($html, $url)
     /**
      * Make an absolute url from an element.
      *
-     * @param string     $base The base url
-     * @param DomElement $elem Element on which we'll retrieve the attribute
+     * @param string   $base The base url
+     * @param \DOMNode $elem Element on which we'll retrieve the attribute
      */
-    private function makeAbsolute($base, $elem)
+    private function makeAbsolute($base, \DOMNode $elem)
     {
         $base = new \SimplePie_IRI($base);
 
@@ -498,7 +498,7 @@ private function makeAbsolute($base, $elem)
                 $this->makeAbsoluteAttr($base, $e, $attr);
             }
 
-            if (strtolower($elem->tagName) == $tag) {
+            if (strtolower($elem->nodeName) == $tag) {
                 $this->makeAbsoluteAttr($base, $elem, $attr);
             }
         }
@@ -507,13 +507,13 @@ private function makeAbsolute($base, $elem)
     /**
      * Make an attribute absolute (href or src).
      *
-     * @param string     $base The base url
-     * @param DomElement $e    Element on which we'll retrieve the attribute
-     * @param string     $attr Attribute that contains the url to absolutize
+     * @param string   $base The base url
+     * @param \DOMNode $e    Element on which we'll retrieve the attribute
+     * @param string   $attr Attribute that contains the url to absolutize
      */
-    private function makeAbsoluteAttr($base, \DomElement $e, $attr)
+    private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
     {
-        if (!$e->hasAttribute($attr)) {
+        if (!$e->attributes->getNamedItem($attr)) {
             return;
         }
 
@@ -570,7 +570,9 @@ private function getExcerpt($text, $num_words = 55, $more = null)
             $more = ' &hellip;';
         }
 
-        $text = strip_tags($text);
+        // use regex instead of strip_tags to left some spaces when removing tags
+        $text = preg_replace('#<[^>]+>#', ' ', $text);
+
         // @todo: Check if word count is based on single characters (East Asian characters)
         /*
         if (1==2) {