Skip to content

Commit 9da1b47

Browse files
committed
Don't fail on libxml errors if the RSD URL can still be found
Various things, such as duplicate element IDs or repeated attributes, break DOMDocument::loadHTML() and so this turns off the direct reporting (E_ERROR) of these and istead adds them to the RsdException if the RSD URL really can't be determined from the HTML. In most cases, the URL can be found correctly and the errors can be disregarded. A test is added for this as well, although it does *not* test for the case when the RSD URL can't be found and there are libxml errors (because we need to serve up a broken HTML file, and the mediawiki-api-base test system can only interact with MediaWiki via the API, which makes it hard to produce broken HTML that doesn't also have the correct LINK element for the RSD URL). A new TestEnvironment::savePage method is added, for easier creation of test wiki pages. Bug: https://phabricator.wikimedia.org/T163527
1 parent 246f0f5 commit 9da1b47

File tree

3 files changed

+57
-4
lines changed

3 files changed

+57
-4
lines changed

src/MediawikiApi.php

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,30 @@ public static function newFromApiEndpoint( $apiEndpoint ) {
7979
* @throws RsdException If the RSD URL could not be found in the page's HTML.
8080
*/
8181
public static function newFromPage( $url ) {
82+
// Set up HTTP client and HTML document.
8283
$tempClient = new Client( [ 'headers' => [ 'User-Agent' => 'addwiki-mediawiki-client' ] ] );
83-
84-
// Get the page HTML and extract the RSD link.
8584
$pageHtml = $tempClient->get( $url )->getBody();
8685
$pageDoc = new DOMDocument();
86+
87+
// Try to load the HTML (turn off errors temporarily; most don't matter, and if they do get
88+
// in the way of finding the API URL, will be reported in the RsdException below).
89+
$internalErrors = libxml_use_internal_errors( true );
8790
$pageDoc->loadHTML( $pageHtml );
88-
$link = ( new DOMXpath( $pageDoc ) )->query( 'head/link[@type="application/rsd+xml"][@href]' );
91+
$libXmlErrors = libxml_get_errors();
92+
libxml_use_internal_errors( $internalErrors );
93+
94+
// Extract the RSD link.
95+
$xpath = 'head/link[@type="application/rsd+xml"][@href]';
96+
$link = ( new DOMXpath( $pageDoc ) )->query( $xpath );
8997
if ( $link->length === 0 ) {
90-
throw new RsdException( "Unable to find RSD URL in page: $url" );
98+
// Format libxml errors for display.
99+
$libXmlErrorStr = array_reduce( $libXmlErrors, function( $prevErr, $err ) {
100+
return $prevErr . ', ' . $err->message . ' (line '.$err->line . ')';
101+
} );
102+
if ( $libXmlErrorStr ) {
103+
$libXmlErrorStr = "In addition, libxml had the following errors: $libXmlErrorStr";
104+
}
105+
throw new RsdException( "Unable to find RSD URL in page: $url $libXmlErrorStr" );
91106
}
92107
$rsdUrl = $link->item( 0 )->attributes->getnamedItem( 'href' )->nodeValue;
93108

tests/Integration/MediawikiApiTest.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,28 @@ public function testNewFromPageInvalidHtml() {
3030
MediawikiApi::newFromPage( $nonWikiPage );
3131
}
3232

33+
/**
34+
* Duplicate element IDs break DOMDocument::loadHTML
35+
* @see https://phabricator.wikimedia.org/T163527#3219833
36+
* @covers Mediawiki\Api\MediawikiApi::newFromPage
37+
*/
38+
public function testNewFromPageWithDuplicateId() {
39+
$testPageName = __METHOD__;
40+
$testEnv = TestEnvironment::newInstance();
41+
$wikiPageUrl = str_replace( 'api.php', "index.php?title=$testPageName", $testEnv->getApiUrl() );
42+
43+
// Test with no duplicate IDs.
44+
$testEnv->savePage( $testPageName, '<p id="unique-id"></p>' );
45+
$api1 = MediawikiApi::newFromPage( $wikiPageUrl );
46+
$this->assertInstanceOf( MediawikiApi::class, $api1 );
47+
48+
// Test with duplicate ID.
49+
$wikiText = '<p id="duplicated-id"></p><div id="duplicated-id"></div>';
50+
$testEnv->savePage( $testPageName, $wikiText );
51+
$api2 = MediawikiApi::newFromPage( $wikiPageUrl );
52+
$this->assertInstanceOf( MediawikiApi::class, $api2 );
53+
}
54+
3355
/**
3456
* @covers Mediawiki\Api\MediawikiApi::getRequest
3557
* @covers Mediawiki\Api\MediawikiApi::getClientRequestOptions

tests/Integration/TestEnvironment.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
use Exception;
66
use Mediawiki\Api\MediawikiApi;
7+
use Mediawiki\Api\SimpleRequest;
78

89
/**
910
* @author Addshore
@@ -68,4 +69,19 @@ public function getApi() {
6869
return $this->api;
6970
}
7071

72+
/**
73+
* Save a wiki page.
74+
* @param string $title
75+
* @param string $content
76+
*/
77+
public function savePage( $title, $content ) {
78+
79+
$params = [
80+
'title' => $title,
81+
'text' => $content,
82+
'md5' => md5( $content ),
83+
'token' => $this->api->getToken(),
84+
];
85+
$this->api->postRequest( new SimpleRequest( 'edit', $params ) );
86+
}
7187
}

0 commit comments

Comments
 (0)