Skip to content

Update ada #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
[package]
name = "ada-url"
authors = ["Yagiz Nizipli <[email protected]>", "Daniel Lemire <[email protected]>", "LongYinan <[email protected]>"]
version = "0.1.0"
authors = [
"Yagiz Nizipli <[email protected]>",
"Daniel Lemire <[email protected]>",
"LongYinan <[email protected]>",
"Boshen <[email protected]>"
]
version = "1.0.0"
edition = "2021"
description = "Fast WHATWG Compliant URL parser"
readme = "README.md"
Expand Down
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Add the following as a dependency to your project (`Cargo.toml`):

```toml
[dependencies]
ada-url = "0.1"
ada-url = "1"
```

Here is an example illustrating a common usage:
Expand All @@ -24,4 +24,23 @@ fn main() {
u.set_port("9999");
println!("href: {:?}", u.href());
}
```

### Performance

Ada is fast. The benchmark below shows **2 times** faster URL parsing compared to `url`

```
Running bench/parse.rs (target/release/deps/parse-dff65469468a2cec)
url_parse/ada_parse time: [2.5853 µs 2.5982 µs 2.6115 µs]
change: [-3.8745% -2.9874% -2.0620%] (p = 0.00 < 0.05)
Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
1 (1.00%) low mild
1 (1.00%) high severe
url_parse/servo_parse time: [5.5127 µs 5.6287 µs 5.8046 µs]
change: [+0.7618% +3.0977% +6.5694%] (p = 0.01 < 0.05)
Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) high severe
```
72 changes: 67 additions & 5 deletions deps/ada.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* auto-generated on 2023-05-09 17:25:59 -0400. Do not edit! */
/* auto-generated on 2023-05-16 13:48:47 -0400. Do not edit! */
/* begin file src/ada.cpp */
#include "ada.h"
/* begin file src/checkers.cpp */
Expand Down Expand Up @@ -9786,6 +9786,11 @@ std::string to_unicode(std::string_view input) {
ADA_POP_DISABLE_WARNINGS

#include <algorithm>
#if ADA_NEON
#include <arm_neon.h>
#elif ADA_SSE2
#include <emmintrin.h>
#endif

namespace ada::unicode {

Expand Down Expand Up @@ -9817,8 +9822,58 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
}
return non_ascii == 0;
}

ada_really_inline constexpr bool has_tabs_or_newline(
#if ADA_NEON
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
size_t i = 0;
const uint8x16_t mask1 = vmovq_n_u8('\r');
const uint8x16_t mask2 = vmovq_n_u8('\n');
const uint8x16_t mask3 = vmovq_n_u8('\t');
uint8x16_t running{0};
for (; i + 15 < user_input.size(); i += 16) {
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
}
if (i < user_input.size()) {
uint8_t buffer[16]{};
memcpy(buffer, user_input.data() + i, user_input.size() - i);
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
}
return vmaxvq_u8(running) != 0;
}
#elif ADA_SSE2
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
size_t i = 0;
const __m128i mask1 = _mm_set1_epi8('\r');
const __m128i mask2 = _mm_set1_epi8('\n');
const __m128i mask3 = _mm_set1_epi8('\t');
__m128i running{0};
for (; i + 15 < user_input.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
running = _mm_or_si128(
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
_mm_cmpeq_epi8(word, mask2))),
_mm_cmpeq_epi8(word, mask3));
}
if (i < user_input.size()) {
uint8_t buffer[16]{};
memcpy(buffer, user_input.data() + i, user_input.size() - i);
__m128i word = _mm_loadu_si128((const __m128i*)buffer);
running = _mm_or_si128(
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
_mm_cmpeq_epi8(word, mask2))),
_mm_cmpeq_epi8(word, mask3));
}
return _mm_movemask_epi8(running) != 0;
}
#else
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
auto has_zero_byte = [](uint64_t v) {
return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080);
Expand Down Expand Up @@ -9849,6 +9904,7 @@ ada_really_inline constexpr bool has_tabs_or_newline(
}
return running;
}
#endif

// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
Expand Down Expand Up @@ -13732,8 +13788,11 @@ bool url_aggregator::set_hostname(const std::string_view input) {

[[nodiscard]] std::string_view url_aggregator::get_host() const noexcept {
ada_log("url_aggregator::get_host");
// Technically, we should check if there is a hostname, but
// the code below works even if there isn't.
// if(!has_hostname()) { return ""; }
size_t start = components.host_start;
if (buffer.size() > components.host_start &&
if (components.host_end > components.host_start &&
buffer[components.host_start] == '@') {
start++;
}
Expand All @@ -13747,9 +13806,12 @@ bool url_aggregator::set_hostname(const std::string_view input) {

[[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept {
ada_log("url_aggregator::get_hostname");
// Technically, we should check if there is a hostname, but
// the code below works even if there isn't.
// if(!has_hostname()) { return ""; }
size_t start = components.host_start;
// So host_start is not where the host begins.
if (buffer.size() > components.host_start &&
if (components.host_end > components.host_start &&
buffer[components.host_start] == '@') {
start++;
}
Expand Down
32 changes: 23 additions & 9 deletions deps/ada.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* auto-generated on 2023-05-09 17:25:59 -0400. Do not edit! */
/* auto-generated on 2023-05-16 13:48:47 -0400. Do not edit! */
/* begin file include/ada.h */
/**
* @file ada.h
Expand Down Expand Up @@ -468,6 +468,17 @@ namespace ada {
if (!(COND)) __builtin_unreachable(); \
} while (0)
#endif

#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \
(defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP == 2))
#define ADA_SSE2 1
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
#define ADA_NEON 1
#endif

#endif // ADA_COMMON_DEFS_H
/* end file include/ada/common_defs.h */
#include <stdint.h>
Expand Down Expand Up @@ -4320,7 +4331,7 @@ std::string to_unicode(std::string_view input);
* @attention The has_tabs_or_newline function is a bottleneck and it is simple
* enough that compilers like GCC can 'autovectorize it'.
*/
ada_really_inline constexpr bool has_tabs_or_newline(
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept;

/**
Expand Down Expand Up @@ -6473,14 +6484,14 @@ inline std::ostream &operator<<(std::ostream &out,
#ifndef ADA_ADA_VERSION_H
#define ADA_ADA_VERSION_H

#define ADA_VERSION "2.4.0"
#define ADA_VERSION "2.4.1"

namespace ada {

enum {
ADA_VERSION_MAJOR = 2,
ADA_VERSION_MINOR = 4,
ADA_VERSION_REVISION = 0,
ADA_VERSION_REVISION = 1,
};

} // namespace ada
Expand Down Expand Up @@ -6508,11 +6519,11 @@ using result = tl::expected<result_type, ada::errors>;

/**
* The URL parser takes a scalar value string input, with an optional null or
* base URL base (default null). The parser assumes the input has an UTF-8
* encoding.
* base URL base (default null). The parser assumes the input is a valid ASCII
* or UTF-8 string.
*
* @param input the string input to analyze.
* @param base_url the optional string input to use as a base url.
* @param input the string input to analyze (must be valid ASCII or UTF-8)
* @param base_url the optional URL input to use as a base url.
* @return a parsed URL.
*/
template <class result_type = ada::url_aggregator>
Expand All @@ -6525,14 +6536,17 @@ extern template ada::result<url_aggregator> parse<url_aggregator>(
std::string_view input, const url_aggregator* base_url);

/**
* Verifies whether the URL strings can be parsed. The function assumes
* that the inputs are valid ASCII or UTF-8 strings.
* @see https://url.spec.whatwg.org/#dom-url-canparse
* @return If URL can be parsed or not.
*/
bool can_parse(std::string_view input,
const std::string_view* base_input = nullptr);

/**
* Computes a href string from a file path.
* Computes a href string from a file path. The function assumes
* that the input is a valid ASCII or UTF-8 string.
* @return a href string (starts with file:://)
*/
std::string href_from_file(std::string_view path);
Expand Down
2 changes: 1 addition & 1 deletion deps/ada_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ typedef struct {

typedef void* ada_url;

// input should be a null terminated C string
// input should be a null terminated C string (ASCII or UTF-8)
// you must call ada_free on the returned pointer
ada_url ada_parse(const char* input, size_t length);
ada_url ada_parse_with_base(const char* input, size_t input_length,
Expand Down