diff --git a/Cargo.lock b/Cargo.lock index 26eb90b..167ccbd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 3 [[package]] name = "ada-url" -version = "0.1.0" +version = "1.0.0" dependencies = [ "cc", "criterion", diff --git a/Cargo.toml b/Cargo.toml index fd66f9a..ea3f709 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,12 @@ [package] name = "ada-url" -authors = ["Yagiz Nizipli ", "Daniel Lemire ", "LongYinan "] -version = "0.1.0" +authors = [ + "Yagiz Nizipli ", + "Daniel Lemire ", + "LongYinan ", + "Boshen " + ] +version = "1.0.0" edition = "2021" description = "Fast WHATWG Compliant URL parser" readme = "README.md" diff --git a/README.md b/README.md index e51597c..0a0ecce 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Add the following as a dependency to your project (`Cargo.toml`): ```toml [dependencies] -ada-url = "0.1" +ada-url = "1" ``` Here is an example illustrating a common usage: @@ -24,4 +24,23 @@ fn main() { u.set_port("9999"); println!("href: {:?}", u.href()); } +``` + +### Performance + +Ada is fast. The benchmark below shows **2 times** faster URL parsing compared to `url` + +``` + Running bench/parse.rs (target/release/deps/parse-dff65469468a2cec) +url_parse/ada_parse time: [2.5853 µs 2.5982 µs 2.6115 µs] + change: [-3.8745% -2.9874% -2.0620%] (p = 0.00 < 0.05) + Performance has improved. +Found 2 outliers among 100 measurements (2.00%) + 1 (1.00%) low mild + 1 (1.00%) high severe +url_parse/servo_parse time: [5.5127 µs 5.6287 µs 5.8046 µs] + change: [+0.7618% +3.0977% +6.5694%] (p = 0.01 < 0.05) + Change within noise threshold. +Found 2 outliers among 100 measurements (2.00%) + 2 (2.00%) high severe ``` \ No newline at end of file diff --git a/deps/ada.cpp b/deps/ada.cpp index 966720b..e0ee4a5 100644 --- a/deps/ada.cpp +++ b/deps/ada.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2023-05-09 17:25:59 -0400. Do not edit! */ +/* auto-generated on 2023-05-16 13:48:47 -0400. Do not edit! */ /* begin file src/ada.cpp */ #include "ada.h" /* begin file src/checkers.cpp */ @@ -9786,6 +9786,11 @@ std::string to_unicode(std::string_view input) { ADA_POP_DISABLE_WARNINGS #include +#if ADA_NEON +#include +#elif ADA_SSE2 +#include +#endif namespace ada::unicode { @@ -9817,8 +9822,58 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept { } return non_ascii == 0; } - -ada_really_inline constexpr bool has_tabs_or_newline( +#if ADA_NEON +ada_really_inline bool has_tabs_or_newline( + std::string_view user_input) noexcept { + size_t i = 0; + const uint8x16_t mask1 = vmovq_n_u8('\r'); + const uint8x16_t mask2 = vmovq_n_u8('\n'); + const uint8x16_t mask3 = vmovq_n_u8('\t'); + uint8x16_t running{0}; + for (; i + 15 < user_input.size(); i += 16) { + uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i); + running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1), + vceqq_u8(word, mask2))), + vceqq_u8(word, mask3)); + } + if (i < user_input.size()) { + uint8_t buffer[16]{}; + memcpy(buffer, user_input.data() + i, user_input.size() - i); + uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i); + running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1), + vceqq_u8(word, mask2))), + vceqq_u8(word, mask3)); + } + return vmaxvq_u8(running) != 0; +} +#elif ADA_SSE2 +ada_really_inline bool has_tabs_or_newline( + std::string_view user_input) noexcept { + size_t i = 0; + const __m128i mask1 = _mm_set1_epi8('\r'); + const __m128i mask2 = _mm_set1_epi8('\n'); + const __m128i mask3 = _mm_set1_epi8('\t'); + __m128i running{0}; + for (; i + 15 < user_input.size(); i += 16) { + __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i)); + running = _mm_or_si128( + _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1), + _mm_cmpeq_epi8(word, mask2))), + _mm_cmpeq_epi8(word, mask3)); + } + if (i < user_input.size()) { + uint8_t buffer[16]{}; + memcpy(buffer, user_input.data() + i, user_input.size() - i); + __m128i word = _mm_loadu_si128((const __m128i*)buffer); + running = _mm_or_si128( + _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1), + _mm_cmpeq_epi8(word, mask2))), + _mm_cmpeq_epi8(word, mask3)); + } + return _mm_movemask_epi8(running) != 0; +} +#else +ada_really_inline bool has_tabs_or_newline( std::string_view user_input) noexcept { auto has_zero_byte = [](uint64_t v) { return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); @@ -9849,6 +9904,7 @@ ada_really_inline constexpr bool has_tabs_or_newline( } return running; } +#endif // A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, // U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>), @@ -13732,8 +13788,11 @@ bool url_aggregator::set_hostname(const std::string_view input) { [[nodiscard]] std::string_view url_aggregator::get_host() const noexcept { ada_log("url_aggregator::get_host"); + // Technically, we should check if there is a hostname, but + // the code below works even if there isn't. + // if(!has_hostname()) { return ""; } size_t start = components.host_start; - if (buffer.size() > components.host_start && + if (components.host_end > components.host_start && buffer[components.host_start] == '@') { start++; } @@ -13747,9 +13806,12 @@ bool url_aggregator::set_hostname(const std::string_view input) { [[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept { ada_log("url_aggregator::get_hostname"); + // Technically, we should check if there is a hostname, but + // the code below works even if there isn't. + // if(!has_hostname()) { return ""; } size_t start = components.host_start; // So host_start is not where the host begins. - if (buffer.size() > components.host_start && + if (components.host_end > components.host_start && buffer[components.host_start] == '@') { start++; } diff --git a/deps/ada.h b/deps/ada.h index 2756d86..5afa7ab 100644 --- a/deps/ada.h +++ b/deps/ada.h @@ -1,4 +1,4 @@ -/* auto-generated on 2023-05-09 17:25:59 -0400. Do not edit! */ +/* auto-generated on 2023-05-16 13:48:47 -0400. Do not edit! */ /* begin file include/ada.h */ /** * @file ada.h @@ -468,6 +468,17 @@ namespace ada { if (!(COND)) __builtin_unreachable(); \ } while (0) #endif + +#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \ + (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP == 2)) +#define ADA_SSE2 1 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define ADA_NEON 1 +#endif + #endif // ADA_COMMON_DEFS_H /* end file include/ada/common_defs.h */ #include @@ -4320,7 +4331,7 @@ std::string to_unicode(std::string_view input); * @attention The has_tabs_or_newline function is a bottleneck and it is simple * enough that compilers like GCC can 'autovectorize it'. */ -ada_really_inline constexpr bool has_tabs_or_newline( +ada_really_inline bool has_tabs_or_newline( std::string_view user_input) noexcept; /** @@ -6473,14 +6484,14 @@ inline std::ostream &operator<<(std::ostream &out, #ifndef ADA_ADA_VERSION_H #define ADA_ADA_VERSION_H -#define ADA_VERSION "2.4.0" +#define ADA_VERSION "2.4.1" namespace ada { enum { ADA_VERSION_MAJOR = 2, ADA_VERSION_MINOR = 4, - ADA_VERSION_REVISION = 0, + ADA_VERSION_REVISION = 1, }; } // namespace ada @@ -6508,11 +6519,11 @@ using result = tl::expected; /** * The URL parser takes a scalar value string input, with an optional null or - * base URL base (default null). The parser assumes the input has an UTF-8 - * encoding. + * base URL base (default null). The parser assumes the input is a valid ASCII + * or UTF-8 string. * - * @param input the string input to analyze. - * @param base_url the optional string input to use as a base url. + * @param input the string input to analyze (must be valid ASCII or UTF-8) + * @param base_url the optional URL input to use as a base url. * @return a parsed URL. */ template @@ -6525,6 +6536,8 @@ extern template ada::result parse( std::string_view input, const url_aggregator* base_url); /** + * Verifies whether the URL strings can be parsed. The function assumes + * that the inputs are valid ASCII or UTF-8 strings. * @see https://url.spec.whatwg.org/#dom-url-canparse * @return If URL can be parsed or not. */ @@ -6532,7 +6545,8 @@ bool can_parse(std::string_view input, const std::string_view* base_input = nullptr); /** - * Computes a href string from a file path. + * Computes a href string from a file path. The function assumes + * that the input is a valid ASCII or UTF-8 string. * @return a href string (starts with file:://) */ std::string href_from_file(std::string_view path); diff --git a/deps/ada_c.h b/deps/ada_c.h index 669f2d4..f8bcbdc 100644 --- a/deps/ada_c.h +++ b/deps/ada_c.h @@ -38,7 +38,7 @@ typedef struct { typedef void* ada_url; -// input should be a null terminated C string +// input should be a null terminated C string (ASCII or UTF-8) // you must call ada_free on the returned pointer ada_url ada_parse(const char* input, size_t length); ada_url ada_parse_with_base(const char* input, size_t input_length,