1#if ADA_INCLUDE_URL_PATTERN
16namespace ada::url_pattern_helpers {
18std::tuple<std::string, std::vector<std::string>>
19generate_regular_expression_and_name_list(
20 const std::vector<url_pattern_part>& part_list,
21 url_pattern_compile_component_options options) {
25 result.reserve(part_list.size() * 16);
28 std::vector<std::string> name_list{};
29 name_list.reserve(part_list.size());
32 std::string segment_wildcard_regexp;
35 for (
const url_pattern_part& part : part_list) {
37 if (part.type == url_pattern_part_type::FIXED_TEXT) {
39 if (part.modifier == url_pattern_part_modifier::none) {
40 result.append(escape_regexp_string(part.value));
44 result.append(escape_regexp_string(part.value));
46 result.append(convert_modifier_to_string(part.modifier));
53 name_list.push_back(part.name);
56 std::string_view regexp_value = part.value;
58 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD) {
60 if (segment_wildcard_regexp.empty()) {
61 segment_wildcard_regexp = generate_segment_wildcard_regexp(options);
63 regexp_value = segment_wildcard_regexp;
64 }
else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
70 if (part.prefix.empty() && part.suffix.empty()) {
72 if (part.modifier == url_pattern_part_modifier::none ||
73 part.modifier == url_pattern_part_modifier::optional) {
76 result.append(regexp_value);
78 result.append(convert_modifier_to_string(part.modifier));
82 result.append(regexp_value);
84 result.append(convert_modifier_to_string(part.modifier));
91 if (part.modifier == url_pattern_part_modifier::none ||
92 part.modifier == url_pattern_part_modifier::optional) {
95 result.append(escape_regexp_string(part.prefix));
97 result.append(regexp_value);
99 result.append(escape_regexp_string(part.suffix));
101 result.append(convert_modifier_to_string(part.modifier));
106 ADA_ASSERT_TRUE(part.modifier == url_pattern_part_modifier::zero_or_more ||
107 part.modifier == url_pattern_part_modifier::one_or_more);
119 result.append(escape_regexp_string(part.prefix));
123 result.append(regexp_value);
128 result.append(escape_regexp_string(part.suffix));
131 result.append(escape_regexp_string(part.prefix));
135 result.append(regexp_value);
140 result.append(escape_regexp_string(part.suffix));
145 if (part.modifier == url_pattern_part_modifier::zero_or_more) {
154 return {std::move(result), std::move(name_list)};
157bool is_ipv6_address(std::string_view input)
noexcept {
159 if (input.size() < 2)
return false;
163 if (input.front() ==
'[')
return true;
166 if (input.starts_with(
"{["))
return true;
169 return input.starts_with(
"\\[");
172std::string_view convert_modifier_to_string(
173 url_pattern_part_modifier modifier) {
176 case url_pattern_part_modifier::zero_or_more:
179 case url_pattern_part_modifier::optional:
182 case url_pattern_part_modifier::one_or_more:
190std::string generate_segment_wildcard_regexp(
191 url_pattern_compile_component_options options) {
193 std::string
result =
"[^";
196 result.append(escape_regexp_string(options.get_delimiter()));
200 ada_log(
"generate_segment_wildcard_regexp result: ", result);
207constexpr uint8_t CHAR_SCHEME = 1;
208constexpr uint8_t CHAR_UPPER = 2;
209constexpr uint8_t CHAR_SIMPLE_HOSTNAME = 4;
210constexpr uint8_t CHAR_SIMPLE_PATHNAME =
213constexpr std::array<uint8_t, 256> char_class_table = []()
consteval {
214 std::array<uint8_t, 256>
table{};
215 for (
int c =
'a'; c <=
'z'; c++)
216 table[c] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
217 for (
int c =
'A'; c <=
'Z'; c++)
218 table[c] = CHAR_SCHEME | CHAR_UPPER | CHAR_SIMPLE_PATHNAME;
219 for (
int c =
'0'; c <=
'9'; c++)
220 table[c] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
221 table[
'+'] = CHAR_SCHEME;
222 table[
'-'] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
224 CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME;
225 table[
'/'] = CHAR_SIMPLE_PATHNAME;
226 table[
'_'] = CHAR_SIMPLE_PATHNAME;
227 table[
'~'] = CHAR_SIMPLE_PATHNAME;
232tl::expected<std::string, errors> canonicalize_protocol(
233 std::string_view input) {
234 ada_log(
"canonicalize_protocol called with input=", input);
235 if (input.empty()) [[unlikely]] {
239 if (input.ends_with(
":")) {
240 input.remove_suffix(1);
244 if (scheme::is_special(input)) {
245 return std::string(input);
250 uint8_t first_flags = char_class_table[
static_cast<uint8_t
>(input[0])];
251 if (!(first_flags & CHAR_SCHEME) || input[0] ==
'+' || input[0] ==
'-' ||
252 input[0] ==
'.' || unicode::is_ascii_digit(input[0])) {
253 return tl::unexpected(errors::type_error);
256 uint8_t needs_lowercase = first_flags & CHAR_UPPER;
257 for (
size_t i = 1; i < input.size(); i++) {
258 uint8_t flags = char_class_table[
static_cast<uint8_t
>(input[i])];
259 if (!(flags & CHAR_SCHEME)) {
260 return tl::unexpected(errors::type_error);
262 needs_lowercase |= flags & CHAR_UPPER;
265 if (needs_lowercase == 0) {
266 return std::string(input);
269 std::string
result(input);
274tl::expected<std::string, errors> canonicalize_username(
275 std::string_view input) {
277 if (input.empty()) [[unlikely]] {
282 input, character_sets::USERINFO_PERCENT_ENCODE);
283 if (idx == input.size()) {
285 return std::string(input);
288 return ada::unicode::percent_encode(
289 input, character_sets::USERINFO_PERCENT_ENCODE, idx);
292tl::expected<std::string, errors> canonicalize_password(
293 std::string_view input) {
295 if (input.empty()) [[unlikely]] {
300 input, character_sets::USERINFO_PERCENT_ENCODE);
301 if (idx == input.size()) {
303 return std::string(input);
306 return ada::unicode::percent_encode(
307 input, character_sets::USERINFO_PERCENT_ENCODE, idx);
310tl::expected<std::string, errors> canonicalize_hostname(
311 std::string_view input) {
312 ada_log(
"canonicalize_hostname input=", input);
313 if (input.empty()) [[unlikely]] {
318 bool needs_processing =
false;
319 for (
char c : input) {
321 !(char_class_table[
static_cast<uint8_t
>(c)] & CHAR_SIMPLE_HOSTNAME);
323 if (!needs_processing) {
324 return std::string(input);
336 if (!url->set_hostname(input)) {
338 return tl::unexpected(errors::type_error);
341 return std::string(url->get_hostname());
344tl::expected<std::string, errors> canonicalize_ipv6_hostname(
345 std::string_view input) {
346 ada_log(
"canonicalize_ipv6_hostname input=", input);
348 if (std::ranges::any_of(input, [](
char c) {
349 return c !=
'[' && c !=
']' && c !=
':' &&
350 !unicode::is_ascii_hex_digit(c);
352 return tl::unexpected(errors::type_error);
356 auto hostname = std::string(input);
357 unicode::to_lower_ascii(hostname.data(), hostname.size());
361tl::expected<std::string, errors> canonicalize_port(
362 std::string_view port_value) {
364 if (port_value.empty()) [[unlikely]] {
369 std::string trimmed(port_value);
370 helpers::remove_ascii_tab_or_newline(trimmed);
372 if (trimmed.empty()) {
377 if (!unicode::is_ascii_digit(trimmed.front())) {
378 return tl::unexpected(errors::type_error);
382 auto first_non_digit =
383 std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
384 std::string_view digits_to_parse =
385 std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
393 if (digits_to_parse.size() == 5) {
394 if (digits_to_parse >
"65535") {
395 return tl::unexpected(errors::type_error);
397 }
else if (digits_to_parse.size() > 5) {
398 return tl::unexpected(errors::type_error);
400 if (digits_to_parse[0] ==
'0' && digits_to_parse.size() > 1) {
402 return tl::unexpected(errors::type_error);
405 return std::string(digits_to_parse);
408tl::expected<std::string, errors> canonicalize_port_with_protocol(
409 std::string_view port_value, std::string_view protocol) {
411 if (port_value.empty()) [[unlikely]] {
416 if (protocol.empty()) {
418 }
else if (protocol.ends_with(
":")) {
419 protocol.remove_suffix(1);
423 std::string trimmed(port_value);
424 helpers::remove_ascii_tab_or_newline(trimmed);
426 if (trimmed.empty()) {
431 if (!unicode::is_ascii_digit(trimmed.front())) {
432 return tl::unexpected(errors::type_error);
436 auto first_non_digit =
437 std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
438 std::string_view digits_to_parse =
439 std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
442 uint16_t parsed_port{};
443 auto result = std::from_chars(digits_to_parse.data(),
444 digits_to_parse.data() + digits_to_parse.size(),
447 if (
result.ec == std::errc::result_out_of_range) {
448 return tl::unexpected(errors::type_error);
451 if (
result.ec == std::errc()) {
453 uint16_t default_port = scheme::get_special_port(protocol);
456 if (default_port != 0 && default_port == parsed_port) {
461 return std::to_string(parsed_port);
464 return tl::unexpected(errors::type_error);
467tl::expected<std::string, errors> canonicalize_pathname(
468 std::string_view input) {
469 if (input.empty()) [[unlikely]] {
475 bool needs_processing =
false;
476 for (
char c : input) {
478 !(char_class_table[
static_cast<uint8_t
>(c)] & CHAR_SIMPLE_PATHNAME);
480 if (!needs_processing) {
481 return std::string(input);
486 const bool leading_slash = input.starts_with(
"/");
489 const auto modified_value = leading_slash ?
"" :
"/-";
490 const auto full_url =
491 std::string(
"fake://fake-url") + modified_value + std::string(input);
493 const auto pathname = url->get_pathname();
496 if (!leading_slash) {
499 if (pathname.size() < 2) {
500 return tl::unexpected(errors::type_error);
502 return std::string(pathname.substr(2));
504 return std::string(pathname);
507 return tl::unexpected(errors::type_error);
510tl::expected<std::string, errors> canonicalize_opaque_pathname(
511 std::string_view input) {
513 if (input.empty()) [[unlikely]] {
523 return std::string(url->get_pathname());
526 return tl::unexpected(errors::type_error);
529tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
531 if (input.empty()) [[unlikely]] {
535 std::string new_value;
536 new_value = input[0] ==
'?' ? input.substr(1) : input;
538 helpers::remove_ascii_tab_or_newline(new_value);
540 if (new_value.empty()) {
548 new_value, character_sets::QUERY_PERCENT_ENCODE);
549 if (idx == new_value.size()) {
554 return ada::unicode::percent_encode(
555 new_value, character_sets::QUERY_PERCENT_ENCODE, idx);
558tl::expected<std::string, errors> canonicalize_hash(std::string_view input) {
560 if (input.empty()) [[unlikely]] {
564 std::string new_value;
565 new_value = input[0] ==
'#' ? input.substr(1) : input;
567 helpers::remove_ascii_tab_or_newline(new_value);
569 if (new_value.empty()) {
575 new_value, character_sets::FRAGMENT_PERCENT_ENCODE);
576 if (idx == new_value.size()) {
581 return ada::unicode::percent_encode(
582 new_value, character_sets::FRAGMENT_PERCENT_ENCODE, idx);
585tl::expected<std::vector<token>,
errors> tokenize(std::string_view input,
586 token_policy policy) {
587 ada_log(
"tokenize input: ", input);
591 auto tokenizer = Tokenizer(input, policy);
593 while (tokenizer.index < tokenizer.input.size()) {
596 tokenizer.seek_and_get_next_code_point(tokenizer.index);
599 if (tokenizer.code_point ==
'*') {
602 tokenizer.add_token_with_defaults(token_type::ASTERISK);
603 ada_log(
"add ASTERISK token");
609 if (tokenizer.code_point ==
'+' || tokenizer.code_point ==
'?') {
612 tokenizer.add_token_with_defaults(token_type::OTHER_MODIFIER);
618 if (tokenizer.code_point ==
'\\') {
621 if (tokenizer.index == tokenizer.input.size() - 1) {
624 if (
auto error = tokenizer.process_tokenizing_error(
625 tokenizer.next_index, tokenizer.index)) {
626 ada_log(
"process_tokenizing_error failed");
627 return tl::unexpected(*error);
633 auto escaped_index = tokenizer.next_index;
635 tokenizer.get_next_code_point();
638 tokenizer.add_token_with_default_length(
639 token_type::ESCAPED_CHAR, tokenizer.next_index, escaped_index);
640 ada_log(
"add ESCAPED_CHAR token on next_index ", tokenizer.next_index,
641 " with escaped index ", escaped_index);
647 if (tokenizer.code_point ==
'{') {
650 tokenizer.add_token_with_defaults(token_type::OPEN);
651 ada_log(
"add OPEN token");
656 if (tokenizer.code_point ==
'}') {
659 tokenizer.add_token_with_defaults(token_type::CLOSE);
660 ada_log(
"add CLOSE token");
665 if (tokenizer.code_point ==
':') {
667 auto name_position = tokenizer.next_index;
669 auto name_start = name_position;
671 while (name_position < tokenizer.input.size()) {
674 tokenizer.seek_and_get_next_code_point(name_position);
677 bool first_code_point = name_position == name_start;
680 auto valid_code_point =
681 idna::valid_name_code_point(tokenizer.code_point, first_code_point);
682 ada_log(
"tokenizer.code_point=", uint32_t(tokenizer.code_point),
683 " first_code_point=", first_code_point,
684 " valid_code_point=", valid_code_point);
686 if (!valid_code_point)
break;
688 name_position = tokenizer.next_index;
692 if (name_position <= name_start) {
695 if (
auto error = tokenizer.process_tokenizing_error(name_start,
697 ada_log(
"process_tokenizing_error failed");
698 return tl::unexpected(*error);
706 tokenizer.add_token_with_default_length(token_type::NAME, name_position,
712 if (tokenizer.code_point ==
'(') {
716 auto regexp_position = tokenizer.next_index;
718 auto regexp_start = regexp_position;
724 while (regexp_position < tokenizer.input.size()) {
727 tokenizer.seek_and_get_next_code_point(regexp_position);
732 if (!unicode::is_ascii(tokenizer.code_point)) {
735 if (
auto process_error = tokenizer.process_tokenizing_error(
736 regexp_start, tokenizer.index)) {
737 return tl::unexpected(*process_error);
746 if (regexp_position == regexp_start && tokenizer.code_point ==
'?') {
749 if (
auto process_error = tokenizer.process_tokenizing_error(
750 regexp_start, tokenizer.index)) {
751 return tl::unexpected(*process_error);
759 if (tokenizer.code_point ==
'\\') {
761 if (regexp_position == tokenizer.input.size() - 1) {
764 if (
auto process_error = tokenizer.process_tokenizing_error(
765 regexp_start, tokenizer.index)) {
766 return tl::unexpected(*process_error);
773 tokenizer.get_next_code_point();
776 if (!unicode::is_ascii(tokenizer.code_point)) {
779 if (
auto process_error = tokenizer.process_tokenizing_error(
780 regexp_start, tokenizer.index);
781 process_error.has_value()) {
782 return tl::unexpected(*process_error);
789 regexp_position = tokenizer.next_index;
794 if (tokenizer.code_point ==
')') {
800 regexp_position = tokenizer.next_index;
804 }
else if (tokenizer.code_point ==
'(') {
810 if (regexp_position == tokenizer.input.size() - 1) {
813 if (
auto process_error = tokenizer.process_tokenizing_error(
814 regexp_start, tokenizer.index)) {
815 return tl::unexpected(*process_error);
822 auto temporary_position = tokenizer.next_index;
824 tokenizer.get_next_code_point();
826 if (tokenizer.code_point !=
'?') {
829 if (
auto process_error = tokenizer.process_tokenizing_error(
830 regexp_start, tokenizer.index)) {
831 return tl::unexpected(*process_error);
838 tokenizer.next_index = temporary_position;
841 regexp_position = tokenizer.next_index;
850 if (
auto process_error = tokenizer.process_tokenizing_error(
851 regexp_start, tokenizer.index)) {
852 return tl::unexpected(*process_error);
857 auto regexp_length = regexp_position - regexp_start - 1;
859 if (regexp_length == 0) {
862 if (
auto process_error = tokenizer.process_tokenizing_error(
863 regexp_start, tokenizer.index)) {
864 ada_log(
"process_tokenizing_error failed");
865 return tl::unexpected(*process_error);
871 tokenizer.add_token(token_type::REGEXP, regexp_position, regexp_start,
877 tokenizer.add_token_with_defaults(token_type::CHAR);
881 tokenizer.add_token_with_default_length(token_type::END, tokenizer.index,
884 ada_log(
"tokenizer.token_list size is: ", tokenizer.token_list.size());
886 return tokenizer.token_list;
890constexpr std::array<uint8_t, 256> escape_pattern_table = []()
consteval {
891 std::array<uint8_t, 256> out{};
892 for (
auto& c : {
'+',
'*',
'?',
':',
'{',
'}',
'(',
')',
'\\'}) {
898constexpr bool should_escape_pattern_char(
char c) {
899 return escape_pattern_table[
static_cast<uint8_t
>(c)];
903std::string escape_pattern_string(std::string_view input) {
904 ada_log(
"escape_pattern_string called with input=", input);
905 if (input.empty()) [[unlikely]] {
913 result.reserve(input.size() * 2);
916 for (
const char c : input) {
917 if (should_escape_pattern_char(c)) {
929constexpr std::array<uint8_t, 256> escape_regexp_table = []()
consteval {
930 std::array<uint8_t, 256> out{};
931 for (
auto& c : {
'.',
'+',
'*',
'?',
'^',
'$',
'{',
'}',
'(',
')',
'[',
']',
938constexpr bool should_escape_regexp_char(
char c) {
939 return escape_regexp_table[(uint8_t)c];
943std::string escape_regexp_string(std::string_view input) {
949 result.reserve(input.size() * 2);
950 for (
const char c : input) {
951 if (should_escape_regexp_char(c)) {
962std::string process_base_url_string(std::string_view input,
963 url_pattern_init::process_type type) {
965 if (type != url_pattern_init::process_type::pattern) {
966 return std::string(input);
969 return escape_pattern_string(input);
972constexpr bool is_absolute_pathname(
973 std::string_view input, url_pattern_init::process_type type)
noexcept {
975 if (input.empty()) [[unlikely]] {
979 if (input.starts_with(
"/"))
return true;
981 if (type == url_pattern_init::process_type::url)
return false;
983 if (input.size() < 2)
return false;
987 return input[1] ==
'/' && (input[0] ==
'\\' || input[0] ==
'{');
990std::string generate_pattern_string(
991 std::vector<url_pattern_part>& part_list,
992 url_pattern_compile_component_options& options) {
997 for (
size_t index = 0; index < part_list.size(); index++) {
1000 const auto& part = part_list[index];
1004 const url_pattern_part* previous_part =
1005 index == 0 ? nullptr : &part_list[index - 1];
1008 const url_pattern_part* next_part =
1009 index < part_list.size() - 1 ? &part_list[index + 1] :
nullptr;
1011 if (part.type == url_pattern_part_type::FIXED_TEXT) {
1013 if (part.modifier == url_pattern_part_modifier::none) {
1016 result.append(escape_pattern_string(part.value));
1023 result.append(escape_pattern_string(part.value));
1028 result.append(convert_modifier_to_string(part.modifier));
1033 bool custom_name = !unicode::is_ascii_digit(part.name[0]);
1039 bool needs_grouping =
1040 !part.suffix.empty() ||
1041 (!part.prefix.empty() && !options.get_prefix().empty() &&
1042 part.prefix[0] != options.get_prefix()[0]);
1052 if (!needs_grouping && custom_name &&
1053 part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
1054 part.modifier == url_pattern_part_modifier::none && next_part &&
1055 next_part->prefix.empty() && next_part->suffix.empty()) {
1057 if (next_part->type == url_pattern_part_type::FIXED_TEXT) {
1061 if (idna::valid_name_code_point(next_part->value[0],
false)) {
1062 needs_grouping =
true;
1066 needs_grouping = !next_part->name.empty() &&
1067 unicode::is_ascii_digit(next_part->name[0]);
1078 if (!needs_grouping && part.prefix.empty() && previous_part &&
1079 previous_part->type == url_pattern_part_type::FIXED_TEXT &&
1080 !previous_part->value.empty() && !options.get_prefix().empty() &&
1081 previous_part->value.back() == options.get_prefix()[0]) {
1082 needs_grouping =
true;
1089 if (needs_grouping) {
1095 result.append(escape_pattern_string(part.prefix));
1102 result.append(part.name);
1106 if (part.type == url_pattern_part_type::REGEXP) {
1110 result.append(part.value);
1113 }
else if (part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
1120 result.append(generate_segment_wildcard_regexp(options));
1123 }
else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
1134 previous_part->type == url_pattern_part_type::FIXED_TEXT ||
1135 previous_part->modifier != url_pattern_part_modifier::none ||
1136 needs_grouping || !part.prefix.empty())) {
1153 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && custom_name &&
1154 !part.suffix.empty() &&
1155 idna::valid_name_code_point(part.suffix[0],
false)) {
1161 result.append(escape_pattern_string(part.suffix));
1163 if (needs_grouping)
result.append(
"}");
1166 result.append(convert_modifier_to_string(part.modifier));
Declaration of the character sets used by unicode functions.
#define ADA_ASSERT_TRUE(COND)
Definitions for helper functions used within Ada.
bool constexpr is_ascii(std::u32string_view view)
const uint32_t table[8198][2]
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
errors
Error codes for URL parsing operations.
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
URL scheme type definitions and utilities.
Definitions for all unicode specific functions.
Declaration for the URLPattern helpers.