Skip to content

Commit 257227b

Browse files
committed
Strings: added support for UTF8 offsets in regexp
1 parent 0bbd296 commit 257227b

File tree

5 files changed

+114
-12
lines changed

5 files changed

+114
-12
lines changed

src/Utils/Strings.php

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -477,11 +477,17 @@ public static function split(
477477
bool|int $captureOffset = false,
478478
bool $skipEmpty = false,
479479
int $limit = -1,
480+
bool $utf8 = false,
480481
): array {
481482
$flags = is_int($captureOffset) && $captureOffset // back compatibility
482483
? $captureOffset
483484
: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
484-
return self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
485+
$pattern .= $utf8 ? 'u' : '';
486+
$m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
487+
if ($utf8 && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
488+
return self::bytesToChars($subject, [$m])[0];
489+
}
490+
return $m;
485491
}
486492

487493

@@ -494,16 +500,25 @@ public static function match(
494500
bool|int $captureOffset = false,
495501
int $offset = 0,
496502
bool $unmatchedAsNull = false,
503+
bool $utf8 = false,
497504
): ?array {
498505
$flags = is_int($captureOffset) && $captureOffset // back compatibility
499506
? $captureOffset
500507
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
508+
if ($utf8) {
509+
$offset = strlen(self::substring($subject, 0, $offset));
510+
$pattern .= 'u';
511+
}
501512
if ($offset > strlen($subject)) {
502513
return null;
503514
}
504-
return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
505-
? $m
506-
: null;
515+
if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
516+
return null;
517+
}
518+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
519+
return self::bytesToChars($subject, [$m])[0];
520+
}
521+
return $m;
507522
}
508523

509524

@@ -518,10 +533,15 @@ public static function matchAll(
518533
int $offset = 0,
519534
bool $unmatchedAsNull = false,
520535
bool $patternOrder = false,
536+
bool $utf8 = false,
521537
): array {
522538
$flags = is_int($captureOffset) && $captureOffset // back compatibility
523539
? $captureOffset
524540
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
541+
if ($utf8) {
542+
$offset = strlen(self::substring($subject, 0, $offset));
543+
$pattern .= 'u';
544+
}
525545
if ($offset > strlen($subject)) {
526546
return [];
527547
}
@@ -530,6 +550,9 @@ public static function matchAll(
530550
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
531551
$offset,
532552
]);
553+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
554+
return self::bytesToChars($subject, $m);
555+
}
533556
return $m;
534557
}
535558

@@ -544,23 +567,52 @@ public static function replace(
544567
int $limit = -1,
545568
bool $captureOffset = false,
546569
bool $unmatchedAsNull = false,
570+
bool $utf8 = false,
547571
): string {
548572
if (is_object($replacement) || is_array($replacement)) {
549573
if (!is_callable($replacement, false, $textual)) {
550574
throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
551575
}
552576
$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
577+
if ($utf8) {
578+
$pattern .= 'u';
579+
if ($captureOffset) {
580+
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
581+
}
582+
}
553583
return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
554584

555585
} elseif (is_array($pattern) && is_string(key($pattern))) {
556586
$replacement = array_values($pattern);
557587
$pattern = array_keys($pattern);
558588
}
559589

590+
if ($utf8) {
591+
$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
592+
}
593+
560594
return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
561595
}
562596

563597

598+
private static function bytesToChars(string $s, array $groups): array
599+
{
600+
$lastBytes = $lastChars = 0;
601+
foreach ($groups as &$matches) {
602+
foreach ($matches as &$match) {
603+
if ($match[1] > $lastBytes) {
604+
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
605+
} elseif ($match[1] < $lastBytes) {
606+
$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
607+
}
608+
$lastBytes = $match[1];
609+
$match[1] = $lastChars;
610+
}
611+
}
612+
return $groups;
613+
}
614+
615+
564616
/** @internal */
565617
public static function pcre(string $func, array $args)
566618
{

tests/Utils/Strings.match().phpt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,22 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
1919

2020
Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
2121

22-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
23-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
22+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
23+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
2424

25+
Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true));
2526
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true));
2627
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', 0, 0, unmatchedAsNull: true)); // $flags = 0
2728

2829
Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2));
2930

31+
Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2));
32+
33+
Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2));
34+
35+
Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
36+
37+
Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2));
38+
3039
Assert::null(Strings::match('hello world!', '', offset: 50));
3140
Assert::null(Strings::match('', '', offset: 1));

tests/Utils/Strings.matchAll().phpt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,31 @@ Assert::same([
4545
[['u', 3], ['u', 7], ['', 11], ['', 15]],
4646
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
4747

48+
Assert::same([
49+
[['lu', 1], ['l', 1], ['u', 2]],
50+
[['ou', 4], ['o', 4], ['u', 5]],
51+
[['k', 7], ['k', 7], ['', 8]],
52+
[['k', 10], ['k', 10], ['', 11]],
53+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true));
54+
4855
Assert::same([
4956
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
5057
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
5158
[['u', 3], ['u', 7], ['', 11], ['', 15]],
5259
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
5360

61+
Assert::same([
62+
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
63+
[['l', 1], ['o', 4], ['k', 7], ['k', 10]],
64+
[['u', 2], ['u', 5], ['', 8], ['', 11]],
65+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true));
66+
5467
Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2));
5568

69+
Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true));
70+
71+
Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
72+
5673
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
5774
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true));
5875

tests/Utils/Strings.replace().phpt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
3737

3838
// flags & callback
3939
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true));
40+
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8: true));
4041
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);
42+
43+
// utf-8 without modifier
44+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', fn() => '*', utf8: true));
45+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', '*', utf8: true));
46+
Assert::same('* *', Strings::replace('žluťoučký kůň', ['#\w+#'], '*', utf8: true));

tests/Utils/Strings.split().phpt

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,29 @@ Assert::same([
4646
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
4747

4848
Assert::same([
49-
['a', 0],
50-
[',', 1],
51-
['b', 3],
52-
[',', 4],
53-
['c', 6],
54-
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
49+
['ž', 0],
50+
['lu', 2],
51+
['ť', 4],
52+
['ou', 6],
53+
['č', 8],
54+
['k', 10],
55+
['ý ', 11],
56+
['k', 14],
57+
['ůň', 15],
58+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
59+
60+
Assert::same([
61+
['ž', 0],
62+
['lu', 1],
63+
['ť', 3],
64+
['ou', 4],
65+
['č', 6],
66+
['k', 7],
67+
['ý ', 8],
68+
['k', 10],
69+
['ůň', 11],
70+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true));
71+
72+
Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
5573

5674
Assert::same(['a', ',', 'b, c'], Strings::split('a, b, c', '#(,)\s*#', limit: 2));

0 commit comments

Comments
 (0)