Skip to content

Commit 50ed873

Browse files
committed
Implement #scan_integer to efficiently parse Integer
Fix: #113 This allows to directly parse an Integer from a String without needing to first allocate a sub string. Notes: The implementation is limited by design, it's meant as a first step, only the most straightforward, based 10 integers are supported.
1 parent 81a80a1 commit 50ed873

File tree

3 files changed

+139
-0
lines changed

3 files changed

+139
-0
lines changed

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.jruby.runtime.builtin.IRubyObject;
5555
import org.jruby.util.ByteList;
5656
import org.jruby.util.StringSupport;
57+
import org.jruby.util.ConvertBytes;
5758

5859
import java.util.Iterator;
5960

@@ -556,6 +557,47 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) {
556557
return peek(context, length);
557558
}
558559

560+
@JRubyMethod(name = "scan_integer")
561+
public IRubyObject scan_integer(ThreadContext context) {
562+
final Ruby runtime = context.runtime;
563+
check(context);
564+
clearMatched();
565+
566+
if (!str.getEncoding().isAsciiCompatible()) {
567+
throw getRuntime().newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
568+
}
569+
570+
571+
ByteList bytes = str.getByteList();
572+
int curr = this.curr;
573+
574+
int bite = bytes.get(curr);
575+
if (bite == '-' || bite == '+') {
576+
curr++;
577+
bite = bytes.get(curr);
578+
}
579+
580+
if (!(bite >= '0' && bite <= '9')) {
581+
return runtime.getNil();
582+
}
583+
584+
while (bite >= '0' && bite <= '9') {
585+
curr++;
586+
if (curr >= bytes.getRealSize()) {
587+
break;
588+
}
589+
bite = bytes.get(curr);
590+
}
591+
592+
int length = curr - this.curr;
593+
prev = this.curr;
594+
this.curr = curr;
595+
setMatched();
596+
adjustRegisters();
597+
598+
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
599+
}
600+
559601
@JRubyMethod(name = "unscan")
560602
public IRubyObject unscan(ThreadContext context) {
561603
check(context);

ext/strscan/strscan.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ static VALUE strscan_get_byte _((VALUE self));
115115
static VALUE strscan_getbyte _((VALUE self));
116116
static VALUE strscan_peek _((VALUE self, VALUE len));
117117
static VALUE strscan_peep _((VALUE self, VALUE len));
118+
static VALUE strscan_scan_integer _((VALUE self));
118119
static VALUE strscan_unscan _((VALUE self));
119120
static VALUE strscan_bol_p _((VALUE self));
120121
static VALUE strscan_eos_p _((VALUE self));
@@ -1266,6 +1267,51 @@ strscan_peep(VALUE self, VALUE vlen)
12661267
return strscan_peek(self, vlen);
12671268
}
12681269

1270+
/*
1271+
* call-seq:
1272+
* scan_integer
1273+
*
1274+
* Equivalent to #scan with a \-?\d+ pattern, and returns an Integer or nil.
1275+
*/
1276+
static VALUE
1277+
strscan_scan_integer(VALUE self)
1278+
{
1279+
struct strscanner *p;
1280+
1281+
GET_SCANNER(self, p);
1282+
CLEAR_MATCH_STATUS(p);
1283+
1284+
rb_must_asciicompat(p->str);
1285+
1286+
char *ptr = CURPTR(p);
1287+
1288+
long len = 0;
1289+
if (ptr[len] == '-' || ptr[len] == '+') {
1290+
len++;
1291+
}
1292+
1293+
if (!isdigit(ptr[len])) {
1294+
return Qnil;
1295+
}
1296+
1297+
MATCHED(p);
1298+
p->prev = p->curr;
1299+
1300+
while(isdigit(ptr[len])) {
1301+
len++;
1302+
}
1303+
1304+
VALUE buffer_v;
1305+
char *buffer = ALLOCV_N(char, buffer_v, len + 1);
1306+
1307+
MEMCPY(buffer, CURPTR(p), char, len);
1308+
buffer[len] = '\0';
1309+
VALUE integer = rb_cstr2inum(buffer, 10);
1310+
RB_GC_GUARD(buffer_v);
1311+
p->curr += len;
1312+
return integer;
1313+
}
1314+
12691315
/*
12701316
* :markup: markdown
12711317
* :include: strscan/link_refs.txt
@@ -2204,6 +2250,8 @@ Init_strscan(void)
22042250
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
22052251
rb_define_method(StringScanner, "peep", strscan_peep, 1);
22062252

2253+
rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0);
2254+
22072255
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
22082256

22092257
rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);

test/strscan/test_stringscanner.rb

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,55 @@ def test_named_captures
890890
assert_equal(9, scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/))
891891
assert_equal({"f" => "foo", "r" => "bar", "z" => "baz"}, scan.named_captures)
892892
end
893+
894+
def test_scan_integer
895+
s = create_string_scanner('abc')
896+
assert_nil s.scan_integer
897+
assert_equal 0, s.pos
898+
refute_predicate s, :matched?
899+
900+
s = create_string_scanner('123abc')
901+
assert_equal 123, s.scan_integer
902+
assert_equal 3, s.pos
903+
assert_predicate s, :matched?
904+
905+
s = create_string_scanner('-123abc')
906+
assert_equal -123, s.scan_integer
907+
assert_equal 4, s.pos
908+
assert_predicate s, :matched?
909+
910+
s = create_string_scanner('+123')
911+
assert_equal 123, s.scan_integer
912+
assert_equal 4, s.pos
913+
assert_predicate s, :matched?
914+
915+
s = create_string_scanner('-abc')
916+
assert_nil s.scan_integer
917+
assert_equal 0, s.pos
918+
refute_predicate s, :matched?
919+
920+
huge_integer = '1' * 2_000
921+
s = create_string_scanner(huge_integer)
922+
assert_equal huge_integer.to_i, s.scan_integer
923+
assert_equal 2_000, s.pos
924+
assert_predicate s, :matched?
925+
end
926+
927+
def test_scan_integer_unmatch
928+
s = create_string_scanner('123abc')
929+
assert_equal 123, s.scan_integer
930+
assert_equal 3, s.pos
931+
932+
s.unscan
933+
assert_equal 0, s.pos
934+
end
935+
936+
def test_scan_integer_encoding
937+
s = create_string_scanner('123abc'.encode(Encoding::UTF_32LE))
938+
assert_raise(Encoding::CompatibilityError) do
939+
s.scan_integer
940+
end
941+
end
893942
end
894943

895944
class TestStringScanner < Test::Unit::TestCase

0 commit comments

Comments
 (0)