join: Use hash lookup table by default #62

jpsamaroo · 2023-12-05T15:17:33Z

This should make joins significantly faster in the common case.

krynju · 2023-12-10T16:53:08Z

src/operations/join.jl

 function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
+    # Use the smaller table to construct the lookup table
+    l_bigger = length(rows(l)) >= length(rows(r))
+    if l_bigger
+        build_table = r
+        build_ind = r_ind
+        probe_table = l
+        probe_ind = l_ind
+    else
+        build_table = l
+        build_ind = l_ind
+        probe_table = r
+        probe_ind = r_ind
+    end
+
+    # Construct the lookup table
+    row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,)
+    row_type = Base.to_tuple_type([eltype(getcolumn(build_table, ind)) for ind in build_ind])
+    lookup = Dict{row_type,Vector{UInt}}()
+    for (idx, row) in enumerate(rows(build_table))
+        key = row_tuple(row, build_ind)
+        idxs = get!(Vector{UInt}, lookup, key)
+        push!(idxs, idx)
+    end
+
+    # Find rows in the larger (probe) table that match with the build table entries
    l_length = length(rows(l))
    vl = Vector{UInt}()
    vr = Vector{UInt}()
    sizehint!(vl, l_length)
    sizehint!(vr, l_length)
-    for (oind, oel) in enumerate(rows(l))
-        for (iind, iel) in enumerate(rows(r))
-            if compare_rows_eq(oel, iel, l_ind, r_ind)
-                push!(vl, oind)
-                push!(vr, iind)
+    for (idx, row) in enumerate(rows(probe_table))
+        key = row_tuple(row, probe_ind)
+        if haskey(lookup, key)
+            build_idxs = lookup[key]
+            for build_idx in build_idxs
+                if l_bigger
+                    push!(vl, idx)
+                    push!(vr, build_idx)
+                else
+                    push!(vl, build_idx)
+                    push!(vr, idx)
+                end
            end
        end
    end
+
    return vl, vr
 end


Something like this would work better for me

we don't introduce new names by renaming existing args

we if once on the l_bigger condition instead of running the if in the hot loop

sizehints are correct now (they were missing an if on l_bigger)

let me know what you think

function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N} # Use the smaller table to construct the lookup table vl, vr = if length(rows(l)) >= length(rows(r)) match_inner_indices_hash(l, r, l_ind, r_ind) else match_inner_indices_hash(r, l, r_ind, l_ind) end return vl, vr end function match_inner_indices_hash(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N} # assume left is the bigger table # Construct the lookup table row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,) row_type = Base.to_tuple_type([eltype(getcolumn(r, ind)) for ind in r_ind]) lookup = Dict{row_type,Vector{UInt}}() for (idx, row) in enumerate(rows(r)) key = row_tuple(row, r_ind) idxs = get!(Vector{UInt}, lookup, key) push!(idxs, idx) end # Find rows in the larger (probe) table that match with the build table entries l_length = length(rows(l)) vl = Vector{UInt}() vr = Vector{UInt}() sizehint!(vl, l_length) sizehint!(vr, l_length) for (idx, row) in enumerate(rows(l)) key = row_tuple(row, l_ind) if haskey(lookup, key) build_idxs = lookup[key] for build_idx in build_idxs push!(vl, idx) push!(vr, build_idx) end end end return vl, vr end

join: Use hash lookup table by default

7677813

jpsamaroo added the performance label Dec 5, 2023

jpsamaroo requested a review from krynju December 5, 2023 15:17

krynju reviewed Dec 10, 2023

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

join: Use hash lookup table by default #62

join: Use hash lookup table by default #62

Uh oh!

jpsamaroo commented Dec 5, 2023

Uh oh!

krynju Dec 10, 2023 •

edited

Loading

Uh oh!

Uh oh!

join: Use hash lookup table by default #62

Are you sure you want to change the base?

join: Use hash lookup table by default #62

Uh oh!

Conversation

jpsamaroo commented Dec 5, 2023

Uh oh!

krynju Dec 10, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

krynju Dec 10, 2023 •

edited

Loading