Skip to content

join: Use hash lookup table by default #62

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

jpsamaroo
Copy link
Member

This should make joins significantly faster in the common case.

Comment on lines 46 to 94
function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
# Use the smaller table to construct the lookup table
l_bigger = length(rows(l)) >= length(rows(r))
if l_bigger
build_table = r
build_ind = r_ind
probe_table = l
probe_ind = l_ind
else
build_table = l
build_ind = l_ind
probe_table = r
probe_ind = r_ind
end

# Construct the lookup table
row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,)
row_type = Base.to_tuple_type([eltype(getcolumn(build_table, ind)) for ind in build_ind])
lookup = Dict{row_type,Vector{UInt}}()
for (idx, row) in enumerate(rows(build_table))
key = row_tuple(row, build_ind)
idxs = get!(Vector{UInt}, lookup, key)
push!(idxs, idx)
end

# Find rows in the larger (probe) table that match with the build table entries
l_length = length(rows(l))
vl = Vector{UInt}()
vr = Vector{UInt}()
sizehint!(vl, l_length)
sizehint!(vr, l_length)
for (oind, oel) in enumerate(rows(l))
for (iind, iel) in enumerate(rows(r))
if compare_rows_eq(oel, iel, l_ind, r_ind)
push!(vl, oind)
push!(vr, iind)
for (idx, row) in enumerate(rows(probe_table))
key = row_tuple(row, probe_ind)
if haskey(lookup, key)
build_idxs = lookup[key]
for build_idx in build_idxs
if l_bigger
push!(vl, idx)
push!(vr, build_idx)
else
push!(vl, build_idx)
push!(vr, idx)
end
end
end
end

return vl, vr
end
Copy link
Member

@krynju krynju Dec 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like this would work better for me

  1. we don't introduce new names by renaming existing args
  2. we if once on the l_bigger condition instead of running the if in the hot loop
  3. sizehints are correct now (they were missing an if on l_bigger)

let me know what you think

function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
    # Use the smaller table to construct the lookup table
    vl, vr = if length(rows(l)) >= length(rows(r))
        match_inner_indices_hash(l, r, l_ind, r_ind)
    else
        match_inner_indices_hash(r, l, r_ind, l_ind)
    end
    return vl, vr
end

function match_inner_indices_hash(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
    # assume left is the bigger table
    # Construct the lookup table
    row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,)
    row_type = Base.to_tuple_type([eltype(getcolumn(r, ind)) for ind in r_ind])
    lookup = Dict{row_type,Vector{UInt}}()
    for (idx, row) in enumerate(rows(r))
        key = row_tuple(row, r_ind)
        idxs = get!(Vector{UInt}, lookup, key)
        push!(idxs, idx)
    end

    # Find rows in the larger (probe) table that match with the build table entries
    l_length = length(rows(l))
    vl = Vector{UInt}()
    vr = Vector{UInt}()
    sizehint!(vl, l_length)
    sizehint!(vr, l_length)

    for (idx, row) in enumerate(rows(l))
        key = row_tuple(row, l_ind)
        if haskey(lookup, key)
            build_idxs = lookup[key]
            for build_idx in build_idxs
                push!(vl, idx)
                push!(vr, build_idx)
            end
        end
    end

    return vl, vr
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants