Skip to content

join: Use hash lookup table by default #62

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 39 additions & 6 deletions src/operations/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,55 @@ end
match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int})

Returns two vectors containing indices of matched rows.
Standard non-optimized use case.
Constructs a hash table to compare matching keys.
"""
function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
# Use the smaller table to construct the lookup table
l_bigger = length(rows(l)) >= length(rows(r))
if l_bigger
build_table = r
build_ind = r_ind
probe_table = l
probe_ind = l_ind
else
build_table = l
build_ind = l_ind
probe_table = r
probe_ind = r_ind
end

# Construct the lookup table
row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,)
row_type = Base.to_tuple_type([eltype(getcolumn(build_table, ind)) for ind in build_ind])
lookup = Dict{row_type,Vector{UInt}}()
for (idx, row) in enumerate(rows(build_table))
key = row_tuple(row, build_ind)
idxs = get!(Vector{UInt}, lookup, key)
push!(idxs, idx)
end

# Find rows in the larger (probe) table that match with the build table entries
l_length = length(rows(l))
vl = Vector{UInt}()
vr = Vector{UInt}()
sizehint!(vl, l_length)
sizehint!(vr, l_length)
for (oind, oel) in enumerate(rows(l))
for (iind, iel) in enumerate(rows(r))
if compare_rows_eq(oel, iel, l_ind, r_ind)
push!(vl, oind)
push!(vr, iind)
for (idx, row) in enumerate(rows(probe_table))
key = row_tuple(row, probe_ind)
if haskey(lookup, key)
build_idxs = lookup[key]
for build_idx in build_idxs
if l_bigger
push!(vl, idx)
push!(vr, build_idx)
else
push!(vl, build_idx)
push!(vr, idx)
end
end
end
end

return vl, vr
end
Comment on lines 46 to 94
Copy link
Member

@krynju krynju Dec 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like this would work better for me

  1. we don't introduce new names by renaming existing args
  2. we if once on the l_bigger condition instead of running the if in the hot loop
  3. sizehints are correct now (they were missing an if on l_bigger)

let me know what you think

function match_inner_indices(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
    # Use the smaller table to construct the lookup table
    vl, vr = if length(rows(l)) >= length(rows(r))
        match_inner_indices_hash(l, r, l_ind, r_ind)
    else
        match_inner_indices_hash(r, l, r_ind, l_ind)
    end
    return vl, vr
end

function match_inner_indices_hash(l, r, l_ind::NTuple{N,Int}, r_ind::NTuple{N,Int}) where {N}
    # assume left is the bigger table
    # Construct the lookup table
    row_tuple = (row, cols) -> ([getcolumn(row, x) for x in cols]...,)
    row_type = Base.to_tuple_type([eltype(getcolumn(r, ind)) for ind in r_ind])
    lookup = Dict{row_type,Vector{UInt}}()
    for (idx, row) in enumerate(rows(r))
        key = row_tuple(row, r_ind)
        idxs = get!(Vector{UInt}, lookup, key)
        push!(idxs, idx)
    end

    # Find rows in the larger (probe) table that match with the build table entries
    l_length = length(rows(l))
    vl = Vector{UInt}()
    vr = Vector{UInt}()
    sizehint!(vl, l_length)
    sizehint!(vr, l_length)

    for (idx, row) in enumerate(rows(l))
        key = row_tuple(row, l_ind)
        if haskey(lookup, key)
            build_idxs = lookup[key]
            for build_idx in build_idxs
                push!(vl, idx)
                push!(vr, build_idx)
            end
        end
    end

    return vl, vr
end


Expand Down