Skip to content

Commit 82ef744

Browse files
committed
Optimize join conditions to use direct column references
NOTE: This PR was created with AI tools and a human. When matching patterns like (u)-[e]->(v), join conditions previously rebuilt entire vertex/edge agtype values just to extract IDs: age_id(_agtype_build_vertex(r.id, ...))::graphid Added optimize_qual_expr_mutator() to replace these patterns with direct column access: age_id(_agtype_build_vertex(id, ...)) -> graphid_to_agtype(id) age_start_id(_agtype_build_edge(...)) -> graphid_to_agtype(start) age_end_id(_agtype_build_edge(...)) -> graphid_to_agtype(end) age_properties(...) -> direct properties column Join conditions now use efficient comparisons like (e.start_id = u.id) enabling PostgreSQL to leverage index scans on edge tables. Added regression tests. All regression tests passed. modified: regress/expected/unified_vertex_table.out modified: regress/sql/unified_vertex_table.sql modified: src/backend/parser/cypher_clause.c
1 parent e384a96 commit 82ef744

File tree

3 files changed

+338
-1
lines changed

3 files changed

+338
-1
lines changed

regress/expected/unified_vertex_table.out

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1309,11 +1309,124 @@ $$) AS (eid agtype, props agtype, sid agtype, eid2 agtype);
13091309
11540474045136897 | {"weight": 10} | 11258999068426241 | 11821949021847553
13101310
(1 row)
13111311

1312+
--
1313+
-- Test 29: Verify join condition optimization with EXPLAIN
1314+
--
1315+
-- When vertices/edges from previous clauses are joined, the optimization
1316+
-- should replace patterns like:
1317+
-- age_id(_agtype_build_vertex(r.id, ...))::graphid
1318+
-- with direct column access:
1319+
-- r.id
1320+
--
1321+
-- This avoids expensive vertex reconstruction in join conditions.
1322+
--
1323+
-- Create test data: Users following each other
1324+
SELECT * FROM cypher('unified_test', $$
1325+
CREATE (:JoinOptUser {name: 'Alice'}),
1326+
(:JoinOptUser {name: 'Bob'}),
1327+
(:JoinOptUser {name: 'Carol'})
1328+
$$) AS (v agtype);
1329+
v
1330+
---
1331+
(0 rows)
1332+
1333+
SELECT * FROM cypher('unified_test', $$
1334+
MATCH (a:JoinOptUser {name: 'Alice'}), (b:JoinOptUser {name: 'Bob'})
1335+
CREATE (a)-[:JOPT_FOLLOWS]->(b)
1336+
$$) AS (e agtype);
1337+
e
1338+
---
1339+
(0 rows)
1340+
1341+
SELECT * FROM cypher('unified_test', $$
1342+
MATCH (b:JoinOptUser {name: 'Bob'}), (c:JoinOptUser {name: 'Carol'})
1343+
CREATE (b)-[:JOPT_FOLLOWS]->(c)
1344+
$$) AS (e agtype);
1345+
e
1346+
---
1347+
(0 rows)
1348+
1349+
-- EXPLAIN showing join conditions use direct column access
1350+
-- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...))
1351+
-- And: direct id comparisons instead of age_id(...)::graphid
1352+
EXPLAIN (COSTS OFF)
1353+
SELECT * FROM cypher('unified_test', $$
1354+
MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser)
1355+
RETURN u.name, v.name
1356+
$$) AS (u_name agtype, v_name agtype);
1357+
QUERY PLAN
1358+
------------------------------------------------------------
1359+
Nested Loop
1360+
Join Filter: (e.start_id = u.id)
1361+
-> Nested Loop
1362+
-> Seq Scan on _ag_label_vertex u
1363+
Filter: (labels = '23814'::oid)
1364+
-> Seq Scan on _ag_label_vertex v
1365+
Filter: (labels = '23814'::oid)
1366+
-> Bitmap Heap Scan on "JOPT_FOLLOWS" e
1367+
Recheck Cond: (end_id = v.id)
1368+
-> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx"
1369+
Index Cond: (end_id = v.id)
1370+
(11 rows)
1371+
1372+
-- Verify the query still returns correct results
1373+
SELECT * FROM cypher('unified_test', $$
1374+
MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser)
1375+
RETURN u.name, v.name
1376+
ORDER BY u.name
1377+
$$) AS (u_name agtype, v_name agtype);
1378+
u_name | v_name
1379+
---------+---------
1380+
"Alice" | "Bob"
1381+
"Bob" | "Carol"
1382+
(2 rows)
1383+
1384+
-- Multi-hop pattern showing optimization across multiple joins
1385+
EXPLAIN (COSTS OFF)
1386+
SELECT * FROM cypher('unified_test', $$
1387+
MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser)
1388+
RETURN a.name, b.name, c.name
1389+
$$) AS (a_name agtype, b_name agtype, c_name agtype);
1390+
QUERY PLAN
1391+
------------------------------------------------------------------------
1392+
Nested Loop
1393+
Join Filter: (e1.start_id = a.id)
1394+
-> Nested Loop
1395+
Join Filter: _ag_enforce_edge_uniqueness2(e1.id, e2.id)
1396+
-> Nested Loop
1397+
Join Filter: (e2.start_id = b.id)
1398+
-> Nested Loop
1399+
-> Seq Scan on _ag_label_vertex b
1400+
Filter: (labels = '23814'::oid)
1401+
-> Seq Scan on _ag_label_vertex c
1402+
Filter: (labels = '23814'::oid)
1403+
-> Bitmap Heap Scan on "JOPT_FOLLOWS" e2
1404+
Recheck Cond: (end_id = c.id)
1405+
-> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx"
1406+
Index Cond: (end_id = c.id)
1407+
-> Bitmap Heap Scan on "JOPT_FOLLOWS" e1
1408+
Recheck Cond: (end_id = b.id)
1409+
-> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx"
1410+
Index Cond: (end_id = b.id)
1411+
-> Seq Scan on _ag_label_vertex a
1412+
Filter: (labels = '23814'::oid)
1413+
(21 rows)
1414+
1415+
-- Verify multi-hop query results
1416+
SELECT * FROM cypher('unified_test', $$
1417+
MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser)
1418+
RETURN a.name, b.name, c.name
1419+
$$) AS (a_name agtype, b_name agtype, c_name agtype);
1420+
a_name | b_name | c_name
1421+
---------+--------+---------
1422+
"Alice" | "Bob" | "Carol"
1423+
(1 row)
1424+
13121425
--
13131426
-- Cleanup
13141427
--
13151428
SELECT drop_graph('unified_test', true);
1316-
NOTICE: drop cascades to 42 other objects
1429+
NOTICE: drop cascades to 44 other objects
13171430
DETAIL: drop cascades to table unified_test._ag_label_vertex
13181431
drop cascades to table unified_test._ag_label_edge
13191432
drop cascades to table unified_test."Person"
@@ -1356,6 +1469,8 @@ drop cascades to table unified_test."OptimizeTest"
13561469
drop cascades to table unified_test."OptStart"
13571470
drop cascades to table unified_test."OPT_EDGE"
13581471
drop cascades to table unified_test."OptEnd"
1472+
drop cascades to table unified_test."JoinOptUser"
1473+
drop cascades to table unified_test."JOPT_FOLLOWS"
13591474
NOTICE: graph "unified_test" has been dropped
13601475
drop_graph
13611476
------------

regress/sql/unified_vertex_table.sql

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,64 @@ SELECT * FROM cypher('unified_test', $$
809809
RETURN id(e), properties(e), start_id(e), end_id(e)
810810
$$) AS (eid agtype, props agtype, sid agtype, eid2 agtype);
811811

812+
--
813+
-- Test 29: Verify join condition optimization with EXPLAIN
814+
--
815+
-- When vertices/edges from previous clauses are joined, the optimization
816+
-- should replace patterns like:
817+
-- age_id(_agtype_build_vertex(r.id, ...))::graphid
818+
-- with direct column access:
819+
-- r.id
820+
--
821+
-- This avoids expensive vertex reconstruction in join conditions.
822+
--
823+
824+
-- Create test data: Users following each other
825+
SELECT * FROM cypher('unified_test', $$
826+
CREATE (:JoinOptUser {name: 'Alice'}),
827+
(:JoinOptUser {name: 'Bob'}),
828+
(:JoinOptUser {name: 'Carol'})
829+
$$) AS (v agtype);
830+
831+
SELECT * FROM cypher('unified_test', $$
832+
MATCH (a:JoinOptUser {name: 'Alice'}), (b:JoinOptUser {name: 'Bob'})
833+
CREATE (a)-[:JOPT_FOLLOWS]->(b)
834+
$$) AS (e agtype);
835+
836+
SELECT * FROM cypher('unified_test', $$
837+
MATCH (b:JoinOptUser {name: 'Bob'}), (c:JoinOptUser {name: 'Carol'})
838+
CREATE (b)-[:JOPT_FOLLOWS]->(c)
839+
$$) AS (e agtype);
840+
841+
-- EXPLAIN showing join conditions use direct column access
842+
-- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...))
843+
-- And: direct id comparisons instead of age_id(...)::graphid
844+
EXPLAIN (COSTS OFF)
845+
SELECT * FROM cypher('unified_test', $$
846+
MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser)
847+
RETURN u.name, v.name
848+
$$) AS (u_name agtype, v_name agtype);
849+
850+
-- Verify the query still returns correct results
851+
SELECT * FROM cypher('unified_test', $$
852+
MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser)
853+
RETURN u.name, v.name
854+
ORDER BY u.name
855+
$$) AS (u_name agtype, v_name agtype);
856+
857+
-- Multi-hop pattern showing optimization across multiple joins
858+
EXPLAIN (COSTS OFF)
859+
SELECT * FROM cypher('unified_test', $$
860+
MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser)
861+
RETURN a.name, b.name, c.name
862+
$$) AS (a_name agtype, b_name agtype, c_name agtype);
863+
864+
-- Verify multi-hop query results
865+
SELECT * FROM cypher('unified_test', $$
866+
MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser)
867+
RETURN a.name, b.name, c.name
868+
$$) AS (a_name agtype, b_name agtype, c_name agtype);
869+
812870
--
813871
-- Cleanup
814872
--

src/backend/parser/cypher_clause.c

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "parser/parsetree.h"
3939
#include "parser/parse_relation.h"
4040
#include "rewrite/rewriteHandler.h"
41+
#include "utils/lsyscache.h"
4142

4243
#include "catalog/ag_graph.h"
4344
#include "catalog/ag_label.h"
@@ -136,6 +137,7 @@ static Node *make_edge_expr(cypher_parsestate *cpstate,
136137
ParseNamespaceItem *pnsi);
137138
static Node *make_qual(cypher_parsestate *cpstate,
138139
transform_entity *entity, char *name);
140+
static Node *optimize_qual_expr_mutator(Node *node, void *context);
139141
static TargetEntry *
140142
transform_match_create_path_variable(cypher_parsestate *cpstate,
141143
cypher_path *path, List *entities);
@@ -3375,11 +3377,173 @@ static void transform_match_pattern(cypher_parsestate *cpstate, Query *query,
33753377
expr = (Expr *)coerce_to_boolean(pstate, (Node *)expr, "WHERE");
33763378
}
33773379

3380+
/*
3381+
* Apply optimization to the transformed expression tree. This looks for
3382+
* patterns like age_id(_agtype_build_vertex(...)) and replaces them with
3383+
* direct column references.
3384+
*/
3385+
if (expr != NULL)
3386+
{
3387+
expr = (Expr *)optimize_qual_expr_mutator((Node *)expr, NULL);
3388+
}
3389+
33783390
query->rtable = cpstate->pstate.p_rtable;
33793391
query->rteperminfos = cpstate->pstate.p_rteperminfos;
33803392
query->jointree = makeFromExpr(cpstate->pstate.p_joinlist, (Node *)expr);
33813393
}
33823394

3395+
/*
3396+
* optimize_qual_expr_mutator - Walk expression tree and optimize vertex/edge
3397+
* accessor patterns.
3398+
*
3399+
* This mutator looks for patterns like:
3400+
* age_id(_agtype_build_vertex(id, label, props))
3401+
* and transforms them to:
3402+
* graphid_to_agtype(id)
3403+
*
3404+
* This avoids the expensive reconstruction of vertex/edge agtype values
3405+
* just to immediately extract a single field from them. This is particularly
3406+
* important for join conditions where the vertex/edge comes from a previous
3407+
* clause.
3408+
*/
3409+
static Node *optimize_qual_expr_mutator(Node *node, void *context)
3410+
{
3411+
if (node == NULL)
3412+
{
3413+
return NULL;
3414+
}
3415+
3416+
/*
3417+
* Look for FuncExpr nodes that wrap accessor functions around
3418+
* _agtype_build_vertex or _agtype_build_edge calls.
3419+
*/
3420+
if (IsA(node, FuncExpr))
3421+
{
3422+
FuncExpr *outer_func = (FuncExpr *)node;
3423+
char *outer_func_name;
3424+
Node *arg;
3425+
FuncExpr *inner_func;
3426+
char *inner_func_name;
3427+
List *inner_args;
3428+
int arg_index = -1;
3429+
3430+
/* Must have exactly one argument */
3431+
if (list_length(outer_func->args) != 1)
3432+
{
3433+
goto recurse;
3434+
}
3435+
3436+
outer_func_name = get_func_name(outer_func->funcid);
3437+
if (outer_func_name == NULL)
3438+
{
3439+
goto recurse;
3440+
}
3441+
3442+
/* Check if this is an accessor function we can optimize */
3443+
if (strcmp(outer_func_name, "age_id") != 0 &&
3444+
strcmp(outer_func_name, "age_start_id") != 0 &&
3445+
strcmp(outer_func_name, "age_end_id") != 0 &&
3446+
strcmp(outer_func_name, "age_properties") != 0)
3447+
{
3448+
goto recurse;
3449+
}
3450+
3451+
arg = (Node *)linitial(outer_func->args);
3452+
3453+
/* The argument must be a FuncExpr (the build function) */
3454+
if (!IsA(arg, FuncExpr))
3455+
{
3456+
goto recurse;
3457+
}
3458+
3459+
inner_func = (FuncExpr *)arg;
3460+
inner_func_name = get_func_name(inner_func->funcid);
3461+
if (inner_func_name == NULL)
3462+
{
3463+
goto recurse;
3464+
}
3465+
3466+
inner_args = inner_func->args;
3467+
3468+
/*
3469+
* Check for _agtype_build_vertex(id, label_name, properties)
3470+
* Arguments: 0=id (graphid), 1=label_name (cstring), 2=properties (agtype)
3471+
*/
3472+
if (strcmp(inner_func_name, "_agtype_build_vertex") == 0 &&
3473+
list_length(inner_args) == 3)
3474+
{
3475+
if (strcmp(outer_func_name, "age_id") == 0)
3476+
{
3477+
arg_index = 0; /* id */
3478+
}
3479+
else if (strcmp(outer_func_name, "age_properties") == 0)
3480+
{
3481+
arg_index = 2; /* properties */
3482+
}
3483+
}
3484+
/*
3485+
* Check for _agtype_build_edge(id, startid, endid, label_name, properties)
3486+
* Arguments: 0=id (graphid), 1=start_id (graphid), 2=end_id (graphid),
3487+
* 3=label_name (cstring), 4=properties (agtype)
3488+
*/
3489+
else if (strcmp(inner_func_name, "_agtype_build_edge") == 0 &&
3490+
list_length(inner_args) == 5)
3491+
{
3492+
if (strcmp(outer_func_name, "age_id") == 0)
3493+
{
3494+
arg_index = 0; /* id */
3495+
}
3496+
else if (strcmp(outer_func_name, "age_start_id") == 0)
3497+
{
3498+
arg_index = 1; /* start_id */
3499+
}
3500+
else if (strcmp(outer_func_name, "age_end_id") == 0)
3501+
{
3502+
arg_index = 2; /* end_id */
3503+
}
3504+
else if (strcmp(outer_func_name, "age_properties") == 0)
3505+
{
3506+
arg_index = 4; /* properties */
3507+
}
3508+
}
3509+
3510+
/* If we found a pattern to optimize */
3511+
if (arg_index >= 0)
3512+
{
3513+
Node *extracted_arg = (Node *)list_nth(inner_args, arg_index);
3514+
3515+
/* For properties, return directly (already agtype) */
3516+
if (strcmp(outer_func_name, "age_properties") == 0)
3517+
{
3518+
return extracted_arg;
3519+
}
3520+
else
3521+
{
3522+
/*
3523+
* For graphid fields (id, start_id, end_id), we need to wrap
3524+
* in graphid_to_agtype to match the original return type.
3525+
*/
3526+
Oid cast_func_oid;
3527+
FuncExpr *cast_expr;
3528+
3529+
cast_func_oid = get_ag_func_oid("graphid_to_agtype", 1,
3530+
GRAPHIDOID);
3531+
3532+
cast_expr = makeFuncExpr(cast_func_oid, AGTYPEOID,
3533+
list_make1(extracted_arg),
3534+
InvalidOid, InvalidOid,
3535+
COERCE_EXPLICIT_CALL);
3536+
cast_expr->location = outer_func->location;
3537+
3538+
return (Node *)cast_expr;
3539+
}
3540+
}
3541+
}
3542+
3543+
recurse:
3544+
return expression_tree_mutator(node, optimize_qual_expr_mutator, context);
3545+
}
3546+
33833547
/*
33843548
* Creates a FuncCall node that will prevent an edge from being joined
33853549
* to twice.

0 commit comments

Comments
 (0)