Skip to content

Commit c1d13f4

Browse files
authored
feat: add support for named vectorizers to ai.vectorizer_errors (#740)
1 parent 647985e commit c1d13f4

File tree

9 files changed

+258
-5
lines changed

9 files changed

+258
-5
lines changed

projects/pgai/db/sql/idempotent/999-privileges.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ begin
44
if not admin then
55
execute 'grant usage, create on schema ai to ' || to_user;
66
execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
7+
execute 'grant select on ai._vectorizer_errors to ' || to_user;
78
execute 'grant select on ai.vectorizer_errors to ' || to_user;
89
execute 'grant select on ai.vectorizer_status to ' || to_user;
910
execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
@@ -13,6 +14,7 @@ begin
1314
execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
1415
execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
1516
execute 'grant all privileges on table ai.vectorizer to ' || to_user;
17+
execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
1618
execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
1719
execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
1820
execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
2+
alter table ai.vectorizer_errors rename to _vectorizer_errors;
3+
4+
-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
5+
-- this is not strictly necessary, but it is a good practice to keep the naming consistent
6+
alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;
7+
8+
-- create a view including vectorizer name
9+
create or replace view ai.vectorizer_errors as
10+
select
11+
ve.*,
12+
v.name
13+
from
14+
ai._vectorizer_errors ve
15+
left join ai.vectorizer v on ve.id = v.id;
16+
17+
18+
-- grant privileges on new ai.vectorizer_errors view
19+
do language plpgsql $block$
20+
declare
21+
to_user text;
22+
priv_type text;
23+
with_grant text;
24+
rec record;
25+
begin
26+
-- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
27+
for rec in
28+
select distinct grantee as username, privilege_type, is_grantable
29+
from information_schema.role_table_grants
30+
where table_schema = 'ai'
31+
and table_name = '_vectorizer_errors'
32+
loop
33+
to_user := rec.username;
34+
priv_type := rec.privilege_type;
35+
with_grant := '';
36+
if rec.is_grantable then
37+
with_grant := ' WITH GRANT OPTION';
38+
end if;
39+
execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
40+
end loop;
41+
end
42+
$block$;

projects/pgai/db/tests/vectorizer/test_named_vectorizer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,20 @@ def test_named_vectorizer():
144144
vectorizer_name = cur.fetchone()[0]
145145
assert vectorizer_name == "website_blog_embedding1"
146146

147+
# Test fetch errors by vectorizer name
148+
cur.execute(
149+
"insert into ai._vectorizer_errors (id, message) values (%s, %s)",
150+
(vectorizer_id_2, "test error message"),
151+
)
152+
153+
cur.execute(
154+
"select * from ai.vectorizer_errors where name = %s",
155+
(vectorizer_name,),
156+
)
157+
158+
error = cur.fetchone()
159+
assert error.message == "test error message"
160+
147161
# create a vectorizer with no name check default name
148162
cur.execute("""
149163
select ai.create_vectorizer

projects/pgai/pgai/data/ai.sql

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,6 +1103,74 @@ begin
11031103
end;
11041104
$outer_migration_block$;
11051105

1106+
-------------------------------------------------------------------------------
1107+
-- 030-add_vectorizer_errors_view.sql
1108+
do $outer_migration_block$ /*030-add_vectorizer_errors_view.sql*/
1109+
declare
1110+
_sql text;
1111+
_migration record;
1112+
_migration_name text = $migration_name$030-add_vectorizer_errors_view.sql$migration_name$;
1113+
_migration_body text =
1114+
$migration_body$
1115+
-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
1116+
alter table ai.vectorizer_errors rename to _vectorizer_errors;
1117+
1118+
-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
1119+
-- this is not strictly necessary, but it is a good practice to keep the naming consistent
1120+
alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;
1121+
1122+
-- create a view including vectorizer name
1123+
create or replace view ai.vectorizer_errors as
1124+
select
1125+
ve.*,
1126+
v.name
1127+
from
1128+
ai._vectorizer_errors ve
1129+
left join ai.vectorizer v on ve.id = v.id;
1130+
1131+
1132+
-- grant privileges on new ai.vectorizer_errors view
1133+
do language plpgsql $block$
1134+
declare
1135+
to_user text;
1136+
priv_type text;
1137+
with_grant text;
1138+
rec record;
1139+
begin
1140+
-- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
1141+
for rec in
1142+
select distinct grantee as username, privilege_type, is_grantable
1143+
from information_schema.role_table_grants
1144+
where table_schema = 'ai'
1145+
and table_name = '_vectorizer_errors'
1146+
loop
1147+
to_user := rec.username;
1148+
priv_type := rec.privilege_type;
1149+
with_grant := '';
1150+
if rec.is_grantable then
1151+
with_grant := ' WITH GRANT OPTION';
1152+
end if;
1153+
execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
1154+
end loop;
1155+
end
1156+
$block$;
1157+
$migration_body$;
1158+
begin
1159+
select * into _migration from ai.pgai_lib_migration where "name" operator(pg_catalog.=) _migration_name;
1160+
if _migration is not null then
1161+
raise notice 'migration %s already applied. skipping.', _migration_name;
1162+
if _migration.body operator(pg_catalog.!=) _migration_body then
1163+
raise warning 'the contents of migration "%s" have changed', _migration_name;
1164+
end if;
1165+
return;
1166+
end if;
1167+
_sql = pg_catalog.format(E'do /*%s*/ $migration_body$\nbegin\n%s\nend;\n$migration_body$;', _migration_name, _migration_body);
1168+
execute _sql;
1169+
insert into ai.pgai_lib_migration ("name", body, applied_at_version)
1170+
values (_migration_name, _migration_body, $version$__version__$version$);
1171+
end;
1172+
$outer_migration_block$;
1173+
11061174
--------------------------------------------------------------------------------
11071175
-- 001-chunking.sql
11081176

@@ -4222,6 +4290,7 @@ begin
42224290
if not admin then
42234291
execute 'grant usage, create on schema ai to ' || to_user;
42244292
execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
4293+
execute 'grant select on ai._vectorizer_errors to ' || to_user;
42254294
execute 'grant select on ai.vectorizer_errors to ' || to_user;
42264295
execute 'grant select on ai.vectorizer_status to ' || to_user;
42274296
execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
@@ -4231,6 +4300,7 @@ begin
42314300
execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
42324301
execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
42334302
execute 'grant all privileges on table ai.vectorizer to ' || to_user;
4303+
execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
42344304
execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
42354305
execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
42364306
execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;

projects/pgai/pgai/vectorizer/features/features.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@ def __init__(
1515
has_worker_tracking_table: bool,
1616
has_loading_retries: bool,
1717
has_reveal_secret_function: bool,
18+
has_vectorizer_errors_view: bool,
1819
) -> None:
1920
self.has_disabled_column = has_disabled_column
2021
self.has_worker_tracking_table = has_worker_tracking_table
2122
self.has_loading_retries = has_loading_retries
2223
self.has_reveal_secret_function = has_reveal_secret_function
24+
self.has_vectorizer_errors_view = has_vectorizer_errors_view
2325

2426
@classmethod
2527
def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
@@ -62,20 +64,31 @@ def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
6264
cur.execute(query)
6365
has_reveal_secret_function = cur.fetchone() is not None
6466

67+
# Newer versions of pgai lib have the ai.vectorizer_errors view.
68+
# The table has been renamed to ai._vectorizer_errors
69+
query = """
70+
SELECT table_name
71+
FROM information_schema.views
72+
WHERE table_schema = 'ai' AND table_name = 'vectorizer_errors';
73+
"""
74+
cur.execute(query)
75+
has_vectorizer_errors_view = cur.fetchone() is not None
76+
6577
return cls(
6678
has_disabled_column,
6779
has_worker_tracking_table,
6880
has_loading_retries,
6981
has_reveal_secret_function,
82+
has_vectorizer_errors_view,
7083
)
7184

7285
@classmethod
7386
def for_testing_latest_version(cls: type[Self]) -> Self:
74-
return cls(True, True, True, True)
87+
return cls(True, True, True, True, True)
7588

7689
@classmethod
7790
def for_testing_no_features(cls: type[Self]) -> Self:
78-
return cls(False, False, False, False)
91+
return cls(False, False, False, False, False)
7992

8093
@cached_property
8194
def disable_vectorizers(self) -> bool:

projects/pgai/pgai/vectorizer/vectorizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
SourceRow: TypeAlias = dict[str, Any]
4444

4545
DEFAULT_CONCURRENCY = 1
46+
DEFAULT_VECTORIZER_ERRORS_TABLE = "_vectorizer_errors"
4647

4748
VECTORIZER_FAILED = "vectorizer failed with unexpected error"
4849

@@ -125,7 +126,6 @@ class Vectorizer(BaseModel):
125126
source_pk (list[PkAtt]): List of primary key attributes from the source table.
126127
errors_schema (str): The schema where the error log is saved. Default is "ai".
127128
errors_table (str): The table where errors are logged.
128-
Default is "vectorizer_errors".
129129
"""
130130

131131
id: int
@@ -137,7 +137,7 @@ class Vectorizer(BaseModel):
137137
source_pk: list[PkAtt]
138138
queue_failed_table: str | None = None
139139
errors_schema: str = "ai"
140-
errors_table: str = "vectorizer_errors"
140+
errors_table: str = DEFAULT_VECTORIZER_ERRORS_TABLE
141141
schema_: str = Field(alias="schema", default="ai")
142142
table: str = "vectorizer"
143143

projects/pgai/pgai/vectorizer/worker.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from .. import __version__
1616
from .embeddings import ApiKeyMixin
1717
from .features import Features
18-
from .vectorizer import Vectorizer
18+
from .vectorizer import DEFAULT_VECTORIZER_ERRORS_TABLE, Vectorizer
1919
from .worker_tracking import WorkerTracking
2020

2121
if sys.version_info >= (3, 11):
@@ -124,6 +124,13 @@ def _get_vectorizer(self, vectorizer_id: int, features: Features) -> Vectorizer:
124124
vectorizer = row["vectorizer"]
125125
embedding = vectorizer["config"]["embedding"]
126126
vectorizer = Vectorizer.model_validate(vectorizer)
127+
128+
if (
129+
vectorizer.errors_table == DEFAULT_VECTORIZER_ERRORS_TABLE
130+
and not features.has_vectorizer_errors_view
131+
):
132+
vectorizer.errors_table = "vectorizer_errors"
133+
127134
# The Ollama API doesn't need a key, so `api_key_name` may be unset
128135
if "api_key_name" in embedding:
129136
api_key_name = embedding["api_key_name"]
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
interactions:
2+
- request:
3+
body: '{"input": ["post_1", "post_2", "post_3"], "model": "intentionally-bad-embedding-model",
4+
"dimensions": 1536, "encoding_format": "float"}'
5+
headers:
6+
accept:
7+
- application/json
8+
accept-encoding:
9+
- gzip, deflate, br, zstd
10+
connection:
11+
- keep-alive
12+
content-length:
13+
- '135'
14+
content-type:
15+
- application/json
16+
host:
17+
- api.openai.com
18+
user-agent:
19+
- AsyncOpenAI/Python 1.70.0
20+
x-stainless-arch:
21+
- arm64
22+
x-stainless-async:
23+
- async:asyncio
24+
x-stainless-lang:
25+
- python
26+
x-stainless-os:
27+
- MacOS
28+
x-stainless-package-version:
29+
- 1.70.0
30+
x-stainless-raw-response:
31+
- stream
32+
x-stainless-read-timeout:
33+
- '600'
34+
x-stainless-retry-count:
35+
- '0'
36+
x-stainless-runtime:
37+
- CPython
38+
x-stainless-runtime-version:
39+
- 3.10.15
40+
method: POST
41+
uri: https://api.openai.com/v1/embeddings
42+
response:
43+
body:
44+
string: !!binary |
45+
IbgDACCWTuUm6PQ4rrgJLvdamfPpEQSGbUltWUFbEFGGYZjAxe42Bq34IaX8BF8PAwDy8zzMJFA4
46+
pmeHnyRA/+TRDc63ULlffb/moddte3GjHfed8c7lPnKYK4gh6/kTw4xr2KBU4Yp/YR2Q1x81d2O9
47+
xrbc27GnnL2FuUiKHdxS2O8k0G9tuwPs4JDNkP2wyqj8TKj62McMAw==
48+
headers:
49+
CF-RAY:
50+
- 93fa9285b965589a-BCN
51+
Connection:
52+
- keep-alive
53+
Content-Encoding:
54+
- br
55+
Content-Type:
56+
- application/json; charset=utf-8
57+
Date:
58+
- Wed, 14 May 2025 12:59:22 GMT
59+
Server:
60+
- cloudflare
61+
Transfer-Encoding:
62+
- chunked
63+
X-Content-Type-Options:
64+
- nosniff
65+
alt-svc:
66+
- h3=":443"; ma=86400
67+
cf-cache-status:
68+
- DYNAMIC
69+
strict-transport-security:
70+
- max-age=31536000; includeSubDomains; preload
71+
vary:
72+
- Origin
73+
x-request-id:
74+
- req_c8879e03af8f9ecd9ac5178dbc89604d
75+
status:
76+
code: 404
77+
message: Not Found
78+
version: 1

projects/pgai/tests/vectorizer/cli/test_compatibility.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,30 @@ def test_080_vectorizer_definition(
4545
with conn.cursor() as cur:
4646
cur.execute("SELECT * FROM blog_embedding_store;")
4747
assert len(cur.fetchall()) == 4
48+
49+
50+
@pytest.mark.postgres_params(ai_extension_version="0.8.0")
51+
def test_errors_table_compatibility(
52+
cli_db: tuple[TestDatabase, Connection], cli_db_url: str, vcr_: Any
53+
):
54+
conn = cli_db[1]
55+
setup_source_table(conn, 3)
56+
57+
with vcr_.use_cassette("test_errors_table_compatibility.yaml"):
58+
# Create vectorizer with intentionally bad embedding model to produce an error
59+
with conn.cursor() as cur:
60+
cur.execute("""
61+
SELECT ai.create_vectorizer(
62+
'blog'::regclass,
63+
embedding =>
64+
ai.embedding_openai('intentionally-bad-embedding-model', 1536),
65+
chunking => ai.chunking_character_text_splitter('content'),
66+
formatting => ai.formatting_python_template('$chunk')
67+
);
68+
""") # type: ignore
69+
vectorizer_id = int(cur.fetchone()[0]) # type: ignore
70+
run_vectorizer_worker(cli_db_url, vectorizer_id)
71+
72+
with conn.cursor() as cur:
73+
cur.execute("SELECT * FROM ai.vectorizer_errors;")
74+
assert len(cur.fetchall()) > 0

0 commit comments

Comments
 (0)