Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions projects/pgai/db/sql/idempotent/999-privileges.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ begin
if not admin then
execute 'grant usage, create on schema ai to ' || to_user;
execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
execute 'grant select on ai._vectorizer_errors to ' || to_user;
execute 'grant select on ai.vectorizer_errors to ' || to_user;
execute 'grant select on ai.vectorizer_status to ' || to_user;
execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
Expand All @@ -13,6 +14,7 @@ begin
execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
execute 'grant all privileges on table ai.vectorizer to ' || to_user;
execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
alter table ai.vectorizer_errors rename to _vectorizer_errors;

-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
-- this is not strictly necessary, but it is a good practice to keep the naming consistent
alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;

-- create a view including vectorizer name
create or replace view ai.vectorizer_errors as
select
ve.*,
v.name
from
ai._vectorizer_errors ve
left join ai.vectorizer v on ve.id = v.id;


-- grant privileges on new ai.vectorizer_errors view
do language plpgsql $block$
declare
to_user text;
priv_type text;
with_grant text;
rec record;
begin
-- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
for rec in
select distinct grantee as username, privilege_type, is_grantable
from information_schema.role_table_grants
where table_schema = 'ai'
and table_name = '_vectorizer_errors'
loop
to_user := rec.username;
priv_type := rec.privilege_type;
with_grant := '';
if rec.is_grantable then
with_grant := ' WITH GRANT OPTION';
end if;
execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
end loop;
end
$block$;
14 changes: 14 additions & 0 deletions projects/pgai/db/tests/vectorizer/test_named_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,20 @@ def test_named_vectorizer():
vectorizer_name = cur.fetchone()[0]
assert vectorizer_name == "website_blog_embedding1"

# Test fetch errors by vectorizer name
cur.execute(
"insert into ai._vectorizer_errors (id, message) values (%s, %s)",
(vectorizer_id_2, "test error message"),
)

cur.execute(
"select * from ai.vectorizer_errors where name = %s",
(vectorizer_name,),
)

error = cur.fetchone()
assert error.message == "test error message"

# create a vectorizer with no name check default name
cur.execute("""
select ai.create_vectorizer
Expand Down
70 changes: 70 additions & 0 deletions projects/pgai/pgai/data/ai.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,74 @@ begin
end;
$outer_migration_block$;

-------------------------------------------------------------------------------
-- 030-add_vectorizer_errors_view.sql
do $outer_migration_block$ /*030-add_vectorizer_errors_view.sql*/
declare
_sql text;
_migration record;
_migration_name text = $migration_name$030-add_vectorizer_errors_view.sql$migration_name$;
_migration_body text =
$migration_body$
-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
alter table ai.vectorizer_errors rename to _vectorizer_errors;

-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
-- this is not strictly necessary, but it is a good practice to keep the naming consistent
alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;

-- create a view including vectorizer name
create or replace view ai.vectorizer_errors as
select
ve.*,
v.name
from
ai._vectorizer_errors ve
left join ai.vectorizer v on ve.id = v.id;


-- grant privileges on new ai.vectorizer_errors view
do language plpgsql $block$
declare
to_user text;
priv_type text;
with_grant text;
rec record;
begin
-- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
for rec in
select distinct grantee as username, privilege_type, is_grantable
from information_schema.role_table_grants
where table_schema = 'ai'
and table_name = '_vectorizer_errors'
loop
to_user := rec.username;
priv_type := rec.privilege_type;
with_grant := '';
if rec.is_grantable then
with_grant := ' WITH GRANT OPTION';
end if;
execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
end loop;
end
$block$;
$migration_body$;
begin
select * into _migration from ai.pgai_lib_migration where "name" operator(pg_catalog.=) _migration_name;
if _migration is not null then
raise notice 'migration %s already applied. skipping.', _migration_name;
if _migration.body operator(pg_catalog.!=) _migration_body then
raise warning 'the contents of migration "%s" have changed', _migration_name;
end if;
return;
end if;
_sql = pg_catalog.format(E'do /*%s*/ $migration_body$\nbegin\n%s\nend;\n$migration_body$;', _migration_name, _migration_body);
execute _sql;
insert into ai.pgai_lib_migration ("name", body, applied_at_version)
values (_migration_name, _migration_body, $version$__version__$version$);
end;
$outer_migration_block$;

--------------------------------------------------------------------------------
-- 001-chunking.sql

Expand Down Expand Up @@ -4222,6 +4290,7 @@ begin
if not admin then
execute 'grant usage, create on schema ai to ' || to_user;
execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
execute 'grant select on ai._vectorizer_errors to ' || to_user;
execute 'grant select on ai.vectorizer_errors to ' || to_user;
execute 'grant select on ai.vectorizer_status to ' || to_user;
execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
Expand All @@ -4231,6 +4300,7 @@ begin
execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
execute 'grant all privileges on table ai.vectorizer to ' || to_user;
execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;
Expand Down
17 changes: 15 additions & 2 deletions projects/pgai/pgai/vectorizer/features/features.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those are the changes introduced compared to #683

Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ def __init__(
has_worker_tracking_table: bool,
has_loading_retries: bool,
has_reveal_secret_function: bool,
has_vectorizer_errors_view: bool,
) -> None:
self.has_disabled_column = has_disabled_column
self.has_worker_tracking_table = has_worker_tracking_table
self.has_loading_retries = has_loading_retries
self.has_reveal_secret_function = has_reveal_secret_function
self.has_vectorizer_errors_view = has_vectorizer_errors_view

@classmethod
def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
Expand Down Expand Up @@ -62,20 +64,31 @@ def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
cur.execute(query)
has_reveal_secret_function = cur.fetchone() is not None

# Newer versions of pgai lib have the ai.vectorizer_errors view.
# The table has been renamed to ai._vectorizer_errors
query = """
SELECT table_name
FROM information_schema.views
WHERE table_schema = 'ai' AND table_name = 'vectorizer_errors';
"""
cur.execute(query)
has_vectorizer_errors_view = cur.fetchone() is not None
Comment on lines +67 to +75
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would've probably checked if the _vectorizer_errors table exists and not if vectorizer_errors is a view. But I guess this works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any benefit from that approach?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just one step closer to what the vectorizer actually needs.
It doesn't really care about the view it only needs to know which table it should insert the errors into. But effectively your are checking this since the view and _ table are created in the same migration.


return cls(
has_disabled_column,
has_worker_tracking_table,
has_loading_retries,
has_reveal_secret_function,
has_vectorizer_errors_view,
)

@classmethod
def for_testing_latest_version(cls: type[Self]) -> Self:
return cls(True, True, True, True)
return cls(True, True, True, True, True)

@classmethod
def for_testing_no_features(cls: type[Self]) -> Self:
return cls(False, False, False, False)
return cls(False, False, False, False, False)

@cached_property
def disable_vectorizers(self) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions projects/pgai/pgai/vectorizer/vectorizer.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those are the changes introduced compared to #683

Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
SourceRow: TypeAlias = dict[str, Any]

DEFAULT_CONCURRENCY = 1
DEFAULT_VECTORIZER_ERRORS_TABLE = "_vectorizer_errors"

VECTORIZER_FAILED = "vectorizer failed with unexpected error"

Expand Down Expand Up @@ -125,7 +126,6 @@ class Vectorizer(BaseModel):
source_pk (list[PkAtt]): List of primary key attributes from the source table.
errors_schema (str): The schema where the error log is saved. Default is "ai".
errors_table (str): The table where errors are logged.
Default is "vectorizer_errors".
"""

id: int
Expand All @@ -137,7 +137,7 @@ class Vectorizer(BaseModel):
source_pk: list[PkAtt]
queue_failed_table: str | None = None
errors_schema: str = "ai"
errors_table: str = "vectorizer_errors"
errors_table: str = DEFAULT_VECTORIZER_ERRORS_TABLE
schema_: str = Field(alias="schema", default="ai")
table: str = "vectorizer"

Expand Down
9 changes: 8 additions & 1 deletion projects/pgai/pgai/vectorizer/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .. import __version__
from .embeddings import ApiKeyMixin
from .features import Features
from .vectorizer import Vectorizer
from .vectorizer import DEFAULT_VECTORIZER_ERRORS_TABLE, Vectorizer
from .worker_tracking import WorkerTracking

if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -124,6 +124,13 @@ def _get_vectorizer(self, vectorizer_id: int, features: Features) -> Vectorizer:
vectorizer = row["vectorizer"]
embedding = vectorizer["config"]["embedding"]
vectorizer = Vectorizer.model_validate(vectorizer)

if (
vectorizer.errors_table == DEFAULT_VECTORIZER_ERRORS_TABLE
and not features.has_vectorizer_errors_view
):
vectorizer.errors_table = "vectorizer_errors"

# The Ollama API doesn't need a key, so `api_key_name` may be unset
if "api_key_name" in embedding:
api_key_name = embedding["api_key_name"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
interactions:
- request:
body: '{"input": ["post_1", "post_2", "post_3"], "model": "intentionally-bad-embedding-model",
"dimensions": 1536, "encoding_format": "float"}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, br, zstd
connection:
- keep-alive
content-length:
- '135'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.70.0
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.70.0
x-stainless-raw-response:
- stream
x-stainless-read-timeout:
- '600'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.10.15
method: POST
uri: https://api.openai.com/v1/embeddings
response:
body:
string: !!binary |
IbgDACCWTuUm6PQ4rrgJLvdamfPpEQSGbUltWUFbEFGGYZjAxe42Bq34IaX8BF8PAwDy8zzMJFA4
pmeHnyRA/+TRDc63ULlffb/moddte3GjHfed8c7lPnKYK4gh6/kTw4xr2KBU4Yp/YR2Q1x81d2O9
xrbc27GnnL2FuUiKHdxS2O8k0G9tuwPs4JDNkP2wyqj8TKj62McMAw==
headers:
CF-RAY:
- 93fa9285b965589a-BCN
Connection:
- keep-alive
Content-Encoding:
- br
Content-Type:
- application/json; charset=utf-8
Date:
- Wed, 14 May 2025 12:59:22 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Origin
x-request-id:
- req_c8879e03af8f9ecd9ac5178dbc89604d
status:
code: 404
message: Not Found
version: 1
27 changes: 27 additions & 0 deletions projects/pgai/tests/vectorizer/cli/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,30 @@ def test_080_vectorizer_definition(
with conn.cursor() as cur:
cur.execute("SELECT * FROM blog_embedding_store;")
assert len(cur.fetchall()) == 4


@pytest.mark.postgres_params(ai_extension_version="0.8.0")
def test_errors_table_compatibility(
cli_db: tuple[TestDatabase, Connection], cli_db_url: str, vcr_: Any
):
conn = cli_db[1]
setup_source_table(conn, 3)

with vcr_.use_cassette("test_errors_table_compatibility.yaml"):
# Create vectorizer with intentionally bad embedding model to produce an error
with conn.cursor() as cur:
cur.execute("""
SELECT ai.create_vectorizer(
'blog'::regclass,
embedding =>
ai.embedding_openai('intentionally-bad-embedding-model', 1536),
chunking => ai.chunking_character_text_splitter('content'),
formatting => ai.formatting_python_template('$chunk')
);
""") # type: ignore
vectorizer_id = int(cur.fetchone()[0]) # type: ignore
run_vectorizer_worker(cli_db_url, vectorizer_id)

with conn.cursor() as cur:
cur.execute("SELECT * FROM ai.vectorizer_errors;")
assert len(cur.fetchall()) > 0
Loading