timescale · smoya · May 16, 2025 · May 14, 2025 · May 15, 2025 · smoya
@@ -4,6 +4,7 @@ begin
     if not admin then
         execute 'grant usage, create on schema ai to ' || to_user;
         execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
+        execute 'grant select on ai._vectorizer_errors to ' || to_user;
         execute 'grant select on ai.vectorizer_errors to ' || to_user;
         execute 'grant select on ai.vectorizer_status to ' || to_user;
         execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
@@ -13,6 +14,7 @@ begin
         execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
         execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer to ' || to_user;
+        execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
         execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;

@@ -0,0 +1,42 @@
+-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
+alter table ai.vectorizer_errors rename to _vectorizer_errors;
+
+-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
+-- this is not strictly necessary, but it is a good practice to keep the naming consistent
+alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;
+
+-- create a view including vectorizer name
+create or replace view ai.vectorizer_errors as
+select 
+  ve.*,
+  v.name
+from
+  ai._vectorizer_errors ve
+  left join ai.vectorizer v on ve.id = v.id;
+
+
+-- grant privileges on new ai.vectorizer_errors view
+do language plpgsql $block$
+declare
+    to_user text;
+    priv_type text;
+    with_grant text;
+    rec record;
+begin
+    -- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
+    for rec in
+        select distinct grantee as username, privilege_type, is_grantable
+        from information_schema.role_table_grants
+        where table_schema = 'ai'
+        and table_name = '_vectorizer_errors'
+    loop
+        to_user := rec.username;
+        priv_type := rec.privilege_type;
+        with_grant := '';
+        if rec.is_grantable then
+           with_grant := ' WITH GRANT OPTION';
+        end if;
+        execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
+    end loop;
+end
+$block$;
@@ -144,6 +144,20 @@ def test_named_vectorizer():
             vectorizer_name = cur.fetchone()[0]
             assert vectorizer_name == "website_blog_embedding1"
 
+            # Test fetch errors by vectorizer name
+            cur.execute(
+                "insert into ai._vectorizer_errors (id, message) values (%s, %s)",
+                (vectorizer_id_2, "test error message"),
+            )
+
+            cur.execute(
+                "select * from ai.vectorizer_errors where name = %s",
+                (vectorizer_name,),
+            )
+
+            error = cur.fetchone()
+            assert error.message == "test error message"
+
             # create a vectorizer with no name check default name
             cur.execute("""
                select ai.create_vectorizer

@@ -1103,6 +1103,74 @@ begin
 end;
 $outer_migration_block$;
 
+-------------------------------------------------------------------------------
+-- 030-add_vectorizer_errors_view.sql
+do $outer_migration_block$ /*030-add_vectorizer_errors_view.sql*/
+declare
+    _sql text;
+    _migration record;
+    _migration_name text = $migration_name$030-add_vectorizer_errors_view.sql$migration_name$;
+    _migration_body text =
+$migration_body$
+-- rename the ai.vectorizer_errors table to ai._vectorizer_errors
+alter table ai.vectorizer_errors rename to _vectorizer_errors;
+
+-- rename the existing index on the ai.vectorizer_error so it follows the right naming convention (adds the _ prefix)
+-- this is not strictly necessary, but it is a good practice to keep the naming consistent
+alter index ai.vectorizer_errors_id_recorded_idx rename to _vectorizer_errors_id_recorded_idx;
+
+-- create a view including vectorizer name
+create or replace view ai.vectorizer_errors as
+select 
+  ve.*,
+  v.name
+from
+  ai._vectorizer_errors ve
+  left join ai.vectorizer v on ve.id = v.id;
+
+
+-- grant privileges on new ai.vectorizer_errors view
+do language plpgsql $block$
+declare
+    to_user text;
+    priv_type text;
+    with_grant text;
+    rec record;
+begin
+    -- find all users that have permissions on old ai.vectorizer_errors table and grant them to the view
+    for rec in
+        select distinct grantee as username, privilege_type, is_grantable
+        from information_schema.role_table_grants
+        where table_schema = 'ai'
+        and table_name = '_vectorizer_errors'
+    loop
+        to_user := rec.username;
+        priv_type := rec.privilege_type;
+        with_grant := '';
+        if rec.is_grantable then
+           with_grant := ' WITH GRANT OPTION';
+        end if;
+        execute format('GRANT %s ON ai.vectorizer_errors TO %I %s', priv_type, to_user, with_grant);
+    end loop;
+end
+$block$;
+$migration_body$;
+begin
+    select * into _migration from ai.pgai_lib_migration where "name" operator(pg_catalog.=) _migration_name;
+    if _migration is not null then
+        raise notice 'migration %s already applied. skipping.', _migration_name;
+        if _migration.body operator(pg_catalog.!=) _migration_body then
+            raise warning 'the contents of migration "%s" have changed', _migration_name;
+        end if;
+        return;
+    end if;
+    _sql = pg_catalog.format(E'do /*%s*/ $migration_body$\nbegin\n%s\nend;\n$migration_body$;', _migration_name, _migration_body);
+    execute _sql;
+    insert into ai.pgai_lib_migration ("name", body, applied_at_version)
+    values (_migration_name, _migration_body, $version$__version__$version$);
+end;
+$outer_migration_block$;
+
 --------------------------------------------------------------------------------
 -- 001-chunking.sql
 
@@ -4222,6 +4290,7 @@ begin
     if not admin then
         execute 'grant usage, create on schema ai to ' || to_user;
         execute 'grant select, insert, update, delete on table ai.vectorizer to ' || to_user;
+        execute 'grant select on ai._vectorizer_errors to ' || to_user;
         execute 'grant select on ai.vectorizer_errors to ' || to_user;
         execute 'grant select on ai.vectorizer_status to ' || to_user;
         execute 'grant select, usage on sequence ai.vectorizer_id_seq to ' || to_user;
@@ -4231,6 +4300,7 @@ begin
         execute 'grant all privileges on table ai.pgai_lib_version to ' || to_user;
         execute 'grant all privileges on table ai.pgai_lib_feature_flag to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer to ' || to_user;
+        execute 'grant all privileges on table ai._vectorizer_errors to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer_errors to ' || to_user;
         execute 'grant all privileges on table ai.vectorizer_status to ' || to_user;
         execute 'grant all privileges on sequence ai.vectorizer_id_seq to ' || to_user;

@@ -15,11 +15,13 @@ def __init__(
         has_worker_tracking_table: bool,
         has_loading_retries: bool,
         has_reveal_secret_function: bool,
+        has_vectorizer_errors_view: bool,
     ) -> None:
         self.has_disabled_column = has_disabled_column
         self.has_worker_tracking_table = has_worker_tracking_table
         self.has_loading_retries = has_loading_retries
         self.has_reveal_secret_function = has_reveal_secret_function
+        self.has_vectorizer_errors_view = has_vectorizer_errors_view
 
     @classmethod
     def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
@@ -62,20 +64,31 @@ def from_db(cls: type[Self], cur: psycopg.Cursor) -> Self:
         cur.execute(query)
         has_reveal_secret_function = cur.fetchone() is not None
 
+        # Newer versions of pgai lib have the ai.vectorizer_errors view.
+        # The table has been renamed to ai._vectorizer_errors
+        query = """
+        SELECT table_name
+        FROM information_schema.views
+        WHERE table_schema = 'ai' AND table_name = 'vectorizer_errors';
+        """
+        cur.execute(query)
+        has_vectorizer_errors_view = cur.fetchone() is not None
+
         return cls(
             has_disabled_column,
             has_worker_tracking_table,
             has_loading_retries,
             has_reveal_secret_function,
+            has_vectorizer_errors_view,
         )
 
     @classmethod
     def for_testing_latest_version(cls: type[Self]) -> Self:
-        return cls(True, True, True, True)
+        return cls(True, True, True, True, True)
 
     @classmethod
     def for_testing_no_features(cls: type[Self]) -> Self:
-        return cls(False, False, False, False)
+        return cls(False, False, False, False, False)
 
     @cached_property
     def disable_vectorizers(self) -> bool:

@@ -43,6 +43,7 @@
 SourceRow: TypeAlias = dict[str, Any]
 
 DEFAULT_CONCURRENCY = 1
+DEFAULT_VECTORIZER_ERRORS_TABLE = "_vectorizer_errors"
 
 VECTORIZER_FAILED = "vectorizer failed with unexpected error"
 
@@ -125,7 +126,6 @@ class Vectorizer(BaseModel):
         source_pk (list[PkAtt]): List of primary key attributes from the source table.
         errors_schema (str): The schema where the error log is saved. Default is "ai".
         errors_table (str): The table where errors are logged.
-            Default is "vectorizer_errors".
     """
 
     id: int
@@ -137,7 +137,7 @@ class Vectorizer(BaseModel):
     source_pk: list[PkAtt]
     queue_failed_table: str | None = None
     errors_schema: str = "ai"
-    errors_table: str = "vectorizer_errors"
+    errors_table: str = DEFAULT_VECTORIZER_ERRORS_TABLE
     schema_: str = Field(alias="schema", default="ai")
     table: str = "vectorizer"
 

@@ -15,7 +15,7 @@
 from .. import __version__
 from .embeddings import ApiKeyMixin
 from .features import Features
-from .vectorizer import Vectorizer
+from .vectorizer import DEFAULT_VECTORIZER_ERRORS_TABLE, Vectorizer
 from .worker_tracking import WorkerTracking
 
 if sys.version_info >= (3, 11):
@@ -124,6 +124,13 @@ def _get_vectorizer(self, vectorizer_id: int, features: Features) -> Vectorizer:
             vectorizer = row["vectorizer"]
             embedding = vectorizer["config"]["embedding"]
             vectorizer = Vectorizer.model_validate(vectorizer)
+
+            if (
+                vectorizer.errors_table == DEFAULT_VECTORIZER_ERRORS_TABLE
+                and not features.has_vectorizer_errors_view
+            ):
+                vectorizer.errors_table = "vectorizer_errors"
+
             # The Ollama API doesn't need a key, so `api_key_name` may be unset
             if "api_key_name" in embedding:
                 api_key_name = embedding["api_key_name"]

@@ -0,0 +1,78 @@
+interactions:
+- request:
+    body: '{"input": ["post_1", "post_2", "post_3"], "model": "intentionally-bad-embedding-model",
+      "dimensions": 1536, "encoding_format": "float"}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, br, zstd
+      connection:
+      - keep-alive
+      content-length:
+      - '135'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - AsyncOpenAI/Python 1.70.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.70.0
+      x-stainless-raw-response:
+      - stream
+      x-stainless-read-timeout:
+      - '600'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.10.15
+    method: POST
+    uri: https://api.openai.com/v1/embeddings
+  response:
+    body:
+      string: !!binary |
+        IbgDACCWTuUm6PQ4rrgJLvdamfPpEQSGbUltWUFbEFGGYZjAxe42Bq34IaX8BF8PAwDy8zzMJFA4
+        pmeHnyRA/+TRDc63ULlffb/moddte3GjHfed8c7lPnKYK4gh6/kTw4xr2KBU4Yp/YR2Q1x81d2O9
+        xrbc27GnnL2FuUiKHdxS2O8k0G9tuwPs4JDNkP2wyqj8TKj62McMAw==
+    headers:
+      CF-RAY:
+      - 93fa9285b965589a-BCN
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - br
+      Content-Type:
+      - application/json; charset=utf-8
+      Date:
+      - Wed, 14 May 2025 12:59:22 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Origin
+      x-request-id:
+      - req_c8879e03af8f9ecd9ac5178dbc89604d
+    status:
+      code: 404
+      message: Not Found
+version: 1
@@ -45,3 +45,30 @@ def test_080_vectorizer_definition(
         with conn.cursor() as cur:
             cur.execute("SELECT * FROM blog_embedding_store;")
             assert len(cur.fetchall()) == 4
+
+
+@pytest.mark.postgres_params(ai_extension_version="0.8.0")
+def test_errors_table_compatibility(
+    cli_db: tuple[TestDatabase, Connection], cli_db_url: str, vcr_: Any
+):
+    conn = cli_db[1]
+    setup_source_table(conn, 3)
+
+    with vcr_.use_cassette("test_errors_table_compatibility.yaml"):
+        # Create vectorizer with intentionally bad embedding model to produce an error
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT ai.create_vectorizer(
+                'blog'::regclass,
+                embedding =>
+                    ai.embedding_openai('intentionally-bad-embedding-model', 1536),
+                chunking => ai.chunking_character_text_splitter('content'),
+                formatting => ai.formatting_python_template('$chunk')
+                );
+            """)  # type: ignore
+            vectorizer_id = int(cur.fetchone()[0])  # type: ignore
+        run_vectorizer_worker(cli_db_url, vectorizer_id)
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT * FROM ai.vectorizer_errors;")
+            assert len(cur.fetchall()) > 0