feat: add ai.set_scheduling function to change scheduling

alejandrodnm · alejandrodnm · commit cbb688f87be9 · 2025-08-05T11:03:12.000+02:00
The ai.set_scheduling function allows users to change the scheduling of
their vectorizers. This is important when moving from self hosted with
ai.scheduling_none, to TigerData cloud and ai.scheduling_timescaledb.
diff --git a/projects/pgai/db/sql/idempotent/012-vectorizer-api.sql b/projects/pgai/db/sql/idempotent/012-vectorizer-api.sql
@@ -51,7 +51,7 @@ begin
     if embedding is null then
         raise exception 'embedding configuration is required';
     end if;
-    
+
     if loading is null then
         raise exception 'loading configuration is required';
     end if;
@@ -173,7 +173,7 @@ begin
         raise notice 'a vectorizer named % already exists, skipping', name;
         return _existing_vectorizer_id;
     end if;
-    
+
     -- validate the destination can create objects after the if_not_exists check
     perform ai._validate_destination_can_create_objects(destination);
 
@@ -751,4 +751,74 @@ as $func$
     where v.name operator(pg_catalog.=) vectorizer_embed.name
     ;
 $func$ language sql stable security invoker
-set search_path to pg_catalog, pg_temp;
+set search_path to pg_catalog, pg_temp;
+
+
+-------------------------------------------------------------------------------
+-- set_scheduling_timescaledb
+create or replace function ai.set_scheduling
+( vectorizer_id pg_catalog.int4
+, scheduling pg_catalog.jsonb default ai.scheduling_default()
+, indexing pg_catalog.jsonb default ai.indexing_default()
+) returns pg_catalog.jsonb
+as $func$
+declare
+  _job_id pg_catalog.int8;
+  _updated_config pg_catalog.jsonb;
+begin
+    -- if ai.indexing_default, resolve the default
+    if indexing operator(pg_catalog.->>) 'implementation' = 'default' then
+        indexing = ai._resolve_indexing_default();
+    end if;
+
+    -- validate the indexing config
+    perform ai._validate_indexing(indexing);
+
+    -- if ai.scheduling_default, resolve the default
+    if scheduling operator(pg_catalog.->>) 'implementation' = 'default' then
+        scheduling = ai._resolve_scheduling_default();
+    end if;
+
+    -- validate the scheduling config
+    perform ai._validate_scheduling(scheduling);
+
+    -- if scheduling is none then indexing must also be none
+    if scheduling operator(pg_catalog.->>) 'implementation' = 'none'
+    and indexing operator(pg_catalog.->>) 'implementation' != 'none' then
+        raise exception 'automatic indexing is not supported without scheduling. set indexing=>ai.indexing_none() when scheduling=>ai.scheduling_none()';
+    end if;
+
+    -- delete current job if it exists
+    PERFORM public.delete_job(job_id::pg_catalog.int4)
+    FROM (
+        SELECT config #>> '{scheduling,job_id}' as job_id
+        FROM ai.vectorizer
+        WHERE id = vectorizer_id
+    ) c
+    WHERE job_id IS NOT NULL;
+
+    -- schedule the async ext job
+    select ai._vectorizer_schedule_job
+    ( vectorizer_id
+    , scheduling
+    ) into _job_id
+    ;
+    if _job_id is not null then
+        scheduling = pg_catalog.jsonb_insert(scheduling, array['job_id'], pg_catalog.to_jsonb(_job_id));
+    end if;
+
+    UPDATE ai.vectorizer
+    SET config = config operator(pg_catalog.||) pg_catalog.jsonb_build_object
+    ( 'scheduling'
+    , scheduling
+    , 'indexing'
+    , indexing
+    )
+    WHERE id = vectorizer_id
+    RETURNING config INTO _updated_config;
+
+    RETURN _updated_config;
+end
+$func$ language plpgsql volatile security invoker
+set search_path to pg_catalog, pg_temp
+;
diff --git a/projects/pgai/db/tests/vectorizer/test_vectorizer.py b/projects/pgai/db/tests/vectorizer/test_vectorizer.py
@@ -1115,7 +1115,7 @@ def index_creation_tester(cur: psycopg.Cursor, vectorizer_id: int) -> None:
 
     # insert 5 rows into the target
     cur.execute(f"""
-                insert into {vectorizer.config['destination']['target_schema']}.{vectorizer.config['destination']['target_table']}
+                insert into {vectorizer.config["destination"]["target_schema"]}.{vectorizer.config["destination"]["target_table"]}
                 ( embedding_uuid
                 , id
                 , chunk_seq
@@ -1151,7 +1151,7 @@ def index_creation_tester(cur: psycopg.Cursor, vectorizer_id: int) -> None:
 
     # insert 5 rows into the target
     cur.execute(f"""
-                insert into {vectorizer.config['destination']['target_schema']}.{vectorizer.config['destination']['target_table']}
+                insert into {vectorizer.config["destination"]["target_schema"]}.{vectorizer.config["destination"]["target_table"]}
                 ( embedding_uuid
                 , id
                 , chunk_seq
@@ -1416,7 +1416,7 @@ def test_index_create_concurrency():
 
             # insert 10 rows into the target
             cur.execute(f"""
-                        insert into {vectorizer.config['destination']['target_schema']}.{vectorizer.config['destination']['target_table']}
+                        insert into {vectorizer.config["destination"]["target_schema"]}.{vectorizer.config["destination"]["target_table"]}
                         ( embedding_uuid
                         , id
                         , chunk_seq
@@ -1808,7 +1808,7 @@ def test_grant_to_public():
             cur.execute(f"""
                 select has_table_privilege
                 ( 'public'
-                , '{vectorizer.config['destination']['target_schema']}.{vectorizer.config['destination']['target_table']}'
+                , '{vectorizer.config["destination"]["target_schema"]}.{vectorizer.config["destination"]["target_table"]}'
                 , 'select'
                 )""")
             assert cur.fetchone()[0]
@@ -2222,3 +2222,164 @@ def test_install_library_before_ai_extension():
     with psycopg.connect(db_url("test")) as con:
         with con.cursor() as cur:
             cur.execute("create extension ai cascade")
+
+
+@pytest.mark.skipif(
+    os.getenv("PG_MAJOR") == "15", reason="extension does not support pg15"
+)
+def test_set_scheduling():
+    with psycopg.connect(db_url("test")) as con:
+        with con.cursor() as cur:
+            cur.execute("create extension ai cascade")
+
+    with psycopg.connect(
+        db_url("postgres"), autocommit=True, row_factory=namedtuple_row
+    ) as con:
+        with con.cursor() as cur:
+            cur.execute("create extension if not exists timescaledb")
+            cur.execute("select to_regrole('bob') is null")
+            if cur.fetchone()[0] is True:
+                cur.execute("create user bob")
+            cur.execute("select to_regrole('adelaide') is null")
+            if cur.fetchone()[0] is True:
+                cur.execute("create user adelaide")
+    with psycopg.connect(
+        db_url("test"), autocommit=True, row_factory=namedtuple_row
+    ) as con:
+        con.add_notice_handler(detailed_notice_handler)
+        with con.cursor() as cur:
+            cur.execute("drop schema if exists website cascade")
+            cur.execute("create schema website")
+            cur.execute("drop table if exists website.blog")
+            cur.execute("""
+                create table website.blog
+                ( id int not null generated always as identity
+                , title text not null
+                , published timestamptz
+                , body text not null
+                , drop_me text
+                , primary key (title, published)
+                )
+            """)
+            cur.execute(
+                """grant select, insert, update, delete on website.blog to bob, adelaide"""
+            )
+            cur.execute("""grant usage on schema website to adelaide""")
+            cur.execute("""
+                insert into website.blog(title, published, body)
+                values
+                  ('how to cook a hot dog', '2024-01-06'::timestamptz, 'put it on a hot grill')
+                , ('how to make a sandwich', '2023-01-06'::timestamptz, 'put a slice of meat between two pieces of bread')
+                , ('how to make stir fry', '2022-01-06'::timestamptz, 'pick up the phone and order takeout')
+            """)
+
+            # drop the drop_me column
+            cur.execute("alter table website.blog drop column drop_me")
+
+            # create a vectorizer for the blog table
+            # language=PostgreSQL
+            cur.execute("""
+            select ai.create_vectorizer
+            ( 'website.blog'::regclass
+            , loading => ai.loading_column('body')
+            , embedding=>ai.embedding_openai('text-embedding-3-small', 768)
+            , chunking=>ai.chunking_character_text_splitter(128, 10)
+            , formatting=>ai.formatting_python_template('title: $title published: $published $chunk')
+            , scheduling=>ai.scheduling_timescaledb
+                    ( interval '5m'
+                    , initial_start=>'2050-01-06'::timestamptz
+                    , timezone=>'America/Chicago'
+                    )
+            , grant_to=>ai.grant_to('bob', 'fernando') -- bob is good. fernando doesn't exist. don't grant to adelaide
+            );
+            """)
+            vectorizer_id = cur.fetchone()[0]
+
+            # check the vectorizer that was created
+            cur.execute(
+                """
+                select jsonb_pretty(to_jsonb(x) #- array['config', 'version']) 
+                from ai.vectorizer x 
+                where x.id = %s
+            """,
+                (vectorizer_id,),
+            )
+            actual = json.dumps(json.loads(cur.fetchone()[0]), sort_keys=True, indent=2)
+            expected = json.dumps(json.loads(VECTORIZER_ROW), sort_keys=True, indent=2)
+            assert actual == expected
+
+            # get timescaledb job's job_id
+            cur.execute(
+                """
+                select (x.config->'scheduling'->>'job_id')::int
+                from ai.vectorizer x
+                where x.id = %s
+                """,
+                (vectorizer_id,),
+            )
+            current_job_id = cur.fetchone()[0]
+
+            # check the timescaledb job that was created
+            cur.execute(
+                """
+                select j.schedule_interval = interval '5m'
+                and j.proc_schema = 'ai'
+                and j.proc_name = '_vectorizer_job'
+                and j.scheduled = true
+                and j.fixed_schedule = true
+                as is_ok
+                from timescaledb_information.jobs j
+                where j.job_id = %s
+            """,
+                (current_job_id,),
+            )
+            actual = cur.fetchone()[0]
+            assert actual is True
+
+            cur.execute(
+                """
+            select ai.set_scheduling
+            ( %s
+            , scheduling=>ai.scheduling_timescaledb
+                ( interval '30m'
+                , initial_start=>'2050-01-06'::timestamptz
+                , timezone=>'America/Chicago'
+                )
+            , indexing=>ai.indexing_hnsw()
+            )
+            """,
+                (vectorizer_id,),
+            )
+
+            # check the timescaledb old job that was deleted
+            cur.execute(
+                "select exists (select from timescaledb_information.jobs j where j.job_id = %s)",
+                (current_job_id,),
+            )
+            exists = cur.fetchone()[0]
+            assert not exists
+
+            cur.execute(
+                "select config from ai.vectorizer where id = %s", (vectorizer_id,)
+            )
+            config = cur.fetchone()[0]
+            assert config["scheduling"]["schedule_interval"] == "00:30:00"
+            assert config["indexing"]["implementation"] == "hnsw"
+            job_id = config["scheduling"]["job_id"]
+            assert job_id != current_job_id
+
+            cur.execute(
+                """
+                select j.schedule_interval = interval '30m'
+                and j.proc_schema = 'ai'
+                and j.proc_name = '_vectorizer_job'
+                and j.scheduled = true
+                and j.fixed_schedule = true
+                as is_ok
+                from timescaledb_information.jobs j
+                where j.job_id = %s
+            """,
+                (job_id,),
+            )
+            actual = cur.fetchone()[0]
+            assert actual is True
diff --git a/projects/pgai/pgai/data/ai.sql b/projects/pgai/pgai/data/ai.sql
@@ -3603,7 +3603,7 @@ begin
     if embedding is null then
         raise exception 'embedding configuration is required';
     end if;
-    
+
     if loading is null then
         raise exception 'loading configuration is required';
     end if;
@@ -3725,7 +3725,7 @@ begin
         raise notice 'a vectorizer named % already exists, skipping', name;
         return _existing_vectorizer_id;
     end if;
-    
+
     -- validate the destination can create objects after the if_not_exists check
     perform ai._validate_destination_can_create_objects(destination);
 
@@ -4305,6 +4305,77 @@ as $func$
 $func$ language sql stable security invoker
 set search_path to pg_catalog, pg_temp;
 
+
+-------------------------------------------------------------------------------
+-- set_scheduling_timescaledb
+create or replace function ai.set_scheduling
+( vectorizer_id pg_catalog.int4
+, scheduling pg_catalog.jsonb default ai.scheduling_default()
+, indexing pg_catalog.jsonb default ai.indexing_default()
+) returns pg_catalog.jsonb
+as $func$
+declare
+  _job_id pg_catalog.int8;
+  _updated_config pg_catalog.jsonb;
+begin
+    -- if ai.indexing_default, resolve the default
+    if indexing operator(pg_catalog.->>) 'implementation' = 'default' then
+        indexing = ai._resolve_indexing_default();
+    end if;
+
+    -- validate the indexing config
+    perform ai._validate_indexing(indexing);
+
+    -- if ai.scheduling_default, resolve the default
+    if scheduling operator(pg_catalog.->>) 'implementation' = 'default' then
+        scheduling = ai._resolve_scheduling_default();
+    end if;
+
+    -- validate the scheduling config
+    perform ai._validate_scheduling(scheduling);
+
+    -- if scheduling is none then indexing must also be none
+    if scheduling operator(pg_catalog.->>) 'implementation' = 'none'
+    and indexing operator(pg_catalog.->>) 'implementation' != 'none' then
+        raise exception 'automatic indexing is not supported without scheduling. set indexing=>ai.indexing_none() when scheduling=>ai.scheduling_none()';
+    end if;
+
+    -- delete current job if it exists
+    PERFORM public.delete_job(job_id::pg_catalog.int4)
+    FROM (
+        SELECT config #>> '{scheduling,job_id}' as job_id
+        FROM ai.vectorizer
+        WHERE id = vectorizer_id
+    ) c
+    WHERE job_id IS NOT NULL;
+
+    -- schedule the async ext job
+    select ai._vectorizer_schedule_job
+    ( vectorizer_id
+    , scheduling
+    ) into _job_id
+    ;
+    if _job_id is not null then
+        scheduling = pg_catalog.jsonb_insert(scheduling, array['job_id'], pg_catalog.to_jsonb(_job_id));
+    end if;
+
+    UPDATE ai.vectorizer
+    SET config = config operator(pg_catalog.||) pg_catalog.jsonb_build_object
+    ( 'scheduling'
+    , scheduling
+    , 'indexing'
+    , indexing
+    )
+    WHERE id = vectorizer_id
+    RETURNING config INTO _updated_config;
+
+    RETURN _updated_config;
+end
+$func$ language plpgsql volatile security invoker
+set search_path to pg_catalog, pg_temp
+;
+
+
 --------------------------------------------------------------------------------
 -- 013-worker-tracking.sql
 CREATE OR REPLACE FUNCTION ai._worker_start(version text, expected_heartbeat_interval interval) RETURNS uuid AS $$