From eaa47cf332be30de63bfa3c0e9b933230747c7fd Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 27 May 2022 00:12:53 +0100 Subject: [PATCH 1/7] Remove redundant references to `event_edges.room_id` We don't need to care about the room_id here, because we are already checking the event id. --- .../databases/main/event_federation.py | 6 ++-- .../storage/databases/main/events_worker.py | 33 +++++++++---------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 562dcbe94d3a..bd2dde2f7c4a 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -1318,16 +1318,14 @@ def _get_missing_events( query = ( "SELECT prev_event_id FROM event_edges " - "WHERE room_id = ? AND event_id = ? AND is_state = ? " + "WHERE event_id = ? AND is_state = ? " "LIMIT ?" ) while front and len(event_results) < limit: new_front = set() for event_id in front: - txn.execute( - query, (room_id, event_id, False, limit - len(event_results)) - ) + txn.execute(query, (event_id, False, limit - len(event_results))) new_results = {t[0] for t in txn} - seen_events diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index a97d7e166444..b99b10778490 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -1928,6 +1928,18 @@ def is_event_next_to_gap_txn(txn: LoggingTransaction) -> bool: LIMIT 1 """ + # We consider any forward extremity as the latest in the room and + # not a forward gap. + # + # To expand, even though there is technically a gap at the front of + # the room where the forward extremities are, we consider those the + # latest messages in the room so asking other homeservers for more + # is useless. The new latest messages will just be federated as + # usual. + txn.execute(forward_extremity_query, (event.room_id, event.event_id)) + if txn.fetchone(): + return False + # Check to see whether the event in question is already referenced # by another event. If we don't see any edges, we're next to a # forward gap. @@ -1936,8 +1948,7 @@ def is_event_next_to_gap_txn(txn: LoggingTransaction) -> bool: /* Check to make sure the event referencing our event in question is not rejected */ LEFT JOIN rejections ON event_edges.event_id = rejections.event_id WHERE - event_edges.room_id = ? - AND event_edges.prev_event_id = ? + event_edges.prev_event_id = ? /* It's not a valid edge if the event referencing our event in * question is rejected. */ @@ -1945,25 +1956,11 @@ def is_event_next_to_gap_txn(txn: LoggingTransaction) -> bool: LIMIT 1 """ - # We consider any forward extremity as the latest in the room and - # not a forward gap. - # - # To expand, even though there is technically a gap at the front of - # the room where the forward extremities are, we consider those the - # latest messages in the room so asking other homeservers for more - # is useless. The new latest messages will just be federated as - # usual. - txn.execute(forward_extremity_query, (event.room_id, event.event_id)) - forward_extremities = txn.fetchall() - if len(forward_extremities): - return False - # If there are no forward edges to the event in question (another # event hasn't referenced this event in their prev_events), then we # assume there is a forward gap in the history. - txn.execute(forward_edge_query, (event.room_id, event.event_id)) - forward_edges = txn.fetchall() - if not len(forward_edges): + txn.execute(forward_edge_query, (event.event_id,)) + if not txn.fetchone(): return True return False From 4bc172f652abb3e4a55263b9c2a21d6783c64159 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 26 May 2022 23:45:36 +0100 Subject: [PATCH 2/7] Clean up the event_edges table We make a number of changes to `event_edges`: * We give the `room_id` and `is_state` columns defaults (null and false respectively) so that we can stop populating them. * We drop any rows that have `is_state` set true - they should no longer exist. * We drop any rows that do not exist in `events` - these should not exist either. * We drop the old unique constraint on all the colums, which wasn't much use. * We create a new unique index on `(event_id, prev_event_id)`. * We add a foreign key constraint to `events`. These happen rather differently depending on whether we are on Postgres or SQLite. For SQLite, we just rebuild the whole table, copying only the rows we want to keep. For Postgres, we try to do things in the background as much as possible. --- .../databases/main/events_bg_updates.py | 111 +++++++++++++++++- .../storage/databases/main/purge_events.py | 2 +- synapse/storage/schema/__init__.py | 5 +- .../71/01rebuild_event_edges.sql.postgres | 43 +++++++ .../delta/71/01rebuild_event_edges.sql.sqlite | 51 ++++++++ 5 files changed, 209 insertions(+), 3 deletions(-) create mode 100644 synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.postgres create mode 100644 synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite diff --git a/synapse/storage/databases/main/events_bg_updates.py b/synapse/storage/databases/main/events_bg_updates.py index d5f005966597..56208ce1f6cd 100644 --- a/synapse/storage/databases/main/events_bg_updates.py +++ b/synapse/storage/databases/main/events_bg_updates.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 The Matrix.org Foundation C.I.C. +# Copyright 2019-2022 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -64,6 +64,9 @@ class _BackgroundUpdates: INDEX_STREAM_ORDERING2_TS = "index_stream_ordering2_ts" REPLACE_STREAM_ORDERING_COLUMN = "replace_stream_ordering_column" + EVENT_EDGES_DROP_INVALID_ROWS = "event_edges_drop_invalid_rows" + EVENT_EDGES_REPLACE_INDEX = "event_edges_replace_index" + @attr.s(slots=True, frozen=True, auto_attribs=True) class _CalculateChainCover: @@ -240,6 +243,21 @@ def __init__( ################################################################################ + self.db_pool.updates.register_background_update_handler( + _BackgroundUpdates.EVENT_EDGES_DROP_INVALID_ROWS, + self._background_drop_invalid_event_edges_rows, + ) + + self.db_pool.updates.register_background_index_update( + _BackgroundUpdates.EVENT_EDGES_REPLACE_INDEX, + index_name="event_edges_event_id_prev_event_id_idx", + table="event_edges", + columns=["event_id", "prev_event_id"], + unique=True, + # the old index which just covered event_id is now redundant. + replaces_index="ev_edges_id", + ) + async def _background_reindex_fields_sender( self, progress: JsonDict, batch_size: int ) -> int: @@ -1290,3 +1308,94 @@ def process(txn: Cursor) -> None: ) return 0 + + async def _background_drop_invalid_event_edges_rows( + self, progress: JsonDict, batch_size: int + ) -> int: + """Drop invalid rows from event_edges + + This only runs for postgres. For SQLite, it all happens synchronously. + + Firstly, drop any rows with is_state=True. These may have been added a long time + ago, but they are no longer used. + + We also drop rows that do not correspond to entries in `events`, and add a + foreign key. + """ + + last_event_id = progress.get("last_event_id", "") + + def drop_invalid_event_edges_txn(txn: LoggingTransaction) -> bool: + """Returns True if we're done.""" + + # first we need to find an endpoint. + txn.execute( + """ + SELECT event_id FROM event_edges + WHERE event_id > ? + ORDER BY event_id + LIMIT 1 OFFSET ? + """, + (last_event_id, batch_size), + ) + + endpoint = None + row = txn.fetchone() + + if row: + endpoint = row[0] + + where_clause = "ee.event_id > ?" + args = [last_event_id] + if endpoint: + where_clause += " AND ee.event_id <= ?" + args.append(endpoint) + + # now delete any that: + # - have is_state=TRUE, or + # - do not correspond to a row in `events` + txn.execute( + f""" + DELETE FROM event_edges + WHERE event_id IN ( + SELECT ee.event_id + FROM event_edges ee + LEFT JOIN events ev USING (event_id) + WHERE ({where_clause}) AND + (is_state OR ev.event_id IS NULL) + )""", + args, + ) + + logger.info( + "cleaned up event_edges up to %s: removed %i/%i rows", + endpoint, + txn.rowcount, + batch_size, + ) + + if endpoint is not None: + self.db_pool.updates._background_update_progress_txn( + txn, + _BackgroundUpdates.EVENT_EDGES_DROP_INVALID_ROWS, + {"last_event_id": endpoint}, + ) + return False + + # if that was the final batch, we validate the foreign key. + logger.info("cleaned up event_edges; enabling foreign key") + txn.execute( + "ALTER TABLE event_edges VALIDATE CONSTRAINT event_edges_event_id_fkey" + ) + return True + + done = await self.db_pool.runInteraction( + desc="drop_invalid_event_edges", func=drop_invalid_event_edges_txn + ) + + if done: + await self.db_pool.updates._end_background_update( + _BackgroundUpdates.EVENT_EDGES_DROP_INVALID_ROWS + ) + + return batch_size diff --git a/synapse/storage/databases/main/purge_events.py b/synapse/storage/databases/main/purge_events.py index 2353c120e9cc..9969c2a21023 100644 --- a/synapse/storage/databases/main/purge_events.py +++ b/synapse/storage/databases/main/purge_events.py @@ -214,10 +214,10 @@ def _purge_history_txn( # Delete all remote non-state events for table in ( + "event_edges", "events", "event_json", "event_auth", - "event_edges", "event_forward_extremities", "event_relations", "event_search", diff --git a/synapse/storage/schema/__init__.py b/synapse/storage/schema/__init__.py index da98f05e0348..aa157e8015c7 100644 --- a/synapse/storage/schema/__init__.py +++ b/synapse/storage/schema/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -SCHEMA_VERSION = 70 # remember to update the list below when updating +SCHEMA_VERSION = 71 # remember to update the list below when updating """Represents the expectations made by the codebase about the database schema This should be incremented whenever the codebase changes its requirements on the @@ -67,6 +67,9 @@ Changes in SCHEMA_VERSION = 70: - event_reference_hashes is no longer written to. + +Changes in SCHEMA_VERSION = 71: + - event_edges.(room_id, is_state) are no longer written to. """ diff --git a/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.postgres b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.postgres new file mode 100644 index 000000000000..f32f445858f1 --- /dev/null +++ b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.postgres @@ -0,0 +1,43 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- We're going to stop populating event_edges.room_id and event_edges.is_state, +-- which means we now need to give them defaults. + +-- We also drop the exising unique constraint which spans all four columns. Franky +-- it's not doing much, and there are other indexes on event_id and prev_event_id. +-- Later on we introduce a proper unique constraint on (event_id, prev_event_id). +-- +-- We also add a foreign key constraint (which will be enforced for new rows), but +-- don't yet validate it for existing rows (since that's slow, and we haven't yet +-- checked that all the rows are valid) + +ALTER TABLE event_edges + ALTER room_id DROP NOT NULL, + ALTER is_state SET DEFAULT FALSE, + DROP CONSTRAINT IF EXISTS event_edges_event_id_prev_event_id_room_id_is_state_key, + ADD CONSTRAINT event_edges_event_id_fkey FOREIGN KEY (event_id) REFERENCES events(event_id) NOT VALID; + +-- In the background, we drop any rows with is_state=True. These may have been +-- added a long time ago, but they are no longer used. +-- +-- We also drop rows that do not correspond to entries in `events`, and finally +-- validate the foreign key. +INSERT INTO background_updates (ordering, update_name, progress_json) VALUES + (7101, 'event_edges_drop_invalid_rows', '{}'); + +-- We'll then create a new unique index on (event_id, prev_event_id). +INSERT INTO background_updates (ordering, update_name, progress_json, depends_on) VALUES + (7101, 'event_edges_replace_index', '{}', 'event_edges_drop_invalid_rows'); diff --git a/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite new file mode 100644 index 000000000000..601d0e524b07 --- /dev/null +++ b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite @@ -0,0 +1,51 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- We're going to stop populating event_edges.room_id and event_edges.is_state, +-- which means we now need to give them defaults. +-- +-- We also take the opportunity to: +-- - drop any rows with is_state=True (these were populated a long time ago, but +-- are no longer used.) +-- - drop any rows which do not correspond to entries in `events` +-- - tighten the unique index so that it applies just to (event_id, prev_event_id) +-- - drop the "ev_edges_id" index, which is redundant to the above. +-- - add a foreign key constraint from event_id to `events` + +CREATE TABLE new_event_edges ( + event_id TEXT NOT NULL, + prev_event_id TEXT NOT NULL, + room_id TEXT NULL, + is_state BOOL NOT NULL DEFAULT 0, + FOREIGN KEY(event_id) REFERENCES events(event_id) +); + +INSERT INTO new_event_edges + SELECT ee.event_id, ee.prev_event_id, ee.room_id, ee.is_state + FROM event_edges ee JOIN events ev USING (event_id) + WHERE NOT ee.is_state; + +DROP TABLE event_edges; + +ALTER TABLE new_event_edges RENAME TO event_edges; + +CREATE UNIQUE INDEX event_edges_event_id_prev_event_id_idx + ON event_edges (event_id, prev_event_id); + +CREATE INDEX ev_edges_prev_id ON event_edges (prev_event_id); + + + + From 9c70e8833deff7e10fd116acea4897a931351391 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 27 May 2022 00:13:38 +0100 Subject: [PATCH 3/7] Stop populateing `event_edges.room_id` and `is_state` We can just rely on the defaults. --- synapse/storage/databases/main/events.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 17e35cf63e68..f48e29f913d0 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -2296,11 +2296,9 @@ def _handle_mult_prev_events( self.db_pool.simple_insert_many_txn( txn, table="event_edges", - keys=("event_id", "prev_event_id", "room_id", "is_state"), + keys=("event_id", "prev_event_id"), values=[ - (ev.event_id, e_id, ev.room_id, False) - for ev in events - for e_id in ev.prev_event_ids() + (ev.event_id, e_id) for ev in events for e_id in ev.prev_event_ids() ], ) From d885adcc019bcb3097b02e5cd86cf5821b99fdfe Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 27 May 2022 00:21:49 +0100 Subject: [PATCH 4/7] changelog --- changelog.d/12893.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/12893.misc diff --git a/changelog.d/12893.misc b/changelog.d/12893.misc new file mode 100644 index 000000000000..5705210303a8 --- /dev/null +++ b/changelog.d/12893.misc @@ -0,0 +1 @@ +Simplify the database schema for `event_edges`. From aa3a170d23e42fc9fd94c71195b4f9904496907e Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 30 May 2022 11:40:13 +0100 Subject: [PATCH 5/7] Add a comment --- synapse/storage/databases/main/events_bg_updates.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/synapse/storage/databases/main/events_bg_updates.py b/synapse/storage/databases/main/events_bg_updates.py index 56208ce1f6cd..a5ee6dce184e 100644 --- a/synapse/storage/databases/main/events_bg_updates.py +++ b/synapse/storage/databases/main/events_bg_updates.py @@ -1383,6 +1383,11 @@ def drop_invalid_event_edges_txn(txn: LoggingTransaction) -> bool: return False # if that was the final batch, we validate the foreign key. + # + # The constraint should have been in place and enforced for new rows since + # before we started deleting invalid rows, so there's no chance for any + # invalid rows to have snuck in the meantime. In other words, this really + # ought to succeed. logger.info("cleaned up event_edges; enabling foreign key") txn.execute( "ALTER TABLE event_edges VALIDATE CONSTRAINT event_edges_event_id_fkey" From b3013853815c3fbba38de014edc189fc47bf25c1 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 30 May 2022 11:41:19 +0100 Subject: [PATCH 6/7] remove spurious newlines --- .../schema/main/delta/71/01rebuild_event_edges.sql.sqlite | 4 ---- 1 file changed, 4 deletions(-) diff --git a/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite index 601d0e524b07..0bb86edd2ab4 100644 --- a/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite +++ b/synapse/storage/schema/main/delta/71/01rebuild_event_edges.sql.sqlite @@ -45,7 +45,3 @@ CREATE UNIQUE INDEX event_edges_event_id_prev_event_id_idx ON event_edges (event_id, prev_event_id); CREATE INDEX ev_edges_prev_id ON event_edges (prev_event_id); - - - - From c074c498637648cdea1ad1a81aa3273dd63680df Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 1 Jun 2022 12:10:11 +0100 Subject: [PATCH 7/7] bump SCHEMA_COMPAT_VERSION --- synapse/storage/schema/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/synapse/storage/schema/__init__.py b/synapse/storage/schema/__init__.py index cb57cb20af46..5c4733842dac 100644 --- a/synapse/storage/schema/__init__.py +++ b/synapse/storage/schema/__init__.py @@ -77,10 +77,9 @@ SCHEMA_COMPAT_VERSION = ( - # We now assume that `device_lists_changes_in_room` has been filled out for - # recent device_list_updates. - # ... and that `application_services_state.last_txn` is not used. - 69 + # We no longer maintain `event_edges.room_id`, so synapses with SCHEMA_VERSION < 71 + # will break. + 71 ) """Limit on how far the synapse codebase can be rolled back without breaking db compat