From 7dc8fb1fe780888d5758bf665949e9e09d21e56a Mon Sep 17 00:00:00 2001 From: kegsay Date: Tue, 7 Sep 2021 15:07:14 +0100 Subject: [PATCH] Add more logs (#2005) * Add more logs To help debug the migration issue in #1924 along with manual data-loss-inducing fixes. Also log the origin server on processed txns to help debug buggy server origins. * Fix query --- federationapi/routing/send.go | 2 +- .../2021041615092700_state_blocks_refactor.go | 44 ++++++++++++++++--- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/federationapi/routing/send.go b/federationapi/routing/send.go index 2b2828266..aacc8a919 100644 --- a/federationapi/routing/send.go +++ b/federationapi/routing/send.go @@ -345,7 +345,7 @@ func (t *txnReq) processTransaction(ctx context.Context) (*gomatrixserverlib.Res } if c := len(results); c > 0 { - util.GetLogger(ctx).Infof("Processed %d PDUs from transaction %q", c, t.TransactionID) + util.GetLogger(ctx).Infof("Processed %d PDUs from %v in transaction %q", c, t.Origin, t.TransactionID) } return &gomatrixserverlib.RespSend{PDUs: results}, nil } diff --git a/roomserver/storage/postgres/deltas/2021041615092700_state_blocks_refactor.go b/roomserver/storage/postgres/deltas/2021041615092700_state_blocks_refactor.go index 6b72de51a..06740dc8b 100644 --- a/roomserver/storage/postgres/deltas/2021041615092700_state_blocks_refactor.go +++ b/roomserver/storage/postgres/deltas/2021041615092700_state_blocks_refactor.go @@ -220,7 +220,6 @@ func UpStateBlocksRefactor(tx *sql.Tx) error { index := stateSnapshotData{snapshot.StateSnapshotNID, snapshot.RoomNID} newsnapshots[index] = append(newsnapshots[index], blocknid) } - for snapshotdata, newblocks := range newsnapshots { var newblocksarray pq.Int64Array for _, b := range newblocks { @@ -229,11 +228,11 @@ func UpStateBlocksRefactor(tx *sql.Tx) error { var newNID types.StateSnapshotNID err = tx.QueryRow(` - INSERT INTO roomserver_state_snapshots (state_snapshot_hash, room_nid, state_block_nids) - VALUES ($1, $2, $3) - ON CONFLICT (state_snapshot_hash) DO UPDATE SET room_nid=$2 - RETURNING state_snapshot_nid - `, newblocks.Hash(), snapshotdata.RoomNID, newblocksarray).Scan(&newNID) + INSERT INTO roomserver_state_snapshots (state_snapshot_hash, room_nid, state_block_nids) + VALUES ($1, $2, $3) + ON CONFLICT (state_snapshot_hash) DO UPDATE SET room_nid=$2 + RETURNING state_snapshot_nid + `, newblocks.Hash(), snapshotdata.RoomNID, newblocksarray).Scan(&newNID) if err != nil { return fmt.Errorf("tx.QueryRow.Scan (insert new snapshot): %w", err) } @@ -252,16 +251,49 @@ func UpStateBlocksRefactor(tx *sql.Tx) error { // If we do, this is a problem if Dendrite tries to load the snapshot as it will not exist // in roomserver_state_snapshots var count int64 + if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_events WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil { return fmt.Errorf("assertion query failed: %s", err) } if count > 0 { + var debugEventID, debugRoomID string + var debugEventTypeNID, debugStateKeyNID, debugSnapNID, debugDepth int64 + err = tx.QueryRow( + `SELECT event_id, event_type_nid, event_state_key_nid, roomserver_events.state_snapshot_nid, depth, room_id FROM roomserver_events + JOIN roomserver_rooms ON roomserver_rooms.room_nid = roomserver_events.room_nid WHERE roomserver_events.state_snapshot_nid < $1 AND roomserver_events.state_snapshot_nid != 0`, maxsnapshotid, + ).Scan(&debugEventID, &debugEventTypeNID, &debugStateKeyNID, &debugSnapNID, &debugDepth, &debugRoomID) + if err != nil { + logrus.Errorf("cannot extract debug info: %v", err) + } else { + logrus.Errorf( + "Affected row: event_id=%v room_id=%v type=%v state_key=%v snapshot=%v depth=%v", + debugEventID, debugRoomID, debugEventTypeNID, debugStateKeyNID, debugSnapNID, debugDepth, + ) + logrus.Errorf("To fix this manually, run this query first then retry the migration: "+ + "UPDATE roomserver_events SET state_snapshot_nid=0 WHERE event_id='%v'", debugEventID) + } return fmt.Errorf("%d events exist in roomserver_events which have not been converted to a new state_snapshot_nid; this is a bug, please report", count) } if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_rooms WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil { return fmt.Errorf("assertion query failed: %s", err) } if count > 0 { + var debugRoomID string + var debugSnapNID, debugLastEventNID int64 + err = tx.QueryRow( + `SELECT room_id, state_snapshot_nid, last_event_sent_nid FROM roomserver_rooms WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid, + ).Scan(&debugRoomID, &debugSnapNID, &debugLastEventNID) + if err != nil { + logrus.Errorf("cannot extract debug info: %v", err) + } else { + logrus.Errorf( + "Affected row: room_id=%v snapshot=%v last_sent=%v", + debugRoomID, debugSnapNID, debugLastEventNID, + ) + logrus.Errorf("To fix this manually, run this query first then retry the migration: "+ + "UPDATE roomserver_rooms SET state_snapshot_nid=0 WHERE room_id='%v'", debugRoomID) + logrus.Errorf("Running this UPDATE will cause the room in question to become unavailable on this server. Leave and re-join the room afterwards.") + } return fmt.Errorf("%d rooms exist in roomserver_rooms which have not been converted to a new state_snapshot_nid; this is a bug, please report", count) }