From ebcee91cdb7a3e5c661a52de25109f9365c39f37 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 22:40:09 +0800 Subject: [PATCH 1/9] feat(cluster): add backup PITR substrate --- src/backend/catalog/system_views.sql | 65 ++ src/backend/cluster/Makefile | 7 +- src/backend/cluster/cluster_backup.c | 890 ++++++++++++++++++ src/backend/cluster/cluster_backup_manifest.c | 324 +++++++ src/backend/cluster/cluster_guc.c | 91 ++ src/backend/cluster/cluster_shmem.c | 5 + src/backend/storage/lmgr/lwlock.c | 2 + src/backend/utils/errcodes.txt | 10 + src/include/catalog/catversion.h | 6 +- src/include/catalog/pg_proc.dat | 61 ++ src/include/cluster/cluster_backup.h | 176 ++++ src/include/cluster/cluster_guc.h | 18 + src/include/storage/lwlock.h | 2 + 13 files changed, 1655 insertions(+), 2 deletions(-) create mode 100644 src/backend/cluster/cluster_backup.c create mode 100644 src/backend/cluster/cluster_backup_manifest.c create mode 100644 src/include/cluster/cluster_backup.h diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 7051436104..8dd6219cec 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1685,6 +1685,71 @@ GRANT SELECT ON pg_cluster_node_removal_state TO PUBLIC; -- REVOKE EXECUTE FROM PUBLIC for defense-in-depth (L7). REVOKE ALL ON FUNCTION pg_cluster_remove_node(int) FROM PUBLIC; +-- PGRAC: cluster-aware backup / restore / PITR surface (spec-6.5). +-- The state/history/restore-point/PITR views are read-only observability. +-- Mutating entry points are superuser-gated in C and revoked from PUBLIC. +CREATE VIEW pg_stat_cluster_backup AS + SELECT in_progress, + backup_id, + coordinator_node_id, + start_redo_lsn, + checkpoint_lsn, + stop_cut_lsn, + consistent_scn, + manifest_crc, + started_at, + stopped_at, + backup_parallel_channels, + backup_wal_retention, + restore_points_enabled, + restore_point_interval_ms + FROM cluster_get_backup_state(); + +REVOKE ALL ON pg_stat_cluster_backup FROM PUBLIC; +GRANT SELECT ON pg_stat_cluster_backup TO PUBLIC; + +CREATE VIEW pg_cluster_backup_history AS + SELECT backup_id, + consistent_scn, + scn_durable_peak, + timeline, + catversion, + storage_id, + node_count, + thread_count, + manifest_crc + FROM cluster_get_backup_history(); + +REVOKE ALL ON pg_cluster_backup_history FROM PUBLIC; +GRANT SELECT ON pg_cluster_backup_history TO PUBLIC; + +CREATE VIEW pg_cluster_restore_points AS + SELECT restore_point_name, + cut_scn, + thread_count, + incarnation, + created_at + FROM cluster_get_restore_points(); + +REVOKE ALL ON pg_cluster_restore_points FROM PUBLIC; +GRANT SELECT ON pg_cluster_restore_points TO PUBLIC; + +CREATE VIEW pg_cluster_pitr_status AS + SELECT target_type, + target_action, + reachable, + reason, + resolved_scn, + restore_point_name + FROM cluster_get_pitr_status(); + +REVOKE ALL ON pg_cluster_pitr_status FROM PUBLIC; +GRANT SELECT ON pg_cluster_pitr_status TO PUBLIC; + +REVOKE ALL ON FUNCTION pg_cluster_backup_start(text, bool) FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_cluster_backup_stop(bool) FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_cluster_create_restore_point(text) FROM PUBLIC; + -- PGRAC: pg_cluster_ic_msg_types (spec-2.3 D8; 2026-05-08). -- Lists every IC message type registered in the process-local -- dispatch_table[] under cluster_ic_router.c. Diagnostic / diff --git a/src/backend/cluster/Makefile b/src/backend/cluster/Makefile index b3d4103386..83e44d0aa6 100644 --- a/src/backend/cluster/Makefile +++ b/src/backend/cluster/Makefile @@ -41,6 +41,8 @@ OBJS = \ cluster.o \ cluster_advisory.o \ cluster_cancel_token.o \ + cluster_backup.o \ + cluster_backup_manifest.o \ cluster_cf_authority.o \ cluster_cf_enqueue.o \ cluster_cf_phase2.o \ @@ -208,7 +210,8 @@ else OBJS = cluster_conf.o cluster_debug.o cluster_ic.o cluster_inject.o cluster_undo_srf.o \ cluster_cr_srf.o cluster_block_apply_srf.o cluster_block_recovery_srf.o cluster_thread_recovery_apply_srf.o cluster_thread_recovery_replay_srf.o cluster_thread_recovery_driver_srf.o cluster_thread_recovery_orchestrator_srf.o cluster_pgstat.o cluster_scn.o cluster_views.o cluster_ges_mode_backend.o \ cluster_ir_srf.o cluster_ts_srf.o cluster_ko_srf.o \ - cluster_hang_resolve.o cluster_clean_leave_views.o cluster_node_remove_views.o + cluster_hang_resolve.o cluster_clean_leave_views.o cluster_node_remove_views.o \ + cluster_backup.o cluster_backup_manifest.o # spec-5.12: cluster_hang_resolve.o provides the pg_cluster_hang_victims / # pg_cluster_hang_resolve SQL symbols (real bodies #ifdef USE_PGRAC_CLUSTER, # --disable-cluster stubs raise ERRCODE_FEATURE_NOT_SUPPORTED); the symbols @@ -217,6 +220,8 @@ OBJS = cluster_conf.o cluster_debug.o cluster_ic.o cluster_inject.o cluster_undo # SRF + pg_cluster_clean_leave_request UDF symbols, same unconditional-link reason. # spec-5.18: cluster_node_remove_views.o provides cluster_get_node_removal_state + # pg_cluster_remove_node, same unconditional-link reason. +# spec-6.5: cluster_backup.o provides the cluster-aware backup/restore/PITR +# SQL symbols; --disable-cluster bodies raise ERRCODE_FEATURE_NOT_SUPPORTED. endif include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/cluster/cluster_backup.c b/src/backend/cluster/cluster_backup.c new file mode 100644 index 0000000000..9451310785 --- /dev/null +++ b/src/backend/cluster/cluster_backup.c @@ -0,0 +1,890 @@ +/*------------------------------------------------------------------------- + * + * cluster_backup.c + * Cluster-aware backup / restore / PITR SQL surface and shmem state. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2026, pgrac contributors + * + * Author: SqlRush + * + * IDENTIFICATION + * src/backend/cluster/cluster_backup.c + * + * NOTES + * This is a pgrac-original file. Linked in both build modes because + * pg_proc.dat references the SQL symbols unconditionally. + * Spec: spec-6.5-cluster-aware-backup-restore-pitr.md + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/xlog.h" +#include "access/xlogbackup.h" +#include "catalog/catversion.h" +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/errcodes.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/timestamp.h" +#include "utils/tuplestore.h" + +#include "cluster/cluster_backup.h" +#include "cluster/cluster_conf.h" +#include "cluster/cluster_guc.h" +#include "cluster/cluster_shmem.h" + +PG_FUNCTION_INFO_V1(pg_cluster_backup_start); +PG_FUNCTION_INFO_V1(pg_cluster_backup_stop); +PG_FUNCTION_INFO_V1(pg_cluster_create_restore_point); +PG_FUNCTION_INFO_V1(cluster_get_backup_state); +PG_FUNCTION_INFO_V1(cluster_get_backup_history); +PG_FUNCTION_INFO_V1(cluster_get_restore_points); +PG_FUNCTION_INFO_V1(cluster_get_pitr_status); + +#ifdef USE_PGRAC_CLUSTER + +#include "cluster/cluster_wal_thread.h" + +typedef struct ClusterBackupSharedState { + LWLockPadded lock; + ClusterBackupStatus status; + bool have_manifest; + ClusterBackupManifest last_manifest; + int restore_point_count; + int restore_point_next; + ClusterRestorePoint restore_points[CLUSTER_BACKUP_RESTORE_POINT_MAX]; +} ClusterBackupSharedState; + +static ClusterBackupSharedState *cluster_backup_state = NULL; +static BackupState *cluster_backup_session_state = NULL; +static StringInfo cluster_backup_tablespace_map = NULL; +static MemoryContext cluster_backup_context = NULL; + +static const char * +cluster_pitr_action_name(int action) +{ + switch (action) { + case CLUSTER_RECOVERY_TARGET_ACTION_PAUSE: + return "pause"; + case CLUSTER_RECOVERY_TARGET_ACTION_PROMOTE: + return "promote"; + case CLUSTER_RECOVERY_TARGET_ACTION_SHUTDOWN: + return "shutdown"; + } + return "unknown"; +} + +Size +cluster_backup_shmem_size(void) +{ + return sizeof(ClusterBackupSharedState); +} + +void +cluster_backup_shmem_init(void) +{ + bool found; + + cluster_backup_state + = ShmemInitStruct("pgrac cluster backup", cluster_backup_shmem_size(), &found); + if (!found) { + MemSet(cluster_backup_state, 0, sizeof(*cluster_backup_state)); + LWLockInitialize(&cluster_backup_state->lock.lock, LWTRANCHE_CLUSTER_BACKUP); + } +} + +static const ClusterShmemRegion cluster_backup_region = { + .name = "pgrac cluster backup", + .size_fn = cluster_backup_shmem_size, + .init_fn = cluster_backup_shmem_init, + .lwlock_count = 1, + .owner_subsys = "cluster_backup", + .reserved_flags = 0, +}; + +void +cluster_backup_shmem_register(void) +{ + cluster_shmem_register_region(&cluster_backup_region); +} + +static void +cluster_backup_error_if_unavailable(const char *op) +{ + if (!cluster_enabled) + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("%s requires cluster.enabled", op))); + if (cluster_node_id < 0 || cluster_node_id >= CLUSTER_MAX_NODES) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("%s requires a valid cluster.node_id", op))); + if (cluster_backup_state == NULL) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cluster backup shared state is not initialized"))); + if (cluster_conf_has_peers()) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("%s is not available for multi-node clusters yet", op), + errhint("Use a single-node cluster topology, or wait for the 6.5 " + "coordinator/backup-set writer to land."))); +} + +static SCN +cluster_backup_current_scn(void) +{ + SCN scn; + + scn = cluster_scn_current(); + if (!SCN_VALID(scn)) + scn = cluster_scn_advance(); + return scn; +} + +static void +cluster_backup_update_start(const char *backup_id, const BackupState *state) +{ + ClusterBackupStatus status; + + MemSet(&status, 0, sizeof(status)); + status.in_progress = true; + strlcpy(status.backup_id, backup_id, sizeof(status.backup_id)); + status.coordinator_node_id = cluster_node_id; + status.start_redo_lsn = state->startpoint; + status.checkpoint_lsn = state->checkpointloc; + status.start_tli = state->starttli; + status.started_at = GetCurrentTimestamp(); + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + cluster_backup_state->status = status; + LWLockRelease(&cluster_backup_state->lock.lock); +} + +static void +cluster_backup_update_stop(const BackupState *state, + const ClusterBackupManifest *manifest, + SCN cut_scn) +{ + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + cluster_backup_state->status.in_progress = false; + cluster_backup_state->status.stop_cut_lsn = state->stoppoint; + cluster_backup_state->status.consistent_scn = cut_scn; + cluster_backup_state->status.manifest_crc = manifest->manifest_crc; + cluster_backup_state->status.stopped_at = GetCurrentTimestamp(); + cluster_backup_state->last_manifest = *manifest; + cluster_backup_state->have_manifest = true; + LWLockRelease(&cluster_backup_state->lock.lock); +} + +static void +cluster_backup_mark_native_stopped(const BackupState *state) +{ + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + cluster_backup_state->status.in_progress = false; + if (state != NULL) + cluster_backup_state->status.stop_cut_lsn = state->stoppoint; + cluster_backup_state->status.stopped_at = GetCurrentTimestamp(); + LWLockRelease(&cluster_backup_state->lock.lock); +} + +static void +cluster_backup_add_restore_point(const ClusterRestorePoint *point) +{ + int slot; + + if (point == NULL || !point->present) + return; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + slot = cluster_backup_state->restore_point_next; + cluster_backup_state->restore_points[slot] = *point; + cluster_backup_state->restore_point_next + = (cluster_backup_state->restore_point_next + 1) % CLUSTER_BACKUP_RESTORE_POINT_MAX; + if (cluster_backup_state->restore_point_count < CLUSTER_BACKUP_RESTORE_POINT_MAX) + cluster_backup_state->restore_point_count++; + LWLockRelease(&cluster_backup_state->lock.lock); +} + +void +cluster_backup_get_status(ClusterBackupStatus *out) +{ + if (out == NULL) + return; + MemSet(out, 0, sizeof(*out)); + if (cluster_backup_state == NULL) + return; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + *out = cluster_backup_state->status; + LWLockRelease(&cluster_backup_state->lock.lock); +} + +bool +cluster_backup_get_last_manifest(ClusterBackupManifest *out) +{ + bool have_manifest; + + if (out == NULL) + return false; + MemSet(out, 0, sizeof(*out)); + if (cluster_backup_state == NULL) + return false; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + have_manifest = cluster_backup_state->have_manifest; + if (have_manifest) + *out = cluster_backup_state->last_manifest; + LWLockRelease(&cluster_backup_state->lock.lock); + return have_manifest; +} + +int +cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points) +{ + int count; + int start; + int i; + + if (out == NULL || max_points <= 0 || cluster_backup_state == NULL) + return 0; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + count = Min(cluster_backup_state->restore_point_count, max_points); + start = cluster_backup_state->restore_point_next - cluster_backup_state->restore_point_count; + if (start < 0) + start += CLUSTER_BACKUP_RESTORE_POINT_MAX; + for (i = 0; i < count; i++) { + int slot = (start + i) % CLUSTER_BACKUP_RESTORE_POINT_MAX; + + out[i] = cluster_backup_state->restore_points[slot]; + } + LWLockRelease(&cluster_backup_state->lock.lock); + return count; +} + +static void +cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, + const BackupState *state, + SCN cut_scn) +{ + ClusterBackupManifestThread thread; + uint16 thread_id = cluster_wal_thread_id(); + int thread_index; + + if (thread_id == XLP_THREAD_ID_LEGACY) + thread_id = 1; + thread_index = (int)thread_id - 1; + + cluster_backup_manifest_init(manifest, state->name); + manifest->consistent_scn = cut_scn; + manifest->scn_durable_peak = cut_scn; + manifest->timeline = state->stoptli; + manifest->catversion = CATALOG_VERSION_NO; + manifest->incarnation = 0; + manifest->backend_storage_id = (uint32)cluster_shared_storage_backend; + manifest->node_count = 1; + manifest->control_included = true; + manifest->voting_included = false; + + MemSet(&thread, 0, sizeof(thread)); + thread.present = true; + thread.wal_included = true; + thread.undo_included = true; + thread.tt_included = true; + thread.thread_id = thread_id; + thread.node_id = cluster_node_id; + thread.start_redo_lsn = state->startpoint; + thread.checkpoint_lsn = state->checkpointloc; + thread.start_tli = state->starttli; + thread.stop_cut_lsn = state->stoppoint; + + if (!cluster_backup_manifest_set_thread(manifest, thread_index, &thread)) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("could not add local WAL thread to cluster backup manifest"))); + + if (cluster_backup_manifest_checksums != CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup manifests require crc32c checksums"))); + cluster_backup_manifest_seal(manifest); + if (cluster_backup_manifest_validate(manifest) != CLUSTER_BACKUP_MANIFEST_OK) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup manifest failed self-validation"))); +} + +static char * +cluster_backup_build_label(const BackupState *state, + const ClusterBackupManifest *manifest, + SCN cut_scn) +{ + StringInfoData buf; + char *native; + + native = build_backup_content((BackupState *)state, false); + initStringInfo(&buf); + appendStringInfoString(&buf, native); + appendStringInfo(&buf, "CLUSTER_BACKUP_ID: %s\n", manifest->backup_id); + appendStringInfo(&buf, "CLUSTER_CONSISTENT_SCN: " UINT64_FORMAT "\n", (uint64)cut_scn); + appendStringInfo(&buf, "CLUSTER_MANIFEST_CRC32C: %u\n", manifest->manifest_crc); + appendStringInfo(&buf, "CLUSTER_NODE_COUNT: %u\n", manifest->node_count); + appendStringInfo(&buf, "CLUSTER_THREAD_COUNT: %u\n", manifest->thread_count); + pfree(native); + return buf.data; +} + +Datum +pg_cluster_backup_start(PG_FUNCTION_ARGS) +{ +#define PG_CLUSTER_BACKUP_START_COLS 4 + TupleDesc tupdesc; + Datum values[PG_CLUSTER_BACKUP_START_COLS] = {0}; + bool nulls[PG_CLUSTER_BACKUP_START_COLS] = {0}; + text *backupid = PG_GETARG_TEXT_PP(0); + bool fast = PG_GETARG_BOOL(1); + char *backupidstr; + SessionBackupState status; + MemoryContext oldcontext; + + if (!superuser()) + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to start a cluster backup"))); + cluster_backup_error_if_unavailable("pg_cluster_backup_start"); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + backupidstr = text_to_cstring(backupid); + if (strlen(backupidstr) >= CLUSTER_BACKUP_ID_MAX) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cluster backup id is too long"), + errdetail("Maximum length is %d bytes.", CLUSTER_BACKUP_ID_MAX - 1))); + + status = get_backup_status(); + if (status == SESSION_BACKUP_RUNNING) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_IN_PROGRESS), + errmsg("a backup is already in progress in this session"))); + + if (cluster_backup_context == NULL) + cluster_backup_context = AllocSetContextCreate(TopMemoryContext, + "cluster backup context", + ALLOCSET_START_SMALL_SIZES); + else { + cluster_backup_session_state = NULL; + cluster_backup_tablespace_map = NULL; + MemoryContextReset(cluster_backup_context); + } + + oldcontext = MemoryContextSwitchTo(cluster_backup_context); + cluster_backup_session_state = (BackupState *)palloc0(sizeof(BackupState)); + cluster_backup_tablespace_map = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + register_persistent_abort_backup_handler(); + do_pg_backup_start(backupidstr, fast, NULL, cluster_backup_session_state, + cluster_backup_tablespace_map); + cluster_backup_update_start(backupidstr, cluster_backup_session_state); + + values[0] = CStringGetTextDatum(backupidstr); + values[1] = LSNGetDatum(cluster_backup_session_state->startpoint); + values[2] = LSNGetDatum(cluster_backup_session_state->checkpointloc); + values[3] = Int32GetDatum((int32)cluster_backup_session_state->starttli); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +Datum +pg_cluster_backup_stop(PG_FUNCTION_ARGS) +{ +#define PG_CLUSTER_BACKUP_STOP_COLS 4 + TupleDesc tupdesc; + Datum values[PG_CLUSTER_BACKUP_STOP_COLS] = {0}; + bool nulls[PG_CLUSTER_BACKUP_STOP_COLS] = {0}; + bool waitforarchive = PG_GETARG_BOOL(0); + ClusterBackupManifest manifest; + ClusterRestorePoint point; + SCN cut_scn; + char *backup_label; + XLogRecPtr thread_lsn[CLUSTER_MAX_NODES]; + SCN thread_scn[CLUSTER_MAX_NODES]; + int thread_index; + uint16 thread_id; + + if (!superuser()) + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to stop a cluster backup"))); + cluster_backup_error_if_unavailable("pg_cluster_backup_stop"); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + if (get_backup_status() != SESSION_BACKUP_RUNNING) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cluster backup is not in progress"), + errhint("Did you call pg_cluster_backup_start()?"))); + if (cluster_backup_session_state == NULL || cluster_backup_tablespace_map == NULL) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cluster backup session state is missing"))); + + do_pg_backup_stop(cluster_backup_session_state, waitforarchive); + cluster_backup_mark_native_stopped(cluster_backup_session_state); + cut_scn = cluster_backup_current_scn(); + cluster_backup_fill_local_manifest(&manifest, cluster_backup_session_state, cut_scn); + backup_label = cluster_backup_build_label(cluster_backup_session_state, &manifest, cut_scn); + cluster_backup_update_stop(cluster_backup_session_state, &manifest, cut_scn); + + MemSet(thread_lsn, 0, sizeof(thread_lsn)); + MemSet(thread_scn, 0, sizeof(thread_scn)); + thread_id = cluster_wal_thread_id(); + if (thread_id == XLP_THREAD_ID_LEGACY) + thread_id = 1; + thread_index = (int)thread_id - 1; + thread_lsn[thread_index] = cluster_backup_session_state->stoppoint; + thread_scn[thread_index] = cut_scn; + if (cluster_restore_point_build(&point, manifest.backup_id, thread_scn, thread_lsn, + CLUSTER_MAX_NODES, true, true, manifest.incarnation) + == CLUSTER_RESTORE_POINT_CUT_OK) { + point.created_at = GetCurrentTimestamp(); + cluster_backup_add_restore_point(&point); + } + + values[0] = Int64GetDatum((int64)cut_scn); + values[1] = LSNGetDatum(cluster_backup_session_state->stoppoint); + values[2] = Int64GetDatum((int64)manifest.manifest_crc); + values[3] = CStringGetTextDatum(backup_label); + + pfree(backup_label); + cluster_backup_session_state = NULL; + cluster_backup_tablespace_map = NULL; + MemoryContextDelete(cluster_backup_context); + cluster_backup_context = NULL; + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +Datum +pg_cluster_create_restore_point(PG_FUNCTION_ARGS) +{ +#define PG_CLUSTER_RESTORE_POINT_COLS 3 + TupleDesc tupdesc; + Datum values[PG_CLUSTER_RESTORE_POINT_COLS] = {0}; + bool nulls[PG_CLUSTER_RESTORE_POINT_COLS] = {0}; + text *restore_name = PG_GETARG_TEXT_PP(0); + char *restore_name_str; + XLogRecPtr restorepoint; + SCN cut_scn; + SCN thread_scn[CLUSTER_MAX_NODES]; + XLogRecPtr thread_lsn[CLUSTER_MAX_NODES]; + uint16 thread_id; + int thread_index; + ClusterRestorePoint point; + ClusterRestorePointCutReason reason; + + if (!superuser()) + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to create a cluster restore point"))); + cluster_backup_error_if_unavailable("pg_cluster_create_restore_point"); + if (RecoveryInProgress()) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + if (!XLogIsNeeded()) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for creating a restore point"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + restore_name_str = text_to_cstring(restore_name); + if (strlen(restore_name_str) >= CLUSTER_RESTORE_POINT_NAME_MAX) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cluster restore point name is too long"), + errdetail("Maximum length is %d bytes.", + CLUSTER_RESTORE_POINT_NAME_MAX - 1))); + + restorepoint = XLogRestorePoint(restore_name_str); + cut_scn = cluster_backup_current_scn(); + + MemSet(thread_scn, 0, sizeof(thread_scn)); + MemSet(thread_lsn, 0, sizeof(thread_lsn)); + thread_id = cluster_wal_thread_id(); + if (thread_id == XLP_THREAD_ID_LEGACY) + thread_id = 1; + thread_index = (int)thread_id - 1; + thread_scn[thread_index] = cut_scn; + thread_lsn[thread_index] = restorepoint; + reason = cluster_restore_point_build(&point, restore_name_str, thread_scn, thread_lsn, + CLUSTER_MAX_NODES, true, true, 0); + if (reason != CLUSTER_RESTORE_POINT_CUT_OK) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT), + errmsg("could not build cluster restore point cut: %s", + cluster_restore_point_cut_reason_name(reason)))); + point.created_at = GetCurrentTimestamp(); + cluster_backup_add_restore_point(&point); + + values[0] = CStringGetTextDatum(restore_name_str); + values[1] = Int64GetDatum((int64)cut_scn); + values[2] = LSNGetDatum(restorepoint); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +Datum +cluster_get_backup_state(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + ClusterBackupStatus status; + Datum values[14]; + bool nulls[14]; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *)fcinfo->resultinfo; + if (!cluster_enabled) + return (Datum)0; + + cluster_backup_get_status(&status); + MemSet(nulls, false, sizeof(nulls)); + values[0] = BoolGetDatum(status.in_progress); + if (status.backup_id[0] == '\0') + nulls[1] = true; + else + values[1] = CStringGetTextDatum(status.backup_id); + values[2] = Int32GetDatum(status.coordinator_node_id); + values[3] = LSNGetDatum(status.start_redo_lsn); + values[4] = LSNGetDatum(status.checkpoint_lsn); + values[5] = LSNGetDatum(status.stop_cut_lsn); + values[6] = Int64GetDatum((int64)status.consistent_scn); + values[7] = Int64GetDatum((int64)status.manifest_crc); + if (status.started_at == 0) + nulls[8] = true; + else + values[8] = TimestampTzGetDatum(status.started_at); + if (status.stopped_at == 0) + nulls[9] = true; + else + values[9] = TimestampTzGetDatum(status.stopped_at); + values[10] = Int32GetDatum(cluster_backup_parallel_channels); + values[11] = Int32GetDatum(cluster_backup_wal_retention); + values[12] = BoolGetDatum(cluster_enable_pitr_restore_points); + values[13] = Int32GetDatum(cluster_pitr_restore_point_interval_ms); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; +} + +Datum +cluster_get_backup_history(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + ClusterBackupManifest manifest; + Datum values[9]; + bool nulls[9]; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *)fcinfo->resultinfo; + if (!cluster_enabled || !cluster_backup_get_last_manifest(&manifest)) + return (Datum)0; + + MemSet(nulls, false, sizeof(nulls)); + values[0] = CStringGetTextDatum(manifest.backup_id); + values[1] = Int64GetDatum((int64)manifest.consistent_scn); + values[2] = Int64GetDatum((int64)manifest.scn_durable_peak); + values[3] = Int32GetDatum((int32)manifest.timeline); + values[4] = Int64GetDatum((int64)manifest.catversion); + values[5] = Int32GetDatum((int32)manifest.backend_storage_id); + values[6] = Int32GetDatum((int32)manifest.node_count); + values[7] = Int32GetDatum((int32)manifest.thread_count); + values[8] = Int64GetDatum((int64)manifest.manifest_crc); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; +} + +Datum +cluster_get_restore_points(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + ClusterRestorePoint points[CLUSTER_BACKUP_RESTORE_POINT_MAX]; + int count; + int i; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *)fcinfo->resultinfo; + if (!cluster_enabled) + return (Datum)0; + + count = cluster_backup_get_restore_points(points, CLUSTER_BACKUP_RESTORE_POINT_MAX); + for (i = 0; i < count; i++) { + Datum values[5]; + bool nulls[5] = {false}; + + values[0] = CStringGetTextDatum(points[i].name); + values[1] = Int64GetDatum((int64)points[i].cut_scn); + values[2] = Int32GetDatum((int32)points[i].thread_count); + values[3] = Int32GetDatum((int32)points[i].incarnation); + if (points[i].created_at == 0) + nulls[4] = true; + else + values[4] = TimestampTzGetDatum(points[i].created_at); + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + return (Datum)0; +} + +Datum +cluster_get_pitr_status(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + ClusterBackupManifest manifest; + ClusterRestorePoint points[CLUSTER_BACKUP_RESTORE_POINT_MAX]; + ClusterRestorePoint chosen; + ClusterPitrTargetReason reason; + Datum values[6]; + bool nulls[6] = {false}; + int count; + int i; + const char *target_action = cluster_pitr_action_name(cluster_recovery_target_action); + bool have_requested_scn = false; + bool invalid_requested_scn = false; + SCN requested_scn = InvalidScn; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *)fcinfo->resultinfo; + if (!cluster_enabled) + return (Datum)0; + + if ((cluster_recovery_target_scn == NULL || cluster_recovery_target_scn[0] == '\0') + && (cluster_recovery_target_name == NULL || cluster_recovery_target_name[0] == '\0') + && cluster_recovery_target_cluster_time != NULL + && cluster_recovery_target_cluster_time[0] != '\0') { + values[0] = CStringGetTextDatum("cluster_time"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("unsupported_target_type"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + if (cluster_recovery_target_scn != NULL && cluster_recovery_target_scn[0] != '\0') { + int64 parsed = pg_strtoint64(cluster_recovery_target_scn); + + have_requested_scn = true; + if (parsed > 0) + requested_scn = (SCN)parsed; + else + invalid_requested_scn = true; + } + + if (invalid_requested_scn) { + values[0] = CStringGetTextDatum("scn"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("invalid_target"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + if (!have_requested_scn && cluster_recovery_target_name != NULL + && cluster_recovery_target_name[0] != '\0') { + if (!cluster_backup_get_last_manifest(&manifest)) { + values[0] = CStringGetTextDatum("name"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("manifest"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + count = cluster_backup_get_restore_points(points, CLUSTER_BACKUP_RESTORE_POINT_MAX); + for (i = 0; i < count; i++) { + if (!points[i].present) + continue; + if (strcmp(points[i].name, cluster_recovery_target_name) == 0) { + if (scn_time_cmp(points[i].cut_scn, manifest.consistent_scn) < 0) { + values[0] = CStringGetTextDatum("name"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("before_backup"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + if (points[i].thread_count == 0 || points[i].thread_count > CLUSTER_MAX_NODES) { + values[0] = CStringGetTextDatum("name"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("missing_thread"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + values[0] = CStringGetTextDatum("name"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(true); + values[3] = CStringGetTextDatum("ok"); + values[4] = Int64GetDatum((int64)points[i].cut_scn); + values[5] = CStringGetTextDatum(points[i].name); + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + } + values[0] = CStringGetTextDatum("name"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("no_restore_point"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + if (!SCN_VALID(requested_scn)) { + values[0] = CStringGetTextDatum("latest"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(true); + values[3] = CStringGetTextDatum("ok"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + if (!cluster_backup_get_last_manifest(&manifest)) { + values[0] = CStringGetTextDatum("scn"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(false); + values[3] = CStringGetTextDatum("manifest"); + nulls[4] = true; + nulls[5] = true; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; + } + + count = cluster_backup_get_restore_points(points, CLUSTER_BACKUP_RESTORE_POINT_MAX); + reason = cluster_pitr_resolve_scn(points, count, requested_scn, + manifest.consistent_scn, &chosen); + values[0] = CStringGetTextDatum("scn"); + values[1] = CStringGetTextDatum(target_action); + values[2] = BoolGetDatum(reason == CLUSTER_PITR_TARGET_OK); + values[3] = CStringGetTextDatum(cluster_pitr_target_reason_name(reason)); + if (reason == CLUSTER_PITR_TARGET_OK) { + values[4] = Int64GetDatum((int64)chosen.cut_scn); + values[5] = CStringGetTextDatum(chosen.name); + } else { + nulls[4] = true; + nulls[5] = true; + } + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + return (Datum)0; +} + +#else /* !USE_PGRAC_CLUSTER */ + +Size +cluster_backup_shmem_size(void) +{ + return 0; +} + +void +cluster_backup_shmem_init(void) +{ +} + +void +cluster_backup_shmem_register(void) +{ +} + +void +cluster_backup_get_status(ClusterBackupStatus *out) +{ + if (out != NULL) + MemSet(out, 0, sizeof(*out)); +} + +bool +cluster_backup_get_last_manifest(ClusterBackupManifest *out) +{ + if (out != NULL) + MemSet(out, 0, sizeof(*out)); + return false; +} + +int +cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points) +{ + return 0; +} + +Datum +pg_cluster_backup_start(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("pg_cluster_backup_start requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +pg_cluster_backup_stop(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("pg_cluster_backup_stop requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +pg_cluster_create_restore_point(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("pg_cluster_create_restore_point requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +cluster_get_backup_state(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cluster_get_backup_state requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +cluster_get_backup_history(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cluster_get_backup_history requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +cluster_get_restore_points(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cluster_get_restore_points requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +Datum +cluster_get_pitr_status(PG_FUNCTION_ARGS) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cluster_get_pitr_status requires a --enable-cluster build"))); + PG_RETURN_NULL(); +} + +#endif /* USE_PGRAC_CLUSTER */ diff --git a/src/backend/cluster/cluster_backup_manifest.c b/src/backend/cluster/cluster_backup_manifest.c new file mode 100644 index 0000000000..cf5b72ae72 --- /dev/null +++ b/src/backend/cluster/cluster_backup_manifest.c @@ -0,0 +1,324 @@ +/*------------------------------------------------------------------------- + * + * cluster_backup_manifest.c + * Dependency-light helpers for cluster backup manifests and PITR cuts. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2026, pgrac contributors + * + * Author: SqlRush + * + * IDENTIFICATION + * src/backend/cluster/cluster_backup_manifest.c + * + * NOTES + * This is a pgrac-original file (no derivation from PostgreSQL). + * Spec: spec-6.5-cluster-aware-backup-restore-pitr.md + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "cluster/cluster_backup.h" +#include "port/pg_crc32c.h" + +void +cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup_id) +{ + if (manifest == NULL) + return; + + memset(manifest, 0, sizeof(*manifest)); + manifest->magic = CLUSTER_BACKUP_MANIFEST_MAGIC; + manifest->version = CLUSTER_BACKUP_MANIFEST_VERSION; + if (backup_id != NULL) + strlcpy(manifest->backup_id, backup_id, sizeof(manifest->backup_id)); +} + +bool +cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, + int thread_index, + const ClusterBackupManifestThread *thread) +{ + if (manifest == NULL || thread == NULL) + return false; + if (thread_index < 0 || thread_index >= CLUSTER_MAX_NODES) + return false; + if (thread->thread_id == 0 || thread->thread_id > CLUSTER_MAX_NODES) + return false; + + if (!manifest->threads[thread_index].present) + manifest->thread_count++; + manifest->threads[thread_index] = *thread; + manifest->threads[thread_index].present = true; + return true; +} + +uint32 +cluster_backup_manifest_compute_crc(const ClusterBackupManifest *manifest) +{ + ClusterBackupManifest copy; + pg_crc32c crc; + + if (manifest == NULL) + return 0; + + copy = *manifest; + copy.manifest_crc = 0; + INIT_CRC32C(crc); + COMP_CRC32C(crc, ©, sizeof(copy)); + FIN_CRC32C(crc); + return crc; +} + +void +cluster_backup_manifest_seal(ClusterBackupManifest *manifest) +{ + if (manifest == NULL) + return; + manifest->manifest_crc = cluster_backup_manifest_compute_crc(manifest); +} + +ClusterBackupManifestReason +cluster_backup_manifest_validate(const ClusterBackupManifest *manifest) +{ + int i; + int present_count = 0; + + if (manifest == NULL) + return CLUSTER_BACKUP_MANIFEST_NULL; + if (manifest->magic != CLUSTER_BACKUP_MANIFEST_MAGIC) + return CLUSTER_BACKUP_MANIFEST_BAD_MAGIC; + if (manifest->version != CLUSTER_BACKUP_MANIFEST_VERSION) + return CLUSTER_BACKUP_MANIFEST_BAD_VERSION; + if (manifest->node_count == 0 || manifest->node_count > CLUSTER_MAX_NODES || + manifest->thread_count == 0 || manifest->thread_count > CLUSTER_MAX_NODES) + return CLUSTER_BACKUP_MANIFEST_BAD_COUNTS; + if (!manifest->control_included) + return CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL; + if (!SCN_VALID(manifest->consistent_scn) || !SCN_VALID(manifest->scn_durable_peak) || + scn_time_cmp(manifest->scn_durable_peak, manifest->consistent_scn) < 0) + return CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK; + + for (i = 0; i < CLUSTER_MAX_NODES; i++) { + const ClusterBackupManifestThread *thread = &manifest->threads[i]; + + if (!thread->present) + continue; + + present_count++; + if (thread->thread_id == 0 || thread->thread_id > CLUSTER_MAX_NODES) + return CLUSTER_BACKUP_MANIFEST_MISSING_THREAD; + if (thread->start_redo_lsn == InvalidXLogRecPtr || + thread->checkpoint_lsn == InvalidXLogRecPtr || + thread->stop_cut_lsn == InvalidXLogRecPtr || + thread->stop_cut_lsn < thread->start_redo_lsn) + return CLUSTER_BACKUP_MANIFEST_BAD_LSN_RANGE; + if (!thread->wal_included) + return CLUSTER_BACKUP_MANIFEST_MISSING_WAL; + if (!thread->undo_included) + return CLUSTER_BACKUP_MANIFEST_MISSING_UNDO; + if (!thread->tt_included) + return CLUSTER_BACKUP_MANIFEST_MISSING_TT; + } + + if (present_count != (int)manifest->thread_count) + return CLUSTER_BACKUP_MANIFEST_MISSING_THREAD; + if (manifest->manifest_crc != cluster_backup_manifest_compute_crc(manifest)) + return CLUSTER_BACKUP_MANIFEST_BAD_CRC; + + return CLUSTER_BACKUP_MANIFEST_OK; +} + +const char * +cluster_backup_manifest_reason_name(ClusterBackupManifestReason reason) +{ + switch (reason) { + case CLUSTER_BACKUP_MANIFEST_OK: + return "ok"; + case CLUSTER_BACKUP_MANIFEST_NULL: + return "null"; + case CLUSTER_BACKUP_MANIFEST_BAD_MAGIC: + return "bad_magic"; + case CLUSTER_BACKUP_MANIFEST_BAD_VERSION: + return "bad_version"; + case CLUSTER_BACKUP_MANIFEST_BAD_COUNTS: + return "bad_counts"; + case CLUSTER_BACKUP_MANIFEST_MISSING_THREAD: + return "missing_thread"; + case CLUSTER_BACKUP_MANIFEST_BAD_LSN_RANGE: + return "bad_lsn_range"; + case CLUSTER_BACKUP_MANIFEST_MISSING_WAL: + return "missing_wal"; + case CLUSTER_BACKUP_MANIFEST_MISSING_UNDO: + return "missing_undo"; + case CLUSTER_BACKUP_MANIFEST_MISSING_TT: + return "missing_tt"; + case CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL: + return "missing_control"; + case CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK: + return "bad_scn_peak"; + case CLUSTER_BACKUP_MANIFEST_BAD_CRC: + return "bad_crc"; + } + return "unknown"; +} + +ClusterRestorePointCutReason +cluster_restore_point_build(ClusterRestorePoint *out, + const char *name, + const SCN *thread_scn, + const XLogRecPtr *thread_lsn, + int max_threads, + bool pending_commits_empty, + bool commit_fence_held, + uint32 incarnation) +{ + SCN max_scn = InvalidScn; + int i; + int nthreads = 0; + + if (!pending_commits_empty) + return CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS; + if (!commit_fence_held) + return CLUSTER_RESTORE_POINT_CUT_NO_FENCE; + if (out == NULL || thread_scn == NULL || thread_lsn == NULL || + max_threads <= 0 || max_threads > CLUSTER_MAX_NODES) + return CLUSTER_RESTORE_POINT_CUT_NO_THREADS; + + memset(out, 0, sizeof(*out)); + out->present = true; + out->incarnation = incarnation; + if (name != NULL) + strlcpy(out->name, name, sizeof(out->name)); + + for (i = 0; i < max_threads; i++) { + if (!SCN_VALID(thread_scn[i]) && thread_lsn[i] == InvalidXLogRecPtr) + continue; + if (!SCN_VALID(thread_scn[i]) || thread_lsn[i] == InvalidXLogRecPtr) + return CLUSTER_RESTORE_POINT_CUT_BAD_THREAD; + + out->cut_lsn[i] = thread_lsn[i]; + if (!SCN_VALID(max_scn) || scn_time_cmp(thread_scn[i], max_scn) > 0) + max_scn = thread_scn[i]; + nthreads++; + } + + if (nthreads == 0) + return CLUSTER_RESTORE_POINT_CUT_NO_THREADS; + + out->cut_scn = max_scn; + out->thread_count = (uint32)nthreads; + return CLUSTER_RESTORE_POINT_CUT_OK; +} + +const char * +cluster_restore_point_cut_reason_name(ClusterRestorePointCutReason reason) +{ + switch (reason) { + case CLUSTER_RESTORE_POINT_CUT_OK: + return "ok"; + case CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS: + return "pending_commits"; + case CLUSTER_RESTORE_POINT_CUT_NO_FENCE: + return "no_fence"; + case CLUSTER_RESTORE_POINT_CUT_NO_THREADS: + return "no_threads"; + case CLUSTER_RESTORE_POINT_CUT_BAD_THREAD: + return "bad_thread"; + } + return "unknown"; +} + +ClusterPitrTargetReason +cluster_pitr_resolve_scn(const ClusterRestorePoint *points, + int npoints, + SCN requested_scn, + SCN backup_consistent_scn, + ClusterRestorePoint *out) +{ + const ClusterRestorePoint *best = NULL; + int i; + + if (!SCN_VALID(requested_scn) || !SCN_VALID(backup_consistent_scn) || + scn_time_cmp(requested_scn, backup_consistent_scn) < 0) + return CLUSTER_PITR_TARGET_BEFORE_BACKUP; + if (points == NULL || npoints <= 0) + return CLUSTER_PITR_TARGET_NO_RESTORE_POINT; + + for (i = 0; i < npoints; i++) { + const ClusterRestorePoint *point = &points[i]; + + if (!point->present || !SCN_VALID(point->cut_scn)) + continue; + if (scn_time_cmp(point->cut_scn, backup_consistent_scn) < 0) + continue; + if (scn_time_cmp(point->cut_scn, requested_scn) > 0) + continue; + if (point->thread_count == 0 || point->thread_count > CLUSTER_MAX_NODES) + return CLUSTER_PITR_TARGET_MISSING_THREAD; + if (best == NULL || scn_time_cmp(point->cut_scn, best->cut_scn) > 0) + best = point; + } + + if (best == NULL) + return CLUSTER_PITR_TARGET_NO_RESTORE_POINT; + + if (out != NULL) + *out = *best; + return CLUSTER_PITR_TARGET_OK; +} + +const char * +cluster_pitr_target_reason_name(ClusterPitrTargetReason reason) +{ + switch (reason) { + case CLUSTER_PITR_TARGET_OK: + return "ok"; + case CLUSTER_PITR_TARGET_NO_RESTORE_POINT: + return "no_restore_point"; + case CLUSTER_PITR_TARGET_BEFORE_BACKUP: + return "before_backup"; + case CLUSTER_PITR_TARGET_MISSING_THREAD: + return "missing_thread"; + case CLUSTER_PITR_TARGET_UNARCHIVED_WAL: + return "unarchived_wal"; + } + return "unknown"; +} + +ClusterRestoreCompatibilityReason +cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, + uint32 current_catversion, + uint32 current_storage_id, + uint32 expected_node_count) +{ + if (cluster_backup_manifest_validate(manifest) != CLUSTER_BACKUP_MANIFEST_OK) + return CLUSTER_RESTORE_COMPAT_MANIFEST; + if (manifest->catversion != current_catversion) + return CLUSTER_RESTORE_COMPAT_CATVERSION; + if (manifest->backend_storage_id != current_storage_id) + return CLUSTER_RESTORE_COMPAT_STORAGE; + if (manifest->node_count != expected_node_count) + return CLUSTER_RESTORE_COMPAT_TOPOLOGY; + return CLUSTER_RESTORE_COMPAT_OK; +} + +const char * +cluster_restore_compat_reason_name(ClusterRestoreCompatibilityReason reason) +{ + switch (reason) { + case CLUSTER_RESTORE_COMPAT_OK: + return "ok"; + case CLUSTER_RESTORE_COMPAT_CATVERSION: + return "catversion"; + case CLUSTER_RESTORE_COMPAT_STORAGE: + return "storage"; + case CLUSTER_RESTORE_COMPAT_TOPOLOGY: + return "topology"; + case CLUSTER_RESTORE_COMPAT_MANIFEST: + return "manifest"; + } + return "unknown"; +} diff --git a/src/backend/cluster/cluster_guc.c b/src/backend/cluster/cluster_guc.c index fca70c883e..e63afd288f 100644 --- a/src/backend/cluster/cluster_guc.c +++ b/src/backend/cluster/cluster_guc.c @@ -81,6 +81,16 @@ int cluster_recovery_workers_max = 4; /* spec-4.5 D9: merged k-way recovery (default OFF, Q8) + wait timeout. */ bool cluster_merged_recovery = false; int cluster_recovery_merge_wait_timeout = 10000; +/* spec-6.5: cluster-aware backup / restore / PITR target knobs. */ +char *cluster_recovery_target_scn = NULL; +char *cluster_recovery_target_cluster_time = NULL; +char *cluster_recovery_target_name = NULL; +int cluster_recovery_target_action = CLUSTER_RECOVERY_TARGET_ACTION_PAUSE; +bool cluster_enable_pitr_restore_points = false; +int cluster_pitr_restore_point_interval_ms = 0; +int cluster_backup_wal_retention = 0; +int cluster_backup_parallel_channels = 1; +int cluster_backup_manifest_checksums = CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C; int cluster_shared_storage_backend = CLUSTER_SHARED_FS_BACKEND_STUB; /* spec-4.5a D2: shared data root for the cluster_fs (shared_fs) backend. */ char *cluster_shared_data_dir = NULL; @@ -817,6 +827,16 @@ static const struct config_enum_entry cluster_shared_storage_backend_options[] { "multi_attach", CLUSTER_SHARED_FS_BACKEND_MULTI_ATTACH, false }, { NULL, 0, false } }; +static const struct config_enum_entry cluster_recovery_target_action_options[] + = { { "pause", CLUSTER_RECOVERY_TARGET_ACTION_PAUSE, false }, + { "promote", CLUSTER_RECOVERY_TARGET_ACTION_PROMOTE, false }, + { "shutdown", CLUSTER_RECOVERY_TARGET_ACTION_SHUTDOWN, false }, + { NULL, 0, false } }; + +static const struct config_enum_entry cluster_backup_manifest_checksum_options[] + = { { "crc32c", CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C, false }, + { NULL, 0, false } }; + /* * check_cluster_shared_data_dir -- GUC check_hook for @@ -1145,6 +1165,77 @@ cluster_init_guc(void) &cluster_recovery_merge_wait_timeout, 10000, 0, 600000, PGC_POSTMASTER, GUC_UNIT_MS, NULL, NULL, NULL); + DefineCustomStringVariable( + "cluster.recovery_target_scn", + gettext_noop("Cluster PITR target SCN."), + gettext_noop("When set, cluster PITR status resolves the requested SCN against " + "cluster restore points and refuses unreachable targets."), + &cluster_recovery_target_scn, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomStringVariable( + "cluster.recovery_target_cluster_time", + gettext_noop("Cluster PITR target timestamp."), + gettext_noop("Reserved target timestamp for cluster-aware recovery planning. " + "Spec-6.5 exposes the configuration and status surface; the " + "startup recovery action remains fail-closed until the " + "coordinator can prove all WAL threads are present."), + &cluster_recovery_target_cluster_time, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomStringVariable( + "cluster.recovery_target_name", + gettext_noop("Cluster PITR named restore point target."), + gettext_noop("Reserved named cluster restore-point target. The status view " + "reports restore points produced by pg_cluster_create_restore_point."), + &cluster_recovery_target_name, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomEnumVariable( + "cluster.recovery_target_action", + gettext_noop("Action to take when a cluster PITR target is reached."), + gettext_noop("Accepted values are pause, promote, and shutdown. The setting is " + "advertised with the 6.5 target surface; startup recovery remains " + "fail-closed until every required WAL thread is proven present."), + &cluster_recovery_target_action, CLUSTER_RECOVERY_TARGET_ACTION_PAUSE, + cluster_recovery_target_action_options, PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomBoolVariable( + "cluster.enable_pitr_restore_points", + gettext_noop("Enable automatic cluster restore point creation."), + gettext_noop("Manual pg_cluster_create_restore_point is available regardless of " + "this setting. Automatic background creation is reserved until a " + "cluster-wide cut coordinator is present."), + &cluster_enable_pitr_restore_points, false, PGC_SIGHUP, 0, NULL, NULL, NULL); + + DefineCustomIntVariable( + "cluster.pitr_restore_point_interval_ms", + gettext_noop("Interval for automatic cluster PITR restore points."), + gettext_noop("Zero disables automatic restore point scheduling."), + &cluster_pitr_restore_point_interval_ms, 0, 0, 86400000, PGC_SIGHUP, GUC_UNIT_MS, NULL, + NULL, NULL); + + DefineCustomIntVariable( + "cluster.backup_wal_retention", + gettext_noop("Cluster backup WAL retention hint in megabytes."), + gettext_noop("The 6.5 manifest/status surface records the setting; actual " + "multi-thread retention enforcement is deferred to the backup-set " + "writer."), + &cluster_backup_wal_retention, 0, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MB, NULL, NULL, + NULL); + + DefineCustomIntVariable( + "cluster.backup_parallel_channels", + gettext_noop("Maximum cluster backup copy channels."), + gettext_noop("Reserved capacity knob for the cluster backup-set writer."), + &cluster_backup_parallel_channels, 1, 1, CLUSTER_MAX_NODES, PGC_SIGHUP, 0, NULL, NULL, + NULL); + + DefineCustomEnumVariable( + "cluster.backup_manifest_checksums", + gettext_noop("Checksum mode for cluster backup manifests."), + gettext_noop("crc32c protects the in-memory and SQL-visible manifest substrate; " + "6.5 does not provide an unchecked manifest mode."), + &cluster_backup_manifest_checksums, CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C, + cluster_backup_manifest_checksum_options, PGC_SIGHUP, 0, NULL, NULL, NULL); + /* * cluster.injection_points -- comma-separated list of injection point * names to auto-arm at startup with fault_type=WARNING (counter-only + diff --git a/src/backend/cluster/cluster_shmem.c b/src/backend/cluster/cluster_shmem.c index 524d8d3ac2..42d2386b37 100644 --- a/src/backend/cluster/cluster_shmem.c +++ b/src/backend/cluster/cluster_shmem.c @@ -64,6 +64,7 @@ #include "cluster/cluster_diag.h" /* cluster_diag_shmem_register (1.13 Sprint A) */ #include "cluster/cluster_clean_leave.h" /* cluster_clean_leave_shmem_register (spec-5.13 D2) */ #include "cluster/cluster_node_remove.h" /* cluster_node_remove_shmem_register (spec-5.18 D2) */ +#include "cluster/cluster_backup.h" /* cluster_backup_shmem_register (spec-6.5) */ #include "cluster/cluster_inject.h" /* CLUSTER_INJECTION_POINT */ #include "cluster/cluster_lck.h" /* cluster_lck_shmem_register (1.12 Sprint A) */ #include "cluster/cluster_epoch.h" /* cluster_epoch_shmem_register (2.4) */ @@ -596,6 +597,10 @@ cluster_init_shmem_module(void) if (cluster_shmem_lookup_region("pgrac cluster node_remove") == NULL) cluster_node_remove_shmem_register(); + /* spec-6.5: register cluster backup / restore / PITR state. */ + if (cluster_shmem_lookup_region("pgrac cluster backup") == NULL) + cluster_backup_shmem_register(); + /* spec-1.14 Sprint A D7: register cluster_stats shmem region. */ if (cluster_shmem_lookup_region("pgrac cluster stats") == NULL) cluster_stats_shmem_register(); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index c2113ba69f..051def95f8 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -204,6 +204,8 @@ static const char *const BuiltinTrancheNames[] = { "ClusterCleanLeave", /* PGRAC LWTRANCHE_CLUSTER_NODE_REMOVE (spec-5.18): */ "ClusterNodeRemove", + /* PGRAC LWTRANCHE_CLUSTER_BACKUP (spec-6.5): */ + "ClusterBackup", /* PGRAC LWTRANCHE_CLUSTER_STATS: */ "ClusterStats", /* PGRAC LWTRANCHE_CLUSTER_SCN: */ diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 07b3d6027f..35aaea8581 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -827,6 +827,16 @@ Section: Class 53 - Insufficient Resources (pgrac extension) # relfilenode (stale writeback, 8.A). 53RAA E ERRCODE_CLUSTER_OBJECT_FLUSH_UNAVAILABLE cluster_object_flush_unavailable +# spec-6.5: cluster-aware backup / restore / PITR correctness band. +# These states are raised before any path can silently produce a partial +# cluster backup, choose an unreachable PITR target, or start from an +# incomplete / incompatible restore substrate. +53RAB E ERRCODE_CLUSTER_BACKUP_IN_PROGRESS cluster_backup_in_progress +53RAC E ERRCODE_CLUSTER_PITR_TARGET_UNREACHABLE cluster_pitr_target_unreachable +53RAD E ERRCODE_CLUSTER_BACKUP_INCOMPLETE cluster_backup_incomplete +53RAE E ERRCODE_CLUSTER_RESTORE_INCOMPATIBLE cluster_restore_incompatible +53RAF E ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT cluster_restore_point_drain_timeout + Section: Class 54 - Program Limit Exceeded # this is for wired-in limits, not resource exhaustion problems (class borrowed from DB2) diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2101479e99..4ff9ce4792 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -713,7 +713,11 @@ * backward replay and reconstructs commit_scn=InvalidScn for v3. No catalog * surface change; the bump fences an old binary from replaying v3-format WAL * (unknown format_version -> redo PANIC). Bump 202606330 -> 202606340. */ -#define CATALOG_VERSION_NO 202606340 +/* spec-6.5: cluster-aware backup / restore / PITR catalog surface — + * pg_cluster_backup_start/stop, pg_cluster_create_restore_point, 4 state SRFs, + * 4 system views, LWTRANCHE_CLUSTER_BACKUP, and 53RAB..53RAF SQLSTATEs. + * Bump 202606340 -> 202606350. */ +#define CATALOG_VERSION_NO 202606350 /* spec-5.13 (2026-06-27): clean-leave catalog surface — cluster_get_clean_leave_state * SRF (oid 8960) + pg_cluster_clean_leave_state view + pg_cluster_clean_leave_request diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 5653c11367..185d352c20 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12540,6 +12540,67 @@ prorettype => 'text', proargtypes => 'int4', prosrc => 'pg_cluster_remove_node' }, +# spec-6.5 -- cluster-aware backup / restore / PITR SQL surface. +{ oid => '8965', descr => 'start a cluster-aware physical backup', + proname => 'pg_cluster_backup_start', provolatile => 'v', + prorettype => 'record', proargtypes => 'text bool', + proallargtypes => '{text,bool,text,pg_lsn,pg_lsn,int4}', + proargmodes => '{i,i,o,o,o,o}', + proargnames => '{label,fast,backup_id,start_redo_lsn,checkpoint_lsn,start_tli}', + prosrc => 'pg_cluster_backup_start' }, + +{ oid => '8966', descr => 'stop a cluster-aware physical backup', + proname => 'pg_cluster_backup_stop', provolatile => 'v', + prorettype => 'record', proargtypes => 'bool', + proallargtypes => '{bool,int8,pg_lsn,int8,text}', + proargmodes => '{i,o,o,o,o}', + proargnames => '{waitforarchive,consistent_scn,stop_cut_lsn,manifest_crc,backup_label}', + prosrc => 'pg_cluster_backup_stop' }, + +{ oid => '8967', descr => 'create a cluster-aware restore point', + proname => 'pg_cluster_create_restore_point', provolatile => 'v', + prorettype => 'record', proargtypes => 'text', + proallargtypes => '{text,text,int8,pg_lsn}', + proargmodes => '{i,o,o,o}', + proargnames => '{name,restore_point_name,cut_scn,cut_lsn}', + prosrc => 'pg_cluster_create_restore_point' }, + +{ oid => '8968', descr => 'show current cluster backup state', + proname => 'cluster_get_backup_state', prorows => '1', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{bool,text,int4,pg_lsn,pg_lsn,pg_lsn,int8,int8,timestamptz,timestamptz,int4,int4,bool,int4}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{in_progress,backup_id,coordinator_node_id,start_redo_lsn,checkpoint_lsn,stop_cut_lsn,consistent_scn,manifest_crc,started_at,stopped_at,backup_parallel_channels,backup_wal_retention,restore_points_enabled,restore_point_interval_ms}', + prosrc => 'cluster_get_backup_state' }, + +{ oid => '8969', descr => 'show latest cluster backup manifest summary', + proname => 'cluster_get_backup_history', prorows => '16', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int8,int8,int4,int8,int4,int4,int4,int8}', + proargmodes => '{o,o,o,o,o,o,o,o,o}', + proargnames => '{backup_id,consistent_scn,scn_durable_peak,timeline,catversion,storage_id,node_count,thread_count,manifest_crc}', + prosrc => 'cluster_get_backup_history' }, + +{ oid => '8970', descr => 'show cluster restore points', + proname => 'cluster_get_restore_points', prorows => '16', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int8,int4,int4,timestamptz}', + proargmodes => '{o,o,o,o,o}', + proargnames => '{restore_point_name,cut_scn,thread_count,incarnation,created_at}', + prosrc => 'cluster_get_restore_points' }, + +{ oid => '8971', descr => 'show cluster PITR target resolution status', + proname => 'cluster_get_pitr_status', prorows => '1', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{text,text,bool,text,int8,text}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{target_type,target_action,reachable,reason,resolved_scn,restore_point_name}', + prosrc => 'cluster_get_pitr_status' }, + # spec-3.2 D5b (2026-05-22) -- test-only visibility fork injection # functions. These are SQL-visible so TAP can drive a real # HeapTupleSatisfiesMVCC cluster-path miss and assert 53R97. Production diff --git a/src/include/cluster/cluster_backup.h b/src/include/cluster/cluster_backup.h new file mode 100644 index 0000000000..10b3d7a5e2 --- /dev/null +++ b/src/include/cluster/cluster_backup.h @@ -0,0 +1,176 @@ +/*------------------------------------------------------------------------- + * + * cluster_backup.h + * Cluster-aware backup / restore / PITR substrate. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2026, pgrac contributors + * + * Author: SqlRush + * + * IDENTIFICATION + * src/include/cluster/cluster_backup.h + * + * NOTES + * This is a pgrac-original file (no derivation from PostgreSQL). + * Spec: spec-6.5-cluster-aware-backup-restore-pitr.md + * + *------------------------------------------------------------------------- + */ +#ifndef CLUSTER_BACKUP_H +#define CLUSTER_BACKUP_H + +#include "access/xlogdefs.h" +#include "c.h" +#include "cluster/cluster_conf.h" /* CLUSTER_MAX_NODES */ +#include "cluster/cluster_scn.h" +#include "datatype/timestamp.h" + +#define CLUSTER_BACKUP_ID_MAX 64 +#define CLUSTER_RESTORE_POINT_NAME_MAX 64 +#define CLUSTER_BACKUP_MANIFEST_MAGIC 0x5047424BU /* "PGBK" */ +#define CLUSTER_BACKUP_MANIFEST_VERSION 1 +#define CLUSTER_BACKUP_RESTORE_POINT_MAX 16 + +typedef enum ClusterBackupManifestReason { + CLUSTER_BACKUP_MANIFEST_OK = 0, + CLUSTER_BACKUP_MANIFEST_NULL, + CLUSTER_BACKUP_MANIFEST_BAD_MAGIC, + CLUSTER_BACKUP_MANIFEST_BAD_VERSION, + CLUSTER_BACKUP_MANIFEST_BAD_COUNTS, + CLUSTER_BACKUP_MANIFEST_MISSING_THREAD, + CLUSTER_BACKUP_MANIFEST_BAD_LSN_RANGE, + CLUSTER_BACKUP_MANIFEST_MISSING_WAL, + CLUSTER_BACKUP_MANIFEST_MISSING_UNDO, + CLUSTER_BACKUP_MANIFEST_MISSING_TT, + CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL, + CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK, + CLUSTER_BACKUP_MANIFEST_BAD_CRC +} ClusterBackupManifestReason; + +typedef enum ClusterPitrTargetReason { + CLUSTER_PITR_TARGET_OK = 0, + CLUSTER_PITR_TARGET_NO_RESTORE_POINT, + CLUSTER_PITR_TARGET_BEFORE_BACKUP, + CLUSTER_PITR_TARGET_MISSING_THREAD, + CLUSTER_PITR_TARGET_UNARCHIVED_WAL +} ClusterPitrTargetReason; + +typedef enum ClusterRestoreCompatibilityReason { + CLUSTER_RESTORE_COMPAT_OK = 0, + CLUSTER_RESTORE_COMPAT_CATVERSION, + CLUSTER_RESTORE_COMPAT_STORAGE, + CLUSTER_RESTORE_COMPAT_TOPOLOGY, + CLUSTER_RESTORE_COMPAT_MANIFEST +} ClusterRestoreCompatibilityReason; + +typedef enum ClusterRestorePointCutReason { + CLUSTER_RESTORE_POINT_CUT_OK = 0, + CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS, + CLUSTER_RESTORE_POINT_CUT_NO_FENCE, + CLUSTER_RESTORE_POINT_CUT_NO_THREADS, + CLUSTER_RESTORE_POINT_CUT_BAD_THREAD +} ClusterRestorePointCutReason; + +typedef struct ClusterBackupManifestThread { + bool present; + bool wal_included; + bool undo_included; + bool tt_included; + uint32 thread_id; + int32 node_id; + XLogRecPtr start_redo_lsn; + XLogRecPtr checkpoint_lsn; + TimeLineID start_tli; + XLogRecPtr stop_cut_lsn; +} ClusterBackupManifestThread; + +typedef struct ClusterBackupManifest { + uint32 magic; + uint32 version; + char backup_id[CLUSTER_BACKUP_ID_MAX]; + SCN consistent_scn; + SCN scn_durable_peak; + TimeLineID timeline; + uint32 catversion; + uint32 incarnation; + uint32 backend_storage_id; + uint32 node_count; + uint32 thread_count; + bool control_included; + bool voting_included; + ClusterBackupManifestThread threads[CLUSTER_MAX_NODES]; + uint32 manifest_crc; +} ClusterBackupManifest; + +typedef struct ClusterRestorePoint { + bool present; + char name[CLUSTER_RESTORE_POINT_NAME_MAX]; + SCN cut_scn; + XLogRecPtr cut_lsn[CLUSTER_MAX_NODES]; + uint32 thread_count; + uint32 incarnation; + TimestampTz created_at; +} ClusterRestorePoint; + +typedef struct ClusterBackupStatus { + bool in_progress; + char backup_id[CLUSTER_BACKUP_ID_MAX]; + int32 coordinator_node_id; + XLogRecPtr start_redo_lsn; + XLogRecPtr checkpoint_lsn; + TimeLineID start_tli; + XLogRecPtr stop_cut_lsn; + SCN consistent_scn; + uint32 manifest_crc; + TimestampTz started_at; + TimestampTz stopped_at; +} ClusterBackupStatus; + +extern void cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup_id); +extern bool cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, + int thread_index, + const ClusterBackupManifestThread *thread); +extern uint32 cluster_backup_manifest_compute_crc(const ClusterBackupManifest *manifest); +extern void cluster_backup_manifest_seal(ClusterBackupManifest *manifest); +extern ClusterBackupManifestReason +cluster_backup_manifest_validate(const ClusterBackupManifest *manifest); +extern const char *cluster_backup_manifest_reason_name(ClusterBackupManifestReason reason); + +extern ClusterRestorePointCutReason +cluster_restore_point_build(ClusterRestorePoint *out, + const char *name, + const SCN *thread_scn, + const XLogRecPtr *thread_lsn, + int max_threads, + bool pending_commits_empty, + bool commit_fence_held, + uint32 incarnation); +extern const char *cluster_restore_point_cut_reason_name(ClusterRestorePointCutReason reason); + +extern ClusterPitrTargetReason +cluster_pitr_resolve_scn(const ClusterRestorePoint *points, + int npoints, + SCN requested_scn, + SCN backup_consistent_scn, + ClusterRestorePoint *out); +extern const char *cluster_pitr_target_reason_name(ClusterPitrTargetReason reason); + +extern ClusterRestoreCompatibilityReason +cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, + uint32 current_catversion, + uint32 current_storage_id, + uint32 expected_node_count); +extern const char *cluster_restore_compat_reason_name(ClusterRestoreCompatibilityReason reason); + +#ifndef FRONTEND +extern Size cluster_backup_shmem_size(void); +extern void cluster_backup_shmem_init(void); +extern void cluster_backup_shmem_register(void); +extern void cluster_backup_get_status(ClusterBackupStatus *out); +extern bool cluster_backup_get_last_manifest(ClusterBackupManifest *out); +extern int cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points); +#endif + +#endif /* CLUSTER_BACKUP_H */ diff --git a/src/include/cluster/cluster_guc.h b/src/include/cluster/cluster_guc.h index 03ea74a49e..a648ce1d90 100644 --- a/src/include/cluster/cluster_guc.h +++ b/src/include/cluster/cluster_guc.h @@ -497,6 +497,24 @@ extern int cluster_boc_sweep_interval_ms; */ extern bool cluster_enabled; +/* spec-6.5: cluster-aware backup / restore / PITR target configuration. */ +#define CLUSTER_RECOVERY_TARGET_ACTION_PAUSE 0 +#define CLUSTER_RECOVERY_TARGET_ACTION_PROMOTE 1 +#define CLUSTER_RECOVERY_TARGET_ACTION_SHUTDOWN 2 + +#define CLUSTER_BACKUP_MANIFEST_CHECKSUM_OFF 0 +#define CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C 1 + +extern char *cluster_recovery_target_scn; +extern char *cluster_recovery_target_cluster_time; +extern char *cluster_recovery_target_name; +extern int cluster_recovery_target_action; +extern bool cluster_enable_pitr_restore_points; +extern int cluster_pitr_restore_point_interval_ms; +extern int cluster_backup_wal_retention; +extern int cluster_backup_parallel_channels; +extern int cluster_backup_manifest_checksums; + /* spec-3.12 D5: own-instance undo/TT-slot retention horizon gate (default on). */ extern bool cluster_undo_retention_horizon_enabled; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index d10d6c5804..54f9d65c14 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -249,6 +249,8 @@ typedef enum BuiltinTrancheIds { LWTRANCHE_CLUSTER_CLEAN_LEAVE, /* spec-5.18: guards the permanent-removal ClusterNodeRemoveState shmem block. */ LWTRANCHE_CLUSTER_NODE_REMOVE, + /* spec-6.5: guards the cluster backup / restore / PITR shmem block. */ + LWTRANCHE_CLUSTER_BACKUP, /* * PGRAC (stage 1.14 Sprint A): dedicated tranche for * ClusterStatsSharedState lwlock — same pattern as LMON / LCK / DIAG. From f81b5390c02baa9a0451e7f2e8b381c8533ef0dc Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 22:40:36 +0800 Subject: [PATCH 2/9] test(cluster): cover backup PITR surface --- docs/reference/system-views.md | 106 ++++++- docs/user-guide/configuration.md | 25 +- src/test/cluster_tap/t/006_errcodes.pl | 5 +- src/test/cluster_tap/t/007_guc.pl | 23 ++ src/test/cluster_tap/t/020_shmem_registry.pl | 6 +- .../cluster_tap/t/332_cluster_backup_pitr.pl | 142 +++++++++ src/test/cluster_unit/Makefile | 13 +- src/test/cluster_unit/test_cluster_backup.c | 297 ++++++++++++++++++ src/test/cluster_unit/test_cluster_errcodes.c | 28 +- src/test/cluster_unit/test_cluster_shmem.c | 6 + 10 files changed, 635 insertions(+), 16 deletions(-) create mode 100644 src/test/cluster_tap/t/332_cluster_backup_pitr.pl create mode 100644 src/test/cluster_unit/test_cluster_backup.c diff --git a/docs/reference/system-views.md b/docs/reference/system-views.md index 912225bb8a..a8799ac9ce 100644 --- a/docs/reference/system-views.md +++ b/docs/reference/system-views.md @@ -1,14 +1,114 @@ # System views -linkdb adds three cluster-aware system views to the standard -PostgreSQL catalog. All three are present in `--enable-cluster` -builds; in `--disable-cluster` builds they return zero rows. +linkdb adds cluster-aware system views to the standard PostgreSQL +catalog. These views are present in `--enable-cluster` builds; in +`--disable-cluster` builds the backing functions are unavailable or +return zero rows, depending on whether the function is read-only or +operator-facing. | View | Purpose | |---|---| | `pg_cluster_nodes` | Cluster topology (the parsed `pgrac.conf`) | | `pg_stat_cluster_wait_events` | Cluster-specific wait events on the local node | | `pg_stat_gcluster_wait_events` | Cluster wait events globally (cross-node placeholder) | +| `pg_stat_cluster_backup` | Current cluster backup state on the local node | +| `pg_cluster_backup_history` | Latest cluster backup manifest summary | +| `pg_cluster_restore_points` | Cluster restore points visible to PITR status | +| `pg_cluster_pitr_status` | Cluster PITR target reachability status | + +## Cluster Backup / PITR Views + +The cluster backup surface exposes the manifest and target-resolution +state used by `pg_cluster_backup_start`, `pg_cluster_backup_stop`, and +`pg_cluster_create_restore_point`. + +Current 6.5 scope is conservative: + +- A single-node cluster can start and stop a cluster-aware physical + backup. The backup label returned by `pg_cluster_backup_stop` + includes native PostgreSQL label content followed by `CLUSTER_*` + metadata lines. +- If the node has declared peers, the mutating backup/restore-point + functions fail closed with a cluster backup SQLSTATE rather than + silently producing a partial backup. +- The manifest records WAL thread, undo, transaction-table, SCN, and + control-file inclusion state for the proven local cut. A later + backup-set writer can extend the same contract to coordinated + multi-node copying without changing these view shapes. + +### `pg_stat_cluster_backup` + +One row describing the current or most recent cluster backup on this +node. + +| Column | Type | Description | +|---|---|---| +| `in_progress` | `bool` | True while this session has an active cluster backup. | +| `backup_id` | `text` | Backup label/id, or NULL before the first backup. | +| `coordinator_node_id` | `int4` | Local node id that started the backup. | +| `start_redo_lsn` | `pg_lsn` | Checkpoint redo LSN used as the backup start contract. | +| `checkpoint_lsn` | `pg_lsn` | Checkpoint record LSN captured at backup start. | +| `stop_cut_lsn` | `pg_lsn` | WAL cut LSN captured at backup stop. | +| `consistent_scn` | `int8` | Cluster SCN selected for the backup cut. | +| `manifest_crc` | `int8` | CRC32C of the latest manifest image. | +| `started_at` | `timestamptz` | Local timestamp when the backup started. | +| `stopped_at` | `timestamptz` | Local timestamp when the backup stopped. | +| `backup_parallel_channels` | `int4` | Configured copy-channel capacity for the backup substrate. | +| `backup_wal_retention` | `int4` | Configured WAL retention hint, in MB. | +| `restore_points_enabled` | `bool` | Whether automatic PITR restore-point scheduling is enabled. | +| `restore_point_interval_ms` | `int4` | Automatic restore-point scheduling interval, in milliseconds. | + +### `pg_cluster_backup_history` + +Returns the latest cluster backup manifest summary retained in shared +memory. + +| Column | Type | Description | +|---|---|---| +| `backup_id` | `text` | Backup label/id. | +| `consistent_scn` | `int8` | SCN that defines the backup cut. | +| `scn_durable_peak` | `int8` | Highest durable SCN covered by the cut. | +| `timeline` | `int4` | WAL timeline recorded at backup stop. | +| `catversion` | `int8` | Catalog version used to reject incompatible restores. | +| `storage_id` | `int4` | Cluster shared-storage backend id. | +| `node_count` | `int4` | Number of nodes proven in the manifest. | +| `thread_count` | `int4` | Number of WAL threads proven in the manifest. | +| `manifest_crc` | `int8` | CRC32C of the manifest image. | + +### `pg_cluster_restore_points` + +Shows restore points created by the cluster-aware restore-point entry +point. + +| Column | Type | Description | +|---|---|---| +| `restore_point_name` | `text` | Restore point name. | +| `cut_scn` | `int8` | SCN selected for the restore point cut. | +| `thread_count` | `int4` | WAL threads covered by the cut. | +| `incarnation` | `int4` | Cluster incarnation recorded with the cut. | +| `created_at` | `timestamptz` | Local timestamp when the point was recorded. | + +### `pg_cluster_pitr_status` + +Resolves the configured cluster PITR target against known restore +points and the latest manifest. + +| Column | Type | Description | +|---|---|---| +| `target_type` | `text` | `latest` when no target is configured, `scn`, `name`, or `cluster_time`. | +| `target_action` | `text` | Configured PITR action: `pause`, `promote`, or `shutdown`. | +| `reachable` | `bool` | True if the configured target is reachable. | +| `reason` | `text` | `ok` or the fail-closed reason. | +| `resolved_scn` | `int8` | Restore-point SCN selected for the target, when reachable. | +| `restore_point_name` | `text` | Restore point used for the target, when reachable. | + +Mutating function execution is revoked from PUBLIC: + +```sql +SELECT * FROM pg_cluster_backup_start('b1', true); +SELECT * FROM pg_cluster_backup_stop(true); +SELECT * FROM pg_cluster_create_restore_point('rp1'); +``` ## pg_cluster_nodes diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md index b01ce1fac5..26fac530fd 100644 --- a/docs/user-guide/configuration.md +++ b/docs/user-guide/configuration.md @@ -3,7 +3,7 @@ linkdb uses two configuration mechanisms layered on top of standard PostgreSQL configuration: -1. **`postgresql.conf`** — standard PG config plus the `cluster.*` +1. **`postgresql.conf`** — standard PG config plus `cluster.*` GUCs added by linkdb's cluster subsystem. 2. **`pgrac.conf`** — INI-style file describing the cluster topology (the list of nodes that participate in the cluster). @@ -62,6 +62,29 @@ an absolute path), `$PGDATA/pg_wal` must resolve to cluster.wal_threads_dir = '/shared/walroot' ``` +### Cluster backup / PITR GUCs + +These settings support the cluster-aware physical backup / restore / +PITR surface. + +| GUC | Type | Default | Context | Notes | +|---|---|---|---|---| +| `cluster.recovery_target_scn` | string | `''` | postmaster | Target SCN used by `pg_cluster_pitr_status`. Empty means latest unless a name or cluster-time target is set. | +| `cluster.recovery_target_cluster_time` | string | `''` | postmaster | Timestamp target reported by `pg_cluster_pitr_status`; current 6.5 recovery action remains fail-closed for this target type. | +| `cluster.recovery_target_name` | string | `''` | postmaster | Named restore-point target resolved by `pg_cluster_pitr_status`. | +| `cluster.recovery_target_action` | enum | `pause` | postmaster | Accepted values: `pause`, `promote`, `shutdown`; exposed in PITR status. | +| `cluster.enable_pitr_restore_points` | bool | `off` | sighup | Enables future automatic restore-point scheduling. Manual `pg_cluster_create_restore_point()` is independent. | +| `cluster.pitr_restore_point_interval_ms` | integer | `0` | sighup | Zero disables automatic scheduling. | +| `cluster.backup_wal_retention` | integer | `0` MB | sighup | Retention hint for the future backup-set writer. | +| `cluster.backup_parallel_channels` | integer | `1` | sighup | Reserved copy-channel capacity for the future backup-set writer. | +| `cluster.backup_manifest_checksums` | enum | `crc32c` | sighup | Manifest checksums are mandatory; unchecked manifests are not supported. | + +The current implementation is intentionally conservative. A +single-node cluster can create a cluster manifest via +`pg_cluster_backup_start()` / `pg_cluster_backup_stop()`. If declared +peers exist, cluster backup and restore-point mutation fail closed +instead of producing a partial backup or an unreachable PITR target. + ### `cluster.interconnect_tier` | | | diff --git a/src/test/cluster_tap/t/006_errcodes.pl b/src/test/cluster_tap/t/006_errcodes.pl index 3a6999c2e9..a83e9ef874 100644 --- a/src/test/cluster_tap/t/006_errcodes.pl +++ b/src/test/cluster_tap/t/006_errcodes.pl @@ -1,8 +1,7 @@ #------------------------------------------------------------------------- # # 006_errcodes.pl -# End-to-end regression for the 45 cluster SQLSTATE error codes -# registered in stage 0.12. +# End-to-end regression for cluster SQLSTATE error codes. # # Cluster errcodes are registered in src/backend/utils/errcodes.txt # and become available to plpgsql via PG's auto-generated @@ -103,6 +102,8 @@ sub raise_unknown "cluster_lms_queue_full -> 53R01"); is(raise_and_get_sqlstate('cluster_reconfig_in_progress'), '53R60', "cluster_reconfig_in_progress -> 53R60"); +is(raise_and_get_sqlstate('cluster_backup_incomplete'), '53RAD', + "cluster_backup_incomplete -> 53RAD"); is(raise_and_get_sqlstate('cluster_shared_storage_failed'), '58R01', "cluster_shared_storage_failed -> 58R01"); diff --git a/src/test/cluster_tap/t/007_guc.pl b/src/test/cluster_tap/t/007_guc.pl index 7724eb2184..787953a368 100644 --- a/src/test/cluster_tap/t/007_guc.pl +++ b/src/test/cluster_tap/t/007_guc.pl @@ -179,5 +179,28 @@ qr/999 is outside the valid range for parameter "cluster.node_id"/, 'startup log contains GUC out-of-range WARNING for cluster.node_id'); +# ---------- +# spec-6.5 cluster backup / PITR GUCs. +# ---------- +is($node->safe_psql('postgres', + q{SELECT setting || '|' || vartype || '|' || context + FROM pg_settings WHERE name = 'cluster.recovery_target_scn'}), + '|string|postmaster', + 'cluster.recovery_target_scn default and context'); +is($node->safe_psql('postgres', + q{SELECT setting || '|' || vartype || '|' || context + FROM pg_settings WHERE name = 'cluster.recovery_target_action'}), + 'pause|enum|postmaster', + 'cluster.recovery_target_action default and context'); +is($node->safe_psql('postgres', + q{SELECT setting || '|' || vartype || '|' || context + FROM pg_settings WHERE name = 'cluster.backup_manifest_checksums'}), + 'crc32c|enum|sighup', + 'cluster.backup_manifest_checksums is mandatory crc32c'); +is($node->safe_psql('postgres', + q{SELECT setting || '|' || vartype || '|' || context + FROM pg_settings WHERE name = 'cluster.backup_parallel_channels'}), + '1|integer|sighup', + 'cluster.backup_parallel_channels default and context'); done_testing(); diff --git a/src/test/cluster_tap/t/020_shmem_registry.pl b/src/test/cluster_tap/t/020_shmem_registry.pl index d40aa6578f..4e6c119e29 100644 --- a/src/test/cluster_tap/t/020_shmem_registry.pl +++ b/src/test/cluster_tap/t/020_shmem_registry.pl @@ -93,9 +93,11 @@ # "cr counters"). # spec-5.18 D2: +1 "pgrac cluster node_remove" (permanent-removal driver state; # always registered; sorts between "multixact overlay" and "pcm grd"). -my $expected_region_count = $has_visibility_inject ? '68' : '67'; +# spec-6.5: +1 "pgrac cluster backup" (backup / restore / PITR state; sorts +# between "advisory" and "cf stats"). +my $expected_region_count = $has_visibility_inject ? '69' : '68'; my $expected_regions = - 'pgrac block recovery,pgrac cluster advisory,pgrac cluster cf stats,pgrac cluster clean_leave,pgrac cluster conf,pgrac cluster control,pgrac cluster cr admit stats,pgrac cluster cr coordinator,pgrac cluster cr counters,pgrac cluster cr pool,pgrac cluster cr relgen,pgrac cluster cr tuple stats,pgrac cluster cssd,pgrac cluster diag,pgrac cluster dl,pgrac cluster durable tt counters,pgrac cluster epoch,pgrac cluster fence,pgrac cluster gcs,pgrac cluster gcs block,pgrac cluster gcs block dedup,pgrac cluster ges,pgrac cluster ges dedup,pgrac cluster ges reply wait,pgrac cluster grd,pgrac cluster grd outbound,pgrac cluster grd pending,pgrac cluster grd work queue,pgrac cluster hw,pgrac cluster ir,pgrac cluster ko,pgrac cluster lck,pgrac cluster lmd,pgrac cluster lmd graph,pgrac cluster lmd probe,pgrac cluster lmon,pgrac cluster lms,pgrac cluster lock-path counters,pgrac cluster multixact overlay,pgrac cluster node_remove,pgrac cluster pcm grd,pgrac cluster qvotec,pgrac cluster reconfig,pgrac cluster resolver cache,pgrac cluster scn,pgrac cluster sequence,pgrac cluster sinval ack outbound,pgrac cluster sinval ack wait,pgrac cluster sinval inbound,pgrac cluster sinval outbound,pgrac cluster smgr,pgrac cluster startup phase,pgrac cluster stats,pgrac cluster subtrans state,pgrac cluster ts,pgrac cluster tt local seq,pgrac cluster tt slot allocator,pgrac cluster tt status hint outbound,pgrac cluster tt status overlay,pgrac cluster tx enqueue,pgrac cluster undo cleaner,pgrac cluster undo record cursor'; + 'pgrac block recovery,pgrac cluster advisory,pgrac cluster backup,pgrac cluster cf stats,pgrac cluster clean_leave,pgrac cluster conf,pgrac cluster control,pgrac cluster cr admit stats,pgrac cluster cr coordinator,pgrac cluster cr counters,pgrac cluster cr pool,pgrac cluster cr relgen,pgrac cluster cr tuple stats,pgrac cluster cssd,pgrac cluster diag,pgrac cluster dl,pgrac cluster durable tt counters,pgrac cluster epoch,pgrac cluster fence,pgrac cluster gcs,pgrac cluster gcs block,pgrac cluster gcs block dedup,pgrac cluster ges,pgrac cluster ges dedup,pgrac cluster ges reply wait,pgrac cluster grd,pgrac cluster grd outbound,pgrac cluster grd pending,pgrac cluster grd work queue,pgrac cluster hw,pgrac cluster ir,pgrac cluster ko,pgrac cluster lck,pgrac cluster lmd,pgrac cluster lmd graph,pgrac cluster lmd probe,pgrac cluster lmon,pgrac cluster lms,pgrac cluster lock-path counters,pgrac cluster multixact overlay,pgrac cluster node_remove,pgrac cluster pcm grd,pgrac cluster qvotec,pgrac cluster reconfig,pgrac cluster resolver cache,pgrac cluster scn,pgrac cluster sequence,pgrac cluster sinval ack outbound,pgrac cluster sinval ack wait,pgrac cluster sinval inbound,pgrac cluster sinval outbound,pgrac cluster smgr,pgrac cluster startup phase,pgrac cluster stats,pgrac cluster subtrans state,pgrac cluster ts,pgrac cluster tt local seq,pgrac cluster tt slot allocator,pgrac cluster tt status hint outbound,pgrac cluster tt status overlay,pgrac cluster tx enqueue,pgrac cluster undo cleaner,pgrac cluster undo record cursor'; $expected_regions .= ',pgrac cluster visibility inject' if $has_visibility_inject; # spec-4.12 D7: cooperative write-fence region; always registered. Sorts after diff --git a/src/test/cluster_tap/t/332_cluster_backup_pitr.pl b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl new file mode 100644 index 0000000000..bda2d7d5cd --- /dev/null +++ b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl @@ -0,0 +1,142 @@ +#!/usr/bin/env perl +#------------------------------------------------------------------------- +# +# 332_cluster_backup_pitr.pl +# spec-6.5 -- cluster-aware backup / restore / PITR SQL surface. +# +# IDENTIFICATION +# src/test/cluster_tap/t/332_cluster_backup_pitr.pl +# +# Author: SqlRush +# +# Portions Copyright (c) 2026, pgrac contributors +# +#------------------------------------------------------------------------- + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../lib"; + +use PgracClusterNode; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PgracClusterNode->new('cluster_backup_single'); +$node->init(allows_streaming => 1); +$node->append_conf('postgresql.conf', + "cluster.enabled = on\n" + . "cluster.node_id = 0\n" + . "cluster.allow_single_node = on\n" + . "wal_level = replica\n"); +$node->start; + +is($node->safe_psql('postgres', + q{SELECT count(*) FROM pg_stat_cluster_backup}), + '1', + 'L1 backup state view is present'); +is($node->safe_psql('postgres', + q{SELECT backup_parallel_channels || ',' || backup_wal_retention || ',' || + CASE WHEN restore_points_enabled THEN 't' ELSE 'f' END || ',' || + restore_point_interval_ms + FROM pg_stat_cluster_backup}), + '1,0,f,0', + 'L1 backup state view exposes backup/PITR GUC readers'); +is($node->safe_psql('postgres', + q{SELECT target_type || ',' || target_action || ',' || + CASE WHEN reachable THEN 't' ELSE 'f' END || ',' || reason + FROM pg_cluster_pitr_status}), + 'latest,pause,t,ok', + 'L2 default PITR target status is latest/pause/ok'); + +my $backup_row = $node->safe_psql('postgres', + q{SELECT s.backup_id || ',' || + CASE WHEN s.start_redo_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || + CASE WHEN s.checkpoint_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || + CASE WHEN t.consistent_scn > 0 THEN 't' ELSE 'f' END || ',' || + CASE WHEN t.stop_cut_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || + CASE WHEN t.manifest_crc > 0 THEN 't' ELSE 'f' END || ',' || + CASE WHEN t.backup_label LIKE '%CLUSTER_BACKUP_ID: b332%' THEN 't' ELSE 'f' END || ',' || + CASE WHEN t.backup_label LIKE '%CLUSTER_MANIFEST_CRC32C:%' THEN 't' ELSE 'f' END + FROM pg_cluster_backup_start('b332', true) AS s + CROSS JOIN LATERAL + pg_cluster_backup_stop(COALESCE(s.backup_id = '', false)) AS t}); +is($backup_row, 'b332,t,t,t,t,t,t,t', + 'L3 cluster backup start/stop returns checkpoint, SCN, LSN, CRC, and label contract'); + +is($node->safe_psql('postgres', + q{SELECT backup_id || ',' || node_count || ',' || thread_count || ',' || + CASE WHEN manifest_crc > 0 THEN 't' ELSE 'f' END + FROM pg_cluster_backup_history}), + 'b332,1,1,t', + 'L4 latest manifest summary is visible'); + +is($node->safe_psql('postgres', + q{SELECT restore_point_name || ',' || + CASE WHEN cut_scn > 0 THEN 't' ELSE 'f' END || ',' || + CASE WHEN cut_lsn IS NOT NULL THEN 't' ELSE 'f' END + FROM pg_cluster_create_restore_point('rp332')}), + 'rp332,t,t', + 'L5 cluster restore point records SCN and LSN'); + +is($node->safe_psql('postgres', + q{SELECT count(*) FROM pg_cluster_restore_points + WHERE restore_point_name IN ('b332', 'rp332')}), + '2', + 'L6 backup stop and manual restore point are retained'); + +$node->stop; + +my $peer_node = PgracClusterNode->new('cluster_backup_peers'); +$peer_node->init(allows_streaming => 1); +$peer_node->append_conf('postgresql.conf', + "cluster.enabled = on\n" + . "cluster.node_id = 0\n" + . "cluster.allow_single_node = on\n" + . "wal_level = replica\n"); +PostgreSQL::Test::Utils::append_to_file($peer_node->data_dir . '/pgrac.conf', <<'EOC'); +[cluster] +name = pgrac-backup-peer-failclosed + +[node.0] +interconnect_addr = 127.0.0.1:6432 + +[node.1] +interconnect_addr = 127.0.0.1:6433 +EOC +$peer_node->start; + +my ($ret, $out, $err) = $peer_node->psql('postgres', + "\\set VERBOSITY verbose\nSELECT * FROM pg_cluster_backup_start('partial', true)"); +isnt($ret, 0, 'L8 peer topology rejects cluster backup start'); +like($err, qr/53RAD|cluster_backup_incomplete/, + 'L8 peer topology fails closed with cluster_backup_incomplete'); +is($peer_node->safe_psql('postgres', + q{SELECT CASE WHEN in_progress THEN 't' ELSE 'f' END + FROM pg_stat_cluster_backup}), + 'f', + 'L9 failed peer backup did not leave in-progress state'); + +$peer_node->stop; + +my $bad_target_node = PgracClusterNode->new('cluster_backup_bad_target'); +$bad_target_node->init(allows_streaming => 1); +$bad_target_node->append_conf('postgresql.conf', + "cluster.enabled = on\n" + . "cluster.node_id = 0\n" + . "cluster.allow_single_node = on\n" + . "wal_level = replica\n" + . "cluster.recovery_target_scn = '0'\n"); +$bad_target_node->start; + +is($bad_target_node->safe_psql('postgres', + q{SELECT target_type || ',' || target_action || ',' || + CASE WHEN reachable THEN 't' ELSE 'f' END || ',' || reason + FROM pg_cluster_pitr_status}), + 'scn,pause,f,invalid_target', + 'L10 invalid SCN PITR target fails closed'); + +$bad_target_node->stop; + +done_testing(); diff --git a/src/test/cluster_unit/Makefile b/src/test/cluster_unit/Makefile index 28d409d569..6ee1bcb0a9 100644 --- a/src/test/cluster_unit/Makefile +++ b/src/test/cluster_unit/Makefile @@ -47,6 +47,7 @@ TESTS = test_cluster_basic test_cluster_version test_cluster_backend_types \ test_cluster_retention test_cluster_undo_cleaner test_cluster_visibility_variants test_cluster_tt_2pc \ test_cluster_stage3_acceptance test_cluster_undo_buf test_cluster_undo_extent \ test_cluster_wal_thread test_cluster_wal_state test_cluster_recovery_plan \ + test_cluster_backup \ test_cluster_recovery_worker test_cluster_recovery_merge test_cluster_remote_xact \ test_cluster_block_apply test_cluster_thread_apply test_cluster_thread_replay \ test_cluster_thread_driver test_cluster_thread_orchestrator \ @@ -144,13 +145,23 @@ test_cluster_membership: test_cluster_membership.c unit_test.h $(CLUSTER_VERSION $(CLUSTER_VERSION_O) $(CLUSTER_MEMBERSHIP_O) \ $(top_builddir)/src/port/libpgport_srv.a -o $@ +# spec-6.5 D0/D1: test_cluster_backup links only the dependency-light +# manifest/PITR helper object. Runtime SQL/shmem code stays in +# cluster_backup.o and is intentionally not pulled into this pure test. +CLUSTER_BACKUP_MANIFEST_O = $(top_builddir)/src/backend/cluster/cluster_backup_manifest.o +test_cluster_backup: test_cluster_backup.c unit_test.h $(CLUSTER_VERSION_O) \ + $(CLUSTER_BACKUP_MANIFEST_O) + $(CC) $(CFLAGS) $(CPPFLAGS) $< \ + $(CLUSTER_VERSION_O) $(CLUSTER_BACKUP_MANIFEST_O) \ + $(top_builddir)/src/port/libpgport_srv.a -o $@ + # Most tests link only cluster_version.o. test_cluster_guc / # test_cluster_shmem / test_cluster_signal / test_cluster_views / # test_cluster_gviews / test_cluster_ic / test_cluster_conf have # separate rules because they also link additional cluster_*.o # objects (the test files stub the PG backend symbols those # objects reference). -SIMPLE_TESTS = $(filter-out test_cluster_guc test_cluster_shmem test_cluster_signal test_cluster_views test_cluster_gviews test_cluster_ic test_cluster_conf test_cluster_ic_mock test_cluster_inject test_cluster_pgstat test_cluster_debug test_cluster_shared_fs test_cluster_shared_fs_sharedfs test_cluster_smgr test_cluster_startup_phase test_cluster_lmon test_cluster_lck test_cluster_diag test_cluster_stats test_cluster_cssd test_cluster_qvotec test_cluster_voting_disk_io test_cluster_quorum_decision test_cluster_scn test_cluster_epoch test_cluster_fence test_cluster_reconfig test_cluster_ges test_cluster_grd test_cluster_grd_starvation test_cluster_lmd test_cluster_lmd_graph test_cluster_lmd_wait_state test_cluster_cancel_token test_cluster_lmd_probe_collector test_cluster_lock_acquire test_cluster_advisory test_cluster_retention test_cluster_visibility_variants test_cluster_tt_2pc test_cluster_stage3_acceptance test_cluster_undo_buf test_cluster_block_apply test_cluster_thread_apply test_cluster_thread_replay test_cluster_thread_driver test_cluster_thread_orchestrator test_cluster_write_fence test_cluster_stage4_acceptance test_cluster_stage5_5_cr_acceptance test_cluster_stage5_integrated_acceptance test_cluster_ges_mode test_cluster_sequence test_cluster_hw test_cluster_dl test_cluster_extend_gate test_cluster_ir test_cluster_ts test_cluster_ko test_cluster_hw_snapshot test_cluster_cf_authority test_cluster_cf_storage test_cluster_cf_enqueue test_cluster_cf_phase2 test_cluster_cf_stats test_cluster_hang test_cluster_hang_resolve test_cluster_touched_peers test_cluster_clean_leave test_cluster_membership test_cluster_node_remove test_cluster_resolver_cache,$(TESTS)) +SIMPLE_TESTS = $(filter-out test_cluster_guc test_cluster_shmem test_cluster_signal test_cluster_views test_cluster_gviews test_cluster_ic test_cluster_conf test_cluster_ic_mock test_cluster_inject test_cluster_pgstat test_cluster_debug test_cluster_shared_fs test_cluster_shared_fs_sharedfs test_cluster_smgr test_cluster_startup_phase test_cluster_lmon test_cluster_lck test_cluster_diag test_cluster_stats test_cluster_cssd test_cluster_qvotec test_cluster_voting_disk_io test_cluster_quorum_decision test_cluster_scn test_cluster_epoch test_cluster_fence test_cluster_reconfig test_cluster_ges test_cluster_grd test_cluster_grd_starvation test_cluster_lmd test_cluster_lmd_graph test_cluster_lmd_wait_state test_cluster_cancel_token test_cluster_lmd_probe_collector test_cluster_lock_acquire test_cluster_advisory test_cluster_retention test_cluster_visibility_variants test_cluster_tt_2pc test_cluster_stage3_acceptance test_cluster_undo_buf test_cluster_block_apply test_cluster_thread_apply test_cluster_thread_replay test_cluster_thread_driver test_cluster_thread_orchestrator test_cluster_write_fence test_cluster_stage4_acceptance test_cluster_stage5_5_cr_acceptance test_cluster_stage5_integrated_acceptance test_cluster_ges_mode test_cluster_sequence test_cluster_hw test_cluster_dl test_cluster_extend_gate test_cluster_ir test_cluster_ts test_cluster_ko test_cluster_hw_snapshot test_cluster_cf_authority test_cluster_cf_storage test_cluster_cf_enqueue test_cluster_cf_phase2 test_cluster_cf_stats test_cluster_hang test_cluster_hang_resolve test_cluster_touched_peers test_cluster_clean_leave test_cluster_membership test_cluster_node_remove test_cluster_resolver_cache test_cluster_backup,$(TESTS)) # spec-2.4 D16: test_cluster_epoch links cluster_epoch.o standalone. # cluster_epoch.c references ShmemInitStruct + cluster_shmem_register_region diff --git a/src/test/cluster_unit/test_cluster_backup.c b/src/test/cluster_unit/test_cluster_backup.c new file mode 100644 index 0000000000..0c161774c1 --- /dev/null +++ b/src/test/cluster_unit/test_cluster_backup.c @@ -0,0 +1,297 @@ +/*------------------------------------------------------------------------- + * + * test_cluster_backup.c + * spec-6.5 unit tests for cluster backup / restore / PITR helpers. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2026, pgrac contributors + * + * Author: SqlRush + * + * Spec: spec-6.5-cluster-aware-backup-restore-pitr.md + * + * IDENTIFICATION + * src/test/cluster_unit/test_cluster_backup.c + * + * NOTES + * pgrac-original file. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "catalog/catversion.h" +#include "cluster/cluster_backup.h" + +#undef printf +#undef fprintf +#undef snprintf + +#include "unit_test.h" + +UT_DEFINE_GLOBALS(); + +void +ExceptionalCondition(const char *conditionName, const char *fileName, int lineNumber) +{ + printf("# Assert failed: %s at %s:%d\n", conditionName, fileName, lineNumber); + abort(); +} + +int +scn_time_cmp(SCN a, SCN b) +{ + uint64 alocal = scn_local(a); + uint64 blocal = scn_local(b); + + if (alocal < blocal) + return -1; + if (alocal > blocal) + return 1; + return 0; +} + +static SCN +test_scn(uint64 local) +{ + return scn_encode(0, local); +} + +static void +fill_valid_manifest(ClusterBackupManifest *m) +{ + ClusterBackupManifestThread thread; + + cluster_backup_manifest_init(m, "b1"); + m->consistent_scn = test_scn(10); + m->scn_durable_peak = test_scn(12); + m->timeline = 1; + m->catversion = CATALOG_VERSION_NO; + m->backend_storage_id = 3; + m->node_count = 1; + m->control_included = true; + + memset(&thread, 0, sizeof(thread)); + thread.thread_id = 1; + thread.node_id = 0; + thread.start_redo_lsn = 10; + thread.checkpoint_lsn = 20; + thread.start_tli = 1; + thread.stop_cut_lsn = 40; + thread.wal_included = true; + thread.undo_included = true; + thread.tt_included = true; + UT_ASSERT(cluster_backup_manifest_set_thread(m, 0, &thread)); + cluster_backup_manifest_seal(m); +} + +UT_TEST(test_manifest_validates_complete_single_thread) +{ + ClusterBackupManifest m; + + fill_valid_manifest(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_OK); + UT_ASSERT_STR_EQ(cluster_backup_manifest_reason_name(CLUSTER_BACKUP_MANIFEST_OK), "ok"); + UT_ASSERT_NE(cluster_backup_manifest_compute_crc(&m), 0); +} + +UT_TEST(test_manifest_rejects_missing_control_wal_undo_tt) +{ + ClusterBackupManifest m; + + fill_valid_manifest(&m); + m.control_included = false; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL); + + fill_valid_manifest(&m); + m.threads[0].wal_included = false; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_MISSING_WAL); + + fill_valid_manifest(&m); + m.threads[0].undo_included = false; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_MISSING_UNDO); + + fill_valid_manifest(&m); + m.threads[0].tt_included = false; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_MISSING_TT); +} + +UT_TEST(test_manifest_rejects_bad_scn_lsn_count_and_crc) +{ + ClusterBackupManifest m; + + fill_valid_manifest(&m); + m.scn_durable_peak = test_scn(9); + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK); + + fill_valid_manifest(&m); + m.threads[0].stop_cut_lsn = 9; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_BAD_LSN_RANGE); + + fill_valid_manifest(&m); + m.thread_count = 2; + cluster_backup_manifest_seal(&m); + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_MISSING_THREAD); + + fill_valid_manifest(&m); + m.manifest_crc++; + UT_ASSERT_EQ(cluster_backup_manifest_validate(&m), CLUSTER_BACKUP_MANIFEST_BAD_CRC); +} + +UT_TEST(test_manifest_set_thread_is_bounds_defensive) +{ + ClusterBackupManifest m; + ClusterBackupManifestThread thread; + + cluster_backup_manifest_init(&m, "bounds"); + memset(&thread, 0, sizeof(thread)); + thread.thread_id = 1; + UT_ASSERT(!cluster_backup_manifest_set_thread(NULL, 0, &thread)); + UT_ASSERT(!cluster_backup_manifest_set_thread(&m, -1, &thread)); + UT_ASSERT(!cluster_backup_manifest_set_thread(&m, CLUSTER_MAX_NODES, &thread)); + thread.thread_id = 0; + UT_ASSERT(!cluster_backup_manifest_set_thread(&m, 0, &thread)); + thread.thread_id = CLUSTER_MAX_NODES + 1; + UT_ASSERT(!cluster_backup_manifest_set_thread(&m, 0, &thread)); +} + +UT_TEST(test_restore_point_cut_requires_drain_and_fence) +{ + SCN scns[CLUSTER_MAX_NODES]; + XLogRecPtr lsns[CLUSTER_MAX_NODES]; + ClusterRestorePoint point; + + memset(scns, 0, sizeof(scns)); + memset(lsns, 0, sizeof(lsns)); + scns[0] = test_scn(20); + lsns[0] = 500; + + UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, + false, true, 0), + CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS); + UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, + true, false, 0), + CLUSTER_RESTORE_POINT_CUT_NO_FENCE); +} + +UT_TEST(test_restore_point_cut_records_all_threads) +{ + SCN scns[CLUSTER_MAX_NODES]; + XLogRecPtr lsns[CLUSTER_MAX_NODES]; + ClusterRestorePoint point; + + memset(scns, 0, sizeof(scns)); + memset(lsns, 0, sizeof(lsns)); + scns[0] = test_scn(20); + lsns[0] = 500; + scns[2] = test_scn(30); + lsns[2] = 700; + + UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, + true, true, 9), + CLUSTER_RESTORE_POINT_CUT_OK); + UT_ASSERT_EQ(point.thread_count, 2); + UT_ASSERT_EQ(point.cut_scn, test_scn(30)); + UT_ASSERT_EQ(point.incarnation, 9); + UT_ASSERT_EQ(point.cut_lsn[2], 700); +} + +UT_TEST(test_restore_point_cut_rejects_partial_thread) +{ + SCN scns[CLUSTER_MAX_NODES]; + XLogRecPtr lsns[CLUSTER_MAX_NODES]; + ClusterRestorePoint point; + + memset(scns, 0, sizeof(scns)); + memset(lsns, 0, sizeof(lsns)); + scns[0] = test_scn(20); + UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, + true, true, 0), + CLUSTER_RESTORE_POINT_CUT_BAD_THREAD); +} + +UT_TEST(test_pitr_resolves_latest_reachable_restore_point) +{ + ClusterRestorePoint points[3]; + ClusterRestorePoint chosen; + + memset(points, 0, sizeof(points)); + points[0].present = true; + points[0].cut_scn = test_scn(20); + points[0].thread_count = 1; + strlcpy(points[0].name, "a", sizeof(points[0].name)); + points[1].present = true; + points[1].cut_scn = test_scn(30); + points[1].thread_count = 1; + strlcpy(points[1].name, "b", sizeof(points[1].name)); + points[2].present = true; + points[2].cut_scn = test_scn(50); + points[2].thread_count = 1; + strlcpy(points[2].name, "c", sizeof(points[2].name)); + + UT_ASSERT_EQ(cluster_pitr_resolve_scn(points, 3, test_scn(35), test_scn(10), &chosen), + CLUSTER_PITR_TARGET_OK); + UT_ASSERT_EQ(chosen.cut_scn, test_scn(30)); + UT_ASSERT_STR_EQ(chosen.name, "b"); +} + +UT_TEST(test_pitr_fail_closed_reasons) +{ + ClusterRestorePoint point; + + memset(&point, 0, sizeof(point)); + point.present = true; + point.cut_scn = test_scn(20); + point.thread_count = 0; + UT_ASSERT_EQ(cluster_pitr_resolve_scn(&point, 1, test_scn(20), test_scn(10), NULL), + CLUSTER_PITR_TARGET_MISSING_THREAD); + UT_ASSERT_EQ(cluster_pitr_resolve_scn(NULL, 0, test_scn(20), test_scn(10), NULL), + CLUSTER_PITR_TARGET_NO_RESTORE_POINT); + UT_ASSERT_EQ(cluster_pitr_resolve_scn(&point, 1, test_scn(5), test_scn(10), NULL), + CLUSTER_PITR_TARGET_BEFORE_BACKUP); +} + +UT_TEST(test_restore_compatibility_rejects_mismatches) +{ + ClusterBackupManifest m; + + fill_valid_manifest(&m); + UT_ASSERT_EQ(cluster_backup_manifest_compatible(&m, CATALOG_VERSION_NO, 3, 1), + CLUSTER_RESTORE_COMPAT_OK); + UT_ASSERT_EQ(cluster_backup_manifest_compatible(&m, CATALOG_VERSION_NO + 1, 3, 1), + CLUSTER_RESTORE_COMPAT_CATVERSION); + UT_ASSERT_EQ(cluster_backup_manifest_compatible(&m, CATALOG_VERSION_NO, 4, 1), + CLUSTER_RESTORE_COMPAT_STORAGE); + UT_ASSERT_EQ(cluster_backup_manifest_compatible(&m, CATALOG_VERSION_NO, 3, 2), + CLUSTER_RESTORE_COMPAT_TOPOLOGY); + m.manifest_crc++; + UT_ASSERT_EQ(cluster_backup_manifest_compatible(&m, CATALOG_VERSION_NO, 3, 1), + CLUSTER_RESTORE_COMPAT_MANIFEST); +} + +int +main(void) +{ + UT_PLAN(10); + UT_RUN(test_manifest_validates_complete_single_thread); + UT_RUN(test_manifest_rejects_missing_control_wal_undo_tt); + UT_RUN(test_manifest_rejects_bad_scn_lsn_count_and_crc); + UT_RUN(test_manifest_set_thread_is_bounds_defensive); + UT_RUN(test_restore_point_cut_requires_drain_and_fence); + UT_RUN(test_restore_point_cut_records_all_threads); + UT_RUN(test_restore_point_cut_rejects_partial_thread); + UT_RUN(test_pitr_resolves_latest_reachable_restore_point); + UT_RUN(test_pitr_fail_closed_reasons); + UT_RUN(test_restore_compatibility_rejects_mismatches); + UT_DONE(); + return ut_failed_count == 0 ? 0 : 1; +} diff --git a/src/test/cluster_unit/test_cluster_errcodes.c b/src/test/cluster_unit/test_cluster_errcodes.c index f160adb163..4b44f7841e 100644 --- a/src/test/cluster_unit/test_cluster_errcodes.c +++ b/src/test/cluster_unit/test_cluster_errcodes.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * test_cluster_errcodes.c - * Compile-time invariants for the 45 cluster SQLSTATE error codes + * Compile-time invariants for the cluster SQLSTATE error codes * registered in stage 0.12. * * All ERRCODE_CLUSTER_* macros are generated automatically by PG's @@ -14,7 +14,7 @@ * - Each ERRCODE_CLUSTER_* macro encodes the exact SQLSTATE string * via MAKE_SQLSTATE() (proves the .txt -> .h pipeline produced * correct values). - * - All 45 codes use the 'R' subclass character (pgrac namespace + * - All checked codes use the 'R' subclass character (pgrac namespace * discipline; design doc §2.3). * - The Class 58 pgrac block is dense from 58R01..58R12 (the * largest pgrac sub-class, anchors the count proof). @@ -118,7 +118,8 @@ UT_TEST(test_class_40_first_last) UT_TEST(test_class_53_first_last) { UT_ASSERT_EQ(ERRCODE_CLUSTER_LMS_QUEUE_FULL, MAKE_SQLSTATE('5', '3', 'R', '0', '1')); - UT_ASSERT_EQ(ERRCODE_CLUSTER_RECONFIG_IN_PROGRESS, MAKE_SQLSTATE('5', '3', 'R', '6', '0')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, + MAKE_SQLSTATE('5', '3', 'R', 'A', 'F')); } UT_TEST(test_class_55_first_last) @@ -175,6 +176,17 @@ UT_TEST(test_class_58_complete) UT_ASSERT_EQ(ERRCODE_CLUSTER_RECOVERY_FAILED, MAKE_SQLSTATE('5', '8', 'R', '1', '2')); } +UT_TEST(test_class_53_backup_band) +{ + UT_ASSERT_EQ(ERRCODE_CLUSTER_BACKUP_IN_PROGRESS, MAKE_SQLSTATE('5', '3', 'R', 'A', 'B')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_PITR_TARGET_UNREACHABLE, + MAKE_SQLSTATE('5', '3', 'R', 'A', 'C')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_BACKUP_INCOMPLETE, MAKE_SQLSTATE('5', '3', 'R', 'A', 'D')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_RESTORE_INCOMPATIBLE, MAKE_SQLSTATE('5', '3', 'R', 'A', 'E')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, + MAKE_SQLSTATE('5', '3', 'R', 'A', 'F')); +} + /* ---------- * All 45 cluster errcodes use 'R' as their subclass character @@ -190,6 +202,7 @@ UT_TEST(test_all_use_r_subclass) UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_LMS_QUEUE_FULL, 3), 'R'); UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_PCM_STATE_INVALID, 3), 'R'); UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RECONFIG_IN_PROGRESS, 3), 'R'); + UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, 3), 'R'); UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_SHARED_STORAGE_FAILED, 3), 'R'); UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_SNAPSHOT_TOO_OLD, 3), 'R'); UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_ASSERTION_FAILURE, 3), 'R'); @@ -229,9 +242,9 @@ UT_TEST(test_per_class_anchors) UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_PROTOCOL_VERSION_MISMATCH, 5), '5'); /* Class 40 has 4 entries: 40R01..40R04 */ UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_PI_INVALIDATED_RETRY, 5), '4'); - /* Class 53 spans base 53R01..53R07 plus quorum/fence/reconfig ranges up to 53R60. */ - UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RECONFIG_IN_PROGRESS, 4), '6'); - UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RECONFIG_IN_PROGRESS, 5), '0'); + /* Class 53 spans base 53R01..53R07 plus later pgrac bands up to 53RAF. */ + UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, 4), 'A'); + UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, 5), 'F'); /* Class 55 has 6 entries: 55R01..55R06 */ UT_ASSERT_EQ(sqlstate_char(ERRCODE_CLUSTER_BLOCK_MISSING_TEMPORARY, 5), '6'); /* Class 57 keeps operator-intervention cluster codes 57R02..57R06. */ @@ -246,7 +259,7 @@ UT_TEST(test_per_class_anchors) int main(void) { - UT_PLAN(12); + UT_PLAN(13); UT_RUN(test_class_08_first_last); UT_RUN(test_class_40_first_last); UT_RUN(test_class_53_first_last); @@ -256,6 +269,7 @@ main(void) UT_RUN(test_class_72_first_last); UT_RUN(test_class_xx_first_last); UT_RUN(test_class_58_complete); + UT_RUN(test_class_53_backup_band); UT_RUN(test_all_use_r_subclass); UT_RUN(test_no_overlap_with_pg_native); UT_RUN(test_per_class_anchors); diff --git a/src/test/cluster_unit/test_cluster_shmem.c b/src/test/cluster_unit/test_cluster_shmem.c index a927deaa75..d375949d0a 100644 --- a/src/test/cluster_unit/test_cluster_shmem.c +++ b/src/test/cluster_unit/test_cluster_shmem.c @@ -762,6 +762,12 @@ void cluster_node_remove_shmem_register(void) {} +/* spec-6.5 stub: cluster backup / restore / PITR shmem region. */ +void cluster_backup_shmem_register(void); +void +cluster_backup_shmem_register(void) +{} + /* spec-4.12 D7 stub: cluster write-fence token region. */ void cluster_write_fence_shmem_register(void); void From 435e6001e0c85d1a5d782c382b5e19c6d2ec26a8 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 22:42:52 +0800 Subject: [PATCH 3/9] fix(cluster): avoid cluster-only SCN symbol in backup manifest --- src/backend/cluster/cluster_backup_manifest.c | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/backend/cluster/cluster_backup_manifest.c b/src/backend/cluster/cluster_backup_manifest.c index cf5b72ae72..66dd19cf9c 100644 --- a/src/backend/cluster/cluster_backup_manifest.c +++ b/src/backend/cluster/cluster_backup_manifest.c @@ -23,6 +23,23 @@ #include "cluster/cluster_backup.h" #include "port/pg_crc32c.h" +static int +cluster_backup_scn_cmp(SCN a, SCN b) +{ +#ifdef USE_PGRAC_CLUSTER + return scn_time_cmp(a, b); +#else + uint64 alocal = scn_local(a); + uint64 blocal = scn_local(b); + + if (alocal < blocal) + return -1; + if (alocal > blocal) + return 1; + return 0; +#endif +} + void cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup_id) { @@ -98,7 +115,7 @@ cluster_backup_manifest_validate(const ClusterBackupManifest *manifest) if (!manifest->control_included) return CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL; if (!SCN_VALID(manifest->consistent_scn) || !SCN_VALID(manifest->scn_durable_peak) || - scn_time_cmp(manifest->scn_durable_peak, manifest->consistent_scn) < 0) + cluster_backup_scn_cmp(manifest->scn_durable_peak, manifest->consistent_scn) < 0) return CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK; for (i = 0; i < CLUSTER_MAX_NODES; i++) { @@ -200,7 +217,7 @@ cluster_restore_point_build(ClusterRestorePoint *out, return CLUSTER_RESTORE_POINT_CUT_BAD_THREAD; out->cut_lsn[i] = thread_lsn[i]; - if (!SCN_VALID(max_scn) || scn_time_cmp(thread_scn[i], max_scn) > 0) + if (!SCN_VALID(max_scn) || cluster_backup_scn_cmp(thread_scn[i], max_scn) > 0) max_scn = thread_scn[i]; nthreads++; } @@ -242,7 +259,7 @@ cluster_pitr_resolve_scn(const ClusterRestorePoint *points, int i; if (!SCN_VALID(requested_scn) || !SCN_VALID(backup_consistent_scn) || - scn_time_cmp(requested_scn, backup_consistent_scn) < 0) + cluster_backup_scn_cmp(requested_scn, backup_consistent_scn) < 0) return CLUSTER_PITR_TARGET_BEFORE_BACKUP; if (points == NULL || npoints <= 0) return CLUSTER_PITR_TARGET_NO_RESTORE_POINT; @@ -252,13 +269,13 @@ cluster_pitr_resolve_scn(const ClusterRestorePoint *points, if (!point->present || !SCN_VALID(point->cut_scn)) continue; - if (scn_time_cmp(point->cut_scn, backup_consistent_scn) < 0) + if (cluster_backup_scn_cmp(point->cut_scn, backup_consistent_scn) < 0) continue; - if (scn_time_cmp(point->cut_scn, requested_scn) > 0) + if (cluster_backup_scn_cmp(point->cut_scn, requested_scn) > 0) continue; if (point->thread_count == 0 || point->thread_count > CLUSTER_MAX_NODES) return CLUSTER_PITR_TARGET_MISSING_THREAD; - if (best == NULL || scn_time_cmp(point->cut_scn, best->cut_scn) > 0) + if (best == NULL || cluster_backup_scn_cmp(point->cut_scn, best->cut_scn) > 0) best = point; } From e684c0f71f50acd0e24f5e2969950e4c8618532c Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 22:58:13 +0800 Subject: [PATCH 4/9] style(cluster): format backup PITR files --- src/backend/cluster/cluster_backup.c | 59 +++++++++---------- src/backend/cluster/cluster_backup_manifest.c | 51 +++++++--------- src/backend/cluster/cluster_guc.c | 29 ++++----- src/include/cluster/cluster_backup.h | 30 ++++------ src/test/cluster_unit/test_cluster_backup.c | 24 ++++---- src/test/cluster_unit/test_cluster_errcodes.c | 3 +- 6 files changed, 81 insertions(+), 115 deletions(-) diff --git a/src/backend/cluster/cluster_backup.c b/src/backend/cluster/cluster_backup.c index 9451310785..4976f4f27a 100644 --- a/src/backend/cluster/cluster_backup.c +++ b/src/backend/cluster/cluster_backup.c @@ -168,8 +168,7 @@ cluster_backup_update_start(const char *backup_id, const BackupState *state) } static void -cluster_backup_update_stop(const BackupState *state, - const ClusterBackupManifest *manifest, +cluster_backup_update_stop(const BackupState *state, const ClusterBackupManifest *manifest, SCN cut_scn) { LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); @@ -270,8 +269,7 @@ cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points) } static void -cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, - const BackupState *state, +cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, const BackupState *state, SCN cut_scn) { ClusterBackupManifestThread thread; @@ -319,8 +317,7 @@ cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, } static char * -cluster_backup_build_label(const BackupState *state, - const ClusterBackupManifest *manifest, +cluster_backup_build_label(const BackupState *state, const ClusterBackupManifest *manifest, SCN cut_scn) { StringInfoData buf; @@ -343,8 +340,8 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) { #define PG_CLUSTER_BACKUP_START_COLS 4 TupleDesc tupdesc; - Datum values[PG_CLUSTER_BACKUP_START_COLS] = {0}; - bool nulls[PG_CLUSTER_BACKUP_START_COLS] = {0}; + Datum values[PG_CLUSTER_BACKUP_START_COLS] = { 0 }; + bool nulls[PG_CLUSTER_BACKUP_START_COLS] = { 0 }; text *backupid = PG_GETARG_TEXT_PP(0); bool fast = PG_GETARG_BOOL(1); char *backupidstr; @@ -361,9 +358,9 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) backupidstr = text_to_cstring(backupid); if (strlen(backupidstr) >= CLUSTER_BACKUP_ID_MAX) - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("cluster backup id is too long"), - errdetail("Maximum length is %d bytes.", CLUSTER_BACKUP_ID_MAX - 1))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cluster backup id is too long"), + errdetail("Maximum length is %d bytes.", CLUSTER_BACKUP_ID_MAX - 1))); status = get_backup_status(); if (status == SESSION_BACKUP_RUNNING) @@ -371,8 +368,7 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) errmsg("a backup is already in progress in this session"))); if (cluster_backup_context == NULL) - cluster_backup_context = AllocSetContextCreate(TopMemoryContext, - "cluster backup context", + cluster_backup_context = AllocSetContextCreate(TopMemoryContext, "cluster backup context", ALLOCSET_START_SMALL_SIZES); else { cluster_backup_session_state = NULL; @@ -403,8 +399,8 @@ pg_cluster_backup_stop(PG_FUNCTION_ARGS) { #define PG_CLUSTER_BACKUP_STOP_COLS 4 TupleDesc tupdesc; - Datum values[PG_CLUSTER_BACKUP_STOP_COLS] = {0}; - bool nulls[PG_CLUSTER_BACKUP_STOP_COLS] = {0}; + Datum values[PG_CLUSTER_BACKUP_STOP_COLS] = { 0 }; + bool nulls[PG_CLUSTER_BACKUP_STOP_COLS] = { 0 }; bool waitforarchive = PG_GETARG_BOOL(0); ClusterBackupManifest manifest; ClusterRestorePoint point; @@ -471,8 +467,8 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) { #define PG_CLUSTER_RESTORE_POINT_COLS 3 TupleDesc tupdesc; - Datum values[PG_CLUSTER_RESTORE_POINT_COLS] = {0}; - bool nulls[PG_CLUSTER_RESTORE_POINT_COLS] = {0}; + Datum values[PG_CLUSTER_RESTORE_POINT_COLS] = { 0 }; + bool nulls[PG_CLUSTER_RESTORE_POINT_COLS] = { 0 }; text *restore_name = PG_GETARG_TEXT_PP(0); char *restore_name_str; XLogRecPtr restorepoint; @@ -493,19 +489,20 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); if (!XLogIsNeeded()) - ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL level not sufficient for creating a restore point"), - errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for creating a restore point"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); restore_name_str = text_to_cstring(restore_name); if (strlen(restore_name_str) >= CLUSTER_RESTORE_POINT_NAME_MAX) - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("cluster restore point name is too long"), - errdetail("Maximum length is %d bytes.", - CLUSTER_RESTORE_POINT_NAME_MAX - 1))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cluster restore point name is too long"), + errdetail("Maximum length is %d bytes.", CLUSTER_RESTORE_POINT_NAME_MAX - 1))); restorepoint = XLogRestorePoint(restore_name_str); cut_scn = cluster_backup_current_scn(); @@ -621,7 +618,7 @@ cluster_get_restore_points(PG_FUNCTION_ARGS) count = cluster_backup_get_restore_points(points, CLUSTER_BACKUP_RESTORE_POINT_MAX); for (i = 0; i < count; i++) { Datum values[5]; - bool nulls[5] = {false}; + bool nulls[5] = { false }; values[0] = CStringGetTextDatum(points[i].name); values[1] = Int64GetDatum((int64)points[i].cut_scn); @@ -645,7 +642,7 @@ cluster_get_pitr_status(PG_FUNCTION_ARGS) ClusterRestorePoint chosen; ClusterPitrTargetReason reason; Datum values[6]; - bool nulls[6] = {false}; + bool nulls[6] = { false }; int count; int i; const char *target_action = cluster_pitr_action_name(cluster_recovery_target_action); @@ -775,8 +772,8 @@ cluster_get_pitr_status(PG_FUNCTION_ARGS) } count = cluster_backup_get_restore_points(points, CLUSTER_BACKUP_RESTORE_POINT_MAX); - reason = cluster_pitr_resolve_scn(points, count, requested_scn, - manifest.consistent_scn, &chosen); + reason + = cluster_pitr_resolve_scn(points, count, requested_scn, manifest.consistent_scn, &chosen); values[0] = CStringGetTextDatum("scn"); values[1] = CStringGetTextDatum(target_action); values[2] = BoolGetDatum(reason == CLUSTER_PITR_TARGET_OK); @@ -802,13 +799,11 @@ cluster_backup_shmem_size(void) void cluster_backup_shmem_init(void) -{ -} +{} void cluster_backup_shmem_register(void) -{ -} +{} void cluster_backup_get_status(ClusterBackupStatus *out) diff --git a/src/backend/cluster/cluster_backup_manifest.c b/src/backend/cluster/cluster_backup_manifest.c index 66dd19cf9c..7fbaa1fcfe 100644 --- a/src/backend/cluster/cluster_backup_manifest.c +++ b/src/backend/cluster/cluster_backup_manifest.c @@ -54,8 +54,7 @@ cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup } bool -cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, - int thread_index, +cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, int thread_index, const ClusterBackupManifestThread *thread) { if (manifest == NULL || thread == NULL) @@ -109,13 +108,13 @@ cluster_backup_manifest_validate(const ClusterBackupManifest *manifest) return CLUSTER_BACKUP_MANIFEST_BAD_MAGIC; if (manifest->version != CLUSTER_BACKUP_MANIFEST_VERSION) return CLUSTER_BACKUP_MANIFEST_BAD_VERSION; - if (manifest->node_count == 0 || manifest->node_count > CLUSTER_MAX_NODES || - manifest->thread_count == 0 || manifest->thread_count > CLUSTER_MAX_NODES) + if (manifest->node_count == 0 || manifest->node_count > CLUSTER_MAX_NODES + || manifest->thread_count == 0 || manifest->thread_count > CLUSTER_MAX_NODES) return CLUSTER_BACKUP_MANIFEST_BAD_COUNTS; if (!manifest->control_included) return CLUSTER_BACKUP_MANIFEST_MISSING_CONTROL; - if (!SCN_VALID(manifest->consistent_scn) || !SCN_VALID(manifest->scn_durable_peak) || - cluster_backup_scn_cmp(manifest->scn_durable_peak, manifest->consistent_scn) < 0) + if (!SCN_VALID(manifest->consistent_scn) || !SCN_VALID(manifest->scn_durable_peak) + || cluster_backup_scn_cmp(manifest->scn_durable_peak, manifest->consistent_scn) < 0) return CLUSTER_BACKUP_MANIFEST_BAD_SCN_PEAK; for (i = 0; i < CLUSTER_MAX_NODES; i++) { @@ -127,10 +126,10 @@ cluster_backup_manifest_validate(const ClusterBackupManifest *manifest) present_count++; if (thread->thread_id == 0 || thread->thread_id > CLUSTER_MAX_NODES) return CLUSTER_BACKUP_MANIFEST_MISSING_THREAD; - if (thread->start_redo_lsn == InvalidXLogRecPtr || - thread->checkpoint_lsn == InvalidXLogRecPtr || - thread->stop_cut_lsn == InvalidXLogRecPtr || - thread->stop_cut_lsn < thread->start_redo_lsn) + if (thread->start_redo_lsn == InvalidXLogRecPtr + || thread->checkpoint_lsn == InvalidXLogRecPtr + || thread->stop_cut_lsn == InvalidXLogRecPtr + || thread->stop_cut_lsn < thread->start_redo_lsn) return CLUSTER_BACKUP_MANIFEST_BAD_LSN_RANGE; if (!thread->wal_included) return CLUSTER_BACKUP_MANIFEST_MISSING_WAL; @@ -183,14 +182,9 @@ cluster_backup_manifest_reason_name(ClusterBackupManifestReason reason) } ClusterRestorePointCutReason -cluster_restore_point_build(ClusterRestorePoint *out, - const char *name, - const SCN *thread_scn, - const XLogRecPtr *thread_lsn, - int max_threads, - bool pending_commits_empty, - bool commit_fence_held, - uint32 incarnation) +cluster_restore_point_build(ClusterRestorePoint *out, const char *name, const SCN *thread_scn, + const XLogRecPtr *thread_lsn, int max_threads, + bool pending_commits_empty, bool commit_fence_held, uint32 incarnation) { SCN max_scn = InvalidScn; int i; @@ -200,8 +194,8 @@ cluster_restore_point_build(ClusterRestorePoint *out, return CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS; if (!commit_fence_held) return CLUSTER_RESTORE_POINT_CUT_NO_FENCE; - if (out == NULL || thread_scn == NULL || thread_lsn == NULL || - max_threads <= 0 || max_threads > CLUSTER_MAX_NODES) + if (out == NULL || thread_scn == NULL || thread_lsn == NULL || max_threads <= 0 + || max_threads > CLUSTER_MAX_NODES) return CLUSTER_RESTORE_POINT_CUT_NO_THREADS; memset(out, 0, sizeof(*out)); @@ -249,17 +243,14 @@ cluster_restore_point_cut_reason_name(ClusterRestorePointCutReason reason) } ClusterPitrTargetReason -cluster_pitr_resolve_scn(const ClusterRestorePoint *points, - int npoints, - SCN requested_scn, - SCN backup_consistent_scn, - ClusterRestorePoint *out) +cluster_pitr_resolve_scn(const ClusterRestorePoint *points, int npoints, SCN requested_scn, + SCN backup_consistent_scn, ClusterRestorePoint *out) { const ClusterRestorePoint *best = NULL; int i; - if (!SCN_VALID(requested_scn) || !SCN_VALID(backup_consistent_scn) || - cluster_backup_scn_cmp(requested_scn, backup_consistent_scn) < 0) + if (!SCN_VALID(requested_scn) || !SCN_VALID(backup_consistent_scn) + || cluster_backup_scn_cmp(requested_scn, backup_consistent_scn) < 0) return CLUSTER_PITR_TARGET_BEFORE_BACKUP; if (points == NULL || npoints <= 0) return CLUSTER_PITR_TARGET_NO_RESTORE_POINT; @@ -306,10 +297,8 @@ cluster_pitr_target_reason_name(ClusterPitrTargetReason reason) } ClusterRestoreCompatibilityReason -cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, - uint32 current_catversion, - uint32 current_storage_id, - uint32 expected_node_count) +cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, uint32 current_catversion, + uint32 current_storage_id, uint32 expected_node_count) { if (cluster_backup_manifest_validate(manifest) != CLUSTER_BACKUP_MANIFEST_OK) return CLUSTER_RESTORE_COMPAT_MANIFEST; diff --git a/src/backend/cluster/cluster_guc.c b/src/backend/cluster/cluster_guc.c index e63afd288f..0d46a1f12e 100644 --- a/src/backend/cluster/cluster_guc.c +++ b/src/backend/cluster/cluster_guc.c @@ -834,8 +834,7 @@ static const struct config_enum_entry cluster_recovery_target_action_options[] { NULL, 0, false } }; static const struct config_enum_entry cluster_backup_manifest_checksum_options[] - = { { "crc32c", CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C, false }, - { NULL, 0, false } }; + = { { "crc32c", CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C, false }, { NULL, 0, false } }; /* @@ -1166,15 +1165,13 @@ cluster_init_guc(void) NULL, NULL); DefineCustomStringVariable( - "cluster.recovery_target_scn", - gettext_noop("Cluster PITR target SCN."), + "cluster.recovery_target_scn", gettext_noop("Cluster PITR target SCN."), gettext_noop("When set, cluster PITR status resolves the requested SCN against " "cluster restore points and refuses unreachable targets."), &cluster_recovery_target_scn, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomStringVariable( - "cluster.recovery_target_cluster_time", - gettext_noop("Cluster PITR target timestamp."), + "cluster.recovery_target_cluster_time", gettext_noop("Cluster PITR target timestamp."), gettext_noop("Reserved target timestamp for cluster-aware recovery planning. " "Spec-6.5 exposes the configuration and status surface; the " "startup recovery action remains fail-closed until the " @@ -1182,8 +1179,7 @@ cluster_init_guc(void) &cluster_recovery_target_cluster_time, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomStringVariable( - "cluster.recovery_target_name", - gettext_noop("Cluster PITR named restore point target."), + "cluster.recovery_target_name", gettext_noop("Cluster PITR named restore point target."), gettext_noop("Reserved named cluster restore-point target. The status view " "reports restore points produced by pg_cluster_create_restore_point."), &cluster_recovery_target_name, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); @@ -1205,12 +1201,11 @@ cluster_init_guc(void) "cluster-wide cut coordinator is present."), &cluster_enable_pitr_restore_points, false, PGC_SIGHUP, 0, NULL, NULL, NULL); - DefineCustomIntVariable( - "cluster.pitr_restore_point_interval_ms", - gettext_noop("Interval for automatic cluster PITR restore points."), - gettext_noop("Zero disables automatic restore point scheduling."), - &cluster_pitr_restore_point_interval_ms, 0, 0, 86400000, PGC_SIGHUP, GUC_UNIT_MS, NULL, - NULL, NULL); + DefineCustomIntVariable("cluster.pitr_restore_point_interval_ms", + gettext_noop("Interval for automatic cluster PITR restore points."), + gettext_noop("Zero disables automatic restore point scheduling."), + &cluster_pitr_restore_point_interval_ms, 0, 0, 86400000, PGC_SIGHUP, + GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable( "cluster.backup_wal_retention", @@ -1218,12 +1213,10 @@ cluster_init_guc(void) gettext_noop("The 6.5 manifest/status surface records the setting; actual " "multi-thread retention enforcement is deferred to the backup-set " "writer."), - &cluster_backup_wal_retention, 0, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MB, NULL, NULL, - NULL); + &cluster_backup_wal_retention, 0, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MB, NULL, NULL, NULL); DefineCustomIntVariable( - "cluster.backup_parallel_channels", - gettext_noop("Maximum cluster backup copy channels."), + "cluster.backup_parallel_channels", gettext_noop("Maximum cluster backup copy channels."), gettext_noop("Reserved capacity knob for the cluster backup-set writer."), &cluster_backup_parallel_channels, 1, 1, CLUSTER_MAX_NODES, PGC_SIGHUP, 0, NULL, NULL, NULL); diff --git a/src/include/cluster/cluster_backup.h b/src/include/cluster/cluster_backup.h index 10b3d7a5e2..eee33ae756 100644 --- a/src/include/cluster/cluster_backup.h +++ b/src/include/cluster/cluster_backup.h @@ -129,8 +129,7 @@ typedef struct ClusterBackupStatus { } ClusterBackupStatus; extern void cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup_id); -extern bool cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, - int thread_index, +extern bool cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, int thread_index, const ClusterBackupManifestThread *thread); extern uint32 cluster_backup_manifest_compute_crc(const ClusterBackupManifest *manifest); extern void cluster_backup_manifest_seal(ClusterBackupManifest *manifest); @@ -139,29 +138,20 @@ cluster_backup_manifest_validate(const ClusterBackupManifest *manifest); extern const char *cluster_backup_manifest_reason_name(ClusterBackupManifestReason reason); extern ClusterRestorePointCutReason -cluster_restore_point_build(ClusterRestorePoint *out, - const char *name, - const SCN *thread_scn, - const XLogRecPtr *thread_lsn, - int max_threads, - bool pending_commits_empty, - bool commit_fence_held, - uint32 incarnation); +cluster_restore_point_build(ClusterRestorePoint *out, const char *name, const SCN *thread_scn, + const XLogRecPtr *thread_lsn, int max_threads, + bool pending_commits_empty, bool commit_fence_held, uint32 incarnation); extern const char *cluster_restore_point_cut_reason_name(ClusterRestorePointCutReason reason); -extern ClusterPitrTargetReason -cluster_pitr_resolve_scn(const ClusterRestorePoint *points, - int npoints, - SCN requested_scn, - SCN backup_consistent_scn, - ClusterRestorePoint *out); +extern ClusterPitrTargetReason cluster_pitr_resolve_scn(const ClusterRestorePoint *points, + int npoints, SCN requested_scn, + SCN backup_consistent_scn, + ClusterRestorePoint *out); extern const char *cluster_pitr_target_reason_name(ClusterPitrTargetReason reason); extern ClusterRestoreCompatibilityReason -cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, - uint32 current_catversion, - uint32 current_storage_id, - uint32 expected_node_count); +cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, uint32 current_catversion, + uint32 current_storage_id, uint32 expected_node_count); extern const char *cluster_restore_compat_reason_name(ClusterRestoreCompatibilityReason reason); #ifndef FRONTEND diff --git a/src/test/cluster_unit/test_cluster_backup.c b/src/test/cluster_unit/test_cluster_backup.c index 0c161774c1..4a53ecd523 100644 --- a/src/test/cluster_unit/test_cluster_backup.c +++ b/src/test/cluster_unit/test_cluster_backup.c @@ -175,12 +175,12 @@ UT_TEST(test_restore_point_cut_requires_drain_and_fence) scns[0] = test_scn(20); lsns[0] = 500; - UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, - false, true, 0), - CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS); - UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, - true, false, 0), - CLUSTER_RESTORE_POINT_CUT_NO_FENCE); + UT_ASSERT_EQ( + cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, false, true, 0), + CLUSTER_RESTORE_POINT_CUT_PENDING_COMMITS); + UT_ASSERT_EQ( + cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, true, false, 0), + CLUSTER_RESTORE_POINT_CUT_NO_FENCE); } UT_TEST(test_restore_point_cut_records_all_threads) @@ -196,9 +196,9 @@ UT_TEST(test_restore_point_cut_records_all_threads) scns[2] = test_scn(30); lsns[2] = 700; - UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, - true, true, 9), - CLUSTER_RESTORE_POINT_CUT_OK); + UT_ASSERT_EQ( + cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, true, true, 9), + CLUSTER_RESTORE_POINT_CUT_OK); UT_ASSERT_EQ(point.thread_count, 2); UT_ASSERT_EQ(point.cut_scn, test_scn(30)); UT_ASSERT_EQ(point.incarnation, 9); @@ -214,9 +214,9 @@ UT_TEST(test_restore_point_cut_rejects_partial_thread) memset(scns, 0, sizeof(scns)); memset(lsns, 0, sizeof(lsns)); scns[0] = test_scn(20); - UT_ASSERT_EQ(cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, - true, true, 0), - CLUSTER_RESTORE_POINT_CUT_BAD_THREAD); + UT_ASSERT_EQ( + cluster_restore_point_build(&point, "rp", scns, lsns, CLUSTER_MAX_NODES, true, true, 0), + CLUSTER_RESTORE_POINT_CUT_BAD_THREAD); } UT_TEST(test_pitr_resolves_latest_reachable_restore_point) diff --git a/src/test/cluster_unit/test_cluster_errcodes.c b/src/test/cluster_unit/test_cluster_errcodes.c index 4b44f7841e..021eb5c5fc 100644 --- a/src/test/cluster_unit/test_cluster_errcodes.c +++ b/src/test/cluster_unit/test_cluster_errcodes.c @@ -179,8 +179,7 @@ UT_TEST(test_class_58_complete) UT_TEST(test_class_53_backup_band) { UT_ASSERT_EQ(ERRCODE_CLUSTER_BACKUP_IN_PROGRESS, MAKE_SQLSTATE('5', '3', 'R', 'A', 'B')); - UT_ASSERT_EQ(ERRCODE_CLUSTER_PITR_TARGET_UNREACHABLE, - MAKE_SQLSTATE('5', '3', 'R', 'A', 'C')); + UT_ASSERT_EQ(ERRCODE_CLUSTER_PITR_TARGET_UNREACHABLE, MAKE_SQLSTATE('5', '3', 'R', 'A', 'C')); UT_ASSERT_EQ(ERRCODE_CLUSTER_BACKUP_INCOMPLETE, MAKE_SQLSTATE('5', '3', 'R', 'A', 'D')); UT_ASSERT_EQ(ERRCODE_CLUSTER_RESTORE_INCOMPATIBLE, MAKE_SQLSTATE('5', '3', 'R', 'A', 'E')); UT_ASSERT_EQ(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT, From 5fbe02d37ae9b0d108feaf388289de5e81121c61 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 23:10:48 +0800 Subject: [PATCH 5/9] test(cluster): update rules expected for backup views --- src/test/regress/expected/rules.out | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 00df5b9286..34d56ee8ab 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1313,6 +1313,16 @@ pg_backend_memory_contexts| SELECT name, free_chunks, used_bytes FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes); +pg_cluster_backup_history| SELECT backup_id, + consistent_scn, + scn_durable_peak, + timeline, + catversion, + storage_id, + node_count, + thread_count, + manifest_crc + FROM cluster_get_backup_history() cluster_get_backup_history(backup_id, consistent_scn, scn_durable_peak, timeline, catversion, storage_id, node_count, thread_count, manifest_crc); pg_cluster_clean_leave_state| SELECT phase, leaving_node_id, leave_epoch, @@ -1431,6 +1441,13 @@ pg_cluster_nodes| SELECT node_id, region, is_self FROM cluster_get_nodes() cluster_get_nodes(node_id, hostname, interconnect_addr, public_addr, role, region, is_self); +pg_cluster_pitr_status| SELECT target_type, + target_action, + reachable, + reason, + resolved_scn, + restore_point_name + FROM cluster_get_pitr_status() cluster_get_pitr_status(target_type, target_action, reachable, reason, resolved_scn, restore_point_name); pg_cluster_quorum_state| SELECT in_quorum, quorum_size, disks_ok, @@ -1450,6 +1467,12 @@ pg_cluster_reconfig_state| SELECT event_id, cssd_dead_generation, reconfig_kind FROM cluster_get_reconfig_state() cluster_get_reconfig_state(event_id, coordinator_node_id, old_epoch, new_epoch, dead_bitmap, applied_at, observer_role, event_seq, cssd_dead_generation, reconfig_kind); +pg_cluster_restore_points| SELECT restore_point_name, + cut_scn, + thread_count, + incarnation, + created_at + FROM cluster_get_restore_points() cluster_get_restore_points(restore_point_name, cut_scn, thread_count, incarnation, created_at); pg_cluster_shmem| SELECT name, size_bytes, lwlock_count, @@ -1981,6 +2004,21 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync, pg_stat_get_buf_alloc() AS buffers_alloc, pg_stat_get_bgwriter_stat_reset_time() AS stats_reset; +pg_stat_cluster_backup| SELECT in_progress, + backup_id, + coordinator_node_id, + start_redo_lsn, + checkpoint_lsn, + stop_cut_lsn, + consistent_scn, + manifest_crc, + started_at, + stopped_at, + backup_parallel_channels, + backup_wal_retention, + restore_points_enabled, + restore_point_interval_ms + FROM cluster_get_backup_state() cluster_get_backup_state(in_progress, backup_id, coordinator_node_id, start_redo_lsn, checkpoint_lsn, stop_cut_lsn, consistent_scn, manifest_crc, started_at, stopped_at, backup_parallel_channels, backup_wal_retention, restore_points_enabled, restore_point_interval_ms); pg_stat_cluster_counters| SELECT name, value FROM cluster_get_pgstat_counters() cluster_get_pgstat_counters(name, value); From c31c9c374e7229b4b226c17d807e1b8091bc4c93 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Tue, 30 Jun 2026 23:27:54 +0800 Subject: [PATCH 6/9] test(cluster): mark backup rules expectation --- src/test/regress/expected/rules.out | 1 + src/test/regress/sql/rules.sql | 1 + 2 files changed, 2 insertions(+) diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 34d56ee8ab..3db2e5ca7e 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1281,6 +1281,7 @@ drop table cchild; -- -- Check that ruleutils are working -- +-- PGRAC: expected ruleutils output includes cluster backup/PITR catalog views. -- temporarily disable fancy output, so view changes create less diff noise \a\t SELECT viewname, definition FROM pg_views diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 8b7e255dcd..f013d07628 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -772,6 +772,7 @@ drop table cchild; -- Check that ruleutils are working -- +-- PGRAC: expected ruleutils output includes cluster backup/PITR catalog views. -- temporarily disable fancy output, so view changes create less diff noise \a\t From f6aeff336c21b4171504cd0978fdf4d5350365fe Mon Sep 17 00:00:00 2001 From: SqlRush Date: Wed, 1 Jul 2026 06:51:21 +0800 Subject: [PATCH 7/9] feat(cluster): add backup peer ACK coordination --- docs/reference/system-views.md | 11 +- docs/user-guide/configuration.md | 7 +- src/backend/cluster/cluster_backup.c | 968 +++++++++++++++++- src/backend/cluster/cluster_backup_manifest.c | 109 ++ src/backend/cluster/cluster_lmon.c | 13 + src/include/cluster/cluster_backup.h | 59 ++ src/include/cluster/cluster_ic_envelope.h | 11 +- .../cluster_tap/t/332_cluster_backup_pitr.pl | 4 +- src/test/cluster_unit/test_cluster_backup.c | 56 +- src/test/cluster_unit/test_cluster_lmon.c | 12 + 10 files changed, 1215 insertions(+), 35 deletions(-) diff --git a/docs/reference/system-views.md b/docs/reference/system-views.md index a8799ac9ce..edcb92a622 100644 --- a/docs/reference/system-views.md +++ b/docs/reference/system-views.md @@ -29,12 +29,13 @@ Current 6.5 scope is conservative: includes native PostgreSQL label content followed by `CLUSTER_*` metadata lines. - If the node has declared peers, the mutating backup/restore-point - functions fail closed with a cluster backup SQLSTATE rather than - silently producing a partial backup. + functions require LMON-mediated peer ACKs. Missing ACKs, peer NAKs, + disconnected peers, or a changed peer set fail closed with a cluster + backup SQLSTATE rather than silently producing a partial backup. - The manifest records WAL thread, undo, transaction-table, SCN, and - control-file inclusion state for the proven local cut. A later - backup-set writer can extend the same contract to coordinated - multi-node copying without changing these view shapes. + control-file inclusion state for the proven cut. In a peer topology + the manifest is written only after every start-time peer has returned + STOP metadata for its WAL thread. ### `pg_stat_cluster_backup` diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md index 26fac530fd..96cad0cc34 100644 --- a/docs/user-guide/configuration.md +++ b/docs/user-guide/configuration.md @@ -82,8 +82,11 @@ PITR surface. The current implementation is intentionally conservative. A single-node cluster can create a cluster manifest via `pg_cluster_backup_start()` / `pg_cluster_backup_stop()`. If declared -peers exist, cluster backup and restore-point mutation fail closed -instead of producing a partial backup or an unreachable PITR target. +peers exist, mutating backup and restore-point functions require the +LMON-mediated coordinator/peer ACK path to complete. Missing peer +ACKs, peer NAKs, disconnected peers, or topology changes during a +backup fail closed instead of producing a partial backup or an +unreachable PITR target. ### `cluster.interconnect_tier` diff --git a/src/backend/cluster/cluster_backup.c b/src/backend/cluster/cluster_backup.c index 4976f4f27a..cf2ebd087e 100644 --- a/src/backend/cluster/cluster_backup.c +++ b/src/backend/cluster/cluster_backup.c @@ -32,6 +32,7 @@ #include "storage/lwlock.h" #include "storage/shmem.h" #include "utils/builtins.h" +#include "utils/elog.h" #include "utils/errcodes.h" #include "utils/memutils.h" #include "utils/pg_lsn.h" @@ -53,6 +54,9 @@ PG_FUNCTION_INFO_V1(cluster_get_pitr_status); #ifdef USE_PGRAC_CLUSTER +#include "cluster/cluster_ic_envelope.h" +#include "cluster/cluster_ic_router.h" +#include "cluster/cluster_lmon.h" #include "cluster/cluster_wal_thread.h" typedef struct ClusterBackupSharedState { @@ -63,12 +67,139 @@ typedef struct ClusterBackupSharedState { int restore_point_count; int restore_point_next; ClusterRestorePoint restore_points[CLUSTER_BACKUP_RESTORE_POINT_MAX]; + + uint64 next_request_id; + bool coordinator_send_pending; + ClusterBackupWireRequest coordinator_request; + uint8 coordinator_expected[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + uint8 coordinator_backup_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + uint8 coordinator_acked[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + uint8 coordinator_nacked[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + ClusterBackupWireAck coordinator_acks[CLUSTER_MAX_NODES]; + ClusterBackupManifestThread coordinator_peer_threads[CLUSTER_MAX_NODES]; + SCN coordinator_peer_cut_scn[CLUSTER_MAX_NODES]; + + bool peer_command_pending; + ClusterBackupWireRequest peer_command; + bool peer_reply_pending; + int32 peer_reply_dest; + ClusterBackupWireAck peer_reply; } ClusterBackupSharedState; static ClusterBackupSharedState *cluster_backup_state = NULL; static BackupState *cluster_backup_session_state = NULL; static StringInfo cluster_backup_tablespace_map = NULL; static MemoryContext cluster_backup_context = NULL; +static BackupState *cluster_backup_lmon_state = NULL; +static StringInfo cluster_backup_lmon_tablespace_map = NULL; +static MemoryContext cluster_backup_lmon_context = NULL; + +static inline void +cluster_backup_bitmap_set(uint8 *bitmap, int node_id) +{ + if (bitmap == NULL || node_id < 0 || node_id >= CLUSTER_MAX_NODES) + return; + bitmap[node_id / 8] |= (uint8)(1u << (node_id % 8)); +} + +static inline bool +cluster_backup_bitmap_test(const uint8 *bitmap, int node_id) +{ + if (bitmap == NULL || node_id < 0 || node_id >= CLUSTER_MAX_NODES) + return false; + return (bitmap[node_id / 8] & (uint8)(1u << (node_id % 8))) != 0; +} + +static bool +cluster_backup_bitmap_all_acked(const uint8 *expected, const uint8 *acked) +{ + int i; + + for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { + if ((expected[i] & ~acked[i]) != 0) + return false; + } + return true; +} + +static bool +cluster_backup_bitmap_any_set(const uint8 *bitmap) +{ + int i; + + for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { + if (bitmap[i] != 0) + return true; + } + return false; +} + +static bool +cluster_backup_bitmap_equal(const uint8 *left, const uint8 *right) +{ + int i; + + if (left == NULL || right == NULL) + return false; + for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { + if (left[i] != right[i]) + return false; + } + return true; +} + +static int +cluster_backup_bitmap_first_missing(const uint8 *expected, const uint8 *acked, const uint8 *nacked) +{ + int i; + + for (i = 0; i < CLUSTER_MAX_NODES; i++) { + if (cluster_backup_bitmap_test(expected, i) && !cluster_backup_bitmap_test(acked, i) + && !cluster_backup_bitmap_test(nacked, i)) + return i; + } + return -1; +} + +static int +cluster_backup_bitmap_first_set(const uint8 *bitmap) +{ + int i; + + for (i = 0; i < CLUSTER_MAX_NODES; i++) { + if (cluster_backup_bitmap_test(bitmap, i)) + return i; + } + return -1; +} + +static uint16 +cluster_backup_local_thread_id(void) +{ + uint16 thread_id = cluster_wal_thread_id(); + + if (thread_id == XLP_THREAD_ID_LEGACY) + thread_id = 1; + return thread_id; +} + +static const char * +cluster_backup_wire_op_name(ClusterBackupWireOp op) +{ + switch (op) { + case CLUSTER_BACKUP_WIRE_OP_START: + return "start"; + case CLUSTER_BACKUP_WIRE_OP_STOP: + return "stop"; + case CLUSTER_BACKUP_WIRE_OP_ABORT: + return "abort"; + case CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT: + return "restore_point"; + case CLUSTER_BACKUP_WIRE_OP_NONE: + break; + } + return "unknown"; +} static const char * cluster_pitr_action_name(int action) @@ -130,11 +261,6 @@ cluster_backup_error_if_unavailable(const char *op) if (cluster_backup_state == NULL) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cluster backup shared state is not initialized"))); - if (cluster_conf_has_peers()) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("%s is not available for multi-node clusters yet", op), - errhint("Use a single-node cluster topology, or wait for the 6.5 " - "coordinator/backup-set writer to land."))); } static SCN @@ -148,6 +274,251 @@ cluster_backup_current_scn(void) return scn; } +typedef struct ClusterBackupCoordWaitResult { + bool ok; + bool timed_out; + int32 node_id; + ClusterBackupWireResult result; +} ClusterBackupCoordWaitResult; + +static void +cluster_backup_cleanup_session_context(void) +{ + cluster_backup_session_state = NULL; + cluster_backup_tablespace_map = NULL; + if (cluster_backup_context != NULL) { + MemoryContextDelete(cluster_backup_context); + cluster_backup_context = NULL; + } +} + +static void +cluster_backup_build_current_peer_bitmap(uint8 *bitmap) +{ + int i; + + MemSet(bitmap, 0, CLUSTER_BACKUP_NODE_BITMAP_BYTES); + for (i = 0; i < CLUSTER_MAX_NODES; i++) { + if (i == cluster_node_id) + continue; + if (cluster_conf_lookup_node(i) != NULL) + cluster_backup_bitmap_set(bitmap, i); + } +} + +static void +cluster_backup_init_wire_request(ClusterBackupWireRequest *request, ClusterBackupWireOp op, + uint64 request_id, const char *backup_id, + const char *restore_point_name, bool fast, bool waitforarchive, + SCN requested_scn) +{ + MemSet(request, 0, sizeof(*request)); + request->magic = CLUSTER_BACKUP_IC_MAGIC; + request->version = CLUSTER_BACKUP_IC_VERSION; + request->op = (uint16)op; + request->request_id = request_id; + request->coordinator_node_id = cluster_node_id; + request->fast = fast; + request->waitforarchive = waitforarchive; + request->requested_scn = requested_scn; + if (backup_id != NULL) + strlcpy(request->backup_id, backup_id, sizeof(request->backup_id)); + if (restore_point_name != NULL) + strlcpy(request->restore_point_name, restore_point_name, + sizeof(request->restore_point_name)); + cluster_backup_wire_request_compute_crc(request); +} + +static bool +cluster_backup_get_backup_peer_bitmap(uint8 *bitmap) +{ + bool have_peers; + + if (bitmap == NULL || cluster_backup_state == NULL) + return false; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + memcpy(bitmap, cluster_backup_state->coordinator_backup_peers, + CLUSTER_BACKUP_NODE_BITMAP_BYTES); + have_peers = cluster_backup_bitmap_any_set(bitmap); + LWLockRelease(&cluster_backup_state->lock.lock); + return have_peers; +} + +static bool +cluster_backup_begin_coord_request(ClusterBackupWireOp op, const uint8 *expected_override, + const char *backup_id, const char *restore_point_name, bool fast, + bool waitforarchive, SCN requested_scn, uint64 *request_id) +{ + ClusterBackupWireRequest request; + uint8 expected[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + uint64 id; + + if (request_id != NULL) + *request_id = 0; + if (cluster_backup_state == NULL) + return false; + + if (expected_override != NULL) + memcpy(expected, expected_override, sizeof(expected)); + else + cluster_backup_build_current_peer_bitmap(expected); + + if (!cluster_backup_bitmap_any_set(expected)) + return false; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + id = ++cluster_backup_state->next_request_id; + if (id == 0) + id = ++cluster_backup_state->next_request_id; + LWLockRelease(&cluster_backup_state->lock.lock); + + cluster_backup_init_wire_request(&request, op, id, backup_id, restore_point_name, fast, + waitforarchive, requested_scn); + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + cluster_backup_state->coordinator_request = request; + memcpy(cluster_backup_state->coordinator_expected, expected, sizeof(expected)); + MemSet(cluster_backup_state->coordinator_acked, 0, + sizeof(cluster_backup_state->coordinator_acked)); + MemSet(cluster_backup_state->coordinator_nacked, 0, + sizeof(cluster_backup_state->coordinator_nacked)); + MemSet(cluster_backup_state->coordinator_acks, 0, + sizeof(cluster_backup_state->coordinator_acks)); + if (op == CLUSTER_BACKUP_WIRE_OP_START) { + memcpy(cluster_backup_state->coordinator_backup_peers, expected, sizeof(expected)); + MemSet(cluster_backup_state->coordinator_peer_threads, 0, + sizeof(cluster_backup_state->coordinator_peer_threads)); + MemSet(cluster_backup_state->coordinator_peer_cut_scn, 0, + sizeof(cluster_backup_state->coordinator_peer_cut_scn)); + } + cluster_backup_state->coordinator_send_pending = true; + LWLockRelease(&cluster_backup_state->lock.lock); + + if (request_id != NULL) + *request_id = id; + cluster_lmon_wakeup(); + return true; +} + +static bool +cluster_backup_wait_for_peer_acks(ClusterBackupWireOp op, uint64 request_id, + ClusterBackupCoordWaitResult *result) +{ + TimestampTz deadline; + + if (result != NULL) { + MemSet(result, 0, sizeof(*result)); + result->node_id = -1; + result->result = CLUSTER_BACKUP_WIRE_RESULT_OK; + } + + if (request_id == 0) + return true; + + deadline = GetCurrentTimestamp() + + (TimestampTz)cluster_recovery_merge_wait_timeout * INT64CONST(1000); + + for (;;) { + bool all_acked; + bool any_nacked; + int bad_node; + int missing_node; + ClusterBackupWireResult bad_result = CLUSTER_BACKUP_WIRE_RESULT_OK; + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + if (cluster_backup_state->coordinator_request.request_id != request_id + || cluster_backup_state->coordinator_request.op != (uint16)op) { + LWLockRelease(&cluster_backup_state->lock.lock); + if (result != NULL) { + result->node_id = -1; + result->result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; + } + return false; + } + all_acked = cluster_backup_bitmap_all_acked(cluster_backup_state->coordinator_expected, + cluster_backup_state->coordinator_acked); + any_nacked = cluster_backup_bitmap_any_set(cluster_backup_state->coordinator_nacked); + bad_node = cluster_backup_bitmap_first_set(cluster_backup_state->coordinator_nacked); + if (bad_node >= 0) + bad_result + = (ClusterBackupWireResult)cluster_backup_state->coordinator_acks[bad_node].result; + missing_node = cluster_backup_bitmap_first_missing( + cluster_backup_state->coordinator_expected, cluster_backup_state->coordinator_acked, + cluster_backup_state->coordinator_nacked); + LWLockRelease(&cluster_backup_state->lock.lock); + + if (all_acked) { + if (result != NULL) + result->ok = true; + return true; + } + if (any_nacked) { + if (result != NULL) { + result->node_id = bad_node; + result->result = bad_result; + } + return false; + } + if (cluster_recovery_merge_wait_timeout <= 0 || GetCurrentTimestamp() >= deadline) { + if (result != NULL) { + result->timed_out = true; + result->node_id = missing_node; + } + return false; + } + + pg_usleep(10000L); + } +} + +static void +cluster_backup_raise_peer_failure(ClusterBackupWireOp op, + const ClusterBackupCoordWaitResult *result) +{ + const char *op_name = cluster_backup_wire_op_name(op); + int32 node_id = (result != NULL) ? result->node_id : -1; + + if (result != NULL && result->timed_out) + ereport(ERROR, + (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup %s did not receive ACK from all peer nodes", op_name), + errdetail("First missing peer node: %d.", node_id))); + else + ereport(ERROR, + (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup %s was rejected by a peer node", op_name), + errdetail("Peer node %d returned %s.", node_id, + cluster_backup_wire_result_name( + result != NULL ? result->result + : CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR)))); +} + +static void +cluster_backup_raise_coord_enqueue_failure(ClusterBackupWireOp op) +{ + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup %s could not enqueue a peer coordination request", + cluster_backup_wire_op_name(op)))); +} + +static void +cluster_backup_abort_peers_best_effort(const uint8 *expected, const char *backup_id) +{ + uint64 abort_request_id = 0; + ClusterBackupCoordWaitResult ignored; + + if (expected == NULL || !cluster_backup_bitmap_any_set(expected)) + return; + if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_ABORT, expected, backup_id, NULL, + false, false, InvalidScn, &abort_request_id)) + return; + (void)cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_ABORT, abort_request_id, + &ignored); +} + static void cluster_backup_update_start(const char *backup_id, const BackupState *state) { @@ -273,11 +644,9 @@ cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, const Backup SCN cut_scn) { ClusterBackupManifestThread thread; - uint16 thread_id = cluster_wal_thread_id(); + uint16 thread_id = cluster_backup_local_thread_id(); int thread_index; - if (thread_id == XLP_THREAD_ID_LEGACY) - thread_id = 1; thread_index = (int)thread_id - 1; cluster_backup_manifest_init(manifest, state->name); @@ -306,14 +675,140 @@ cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, const Backup if (!cluster_backup_manifest_set_thread(manifest, thread_index, &thread)) ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), errmsg("could not add local WAL thread to cluster backup manifest"))); +} + +static void +cluster_backup_seal_manifest_or_error(ClusterBackupManifest *manifest) +{ + ClusterBackupManifestReason reason; if (cluster_backup_manifest_checksums != CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C) ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), errmsg("cluster backup manifests require crc32c checksums"))); cluster_backup_manifest_seal(manifest); - if (cluster_backup_manifest_validate(manifest) != CLUSTER_BACKUP_MANIFEST_OK) + reason = cluster_backup_manifest_validate(manifest); + if (reason != CLUSTER_BACKUP_MANIFEST_OK) ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifest failed self-validation"))); + errmsg("cluster backup manifest failed self-validation"), + errdetail("Manifest validation reason: %s.", + cluster_backup_manifest_reason_name(reason)))); +} + +static void +cluster_backup_add_peer_stop_threads(ClusterBackupManifest *manifest, uint64 stop_request_id, + SCN *thread_scn, XLogRecPtr *thread_lsn) +{ + int node_id; + + if (stop_request_id == 0) + return; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + for (node_id = 0; node_id < CLUSTER_MAX_NODES; node_id++) { + ClusterBackupManifestThread thread; + ClusterBackupWireAck ack; + SCN cut_scn; + int thread_index; + + if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) + continue; + if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_acked, node_id)) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup manifest is missing a peer STOP ACK"), + errdetail("Missing peer node: %d.", node_id))); + } + + ack = cluster_backup_state->coordinator_acks[node_id]; + thread = cluster_backup_state->coordinator_peer_threads[node_id]; + cut_scn = cluster_backup_state->coordinator_peer_cut_scn[node_id]; + + if (ack.request_id != stop_request_id || ack.op != CLUSTER_BACKUP_WIRE_OP_STOP + || ack.result != CLUSTER_BACKUP_WIRE_RESULT_OK || !thread.present + || ack.stop_cut_lsn == InvalidXLogRecPtr || !SCN_VALID(cut_scn)) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup manifest has incomplete peer STOP metadata"), + errdetail("Peer node: %d.", node_id))); + } + + thread.stop_cut_lsn = ack.stop_cut_lsn; + thread_index = (int)thread.thread_id - 1; + if (thread_index < 0 || thread_index >= CLUSTER_MAX_NODES + || manifest->threads[thread_index].present) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, + (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup manifest has duplicate or invalid WAL thread"), + errdetail("Peer node %d reported thread %u.", node_id, thread.thread_id))); + } + + if (!cluster_backup_manifest_set_thread(manifest, thread_index, &thread)) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, + (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("could not add peer WAL thread to cluster backup manifest"), + errdetail("Peer node %d reported thread %u.", node_id, thread.thread_id))); + } + + manifest->node_count++; + if (!SCN_VALID(manifest->consistent_scn) + || scn_time_cmp(cut_scn, manifest->consistent_scn) > 0) { + manifest->consistent_scn = cut_scn; + manifest->scn_durable_peak = cut_scn; + } + if (thread_scn != NULL) + thread_scn[thread_index] = cut_scn; + if (thread_lsn != NULL) + thread_lsn[thread_index] = ack.stop_cut_lsn; + } + LWLockRelease(&cluster_backup_state->lock.lock); +} + +static void +cluster_backup_add_peer_restore_point_acks(uint64 request_id, SCN *thread_scn, + XLogRecPtr *thread_lsn) +{ + int node_id; + + if (request_id == 0) + return; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); + for (node_id = 0; node_id < CLUSTER_MAX_NODES; node_id++) { + ClusterBackupWireAck ack; + int thread_index; + + if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) + continue; + if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_acked, node_id)) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster restore point is missing a peer ACK"), + errdetail("Missing peer node: %d.", node_id))); + } + + ack = cluster_backup_state->coordinator_acks[node_id]; + if (ack.request_id != request_id || ack.op != CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT + || ack.result != CLUSTER_BACKUP_WIRE_RESULT_OK || ack.stop_cut_lsn == InvalidXLogRecPtr + || !SCN_VALID(ack.cut_scn) || ack.thread_id == 0 || ack.thread_id > CLUSTER_MAX_NODES) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster restore point has incomplete peer metadata"), + errdetail("Peer node: %d.", node_id))); + } + + thread_index = (int)ack.thread_id - 1; + if (thread_lsn[thread_index] != InvalidXLogRecPtr || SCN_VALID(thread_scn[thread_index])) { + LWLockRelease(&cluster_backup_state->lock.lock); + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster restore point has duplicate WAL thread metadata"), + errdetail("Peer node %d reported thread %u.", node_id, ack.thread_id))); + } + thread_lsn[thread_index] = ack.stop_cut_lsn; + thread_scn[thread_index] = ack.cut_scn; + } + LWLockRelease(&cluster_backup_state->lock.lock); } static char * @@ -335,6 +830,360 @@ cluster_backup_build_label(const BackupState *state, const ClusterBackupManifest return buf.data; } +static void +cluster_backup_init_wire_ack(ClusterBackupWireAck *ack, const ClusterBackupWireRequest *request, + ClusterBackupWireResult result) +{ + MemSet(ack, 0, sizeof(*ack)); + ack->magic = CLUSTER_BACKUP_IC_MAGIC; + ack->version = CLUSTER_BACKUP_IC_VERSION; + ack->op = request != NULL ? request->op : CLUSTER_BACKUP_WIRE_OP_NONE; + ack->result = (uint16)result; + ack->sender_node_id = cluster_node_id; + ack->thread_id = cluster_backup_local_thread_id(); + ack->request_id = request != NULL ? request->request_id : 0; +} + +static void +cluster_backup_lmon_reset_context(void) +{ + cluster_backup_lmon_state = NULL; + cluster_backup_lmon_tablespace_map = NULL; + if (cluster_backup_lmon_context != NULL) { + MemoryContextDelete(cluster_backup_lmon_context); + cluster_backup_lmon_context = NULL; + } +} + +static void +cluster_backup_lmon_prepare_context(void) +{ + MemoryContext oldcontext; + + if (cluster_backup_lmon_context == NULL) + cluster_backup_lmon_context = AllocSetContextCreate( + TopMemoryContext, "cluster backup lmon context", ALLOCSET_START_SMALL_SIZES); + else { + cluster_backup_lmon_state = NULL; + cluster_backup_lmon_tablespace_map = NULL; + MemoryContextReset(cluster_backup_lmon_context); + } + + oldcontext = MemoryContextSwitchTo(cluster_backup_lmon_context); + cluster_backup_lmon_state = (BackupState *)palloc0(sizeof(BackupState)); + cluster_backup_lmon_tablespace_map = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); +} + +static ClusterBackupWireAck +cluster_backup_lmon_execute_request(const ClusterBackupWireRequest *request) +{ + ClusterBackupWireAck ack; + ClusterBackupWireResult result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; + + cluster_backup_init_wire_ack(&ack, request, result); + + if (request == NULL || !cluster_backup_wire_request_valid(request) + || request->coordinator_node_id < 0 || request->coordinator_node_id >= CLUSTER_MAX_NODES) { + ack.result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; + cluster_backup_wire_ack_compute_crc(&ack); + return ack; + } + + PG_TRY(); + { + switch ((ClusterBackupWireOp)request->op) { + case CLUSTER_BACKUP_WIRE_OP_START: + if (request->backup_id[0] == '\0') + result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; + else if (cluster_backup_lmon_state != NULL + || get_backup_status() == SESSION_BACKUP_RUNNING) + result = CLUSTER_BACKUP_WIRE_RESULT_BUSY; + else { + cluster_backup_lmon_prepare_context(); + register_persistent_abort_backup_handler(); + do_pg_backup_start(request->backup_id, request->fast, NULL, + cluster_backup_lmon_state, cluster_backup_lmon_tablespace_map); + ack.start_redo_lsn = cluster_backup_lmon_state->startpoint; + ack.checkpoint_lsn = cluster_backup_lmon_state->checkpointloc; + ack.timeline = cluster_backup_lmon_state->starttli; + result = CLUSTER_BACKUP_WIRE_RESULT_OK; + } + break; + + case CLUSTER_BACKUP_WIRE_OP_STOP: + if (cluster_backup_lmon_state == NULL || get_backup_status() != SESSION_BACKUP_RUNNING) + result = CLUSTER_BACKUP_WIRE_RESULT_NOT_IN_BACKUP; + else { + ack.start_redo_lsn = cluster_backup_lmon_state->startpoint; + ack.checkpoint_lsn = cluster_backup_lmon_state->checkpointloc; + do_pg_backup_stop(cluster_backup_lmon_state, request->waitforarchive); + ack.stop_cut_lsn = cluster_backup_lmon_state->stoppoint; + ack.cut_scn = cluster_backup_current_scn(); + ack.timeline = cluster_backup_lmon_state->stoptli; + cluster_backup_lmon_reset_context(); + result = CLUSTER_BACKUP_WIRE_RESULT_OK; + } + break; + + case CLUSTER_BACKUP_WIRE_OP_ABORT: + if (cluster_backup_lmon_state != NULL + || get_backup_status() == SESSION_BACKUP_RUNNING) { + do_pg_abort_backup(0, DatumGetBool(false)); + cluster_backup_lmon_reset_context(); + } + result = CLUSTER_BACKUP_WIRE_RESULT_OK; + break; + + case CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT: + if (request->restore_point_name[0] == '\0') + result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; + else if (RecoveryInProgress() || !XLogIsNeeded()) + result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; + else { + ack.stop_cut_lsn = XLogRestorePoint(request->restore_point_name); + ack.cut_scn = cluster_backup_current_scn(); + result = CLUSTER_BACKUP_WIRE_RESULT_OK; + } + break; + + case CLUSTER_BACKUP_WIRE_OP_NONE: + result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; + break; + } + } + PG_CATCH(); + { + FlushErrorState(); + if ((ClusterBackupWireOp)request->op == CLUSTER_BACKUP_WIRE_OP_START) + cluster_backup_lmon_reset_context(); + else if ((ClusterBackupWireOp)request->op == CLUSTER_BACKUP_WIRE_OP_STOP + && get_backup_status() != SESSION_BACKUP_RUNNING) + cluster_backup_lmon_reset_context(); + result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; + } + PG_END_TRY(); + + ack.result = (uint16)result; + cluster_backup_wire_ack_compute_crc(&ack); + return ack; +} + +static void +cluster_backup_request_handler(const ClusterICEnvelope *env, const void *payload) +{ + const ClusterBackupWireRequest *request = (const ClusterBackupWireRequest *)payload; + + if (cluster_backup_state == NULL || env == NULL || payload == NULL) + return; + if (env->payload_length != sizeof(ClusterBackupWireRequest)) + return; + if (!cluster_backup_wire_request_valid(request)) + return; + if (request->coordinator_node_id != (int32)env->source_node_id) + return; + if (request->coordinator_node_id == cluster_node_id) + return; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + if (!cluster_backup_state->peer_command_pending && !cluster_backup_state->peer_reply_pending) { + cluster_backup_state->peer_command = *request; + cluster_backup_state->peer_command_pending = true; + } + LWLockRelease(&cluster_backup_state->lock.lock); + cluster_lmon_wakeup(); +} + +static void +cluster_backup_ack_handler(const ClusterICEnvelope *env, const void *payload) +{ + const ClusterBackupWireAck *ack = (const ClusterBackupWireAck *)payload; + int32 node_id; + + if (cluster_backup_state == NULL || env == NULL || payload == NULL) + return; + if (env->payload_length != sizeof(ClusterBackupWireAck)) + return; + if (!cluster_backup_wire_ack_valid(ack)) + return; + if (ack->sender_node_id != (int32)env->source_node_id) + return; + node_id = ack->sender_node_id; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + if (cluster_backup_state->coordinator_request.request_id == ack->request_id + && cluster_backup_state->coordinator_request.op == ack->op + && cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) { + cluster_backup_state->coordinator_acks[node_id] = *ack; + if (ack->result == CLUSTER_BACKUP_WIRE_RESULT_OK) { + ClusterBackupManifestThread *thread + = &cluster_backup_state->coordinator_peer_threads[node_id]; + + cluster_backup_bitmap_set(cluster_backup_state->coordinator_acked, node_id); + if (ack->op == CLUSTER_BACKUP_WIRE_OP_START) { + MemSet(thread, 0, sizeof(*thread)); + thread->present = true; + thread->wal_included = true; + thread->undo_included = true; + thread->tt_included = true; + thread->thread_id = ack->thread_id; + thread->node_id = node_id; + thread->start_redo_lsn = ack->start_redo_lsn; + thread->checkpoint_lsn = ack->checkpoint_lsn; + thread->start_tli = ack->timeline; + } else if (ack->op == CLUSTER_BACKUP_WIRE_OP_STOP) { + if (!thread->present && ack->start_redo_lsn != InvalidXLogRecPtr + && ack->checkpoint_lsn != InvalidXLogRecPtr) { + thread->present = true; + thread->wal_included = true; + thread->undo_included = true; + thread->tt_included = true; + thread->thread_id = ack->thread_id; + thread->node_id = node_id; + thread->start_redo_lsn = ack->start_redo_lsn; + thread->checkpoint_lsn = ack->checkpoint_lsn; + thread->start_tli = ack->timeline; + } + thread->stop_cut_lsn = ack->stop_cut_lsn; + cluster_backup_state->coordinator_peer_cut_scn[node_id] = ack->cut_scn; + } + } else + cluster_backup_bitmap_set(cluster_backup_state->coordinator_nacked, node_id); + } + LWLockRelease(&cluster_backup_state->lock.lock); +} + +void +cluster_backup_register_ic_msg_types(void) +{ + const ClusterICMsgTypeInfo request_info = { + .msg_type = PGRAC_IC_MSG_BACKUP_REQUEST, + .name = "backup_request", + .allowed_producer_mask = CLUSTER_IC_PRODUCER_LMON, + .broadcast_ok = true, + .handler = cluster_backup_request_handler, + }; + const ClusterICMsgTypeInfo ack_info = { + .msg_type = PGRAC_IC_MSG_BACKUP_ACK, + .name = "backup_ack", + .allowed_producer_mask = CLUSTER_IC_PRODUCER_LMON, + .broadcast_ok = false, + .handler = cluster_backup_ack_handler, + }; + + cluster_ic_register_msg_type(&request_info); + cluster_ic_register_msg_type(&ack_info); +} + +static void +cluster_backup_record_send_nak(int32 node_id, const ClusterBackupWireRequest *request, + ClusterBackupWireResult result) +{ + ClusterBackupWireAck ack; + + if (node_id < 0 || node_id >= CLUSTER_MAX_NODES || request == NULL) + return; + + cluster_backup_init_wire_ack(&ack, request, result); + ack.sender_node_id = node_id; + cluster_backup_wire_ack_compute_crc(&ack); + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + if (cluster_backup_state->coordinator_request.request_id == request->request_id + && cluster_backup_state->coordinator_request.op == request->op + && cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) { + cluster_backup_state->coordinator_acks[node_id] = ack; + cluster_backup_bitmap_set(cluster_backup_state->coordinator_nacked, node_id); + } + LWLockRelease(&cluster_backup_state->lock.lock); +} + +static void +cluster_backup_lmon_send_coord_request(void) +{ + ClusterBackupWireRequest request; + uint8 expected[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + ClusterICFanoutResult per_peer[CLUSTER_MAX_NODES]; + int node_id; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + if (!cluster_backup_state->coordinator_send_pending) { + LWLockRelease(&cluster_backup_state->lock.lock); + return; + } + request = cluster_backup_state->coordinator_request; + memcpy(expected, cluster_backup_state->coordinator_expected, sizeof(expected)); + cluster_backup_state->coordinator_send_pending = false; + LWLockRelease(&cluster_backup_state->lock.lock); + + cluster_ic_send_envelope_fanout(PGRAC_IC_MSG_BACKUP_REQUEST, &request, (uint32)sizeof(request), + per_peer); + for (node_id = 0; node_id < CLUSTER_MAX_NODES; node_id++) { + if (!cluster_backup_bitmap_test(expected, node_id)) + continue; + if (per_peer[node_id] == CLUSTER_IC_FANOUT_HARD_ERROR + || per_peer[node_id] == CLUSTER_IC_FANOUT_PEER_DOWN) + cluster_backup_record_send_nak(node_id, &request, + CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR); + } +} + +static void +cluster_backup_lmon_send_peer_reply(void) +{ + ClusterBackupWireAck reply; + int32 dest; + bool have_reply; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + have_reply = cluster_backup_state->peer_reply_pending; + reply = cluster_backup_state->peer_reply; + dest = cluster_backup_state->peer_reply_dest; + cluster_backup_state->peer_reply_pending = false; + LWLockRelease(&cluster_backup_state->lock.lock); + + if (!have_reply || dest < 0) + return; + (void)cluster_ic_send_envelope(PGRAC_IC_MSG_BACKUP_ACK, dest, &reply, (uint32)sizeof(reply)); +} + +static void +cluster_backup_lmon_process_peer_command(void) +{ + ClusterBackupWireRequest request; + ClusterBackupWireAck reply; + bool have_command; + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + have_command = cluster_backup_state->peer_command_pending; + request = cluster_backup_state->peer_command; + cluster_backup_state->peer_command_pending = false; + LWLockRelease(&cluster_backup_state->lock.lock); + + if (!have_command) + return; + + reply = cluster_backup_lmon_execute_request(&request); + + LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); + cluster_backup_state->peer_reply = reply; + cluster_backup_state->peer_reply_dest = request.coordinator_node_id; + cluster_backup_state->peer_reply_pending = true; + LWLockRelease(&cluster_backup_state->lock.lock); +} + +void +cluster_backup_lmon_tick(void) +{ + if (cluster_backup_state == NULL || !cluster_enabled) + return; + + cluster_backup_lmon_send_peer_reply(); + cluster_backup_lmon_send_coord_request(); + cluster_backup_lmon_process_peer_command(); + cluster_backup_lmon_send_peer_reply(); +} + Datum pg_cluster_backup_start(PG_FUNCTION_ARGS) { @@ -347,6 +1196,10 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) char *backupidstr; SessionBackupState status; MemoryContext oldcontext; + uint8 start_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + bool have_peers; + uint64 start_request_id = 0; + ClusterBackupCoordWaitResult wait_result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -367,6 +1220,9 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_IN_PROGRESS), errmsg("a backup is already in progress in this session"))); + cluster_backup_build_current_peer_bitmap(start_peers); + have_peers = cluster_backup_bitmap_any_set(start_peers); + if (cluster_backup_context == NULL) cluster_backup_context = AllocSetContextCreate(TopMemoryContext, "cluster backup context", ALLOCSET_START_SMALL_SIZES); @@ -386,6 +1242,27 @@ pg_cluster_backup_start(PG_FUNCTION_ARGS) cluster_backup_tablespace_map); cluster_backup_update_start(backupidstr, cluster_backup_session_state); + if (have_peers) { + if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_START, start_peers, + backupidstr, NULL, fast, false, InvalidScn, + &start_request_id)) { + if (get_backup_status() == SESSION_BACKUP_RUNNING) + do_pg_abort_backup(0, DatumGetBool(false)); + cluster_backup_mark_native_stopped(NULL); + cluster_backup_cleanup_session_context(); + cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_START); + } + if (!cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_START, start_request_id, + &wait_result)) { + cluster_backup_abort_peers_best_effort(start_peers, backupidstr); + if (get_backup_status() == SESSION_BACKUP_RUNNING) + do_pg_abort_backup(0, DatumGetBool(false)); + cluster_backup_mark_native_stopped(NULL); + cluster_backup_cleanup_session_context(); + cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_START, &wait_result); + } + } + values[0] = CStringGetTextDatum(backupidstr); values[1] = LSNGetDatum(cluster_backup_session_state->startpoint); values[2] = LSNGetDatum(cluster_backup_session_state->checkpointloc); @@ -410,6 +1287,11 @@ pg_cluster_backup_stop(PG_FUNCTION_ARGS) SCN thread_scn[CLUSTER_MAX_NODES]; int thread_index; uint16 thread_id; + uint8 stop_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + uint8 current_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + bool have_peers; + uint64 stop_request_id = 0; + ClusterBackupCoordWaitResult wait_result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -426,21 +1308,46 @@ pg_cluster_backup_stop(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cluster backup session state is missing"))); + have_peers = cluster_backup_get_backup_peer_bitmap(stop_peers); + if (have_peers) { + cluster_backup_build_current_peer_bitmap(current_peers); + if (!cluster_backup_bitmap_equal(stop_peers, current_peers)) + ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), + errmsg("cluster backup peer topology changed during backup"), + errhint("Stop or abort the backup and retry after cluster membership " + "settles."))); + if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_STOP, stop_peers, + cluster_backup_session_state->name, NULL, false, + waitforarchive, InvalidScn, &stop_request_id)) + cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_STOP); + } + do_pg_backup_stop(cluster_backup_session_state, waitforarchive); cluster_backup_mark_native_stopped(cluster_backup_session_state); cut_scn = cluster_backup_current_scn(); - cluster_backup_fill_local_manifest(&manifest, cluster_backup_session_state, cut_scn); - backup_label = cluster_backup_build_label(cluster_backup_session_state, &manifest, cut_scn); - cluster_backup_update_stop(cluster_backup_session_state, &manifest, cut_scn); MemSet(thread_lsn, 0, sizeof(thread_lsn)); MemSet(thread_scn, 0, sizeof(thread_scn)); - thread_id = cluster_wal_thread_id(); - if (thread_id == XLP_THREAD_ID_LEGACY) - thread_id = 1; + thread_id = cluster_backup_local_thread_id(); thread_index = (int)thread_id - 1; thread_lsn[thread_index] = cluster_backup_session_state->stoppoint; thread_scn[thread_index] = cut_scn; + + if (have_peers + && !cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_STOP, stop_request_id, + &wait_result)) { + cluster_backup_cleanup_session_context(); + cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_STOP, &wait_result); + } + + cluster_backup_fill_local_manifest(&manifest, cluster_backup_session_state, cut_scn); + if (have_peers) + cluster_backup_add_peer_stop_threads(&manifest, stop_request_id, thread_scn, thread_lsn); + cut_scn = manifest.consistent_scn; + cluster_backup_seal_manifest_or_error(&manifest); + backup_label = cluster_backup_build_label(cluster_backup_session_state, &manifest, cut_scn); + cluster_backup_update_stop(cluster_backup_session_state, &manifest, cut_scn); + if (cluster_restore_point_build(&point, manifest.backup_id, thread_scn, thread_lsn, CLUSTER_MAX_NODES, true, true, manifest.incarnation) == CLUSTER_RESTORE_POINT_CUT_OK) { @@ -454,10 +1361,7 @@ pg_cluster_backup_stop(PG_FUNCTION_ARGS) values[3] = CStringGetTextDatum(backup_label); pfree(backup_label); - cluster_backup_session_state = NULL; - cluster_backup_tablespace_map = NULL; - MemoryContextDelete(cluster_backup_context); - cluster_backup_context = NULL; + cluster_backup_cleanup_session_context(); PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } @@ -479,6 +1383,10 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) int thread_index; ClusterRestorePoint point; ClusterRestorePointCutReason reason; + uint8 restore_point_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; + bool have_peers; + uint64 restore_request_id = 0; + ClusterBackupCoordWaitResult wait_result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -504,17 +1412,30 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) errmsg("cluster restore point name is too long"), errdetail("Maximum length is %d bytes.", CLUSTER_RESTORE_POINT_NAME_MAX - 1))); + cluster_backup_build_current_peer_bitmap(restore_point_peers); + have_peers = cluster_backup_bitmap_any_set(restore_point_peers); + if (have_peers) { + if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, + restore_point_peers, NULL, restore_name_str, false, + false, InvalidScn, &restore_request_id)) + cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT); + } + restorepoint = XLogRestorePoint(restore_name_str); cut_scn = cluster_backup_current_scn(); MemSet(thread_scn, 0, sizeof(thread_scn)); MemSet(thread_lsn, 0, sizeof(thread_lsn)); - thread_id = cluster_wal_thread_id(); - if (thread_id == XLP_THREAD_ID_LEGACY) - thread_id = 1; + thread_id = cluster_backup_local_thread_id(); thread_index = (int)thread_id - 1; thread_scn[thread_index] = cut_scn; thread_lsn[thread_index] = restorepoint; + if (have_peers + && !cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, + restore_request_id, &wait_result)) + cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, &wait_result); + if (have_peers) + cluster_backup_add_peer_restore_point_acks(restore_request_id, thread_scn, thread_lsn); reason = cluster_restore_point_build(&point, restore_name_str, thread_scn, thread_lsn, CLUSTER_MAX_NODES, true, true, 0); if (reason != CLUSTER_RESTORE_POINT_CUT_OK) @@ -523,6 +1444,7 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) cluster_restore_point_cut_reason_name(reason)))); point.created_at = GetCurrentTimestamp(); cluster_backup_add_restore_point(&point); + cut_scn = point.cut_scn; values[0] = CStringGetTextDatum(restore_name_str); values[1] = Int64GetDatum((int64)cut_scn); diff --git a/src/backend/cluster/cluster_backup_manifest.c b/src/backend/cluster/cluster_backup_manifest.c index 7fbaa1fcfe..641d456659 100644 --- a/src/backend/cluster/cluster_backup_manifest.c +++ b/src/backend/cluster/cluster_backup_manifest.c @@ -328,3 +328,112 @@ cluster_restore_compat_reason_name(ClusterRestoreCompatibilityReason reason) } return "unknown"; } + +void +cluster_backup_wire_request_compute_crc(ClusterBackupWireRequest *request) +{ + pg_crc32c crc; + + if (request == NULL) + return; + + request->crc = 0; + INIT_CRC32C(crc); + COMP_CRC32C(crc, request, offsetof(ClusterBackupWireRequest, crc)); + FIN_CRC32C(crc); + request->crc = crc; +} + +bool +cluster_backup_wire_request_valid(const ClusterBackupWireRequest *request) +{ + ClusterBackupWireRequest copy; + + if (request == NULL) + return false; + if (request->magic != CLUSTER_BACKUP_IC_MAGIC || request->version != CLUSTER_BACKUP_IC_VERSION) + return false; + if (request->op <= CLUSTER_BACKUP_WIRE_OP_NONE + || request->op > CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) + return false; + if (request->request_id == 0) + return false; + if (request->coordinator_node_id < 0 || request->coordinator_node_id >= CLUSTER_MAX_NODES) + return false; + if (request->backup_id[CLUSTER_BACKUP_ID_MAX - 1] != '\0') + return false; + if (request->restore_point_name[CLUSTER_RESTORE_POINT_NAME_MAX - 1] != '\0') + return false; + + copy = *request; + cluster_backup_wire_request_compute_crc(©); + return copy.crc == request->crc; +} + +void +cluster_backup_wire_ack_compute_crc(ClusterBackupWireAck *ack) +{ + pg_crc32c crc; + + if (ack == NULL) + return; + + ack->crc = 0; + INIT_CRC32C(crc); + COMP_CRC32C(crc, ack, offsetof(ClusterBackupWireAck, crc)); + FIN_CRC32C(crc); + ack->crc = crc; +} + +bool +cluster_backup_wire_ack_valid(const ClusterBackupWireAck *ack) +{ + ClusterBackupWireAck copy; + + if (ack == NULL) + return false; + if (ack->magic != CLUSTER_BACKUP_IC_MAGIC || ack->version != CLUSTER_BACKUP_IC_VERSION) + return false; + if (ack->op <= CLUSTER_BACKUP_WIRE_OP_NONE || ack->op > CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) + return false; + if (ack->result > CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR) + return false; + if (ack->sender_node_id < 0 || ack->sender_node_id >= CLUSTER_MAX_NODES) + return false; + if (ack->request_id == 0) + return false; + if (ack->result == CLUSTER_BACKUP_WIRE_RESULT_OK) { + if (ack->thread_id == 0 || ack->thread_id > CLUSTER_MAX_NODES) + return false; + if (ack->op == CLUSTER_BACKUP_WIRE_OP_START + && (ack->start_redo_lsn == InvalidXLogRecPtr + || ack->checkpoint_lsn == InvalidXLogRecPtr)) + return false; + if ((ack->op == CLUSTER_BACKUP_WIRE_OP_STOP + || ack->op == CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) + && (ack->stop_cut_lsn == InvalidXLogRecPtr || !SCN_VALID(ack->cut_scn))) + return false; + } + + copy = *ack; + cluster_backup_wire_ack_compute_crc(©); + return copy.crc == ack->crc; +} + +const char * +cluster_backup_wire_result_name(ClusterBackupWireResult result) +{ + switch (result) { + case CLUSTER_BACKUP_WIRE_RESULT_OK: + return "ok"; + case CLUSTER_BACKUP_WIRE_RESULT_BUSY: + return "busy"; + case CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST: + return "bad_request"; + case CLUSTER_BACKUP_WIRE_RESULT_NOT_IN_BACKUP: + return "not_in_backup"; + case CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR: + return "executor_error"; + } + return "unknown"; +} diff --git a/src/backend/cluster/cluster_lmon.c b/src/backend/cluster/cluster_lmon.c index 1c4dae9ba3..cb6f4b1b69 100644 --- a/src/backend/cluster/cluster_lmon.c +++ b/src/backend/cluster/cluster_lmon.c @@ -55,6 +55,7 @@ #include "utils/ps_status.h" #include "utils/timestamp.h" +#include "cluster/cluster_backup.h" /* cluster_backup_register_ic_msg_types + lmon_tick (spec-6.5) */ #include "cluster/cluster_clean_leave.h" /* cluster_clean_leave_register_ic_msg_types (spec-5.13 D8) */ #include "cluster/cluster_node_remove.h" /* cluster_node_remove_lmon_tick + register (spec-5.18 D9/D10) */ #include "cluster/cluster_conf.h" @@ -425,6 +426,16 @@ cluster_lmon_shmem_init(void) node_remove_registered = true; } } + /* spec-6.5 D1/D4: register cluster backup coordinator/peer request + ACK + * messages. Backends enqueue requests in shmem; LMON owns IC fanout. */ + { + static bool backup_registered = false; + + if (!backup_registered) { + cluster_backup_register_ic_msg_types(); + backup_registered = true; + } + } } @@ -1024,6 +1035,7 @@ LmonMain(void) /* spec-3.2 D6: LMON drain cross-node TT status hint outbound. * Fire-and-forget; L172 family — only LMON owns tier1 fds. */ cluster_tt_status_hint_drain_outbound(); + cluster_backup_lmon_tick(); /* * spec-2.34 D6 (HC93 leg a): TTL sweep of the GCS block @@ -1581,6 +1593,7 @@ LmonMain(void) /* spec-3.2 D6: LMON drain cross-node TT status hint outbound. * Fire-and-forget; L172 family — only LMON owns tier1 fds. */ cluster_tt_status_hint_drain_outbound(); + cluster_backup_lmon_tick(); /* spec-2.34 D6 (HC93 leg a): TTL sweep GCS block dedup HTAB. */ cluster_gcs_block_dedup_sweep_expired(GetCurrentTimestamp()); diff --git a/src/include/cluster/cluster_backup.h b/src/include/cluster/cluster_backup.h index eee33ae756..4bb663adf1 100644 --- a/src/include/cluster/cluster_backup.h +++ b/src/include/cluster/cluster_backup.h @@ -32,6 +32,9 @@ #define CLUSTER_BACKUP_MANIFEST_MAGIC 0x5047424BU /* "PGBK" */ #define CLUSTER_BACKUP_MANIFEST_VERSION 1 #define CLUSTER_BACKUP_RESTORE_POINT_MAX 16 +#define CLUSTER_BACKUP_NODE_BITMAP_BYTES (CLUSTER_MAX_NODES / 8) +#define CLUSTER_BACKUP_IC_MAGIC 0x50424249U /* "PBBI" */ +#define CLUSTER_BACKUP_IC_VERSION 1 typedef enum ClusterBackupManifestReason { CLUSTER_BACKUP_MANIFEST_OK = 0, @@ -73,6 +76,22 @@ typedef enum ClusterRestorePointCutReason { CLUSTER_RESTORE_POINT_CUT_BAD_THREAD } ClusterRestorePointCutReason; +typedef enum ClusterBackupWireOp { + CLUSTER_BACKUP_WIRE_OP_NONE = 0, + CLUSTER_BACKUP_WIRE_OP_START, + CLUSTER_BACKUP_WIRE_OP_STOP, + CLUSTER_BACKUP_WIRE_OP_ABORT, + CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT +} ClusterBackupWireOp; + +typedef enum ClusterBackupWireResult { + CLUSTER_BACKUP_WIRE_RESULT_OK = 0, + CLUSTER_BACKUP_WIRE_RESULT_BUSY, + CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST, + CLUSTER_BACKUP_WIRE_RESULT_NOT_IN_BACKUP, + CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR +} ClusterBackupWireResult; + typedef struct ClusterBackupManifestThread { bool present; bool wal_included; @@ -128,6 +147,38 @@ typedef struct ClusterBackupStatus { TimestampTz stopped_at; } ClusterBackupStatus; +typedef struct ClusterBackupWireRequest { + uint32 magic; + uint16 version; + uint16 op; + uint64 request_id; + int32 coordinator_node_id; + bool fast; + bool waitforarchive; + uint16 _pad0; + SCN requested_scn; + char backup_id[CLUSTER_BACKUP_ID_MAX]; + char restore_point_name[CLUSTER_RESTORE_POINT_NAME_MAX]; + uint32 crc; +} ClusterBackupWireRequest; + +typedef struct ClusterBackupWireAck { + uint32 magic; + uint16 version; + uint16 op; + uint16 result; + int32 sender_node_id; + uint16 thread_id; + uint16 _pad0; + uint64 request_id; + XLogRecPtr start_redo_lsn; + XLogRecPtr checkpoint_lsn; + XLogRecPtr stop_cut_lsn; + SCN cut_scn; + TimeLineID timeline; + uint32 crc; +} ClusterBackupWireAck; + extern void cluster_backup_manifest_init(ClusterBackupManifest *manifest, const char *backup_id); extern bool cluster_backup_manifest_set_thread(ClusterBackupManifest *manifest, int thread_index, const ClusterBackupManifestThread *thread); @@ -154,10 +205,18 @@ cluster_backup_manifest_compatible(const ClusterBackupManifest *manifest, uint32 uint32 current_storage_id, uint32 expected_node_count); extern const char *cluster_restore_compat_reason_name(ClusterRestoreCompatibilityReason reason); +extern void cluster_backup_wire_request_compute_crc(ClusterBackupWireRequest *request); +extern bool cluster_backup_wire_request_valid(const ClusterBackupWireRequest *request); +extern void cluster_backup_wire_ack_compute_crc(ClusterBackupWireAck *ack); +extern bool cluster_backup_wire_ack_valid(const ClusterBackupWireAck *ack); +extern const char *cluster_backup_wire_result_name(ClusterBackupWireResult result); + #ifndef FRONTEND extern Size cluster_backup_shmem_size(void); extern void cluster_backup_shmem_init(void); extern void cluster_backup_shmem_register(void); +extern void cluster_backup_register_ic_msg_types(void); +extern void cluster_backup_lmon_tick(void); extern void cluster_backup_get_status(ClusterBackupStatus *out); extern bool cluster_backup_get_last_manifest(ClusterBackupManifest *out); extern int cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points); diff --git a/src/include/cluster/cluster_ic_envelope.h b/src/include/cluster/cluster_ic_envelope.h index 1f13f04a38..3ce935da88 100644 --- a/src/include/cluster/cluster_ic_envelope.h +++ b/src/include/cluster/cluster_ic_envelope.h @@ -230,11 +230,18 @@ typedef enum ClusterICMsgType { * survivors (ClusterNodeRemoveAnnouncePayload: coordinator + target + remove_epoch + * removal_event_id). Survivors drop their refs to the removed node + reply * REMOVE_CLEANUP_ACK. */ - PGRAC_IC_MSG_REMOVE_CLEANUP_ACK = 32 /* PGRAC: spec-5.18 D10 — survivor -> removal + PGRAC_IC_MSG_REMOVE_CLEANUP_ACK = 32, /* PGRAC: spec-5.18 D10 — survivor -> removal * coordinator (ClusterNodeRemoveCleanupAckPayload): "I dropped all refs to the removed * node + accepted the permanent remaster"; sets the survivor's bit in the coordinator's * cleanup ACK barrier. */ - /* values 33..255 available for future sub-spec; never reuse 0..32 */ + PGRAC_IC_MSG_BACKUP_REQUEST = 33, /* PGRAC: spec-6.5 D1/D4 — backup coordinator -> + * peers (ClusterBackupWireRequest): START / STOP / ABORT / RESTORE_POINT request. + * LMON-mediated; peer LMON executes the local native backup/restore-point leg and + * replies with BACKUP_ACK. */ + PGRAC_IC_MSG_BACKUP_ACK = 34 /* PGRAC: spec-6.5 D1/D4 — peer -> backup + * coordinator (ClusterBackupWireAck): local thread REDO/checkpoint/stop-cut + * metadata or fail-closed NAK reason. */ + /* values 35..255 available for future sub-spec; never reuse 0..34 */ } ClusterICMsgType; diff --git a/src/test/cluster_tap/t/332_cluster_backup_pitr.pl b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl index bda2d7d5cd..e3d6947da1 100644 --- a/src/test/cluster_tap/t/332_cluster_backup_pitr.pl +++ b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl @@ -109,9 +109,9 @@ my ($ret, $out, $err) = $peer_node->psql('postgres', "\\set VERBOSITY verbose\nSELECT * FROM pg_cluster_backup_start('partial', true)"); -isnt($ret, 0, 'L8 peer topology rejects cluster backup start'); +isnt($ret, 0, 'L8 peer topology requires complete backup ACKs'); like($err, qr/53RAD|cluster_backup_incomplete/, - 'L8 peer topology fails closed with cluster_backup_incomplete'); + 'L8 missing peer ACK fails closed with cluster_backup_incomplete'); is($peer_node->safe_psql('postgres', q{SELECT CASE WHEN in_progress THEN 't' ELSE 'f' END FROM pg_stat_cluster_backup}), diff --git a/src/test/cluster_unit/test_cluster_backup.c b/src/test/cluster_unit/test_cluster_backup.c index 4a53ecd523..23776220d9 100644 --- a/src/test/cluster_unit/test_cluster_backup.c +++ b/src/test/cluster_unit/test_cluster_backup.c @@ -278,10 +278,62 @@ UT_TEST(test_restore_compatibility_rejects_mismatches) CLUSTER_RESTORE_COMPAT_MANIFEST); } +UT_TEST(test_backup_wire_request_crc_and_bounds) +{ + ClusterBackupWireRequest req; + + memset(&req, 0, sizeof(req)); + req.magic = CLUSTER_BACKUP_IC_MAGIC; + req.version = CLUSTER_BACKUP_IC_VERSION; + req.op = CLUSTER_BACKUP_WIRE_OP_START; + req.request_id = 42; + req.coordinator_node_id = 0; + strlcpy(req.backup_id, "b-wire", sizeof(req.backup_id)); + cluster_backup_wire_request_compute_crc(&req); + UT_ASSERT(cluster_backup_wire_request_valid(&req)); + + req.request_id = 43; + UT_ASSERT(!cluster_backup_wire_request_valid(&req)); + req.request_id = 42; + cluster_backup_wire_request_compute_crc(&req); + req.backup_id[CLUSTER_BACKUP_ID_MAX - 1] = 'x'; + UT_ASSERT(!cluster_backup_wire_request_valid(&req)); +} + +UT_TEST(test_backup_wire_ack_fail_closed_validation) +{ + ClusterBackupWireAck ack; + + memset(&ack, 0, sizeof(ack)); + ack.magic = CLUSTER_BACKUP_IC_MAGIC; + ack.version = CLUSTER_BACKUP_IC_VERSION; + ack.op = CLUSTER_BACKUP_WIRE_OP_STOP; + ack.result = CLUSTER_BACKUP_WIRE_RESULT_OK; + ack.sender_node_id = 1; + ack.thread_id = 2; + ack.request_id = 99; + ack.stop_cut_lsn = 500; + ack.cut_scn = test_scn(30); + cluster_backup_wire_ack_compute_crc(&ack); + UT_ASSERT(cluster_backup_wire_ack_valid(&ack)); + + ack.stop_cut_lsn = InvalidXLogRecPtr; + cluster_backup_wire_ack_compute_crc(&ack); + UT_ASSERT(!cluster_backup_wire_ack_valid(&ack)); + + ack.result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; + ack.thread_id = 0; + ack.cut_scn = InvalidScn; + cluster_backup_wire_ack_compute_crc(&ack); + UT_ASSERT(cluster_backup_wire_ack_valid(&ack)); + UT_ASSERT_STR_EQ(cluster_backup_wire_result_name(CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR), + "executor_error"); +} + int main(void) { - UT_PLAN(10); + UT_PLAN(12); UT_RUN(test_manifest_validates_complete_single_thread); UT_RUN(test_manifest_rejects_missing_control_wal_undo_tt); UT_RUN(test_manifest_rejects_bad_scn_lsn_count_and_crc); @@ -292,6 +344,8 @@ main(void) UT_RUN(test_pitr_resolves_latest_reachable_restore_point); UT_RUN(test_pitr_fail_closed_reasons); UT_RUN(test_restore_compatibility_rejects_mismatches); + UT_RUN(test_backup_wire_request_crc_and_bounds); + UT_RUN(test_backup_wire_ack_fail_closed_validation); UT_DONE(); return ut_failed_count == 0 ? 0 : 1; } diff --git a/src/test/cluster_unit/test_cluster_lmon.c b/src/test/cluster_unit/test_cluster_lmon.c index 3b36fae5b2..d33e7af507 100644 --- a/src/test/cluster_unit/test_cluster_lmon.c +++ b/src/test/cluster_unit/test_cluster_lmon.c @@ -427,6 +427,18 @@ void cluster_node_remove_register_ic_msg_types(void) {} +/* spec-6.5 D1/D4 stubs: cluster_lmon registers and ticks the backup + * coordinator/peer ACK path, but this standalone unit binary intentionally + * does not link cluster_backup.o or backend backup symbols. */ +void cluster_backup_register_ic_msg_types(void); +void +cluster_backup_register_ic_msg_types(void) +{} +void cluster_backup_lmon_tick(void); +void +cluster_backup_lmon_tick(void) +{} + /* spec-2.2 D5 LMON drive references cluster_conf_lookup_node + cluster_node_id. */ const struct ClusterNodeInfo * cluster_conf_lookup_node(int32 node_id pg_attribute_unused()) From c13ec816442c170d2b43a35b0a7caef85a015024 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Wed, 1 Jul 2026 07:44:25 +0800 Subject: [PATCH 8/9] fix(cluster): clear static analysis findings --- src/backend/cluster/cluster_backup_manifest.c | 4 ++-- src/backend/cluster/cluster_ic_tier1.c | 1 + src/backend/cluster/cluster_reconfig.c | 9 +++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/backend/cluster/cluster_backup_manifest.c b/src/backend/cluster/cluster_backup_manifest.c index 641d456659..31ed209093 100644 --- a/src/backend/cluster/cluster_backup_manifest.c +++ b/src/backend/cluster/cluster_backup_manifest.c @@ -353,7 +353,7 @@ cluster_backup_wire_request_valid(const ClusterBackupWireRequest *request) return false; if (request->magic != CLUSTER_BACKUP_IC_MAGIC || request->version != CLUSTER_BACKUP_IC_VERSION) return false; - if (request->op <= CLUSTER_BACKUP_WIRE_OP_NONE + if (request->op == CLUSTER_BACKUP_WIRE_OP_NONE || request->op > CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) return false; if (request->request_id == 0) @@ -394,7 +394,7 @@ cluster_backup_wire_ack_valid(const ClusterBackupWireAck *ack) return false; if (ack->magic != CLUSTER_BACKUP_IC_MAGIC || ack->version != CLUSTER_BACKUP_IC_VERSION) return false; - if (ack->op <= CLUSTER_BACKUP_WIRE_OP_NONE || ack->op > CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) + if (ack->op == CLUSTER_BACKUP_WIRE_OP_NONE || ack->op > CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT) return false; if (ack->result > CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR) return false; diff --git a/src/backend/cluster/cluster_ic_tier1.c b/src/backend/cluster/cluster_ic_tier1.c index a07a12837c..4288936966 100644 --- a/src/backend/cluster/cluster_ic_tier1.c +++ b/src/backend/cluster/cluster_ic_tier1.c @@ -1477,6 +1477,7 @@ cluster_ic_tier1_continue_hello_recv(int anon_slot, int peer_fd, int32 *out_lear const char *self_name = (ClusterConfShmem != NULL) ? ClusterConfShmem->cluster_name : "(no-conf)"; + memset(&peer_sa, 0, sizeof(peer_sa)); if (getpeername(peer_fd, (struct sockaddr *)&peer_sa, &peer_sa_len) == 0) { if (inet_ntop(AF_INET, &peer_sa.sin_addr, peer_ip, sizeof(peer_ip)) == NULL) strcpy(peer_ip, "?"); diff --git a/src/backend/cluster/cluster_reconfig.c b/src/backend/cluster/cluster_reconfig.c index af747c1154..12974307ab 100644 --- a/src/backend/cluster/cluster_reconfig.c +++ b/src/backend/cluster/cluster_reconfig.c @@ -1156,21 +1156,22 @@ cluster_reconfig_join_publish_proven(uint64 admitted_epoch) static void cluster_reconfig_drive_joins(int coordinator) { + ClusterReconfigState *state = ReconfigShmem; uint8 join_bitmap[CLUSTER_RECONFIG_DEAD_BITMAP_BYTES]; uint8 pending_snapshot[CLUSTER_RECONFIG_DEAD_BITMAP_BYTES]; uint64 joiner_incarnations[CLUSTER_MAX_NODES]; int n_join; int i; - if (ReconfigShmem == NULL) + if (state == NULL) return; /* Phase-1 detection + a snapshot of the current pending set, under the lock * (compute_join_bitmap reads membership_state). */ - LWLockAcquire(&ReconfigShmem->lock, LW_SHARED); + LWLockAcquire(&state->lock, LW_SHARED); n_join = cluster_reconfig_compute_join_bitmap(join_bitmap); - memcpy(pending_snapshot, ReconfigShmem->pending_join_bitmap, sizeof(pending_snapshot)); - LWLockRelease(&ReconfigShmem->lock); + memcpy(pending_snapshot, state->pending_join_bitmap, sizeof(pending_snapshot)); + LWLockRelease(&state->lock); if (n_join > 0) { memset(joiner_incarnations, 0, sizeof(joiner_incarnations)); From 4afc589bc27c0b2d1d4c295f54b93947ad88798a Mon Sep 17 00:00:00 2001 From: SqlRush Date: Wed, 1 Jul 2026 10:42:01 +0800 Subject: [PATCH 9/9] fix(cluster): fail closed unfinished backup pitr paths --- docs/reference/system-views.md | 22 +- docs/user-guide/configuration.md | 15 +- src/backend/cluster/cluster_backup.c | 865 +----------------- .../cluster_tap/t/332_cluster_backup_pitr.pl | 103 ++- 4 files changed, 126 insertions(+), 879 deletions(-) diff --git a/docs/reference/system-views.md b/docs/reference/system-views.md index edcb92a622..14ff51a43f 100644 --- a/docs/reference/system-views.md +++ b/docs/reference/system-views.md @@ -24,18 +24,16 @@ state used by `pg_cluster_backup_start`, `pg_cluster_backup_stop`, and Current 6.5 scope is conservative: -- A single-node cluster can start and stop a cluster-aware physical - backup. The backup label returned by `pg_cluster_backup_stop` - includes native PostgreSQL label content followed by `CLUSTER_*` - metadata lines. -- If the node has declared peers, the mutating backup/restore-point - functions require LMON-mediated peer ACKs. Missing ACKs, peer NAKs, - disconnected peers, or a changed peer set fail closed with a cluster - backup SQLSTATE rather than silently producing a partial backup. -- The manifest records WAL thread, undo, transaction-table, SCN, and - control-file inclusion state for the proven cut. In a peer topology - the manifest is written only after every start-time peer has returned - STOP metadata for its WAL thread. +- The views, catalog entries, manifest validators, PITR target resolver, + shared-memory state, and IC wire format are present as substrate. +- Mutating physical backup and restore-point entry points fail closed + with `feature_not_supported` until the cluster physical capture, + durable WAL pin, restore-point commit-drain barrier, restore, and PITR + replay paths are implemented. +- No manifest is published unless WAL, undo, transaction-table, SCN, and + control-file inclusion are proven. The current substrate therefore + refuses to create a manifest instead of reporting a partial or unsound + backup as complete. ### `pg_stat_cluster_backup` diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md index 96cad0cc34..af41d7e614 100644 --- a/docs/user-guide/configuration.md +++ b/docs/user-guide/configuration.md @@ -79,14 +79,13 @@ PITR surface. | `cluster.backup_parallel_channels` | integer | `1` | sighup | Reserved copy-channel capacity for the future backup-set writer. | | `cluster.backup_manifest_checksums` | enum | `crc32c` | sighup | Manifest checksums are mandatory; unchecked manifests are not supported. | -The current implementation is intentionally conservative. A -single-node cluster can create a cluster manifest via -`pg_cluster_backup_start()` / `pg_cluster_backup_stop()`. If declared -peers exist, mutating backup and restore-point functions require the -LMON-mediated coordinator/peer ACK path to complete. Missing peer -ACKs, peer NAKs, disconnected peers, or topology changes during a -backup fail closed instead of producing a partial backup or an -unreachable PITR target. +The current implementation is intentionally conservative. These GUCs +expose the 6.5 catalog and state surface, but mutating cluster physical +backup and restore-point entry points fail closed with +`feature_not_supported` until the physical capture, durable WAL pin, +commit-drain restore-point barrier, restore, and PITR replay paths are +implemented. The server refuses to publish a manifest or restore point +when those proofs are absent. ### `cluster.interconnect_tier` diff --git a/src/backend/cluster/cluster_backup.c b/src/backend/cluster/cluster_backup.c index cf2ebd087e..7688b7922c 100644 --- a/src/backend/cluster/cluster_backup.c +++ b/src/backend/cluster/cluster_backup.c @@ -110,69 +110,6 @@ cluster_backup_bitmap_test(const uint8 *bitmap, int node_id) return (bitmap[node_id / 8] & (uint8)(1u << (node_id % 8))) != 0; } -static bool -cluster_backup_bitmap_all_acked(const uint8 *expected, const uint8 *acked) -{ - int i; - - for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { - if ((expected[i] & ~acked[i]) != 0) - return false; - } - return true; -} - -static bool -cluster_backup_bitmap_any_set(const uint8 *bitmap) -{ - int i; - - for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { - if (bitmap[i] != 0) - return true; - } - return false; -} - -static bool -cluster_backup_bitmap_equal(const uint8 *left, const uint8 *right) -{ - int i; - - if (left == NULL || right == NULL) - return false; - for (i = 0; i < CLUSTER_BACKUP_NODE_BITMAP_BYTES; i++) { - if (left[i] != right[i]) - return false; - } - return true; -} - -static int -cluster_backup_bitmap_first_missing(const uint8 *expected, const uint8 *acked, const uint8 *nacked) -{ - int i; - - for (i = 0; i < CLUSTER_MAX_NODES; i++) { - if (cluster_backup_bitmap_test(expected, i) && !cluster_backup_bitmap_test(acked, i) - && !cluster_backup_bitmap_test(nacked, i)) - return i; - } - return -1; -} - -static int -cluster_backup_bitmap_first_set(const uint8 *bitmap) -{ - int i; - - for (i = 0; i < CLUSTER_MAX_NODES; i++) { - if (cluster_backup_bitmap_test(bitmap, i)) - return i; - } - return -1; -} - static uint16 cluster_backup_local_thread_id(void) { @@ -183,24 +120,6 @@ cluster_backup_local_thread_id(void) return thread_id; } -static const char * -cluster_backup_wire_op_name(ClusterBackupWireOp op) -{ - switch (op) { - case CLUSTER_BACKUP_WIRE_OP_START: - return "start"; - case CLUSTER_BACKUP_WIRE_OP_STOP: - return "stop"; - case CLUSTER_BACKUP_WIRE_OP_ABORT: - return "abort"; - case CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT: - return "restore_point"; - case CLUSTER_BACKUP_WIRE_OP_NONE: - break; - } - return "unknown"; -} - static const char * cluster_pitr_action_name(int action) { @@ -249,6 +168,9 @@ cluster_backup_shmem_register(void) cluster_shmem_register_region(&cluster_backup_region); } +static void cluster_backup_cleanup_session_context(void); +static void cluster_backup_mark_native_stopped(const BackupState *state); + static void cluster_backup_error_if_unavailable(const char *op) { @@ -263,15 +185,25 @@ cluster_backup_error_if_unavailable(const char *op) errmsg("cluster backup shared state is not initialized"))); } -static SCN -cluster_backup_current_scn(void) +static void +cluster_backup_fail_closed_unimplemented(const char *op, const char *missing) { - SCN scn; + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("%s is not available in the current cluster backup substrate", op), + errdetail("%s is required before this operation can return a sound " + "cluster backup/PITR result.", + missing), + errhint("Refusing to create an unsound cluster restore point or " + "backup manifest."))); +} - scn = cluster_scn_current(); - if (!SCN_VALID(scn)) - scn = cluster_scn_advance(); - return scn; +static void +cluster_backup_abort_local_session_if_running(void) +{ + if (get_backup_status() == SESSION_BACKUP_RUNNING) + do_pg_abort_backup(0, DatumGetBool(false)); + cluster_backup_mark_native_stopped(NULL); + cluster_backup_cleanup_session_context(); } typedef struct ClusterBackupCoordWaitResult { @@ -292,267 +224,6 @@ cluster_backup_cleanup_session_context(void) } } -static void -cluster_backup_build_current_peer_bitmap(uint8 *bitmap) -{ - int i; - - MemSet(bitmap, 0, CLUSTER_BACKUP_NODE_BITMAP_BYTES); - for (i = 0; i < CLUSTER_MAX_NODES; i++) { - if (i == cluster_node_id) - continue; - if (cluster_conf_lookup_node(i) != NULL) - cluster_backup_bitmap_set(bitmap, i); - } -} - -static void -cluster_backup_init_wire_request(ClusterBackupWireRequest *request, ClusterBackupWireOp op, - uint64 request_id, const char *backup_id, - const char *restore_point_name, bool fast, bool waitforarchive, - SCN requested_scn) -{ - MemSet(request, 0, sizeof(*request)); - request->magic = CLUSTER_BACKUP_IC_MAGIC; - request->version = CLUSTER_BACKUP_IC_VERSION; - request->op = (uint16)op; - request->request_id = request_id; - request->coordinator_node_id = cluster_node_id; - request->fast = fast; - request->waitforarchive = waitforarchive; - request->requested_scn = requested_scn; - if (backup_id != NULL) - strlcpy(request->backup_id, backup_id, sizeof(request->backup_id)); - if (restore_point_name != NULL) - strlcpy(request->restore_point_name, restore_point_name, - sizeof(request->restore_point_name)); - cluster_backup_wire_request_compute_crc(request); -} - -static bool -cluster_backup_get_backup_peer_bitmap(uint8 *bitmap) -{ - bool have_peers; - - if (bitmap == NULL || cluster_backup_state == NULL) - return false; - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); - memcpy(bitmap, cluster_backup_state->coordinator_backup_peers, - CLUSTER_BACKUP_NODE_BITMAP_BYTES); - have_peers = cluster_backup_bitmap_any_set(bitmap); - LWLockRelease(&cluster_backup_state->lock.lock); - return have_peers; -} - -static bool -cluster_backup_begin_coord_request(ClusterBackupWireOp op, const uint8 *expected_override, - const char *backup_id, const char *restore_point_name, bool fast, - bool waitforarchive, SCN requested_scn, uint64 *request_id) -{ - ClusterBackupWireRequest request; - uint8 expected[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; - uint64 id; - - if (request_id != NULL) - *request_id = 0; - if (cluster_backup_state == NULL) - return false; - - if (expected_override != NULL) - memcpy(expected, expected_override, sizeof(expected)); - else - cluster_backup_build_current_peer_bitmap(expected); - - if (!cluster_backup_bitmap_any_set(expected)) - return false; - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); - id = ++cluster_backup_state->next_request_id; - if (id == 0) - id = ++cluster_backup_state->next_request_id; - LWLockRelease(&cluster_backup_state->lock.lock); - - cluster_backup_init_wire_request(&request, op, id, backup_id, restore_point_name, fast, - waitforarchive, requested_scn); - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); - cluster_backup_state->coordinator_request = request; - memcpy(cluster_backup_state->coordinator_expected, expected, sizeof(expected)); - MemSet(cluster_backup_state->coordinator_acked, 0, - sizeof(cluster_backup_state->coordinator_acked)); - MemSet(cluster_backup_state->coordinator_nacked, 0, - sizeof(cluster_backup_state->coordinator_nacked)); - MemSet(cluster_backup_state->coordinator_acks, 0, - sizeof(cluster_backup_state->coordinator_acks)); - if (op == CLUSTER_BACKUP_WIRE_OP_START) { - memcpy(cluster_backup_state->coordinator_backup_peers, expected, sizeof(expected)); - MemSet(cluster_backup_state->coordinator_peer_threads, 0, - sizeof(cluster_backup_state->coordinator_peer_threads)); - MemSet(cluster_backup_state->coordinator_peer_cut_scn, 0, - sizeof(cluster_backup_state->coordinator_peer_cut_scn)); - } - cluster_backup_state->coordinator_send_pending = true; - LWLockRelease(&cluster_backup_state->lock.lock); - - if (request_id != NULL) - *request_id = id; - cluster_lmon_wakeup(); - return true; -} - -static bool -cluster_backup_wait_for_peer_acks(ClusterBackupWireOp op, uint64 request_id, - ClusterBackupCoordWaitResult *result) -{ - TimestampTz deadline; - - if (result != NULL) { - MemSet(result, 0, sizeof(*result)); - result->node_id = -1; - result->result = CLUSTER_BACKUP_WIRE_RESULT_OK; - } - - if (request_id == 0) - return true; - - deadline = GetCurrentTimestamp() - + (TimestampTz)cluster_recovery_merge_wait_timeout * INT64CONST(1000); - - for (;;) { - bool all_acked; - bool any_nacked; - int bad_node; - int missing_node; - ClusterBackupWireResult bad_result = CLUSTER_BACKUP_WIRE_RESULT_OK; - - CHECK_FOR_INTERRUPTS(); - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); - if (cluster_backup_state->coordinator_request.request_id != request_id - || cluster_backup_state->coordinator_request.op != (uint16)op) { - LWLockRelease(&cluster_backup_state->lock.lock); - if (result != NULL) { - result->node_id = -1; - result->result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; - } - return false; - } - all_acked = cluster_backup_bitmap_all_acked(cluster_backup_state->coordinator_expected, - cluster_backup_state->coordinator_acked); - any_nacked = cluster_backup_bitmap_any_set(cluster_backup_state->coordinator_nacked); - bad_node = cluster_backup_bitmap_first_set(cluster_backup_state->coordinator_nacked); - if (bad_node >= 0) - bad_result - = (ClusterBackupWireResult)cluster_backup_state->coordinator_acks[bad_node].result; - missing_node = cluster_backup_bitmap_first_missing( - cluster_backup_state->coordinator_expected, cluster_backup_state->coordinator_acked, - cluster_backup_state->coordinator_nacked); - LWLockRelease(&cluster_backup_state->lock.lock); - - if (all_acked) { - if (result != NULL) - result->ok = true; - return true; - } - if (any_nacked) { - if (result != NULL) { - result->node_id = bad_node; - result->result = bad_result; - } - return false; - } - if (cluster_recovery_merge_wait_timeout <= 0 || GetCurrentTimestamp() >= deadline) { - if (result != NULL) { - result->timed_out = true; - result->node_id = missing_node; - } - return false; - } - - pg_usleep(10000L); - } -} - -static void -cluster_backup_raise_peer_failure(ClusterBackupWireOp op, - const ClusterBackupCoordWaitResult *result) -{ - const char *op_name = cluster_backup_wire_op_name(op); - int32 node_id = (result != NULL) ? result->node_id : -1; - - if (result != NULL && result->timed_out) - ereport(ERROR, - (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup %s did not receive ACK from all peer nodes", op_name), - errdetail("First missing peer node: %d.", node_id))); - else - ereport(ERROR, - (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup %s was rejected by a peer node", op_name), - errdetail("Peer node %d returned %s.", node_id, - cluster_backup_wire_result_name( - result != NULL ? result->result - : CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR)))); -} - -static void -cluster_backup_raise_coord_enqueue_failure(ClusterBackupWireOp op) -{ - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup %s could not enqueue a peer coordination request", - cluster_backup_wire_op_name(op)))); -} - -static void -cluster_backup_abort_peers_best_effort(const uint8 *expected, const char *backup_id) -{ - uint64 abort_request_id = 0; - ClusterBackupCoordWaitResult ignored; - - if (expected == NULL || !cluster_backup_bitmap_any_set(expected)) - return; - if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_ABORT, expected, backup_id, NULL, - false, false, InvalidScn, &abort_request_id)) - return; - (void)cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_ABORT, abort_request_id, - &ignored); -} - -static void -cluster_backup_update_start(const char *backup_id, const BackupState *state) -{ - ClusterBackupStatus status; - - MemSet(&status, 0, sizeof(status)); - status.in_progress = true; - strlcpy(status.backup_id, backup_id, sizeof(status.backup_id)); - status.coordinator_node_id = cluster_node_id; - status.start_redo_lsn = state->startpoint; - status.checkpoint_lsn = state->checkpointloc; - status.start_tli = state->starttli; - status.started_at = GetCurrentTimestamp(); - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); - cluster_backup_state->status = status; - LWLockRelease(&cluster_backup_state->lock.lock); -} - -static void -cluster_backup_update_stop(const BackupState *state, const ClusterBackupManifest *manifest, - SCN cut_scn) -{ - LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); - cluster_backup_state->status.in_progress = false; - cluster_backup_state->status.stop_cut_lsn = state->stoppoint; - cluster_backup_state->status.consistent_scn = cut_scn; - cluster_backup_state->status.manifest_crc = manifest->manifest_crc; - cluster_backup_state->status.stopped_at = GetCurrentTimestamp(); - cluster_backup_state->last_manifest = *manifest; - cluster_backup_state->have_manifest = true; - LWLockRelease(&cluster_backup_state->lock.lock); -} - static void cluster_backup_mark_native_stopped(const BackupState *state) { @@ -564,24 +235,6 @@ cluster_backup_mark_native_stopped(const BackupState *state) LWLockRelease(&cluster_backup_state->lock.lock); } -static void -cluster_backup_add_restore_point(const ClusterRestorePoint *point) -{ - int slot; - - if (point == NULL || !point->present) - return; - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_EXCLUSIVE); - slot = cluster_backup_state->restore_point_next; - cluster_backup_state->restore_points[slot] = *point; - cluster_backup_state->restore_point_next - = (cluster_backup_state->restore_point_next + 1) % CLUSTER_BACKUP_RESTORE_POINT_MAX; - if (cluster_backup_state->restore_point_count < CLUSTER_BACKUP_RESTORE_POINT_MAX) - cluster_backup_state->restore_point_count++; - LWLockRelease(&cluster_backup_state->lock.lock); -} - void cluster_backup_get_status(ClusterBackupStatus *out) { @@ -639,197 +292,6 @@ cluster_backup_get_restore_points(ClusterRestorePoint *out, int max_points) return count; } -static void -cluster_backup_fill_local_manifest(ClusterBackupManifest *manifest, const BackupState *state, - SCN cut_scn) -{ - ClusterBackupManifestThread thread; - uint16 thread_id = cluster_backup_local_thread_id(); - int thread_index; - - thread_index = (int)thread_id - 1; - - cluster_backup_manifest_init(manifest, state->name); - manifest->consistent_scn = cut_scn; - manifest->scn_durable_peak = cut_scn; - manifest->timeline = state->stoptli; - manifest->catversion = CATALOG_VERSION_NO; - manifest->incarnation = 0; - manifest->backend_storage_id = (uint32)cluster_shared_storage_backend; - manifest->node_count = 1; - manifest->control_included = true; - manifest->voting_included = false; - - MemSet(&thread, 0, sizeof(thread)); - thread.present = true; - thread.wal_included = true; - thread.undo_included = true; - thread.tt_included = true; - thread.thread_id = thread_id; - thread.node_id = cluster_node_id; - thread.start_redo_lsn = state->startpoint; - thread.checkpoint_lsn = state->checkpointloc; - thread.start_tli = state->starttli; - thread.stop_cut_lsn = state->stoppoint; - - if (!cluster_backup_manifest_set_thread(manifest, thread_index, &thread)) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("could not add local WAL thread to cluster backup manifest"))); -} - -static void -cluster_backup_seal_manifest_or_error(ClusterBackupManifest *manifest) -{ - ClusterBackupManifestReason reason; - - if (cluster_backup_manifest_checksums != CLUSTER_BACKUP_MANIFEST_CHECKSUM_CRC32C) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifests require crc32c checksums"))); - cluster_backup_manifest_seal(manifest); - reason = cluster_backup_manifest_validate(manifest); - if (reason != CLUSTER_BACKUP_MANIFEST_OK) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifest failed self-validation"), - errdetail("Manifest validation reason: %s.", - cluster_backup_manifest_reason_name(reason)))); -} - -static void -cluster_backup_add_peer_stop_threads(ClusterBackupManifest *manifest, uint64 stop_request_id, - SCN *thread_scn, XLogRecPtr *thread_lsn) -{ - int node_id; - - if (stop_request_id == 0) - return; - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); - for (node_id = 0; node_id < CLUSTER_MAX_NODES; node_id++) { - ClusterBackupManifestThread thread; - ClusterBackupWireAck ack; - SCN cut_scn; - int thread_index; - - if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) - continue; - if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_acked, node_id)) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifest is missing a peer STOP ACK"), - errdetail("Missing peer node: %d.", node_id))); - } - - ack = cluster_backup_state->coordinator_acks[node_id]; - thread = cluster_backup_state->coordinator_peer_threads[node_id]; - cut_scn = cluster_backup_state->coordinator_peer_cut_scn[node_id]; - - if (ack.request_id != stop_request_id || ack.op != CLUSTER_BACKUP_WIRE_OP_STOP - || ack.result != CLUSTER_BACKUP_WIRE_RESULT_OK || !thread.present - || ack.stop_cut_lsn == InvalidXLogRecPtr || !SCN_VALID(cut_scn)) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifest has incomplete peer STOP metadata"), - errdetail("Peer node: %d.", node_id))); - } - - thread.stop_cut_lsn = ack.stop_cut_lsn; - thread_index = (int)thread.thread_id - 1; - if (thread_index < 0 || thread_index >= CLUSTER_MAX_NODES - || manifest->threads[thread_index].present) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, - (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup manifest has duplicate or invalid WAL thread"), - errdetail("Peer node %d reported thread %u.", node_id, thread.thread_id))); - } - - if (!cluster_backup_manifest_set_thread(manifest, thread_index, &thread)) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, - (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("could not add peer WAL thread to cluster backup manifest"), - errdetail("Peer node %d reported thread %u.", node_id, thread.thread_id))); - } - - manifest->node_count++; - if (!SCN_VALID(manifest->consistent_scn) - || scn_time_cmp(cut_scn, manifest->consistent_scn) > 0) { - manifest->consistent_scn = cut_scn; - manifest->scn_durable_peak = cut_scn; - } - if (thread_scn != NULL) - thread_scn[thread_index] = cut_scn; - if (thread_lsn != NULL) - thread_lsn[thread_index] = ack.stop_cut_lsn; - } - LWLockRelease(&cluster_backup_state->lock.lock); -} - -static void -cluster_backup_add_peer_restore_point_acks(uint64 request_id, SCN *thread_scn, - XLogRecPtr *thread_lsn) -{ - int node_id; - - if (request_id == 0) - return; - - LWLockAcquire(&cluster_backup_state->lock.lock, LW_SHARED); - for (node_id = 0; node_id < CLUSTER_MAX_NODES; node_id++) { - ClusterBackupWireAck ack; - int thread_index; - - if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_expected, node_id)) - continue; - if (!cluster_backup_bitmap_test(cluster_backup_state->coordinator_acked, node_id)) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster restore point is missing a peer ACK"), - errdetail("Missing peer node: %d.", node_id))); - } - - ack = cluster_backup_state->coordinator_acks[node_id]; - if (ack.request_id != request_id || ack.op != CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT - || ack.result != CLUSTER_BACKUP_WIRE_RESULT_OK || ack.stop_cut_lsn == InvalidXLogRecPtr - || !SCN_VALID(ack.cut_scn) || ack.thread_id == 0 || ack.thread_id > CLUSTER_MAX_NODES) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster restore point has incomplete peer metadata"), - errdetail("Peer node: %d.", node_id))); - } - - thread_index = (int)ack.thread_id - 1; - if (thread_lsn[thread_index] != InvalidXLogRecPtr || SCN_VALID(thread_scn[thread_index])) { - LWLockRelease(&cluster_backup_state->lock.lock); - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster restore point has duplicate WAL thread metadata"), - errdetail("Peer node %d reported thread %u.", node_id, ack.thread_id))); - } - thread_lsn[thread_index] = ack.stop_cut_lsn; - thread_scn[thread_index] = ack.cut_scn; - } - LWLockRelease(&cluster_backup_state->lock.lock); -} - -static char * -cluster_backup_build_label(const BackupState *state, const ClusterBackupManifest *manifest, - SCN cut_scn) -{ - StringInfoData buf; - char *native; - - native = build_backup_content((BackupState *)state, false); - initStringInfo(&buf); - appendStringInfoString(&buf, native); - appendStringInfo(&buf, "CLUSTER_BACKUP_ID: %s\n", manifest->backup_id); - appendStringInfo(&buf, "CLUSTER_CONSISTENT_SCN: " UINT64_FORMAT "\n", (uint64)cut_scn); - appendStringInfo(&buf, "CLUSTER_MANIFEST_CRC32C: %u\n", manifest->manifest_crc); - appendStringInfo(&buf, "CLUSTER_NODE_COUNT: %u\n", manifest->node_count); - appendStringInfo(&buf, "CLUSTER_THREAD_COUNT: %u\n", manifest->thread_count); - pfree(native); - return buf.data; -} - static void cluster_backup_init_wire_ack(ClusterBackupWireAck *ack, const ClusterBackupWireRequest *request, ClusterBackupWireResult result) @@ -855,26 +317,6 @@ cluster_backup_lmon_reset_context(void) } } -static void -cluster_backup_lmon_prepare_context(void) -{ - MemoryContext oldcontext; - - if (cluster_backup_lmon_context == NULL) - cluster_backup_lmon_context = AllocSetContextCreate( - TopMemoryContext, "cluster backup lmon context", ALLOCSET_START_SMALL_SIZES); - else { - cluster_backup_lmon_state = NULL; - cluster_backup_lmon_tablespace_map = NULL; - MemoryContextReset(cluster_backup_lmon_context); - } - - oldcontext = MemoryContextSwitchTo(cluster_backup_lmon_context); - cluster_backup_lmon_state = (BackupState *)palloc0(sizeof(BackupState)); - cluster_backup_lmon_tablespace_map = makeStringInfo(); - MemoryContextSwitchTo(oldcontext); -} - static ClusterBackupWireAck cluster_backup_lmon_execute_request(const ClusterBackupWireRequest *request) { @@ -899,30 +341,17 @@ cluster_backup_lmon_execute_request(const ClusterBackupWireRequest *request) else if (cluster_backup_lmon_state != NULL || get_backup_status() == SESSION_BACKUP_RUNNING) result = CLUSTER_BACKUP_WIRE_RESULT_BUSY; - else { - cluster_backup_lmon_prepare_context(); - register_persistent_abort_backup_handler(); - do_pg_backup_start(request->backup_id, request->fast, NULL, - cluster_backup_lmon_state, cluster_backup_lmon_tablespace_map); - ack.start_redo_lsn = cluster_backup_lmon_state->startpoint; - ack.checkpoint_lsn = cluster_backup_lmon_state->checkpointloc; - ack.timeline = cluster_backup_lmon_state->starttli; - result = CLUSTER_BACKUP_WIRE_RESULT_OK; - } + else + result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; break; case CLUSTER_BACKUP_WIRE_OP_STOP: if (cluster_backup_lmon_state == NULL || get_backup_status() != SESSION_BACKUP_RUNNING) result = CLUSTER_BACKUP_WIRE_RESULT_NOT_IN_BACKUP; else { - ack.start_redo_lsn = cluster_backup_lmon_state->startpoint; - ack.checkpoint_lsn = cluster_backup_lmon_state->checkpointloc; - do_pg_backup_stop(cluster_backup_lmon_state, request->waitforarchive); - ack.stop_cut_lsn = cluster_backup_lmon_state->stoppoint; - ack.cut_scn = cluster_backup_current_scn(); - ack.timeline = cluster_backup_lmon_state->stoptli; + do_pg_abort_backup(0, DatumGetBool(false)); cluster_backup_lmon_reset_context(); - result = CLUSTER_BACKUP_WIRE_RESULT_OK; + result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; } break; @@ -938,13 +367,8 @@ cluster_backup_lmon_execute_request(const ClusterBackupWireRequest *request) case CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT: if (request->restore_point_name[0] == '\0') result = CLUSTER_BACKUP_WIRE_RESULT_BAD_REQUEST; - else if (RecoveryInProgress() || !XLogIsNeeded()) + else result = CLUSTER_BACKUP_WIRE_RESULT_EXECUTOR_ERROR; - else { - ack.stop_cut_lsn = XLogRestorePoint(request->restore_point_name); - ack.cut_scn = cluster_backup_current_scn(); - result = CLUSTER_BACKUP_WIRE_RESULT_OK; - } break; case CLUSTER_BACKUP_WIRE_OP_NONE: @@ -1023,9 +447,9 @@ cluster_backup_ack_handler(const ClusterICEnvelope *env, const void *payload) if (ack->op == CLUSTER_BACKUP_WIRE_OP_START) { MemSet(thread, 0, sizeof(*thread)); thread->present = true; - thread->wal_included = true; - thread->undo_included = true; - thread->tt_included = true; + thread->wal_included = false; + thread->undo_included = false; + thread->tt_included = false; thread->thread_id = ack->thread_id; thread->node_id = node_id; thread->start_redo_lsn = ack->start_redo_lsn; @@ -1035,9 +459,9 @@ cluster_backup_ack_handler(const ClusterICEnvelope *env, const void *payload) if (!thread->present && ack->start_redo_lsn != InvalidXLogRecPtr && ack->checkpoint_lsn != InvalidXLogRecPtr) { thread->present = true; - thread->wal_included = true; - thread->undo_included = true; - thread->tt_included = true; + thread->wal_included = false; + thread->undo_included = false; + thread->tt_included = false; thread->thread_id = ack->thread_id; thread->node_id = node_id; thread->start_redo_lsn = ack->start_redo_lsn; @@ -1187,206 +611,47 @@ cluster_backup_lmon_tick(void) Datum pg_cluster_backup_start(PG_FUNCTION_ARGS) { -#define PG_CLUSTER_BACKUP_START_COLS 4 - TupleDesc tupdesc; - Datum values[PG_CLUSTER_BACKUP_START_COLS] = { 0 }; - bool nulls[PG_CLUSTER_BACKUP_START_COLS] = { 0 }; text *backupid = PG_GETARG_TEXT_PP(0); - bool fast = PG_GETARG_BOOL(1); char *backupidstr; - SessionBackupState status; - MemoryContext oldcontext; - uint8 start_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; - bool have_peers; - uint64 start_request_id = 0; - ClusterBackupCoordWaitResult wait_result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to start a cluster backup"))); cluster_backup_error_if_unavailable("pg_cluster_backup_start"); - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - backupidstr = text_to_cstring(backupid); if (strlen(backupidstr) >= CLUSTER_BACKUP_ID_MAX) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cluster backup id is too long"), errdetail("Maximum length is %d bytes.", CLUSTER_BACKUP_ID_MAX - 1))); - - status = get_backup_status(); - if (status == SESSION_BACKUP_RUNNING) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_IN_PROGRESS), - errmsg("a backup is already in progress in this session"))); - - cluster_backup_build_current_peer_bitmap(start_peers); - have_peers = cluster_backup_bitmap_any_set(start_peers); - - if (cluster_backup_context == NULL) - cluster_backup_context = AllocSetContextCreate(TopMemoryContext, "cluster backup context", - ALLOCSET_START_SMALL_SIZES); - else { - cluster_backup_session_state = NULL; - cluster_backup_tablespace_map = NULL; - MemoryContextReset(cluster_backup_context); - } - - oldcontext = MemoryContextSwitchTo(cluster_backup_context); - cluster_backup_session_state = (BackupState *)palloc0(sizeof(BackupState)); - cluster_backup_tablespace_map = makeStringInfo(); - MemoryContextSwitchTo(oldcontext); - - register_persistent_abort_backup_handler(); - do_pg_backup_start(backupidstr, fast, NULL, cluster_backup_session_state, - cluster_backup_tablespace_map); - cluster_backup_update_start(backupidstr, cluster_backup_session_state); - - if (have_peers) { - if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_START, start_peers, - backupidstr, NULL, fast, false, InvalidScn, - &start_request_id)) { - if (get_backup_status() == SESSION_BACKUP_RUNNING) - do_pg_abort_backup(0, DatumGetBool(false)); - cluster_backup_mark_native_stopped(NULL); - cluster_backup_cleanup_session_context(); - cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_START); - } - if (!cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_START, start_request_id, - &wait_result)) { - cluster_backup_abort_peers_best_effort(start_peers, backupidstr); - if (get_backup_status() == SESSION_BACKUP_RUNNING) - do_pg_abort_backup(0, DatumGetBool(false)); - cluster_backup_mark_native_stopped(NULL); - cluster_backup_cleanup_session_context(); - cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_START, &wait_result); - } - } - - values[0] = CStringGetTextDatum(backupidstr); - values[1] = LSNGetDatum(cluster_backup_session_state->startpoint); - values[2] = LSNGetDatum(cluster_backup_session_state->checkpointloc); - values[3] = Int32GetDatum((int32)cluster_backup_session_state->starttli); - - PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); + cluster_backup_fail_closed_unimplemented( + "pg_cluster_backup_start", + "cluster physical backup capture, durable WAL pinning, and restore integration"); + PG_RETURN_NULL(); } Datum pg_cluster_backup_stop(PG_FUNCTION_ARGS) { -#define PG_CLUSTER_BACKUP_STOP_COLS 4 - TupleDesc tupdesc; - Datum values[PG_CLUSTER_BACKUP_STOP_COLS] = { 0 }; - bool nulls[PG_CLUSTER_BACKUP_STOP_COLS] = { 0 }; - bool waitforarchive = PG_GETARG_BOOL(0); - ClusterBackupManifest manifest; - ClusterRestorePoint point; - SCN cut_scn; - char *backup_label; - XLogRecPtr thread_lsn[CLUSTER_MAX_NODES]; - SCN thread_scn[CLUSTER_MAX_NODES]; - int thread_index; - uint16 thread_id; - uint8 stop_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; - uint8 current_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; - bool have_peers; - uint64 stop_request_id = 0; - ClusterBackupCoordWaitResult wait_result; - if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to stop a cluster backup"))); cluster_backup_error_if_unavailable("pg_cluster_backup_stop"); - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - if (get_backup_status() != SESSION_BACKUP_RUNNING) - ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cluster backup is not in progress"), - errhint("Did you call pg_cluster_backup_start()?"))); - if (cluster_backup_session_state == NULL || cluster_backup_tablespace_map == NULL) - ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cluster backup session state is missing"))); - - have_peers = cluster_backup_get_backup_peer_bitmap(stop_peers); - if (have_peers) { - cluster_backup_build_current_peer_bitmap(current_peers); - if (!cluster_backup_bitmap_equal(stop_peers, current_peers)) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_BACKUP_INCOMPLETE), - errmsg("cluster backup peer topology changed during backup"), - errhint("Stop or abort the backup and retry after cluster membership " - "settles."))); - if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_STOP, stop_peers, - cluster_backup_session_state->name, NULL, false, - waitforarchive, InvalidScn, &stop_request_id)) - cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_STOP); - } - - do_pg_backup_stop(cluster_backup_session_state, waitforarchive); - cluster_backup_mark_native_stopped(cluster_backup_session_state); - cut_scn = cluster_backup_current_scn(); - - MemSet(thread_lsn, 0, sizeof(thread_lsn)); - MemSet(thread_scn, 0, sizeof(thread_scn)); - thread_id = cluster_backup_local_thread_id(); - thread_index = (int)thread_id - 1; - thread_lsn[thread_index] = cluster_backup_session_state->stoppoint; - thread_scn[thread_index] = cut_scn; - - if (have_peers - && !cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_STOP, stop_request_id, - &wait_result)) { - cluster_backup_cleanup_session_context(); - cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_STOP, &wait_result); - } - - cluster_backup_fill_local_manifest(&manifest, cluster_backup_session_state, cut_scn); - if (have_peers) - cluster_backup_add_peer_stop_threads(&manifest, stop_request_id, thread_scn, thread_lsn); - cut_scn = manifest.consistent_scn; - cluster_backup_seal_manifest_or_error(&manifest); - backup_label = cluster_backup_build_label(cluster_backup_session_state, &manifest, cut_scn); - cluster_backup_update_stop(cluster_backup_session_state, &manifest, cut_scn); - - if (cluster_restore_point_build(&point, manifest.backup_id, thread_scn, thread_lsn, - CLUSTER_MAX_NODES, true, true, manifest.incarnation) - == CLUSTER_RESTORE_POINT_CUT_OK) { - point.created_at = GetCurrentTimestamp(); - cluster_backup_add_restore_point(&point); - } - - values[0] = Int64GetDatum((int64)cut_scn); - values[1] = LSNGetDatum(cluster_backup_session_state->stoppoint); - values[2] = Int64GetDatum((int64)manifest.manifest_crc); - values[3] = CStringGetTextDatum(backup_label); - - pfree(backup_label); - cluster_backup_cleanup_session_context(); - - PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); + if (get_backup_status() == SESSION_BACKUP_RUNNING) + cluster_backup_abort_local_session_if_running(); + cluster_backup_fail_closed_unimplemented( + "pg_cluster_backup_stop", + "cluster-wide restore-point commit-drain barrier and durable per-thread " + "WAL/undo/transaction-table capture"); + PG_RETURN_NULL(); } Datum pg_cluster_create_restore_point(PG_FUNCTION_ARGS) { -#define PG_CLUSTER_RESTORE_POINT_COLS 3 - TupleDesc tupdesc; - Datum values[PG_CLUSTER_RESTORE_POINT_COLS] = { 0 }; - bool nulls[PG_CLUSTER_RESTORE_POINT_COLS] = { 0 }; text *restore_name = PG_GETARG_TEXT_PP(0); char *restore_name_str; - XLogRecPtr restorepoint; - SCN cut_scn; - SCN thread_scn[CLUSTER_MAX_NODES]; - XLogRecPtr thread_lsn[CLUSTER_MAX_NODES]; - uint16 thread_id; - int thread_index; - ClusterRestorePoint point; - ClusterRestorePointCutReason reason; - uint8 restore_point_peers[CLUSTER_BACKUP_NODE_BITMAP_BYTES]; - bool have_peers; - uint64 restore_request_id = 0; - ClusterBackupCoordWaitResult wait_result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -1402,55 +667,15 @@ pg_cluster_create_restore_point(PG_FUNCTION_ARGS) errmsg("WAL level not sufficient for creating a restore point"), errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - restore_name_str = text_to_cstring(restore_name); if (strlen(restore_name_str) >= CLUSTER_RESTORE_POINT_NAME_MAX) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cluster restore point name is too long"), errdetail("Maximum length is %d bytes.", CLUSTER_RESTORE_POINT_NAME_MAX - 1))); - - cluster_backup_build_current_peer_bitmap(restore_point_peers); - have_peers = cluster_backup_bitmap_any_set(restore_point_peers); - if (have_peers) { - if (!cluster_backup_begin_coord_request(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, - restore_point_peers, NULL, restore_name_str, false, - false, InvalidScn, &restore_request_id)) - cluster_backup_raise_coord_enqueue_failure(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT); - } - - restorepoint = XLogRestorePoint(restore_name_str); - cut_scn = cluster_backup_current_scn(); - - MemSet(thread_scn, 0, sizeof(thread_scn)); - MemSet(thread_lsn, 0, sizeof(thread_lsn)); - thread_id = cluster_backup_local_thread_id(); - thread_index = (int)thread_id - 1; - thread_scn[thread_index] = cut_scn; - thread_lsn[thread_index] = restorepoint; - if (have_peers - && !cluster_backup_wait_for_peer_acks(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, - restore_request_id, &wait_result)) - cluster_backup_raise_peer_failure(CLUSTER_BACKUP_WIRE_OP_RESTORE_POINT, &wait_result); - if (have_peers) - cluster_backup_add_peer_restore_point_acks(restore_request_id, thread_scn, thread_lsn); - reason = cluster_restore_point_build(&point, restore_name_str, thread_scn, thread_lsn, - CLUSTER_MAX_NODES, true, true, 0); - if (reason != CLUSTER_RESTORE_POINT_CUT_OK) - ereport(ERROR, (errcode(ERRCODE_CLUSTER_RESTORE_POINT_DRAIN_TIMEOUT), - errmsg("could not build cluster restore point cut: %s", - cluster_restore_point_cut_reason_name(reason)))); - point.created_at = GetCurrentTimestamp(); - cluster_backup_add_restore_point(&point); - cut_scn = point.cut_scn; - - values[0] = CStringGetTextDatum(restore_name_str); - values[1] = Int64GetDatum((int64)cut_scn); - values[2] = LSNGetDatum(restorepoint); - - PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); + cluster_backup_fail_closed_unimplemented("pg_cluster_create_restore_point", + "cluster-wide restore-point commit-drain barrier"); + PG_RETURN_NULL(); } Datum diff --git a/src/test/cluster_tap/t/332_cluster_backup_pitr.pl b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl index e3d6947da1..51b0c2cd57 100644 --- a/src/test/cluster_tap/t/332_cluster_backup_pitr.pl +++ b/src/test/cluster_tap/t/332_cluster_backup_pitr.pl @@ -17,13 +17,37 @@ use warnings; use FindBin; +use IO::Socket::INET; use lib "$FindBin::RealBin/../lib"; use PgracClusterNode; use PostgreSQL::Test::Utils; use Test::More; -my $node = PgracClusterNode->new('cluster_backup_single'); +my $next_high_port = $ENV{PGRAC_BACKUP_TAP_PORT_BASE} // 60432; + +sub next_free_high_port +{ + for (1 .. 256) + { + my $port = $next_high_port++; + my $sock = IO::Socket::INET->new( + Listen => 5, + LocalAddr => '127.0.0.1', + LocalPort => $port, + Proto => 'tcp', + ReuseAddr => 1); + if ($sock) + { + close $sock; + return $port; + } + } + die "could not find a free high TCP port for cluster backup TAP"; +} + +my $node = PgracClusterNode->new('cluster_backup_single', + port => next_free_high_port()); $node->init(allows_streaming => 1); $node->append_conf('postgresql.conf', "cluster.enabled = on\n" @@ -50,68 +74,68 @@ 'latest,pause,t,ok', 'L2 default PITR target status is latest/pause/ok'); -my $backup_row = $node->safe_psql('postgres', - q{SELECT s.backup_id || ',' || - CASE WHEN s.start_redo_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || - CASE WHEN s.checkpoint_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || - CASE WHEN t.consistent_scn > 0 THEN 't' ELSE 'f' END || ',' || - CASE WHEN t.stop_cut_lsn IS NOT NULL THEN 't' ELSE 'f' END || ',' || - CASE WHEN t.manifest_crc > 0 THEN 't' ELSE 'f' END || ',' || - CASE WHEN t.backup_label LIKE '%CLUSTER_BACKUP_ID: b332%' THEN 't' ELSE 'f' END || ',' || - CASE WHEN t.backup_label LIKE '%CLUSTER_MANIFEST_CRC32C:%' THEN 't' ELSE 'f' END - FROM pg_cluster_backup_start('b332', true) AS s - CROSS JOIN LATERAL - pg_cluster_backup_stop(COALESCE(s.backup_id = '', false)) AS t}); -is($backup_row, 'b332,t,t,t,t,t,t,t', - 'L3 cluster backup start/stop returns checkpoint, SCN, LSN, CRC, and label contract'); +my ($backup_ret, $backup_out, $backup_err) = $node->psql('postgres', + "\\set VERBOSITY verbose\nSELECT * FROM pg_cluster_backup_start('b332', true)"); +isnt($backup_ret, 0, + 'L3 cluster backup start fails closed until physical capture lands'); +like($backup_err, qr/0A000|feature_not_supported/, + 'L3 cluster backup start reports feature_not_supported'); +like($backup_err, qr/physical backup capture|durable WAL pinning|restore integration/, + 'L3 cluster backup start names the missing substrate'); is($node->safe_psql('postgres', - q{SELECT backup_id || ',' || node_count || ',' || thread_count || ',' || - CASE WHEN manifest_crc > 0 THEN 't' ELSE 'f' END - FROM pg_cluster_backup_history}), - 'b332,1,1,t', - 'L4 latest manifest summary is visible'); - + q{SELECT CASE WHEN in_progress THEN 't' ELSE 'f' END + FROM pg_stat_cluster_backup}), + 'f', + 'L4 rejected cluster backup does not leave in-progress state'); is($node->safe_psql('postgres', - q{SELECT restore_point_name || ',' || - CASE WHEN cut_scn > 0 THEN 't' ELSE 'f' END || ',' || - CASE WHEN cut_lsn IS NOT NULL THEN 't' ELSE 'f' END - FROM pg_cluster_create_restore_point('rp332')}), - 'rp332,t,t', - 'L5 cluster restore point records SCN and LSN'); + q{SELECT count(*) FROM pg_cluster_backup_history}), + '0', + 'L4 rejected cluster backup does not publish a manifest'); + +my ($rp_ret, $rp_out, $rp_err) = $node->psql('postgres', + "\\set VERBOSITY verbose\nSELECT * FROM pg_cluster_create_restore_point('rp332')"); +isnt($rp_ret, 0, + 'L5 cluster restore point fails closed until commit-drain lands'); +like($rp_err, qr/0A000|feature_not_supported/, + 'L5 cluster restore point reports feature_not_supported'); +like($rp_err, qr/restore-point commit-drain barrier/, + 'L5 cluster restore point names the missing barrier'); is($node->safe_psql('postgres', - q{SELECT count(*) FROM pg_cluster_restore_points - WHERE restore_point_name IN ('b332', 'rp332')}), - '2', - 'L6 backup stop and manual restore point are retained'); + q{SELECT count(*) FROM pg_cluster_restore_points}), + '0', + 'L6 rejected restore point is not retained'); $node->stop; -my $peer_node = PgracClusterNode->new('cluster_backup_peers'); +my $peer_node = PgracClusterNode->new('cluster_backup_peers', + port => next_free_high_port()); +my $peer_ic0 = next_free_high_port(); +my $peer_ic1 = next_free_high_port(); $peer_node->init(allows_streaming => 1); $peer_node->append_conf('postgresql.conf', "cluster.enabled = on\n" . "cluster.node_id = 0\n" . "cluster.allow_single_node = on\n" . "wal_level = replica\n"); -PostgreSQL::Test::Utils::append_to_file($peer_node->data_dir . '/pgrac.conf', <<'EOC'); +PostgreSQL::Test::Utils::append_to_file($peer_node->data_dir . '/pgrac.conf', <start; my ($ret, $out, $err) = $peer_node->psql('postgres', "\\set VERBOSITY verbose\nSELECT * FROM pg_cluster_backup_start('partial', true)"); -isnt($ret, 0, 'L8 peer topology requires complete backup ACKs'); -like($err, qr/53RAD|cluster_backup_incomplete/, - 'L8 missing peer ACK fails closed with cluster_backup_incomplete'); +isnt($ret, 0, 'L8 declared-peer backup remains fail-closed without capture substrate'); +like($err, qr/0A000|feature_not_supported/, + 'L8 declared-peer backup reports feature_not_supported'); is($peer_node->safe_psql('postgres', q{SELECT CASE WHEN in_progress THEN 't' ELSE 'f' END FROM pg_stat_cluster_backup}), @@ -120,7 +144,8 @@ $peer_node->stop; -my $bad_target_node = PgracClusterNode->new('cluster_backup_bad_target'); +my $bad_target_node = PgracClusterNode->new('cluster_backup_bad_target', + port => next_free_high_port()); $bad_target_node->init(allows_streaming => 1); $bad_target_node->append_conf('postgresql.conf', "cluster.enabled = on\n"