diff --git a/Makefile b/Makefile index 83a6b26b4..69d9b0595 100644 --- a/Makefile +++ b/Makefile @@ -66,6 +66,7 @@ RWOBJS = \ $(FETOOLS)/pg_rewind/local_source.o \ $(FETOOLS)/pg_rewind/parsexlog.o \ $(FETOOLS)/pg_rewind/pg_rewind.o \ + $(FETOOLS)/pg_rewind/tde_ops.o \ $(FETOOLS)/pg_rewind/timeline.o RMGRDESCSOURCES = $(sort $(wildcard $(FETOOLS)/rmgrdesc/*desc*.c)) diff --git a/fetools/pg18/pg_rewind/filemap.c b/fetools/pg18/pg_rewind/filemap.c index c933871ca..91b275eba 100644 --- a/fetools/pg18/pg_rewind/filemap.c +++ b/fetools/pg18/pg_rewind/filemap.c @@ -487,6 +487,8 @@ action_to_str(file_action_t action) return "CREATE"; case FILE_ACTION_REMOVE: return "REMOVE"; + case FILE_ACTION_ENSURE_TDE_KEY: + return "ENSURE_KEY"; default: return "unknown"; @@ -572,9 +574,40 @@ isRelDataFile(const char *path) { RelFileLocator rlocator; unsigned int segNo; - int nmatch; bool matched; + matched = path_rlocator(path, &rlocator, &segNo); + + /* + * path_rlocator() above can match files that have extra characters at the + * end. To eliminate such cases, cross-check that GetRelationPath creates + * the exact same filename, when passed the RelFileLocator information we + * extracted from the filename. + */ + if (matched) + { + char *check_path = datasegpath(rlocator, MAIN_FORKNUM, segNo); + + if (strcmp(check_path, path) != 0) + matched = false; + + pfree(check_path); + } + + return matched; +} + +/* + * Sets rlocator and segNo based on the given path. Returns false if no match + * is found. + * + * Only concerned with files belonging to the main fork. + */ +bool +path_rlocator(const char *path, RelFileLocator *rlocator, unsigned int *segNo) +{ + int nmatch; + /*---- * Relation data files can be in one of the following directories: * @@ -594,55 +627,38 @@ isRelDataFile(const char *path) * *---- */ - rlocator.spcOid = InvalidOid; - rlocator.dbOid = InvalidOid; - rlocator.relNumber = InvalidRelFileNumber; - segNo = 0; - matched = false; + rlocator->spcOid = InvalidOid; + rlocator->dbOid = InvalidOid; + rlocator->relNumber = InvalidRelFileNumber; + *segNo = 0; - nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo); + nmatch = sscanf(path, "global/%u.%u", &rlocator->relNumber, segNo); if (nmatch == 1 || nmatch == 2) { - rlocator.spcOid = GLOBALTABLESPACE_OID; - rlocator.dbOid = 0; - matched = true; + rlocator->spcOid = GLOBALTABLESPACE_OID; + rlocator->dbOid = 0; + return true; } else { nmatch = sscanf(path, "base/%u/%u.%u", - &rlocator.dbOid, &rlocator.relNumber, &segNo); + &rlocator->dbOid, &rlocator->relNumber, segNo); if (nmatch == 2 || nmatch == 3) { - rlocator.spcOid = DEFAULTTABLESPACE_OID; - matched = true; + rlocator->spcOid = DEFAULTTABLESPACE_OID; + return true; } else { nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u", - &rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber, - &segNo); + &rlocator->spcOid, &rlocator->dbOid, &rlocator->relNumber, + segNo); if (nmatch == 3 || nmatch == 4) - matched = true; + return true; } } - /* - * The sscanf tests above can match files that have extra characters at - * the end. To eliminate such cases, cross-check that GetRelationPath - * creates the exact same filename, when passed the RelFileLocator - * information we extracted from the filename. - */ - if (matched) - { - char *check_path = datasegpath(rlocator, MAIN_FORKNUM, segNo); - - if (strcmp(check_path, path) != 0) - matched = false; - - pfree(check_path); - } - - return matched; + return false; } /* @@ -712,6 +728,13 @@ decide_file_action(file_entry_t *entry) if (strstr(path, ".DS_Store") != NULL) return FILE_ACTION_NONE; + /* + * Skip pg_tde key data. This is handled separately by combining the + * source and target keys when processing relation files. + */ + if (strstr(path, "pg_tde/") != NULL) + return FILE_ACTION_NONE; + /* * Remove all files matching the exclusion filters in the target. */ @@ -754,7 +777,7 @@ decide_file_action(file_entry_t *entry) if (keepwal_entry_exists(path)) { pg_log_debug("Not removing file \"%s\" because it is required for recovery", path); - return FILE_ACTION_NONE; + return FILE_ACTION_ENSURE_WAL_SEG; } return FILE_ACTION_REMOVE; } @@ -831,14 +854,15 @@ decide_file_action(file_entry_t *entry) * in the target will be copied based on parsing the target * system's WAL, and any blocks modified in the source will be * updated after rewinding, when the source system's WAL is - * replayed. + * replayed. But we still have to sync source/target keys in + * case it is encrypted. */ if (entry->target_size < entry->source_size) return FILE_ACTION_COPY_TAIL; else if (entry->target_size > entry->source_size) return FILE_ACTION_TRUNCATE; else - return FILE_ACTION_NONE; + return FILE_ACTION_ENSURE_TDE_KEY; } break; diff --git a/fetools/pg18/pg_rewind/filemap.h b/fetools/pg18/pg_rewind/filemap.h index df78a02e3..b933b282a 100644 --- a/fetools/pg18/pg_rewind/filemap.h +++ b/fetools/pg18/pg_rewind/filemap.h @@ -25,6 +25,10 @@ typedef enum * blocks based on the parsed WAL) */ FILE_ACTION_TRUNCATE, /* truncate local file to 'newsize' bytes */ FILE_ACTION_REMOVE, /* remove local file / directory / symlink */ + FILE_ACTION_ENSURE_TDE_KEY, /* data file with no action, but we to check + * if it is encrypted and sync source/target + * keys */ + FILE_ACTION_ENSURE_WAL_SEG /* kept WAL segment might need reencryption */ } file_action_t; typedef enum @@ -113,4 +117,6 @@ extern void print_filemap(filemap_t *filemap); extern void keepwal_init(void); extern void keepwal_add_entry(const char *path); +extern bool path_rlocator(const char *path, RelFileLocator *rlocator, unsigned int *segNo); + #endif /* FILEMAP_H */ diff --git a/fetools/pg18/pg_rewind/libpq_source.c b/fetools/pg18/pg_rewind/libpq_source.c index 56c2ad55d..0678dd1e5 100644 --- a/fetools/pg18/pg_rewind/libpq_source.c +++ b/fetools/pg18/pg_rewind/libpq_source.c @@ -17,6 +17,9 @@ #include "pg_rewind.h" #include "port/pg_bswap.h" #include "rewind_source.h" +#include "tde_ops.h" + +#include "pg_tde.h" /* * Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a @@ -31,6 +34,7 @@ typedef struct const char *path; /* path relative to data directory root */ off_t offset; size_t length; + bool encrypt; } fetch_range_request; typedef struct @@ -71,6 +75,10 @@ static char *libpq_fetch_file(rewind_source *source, const char *path, static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source); static void libpq_destroy(rewind_source *source); +static void libpq_queue_process_fetch_range(rewind_source *source, const char *path, + bool needs_encrypt, off_t off, size_t len); +static void libpq_fetch_tde_keys(rewind_source *source); + /* * Create a new libpq source. * @@ -100,6 +108,8 @@ init_libpq_source(PGconn *conn) initStringInfo(&src->offsets); initStringInfo(&src->lengths); + libpq_fetch_tde_keys(&src->common); + return &src->common; } @@ -345,7 +355,7 @@ libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len) * fetch-requests are for a whole file. */ open_target_file(path, true); - libpq_queue_fetch_range(source, path, 0, Max(len, MAX_CHUNK_SIZE)); + libpq_queue_process_fetch_range(source, path, false, 0, Max(len, MAX_CHUNK_SIZE)); } /* @@ -354,6 +364,17 @@ libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len) static void libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) +{ + libpq_queue_process_fetch_range(source, path, true, off, len); +} + +/* + * A workhorse for libpq_queue_fetch_range. + * `needs_encrypt` indicates if file's blocks may need re-encryption. + */ +static void +libpq_queue_process_fetch_range(rewind_source *source, const char *path, + bool needs_encrypt, off_t off, size_t len) { libpq_source *src = (libpq_source *) source; @@ -406,6 +427,7 @@ libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, src->request_queue[src->num_requests].path = path; src->request_queue[src->num_requests].offset = off; src->request_queue[src->num_requests].length = thislen; + src->request_queue[src->num_requests].encrypt = needs_encrypt; src->num_requests++; off += thislen; @@ -420,6 +442,7 @@ static void libpq_finish_fetch(rewind_source *source) { process_queued_fetch_requests((libpq_source *) source); + flush_current_tde_rel_key(); } static void @@ -592,6 +615,19 @@ process_queued_fetch_requests(libpq_source *src) open_target_file(filename, false); + if (rq->encrypt) + { + Assert(chunksize % BLCKSZ == 0); + + ensure_tde_keys(filename); + + for (int i = 0; i < chunksize / BLCKSZ; i++) + { + unsigned char *data = (unsigned char *) chunk + BLCKSZ * i; + + tde_reencrypt_block(data, chunkoff + BLCKSZ * i, MAIN_FORKNUM); + } + } write_target_range(chunk, chunkoff, chunksize); } @@ -682,3 +718,52 @@ libpq_destroy(rewind_source *source) /* NOTE: we don't close the connection here, as it was not opened by us. */ } + +static void +libpq_fetch_tde_keys(rewind_source *source) +{ + PGconn *conn = ((libpq_source *) source)->conn; + PGresult *res; + + res = PQexec(conn, "SELECT pg_ls_dir('" PG_TDE_DATA_DIR "', true, false)"); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pg_fatal("could not fetch file list: %s", + PQresultErrorMessage(res)); + + /* no tde dir, nothing to do */ + if (PQntuples(res) == 0) + { + PQclear(res); + return; + } + + init_tde(); + + for (int i = 0; i < PQntuples(res); i++) + { + char *path; + char *tde_file_buf; + size_t size; + char target_path[MAXPGPATH]; + + if (PQgetisnull(res, i, 0)) + { + /* + * The file was removed from the server while the query was + * running. Ignore it. + */ + continue; + } + + path = PQgetvalue(res, i, 0); + + snprintf(target_path, MAXPGPATH, "%s/%s", PG_TDE_DATA_DIR, path); + tde_file_buf = libpq_fetch_file(source, target_path, &size); + + write_tmp_source_file(path, tde_file_buf, size); + pg_free(tde_file_buf); + } + + PQclear(res); +} diff --git a/fetools/pg18/pg_rewind/local_source.c b/fetools/pg18/pg_rewind/local_source.c index 5a6e805c1..c5c34607f 100644 --- a/fetools/pg18/pg_rewind/local_source.c +++ b/fetools/pg18/pg_rewind/local_source.c @@ -10,11 +10,19 @@ #include "postgres_fe.h" #include +#include #include +#include "catalog/pg_tablespace_d.h" #include "common/logging.h" #include "file_ops.h" +#include "pg_rewind.h" #include "rewind_source.h" +#include "tde_ops.h" + +#include "pg_tde.h" +#include "common/pg_tde_utils.h" +#include "access/pg_tde_tdemap.h" typedef struct { @@ -34,6 +42,8 @@ static void local_queue_fetch_range(rewind_source *source, const char *path, static void local_finish_fetch(rewind_source *source); static void local_destroy(rewind_source *source); +static void local_fetch_tde_keys(rewind_source *source); + rewind_source * init_local_source(const char *datadir) { @@ -51,6 +61,8 @@ init_local_source(const char *datadir) src->datadir = datadir; + local_fetch_tde_keys(&src->common); + return &src->common; } @@ -145,6 +157,8 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, open_target_file(path, false); + ensure_tde_keys(path); + while (end - begin > 0) { ssize_t readlen; @@ -162,6 +176,9 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, else if (readlen == 0) pg_fatal("unexpected EOF while reading file \"%s\"", srcpath); + /* Re-encrypt blocks with a proper key if needed. */ + tde_reencrypt_block((unsigned char *) buf.data, begin, MAIN_FORKNUM); + write_target_range(buf.data, begin, readlen); begin += readlen; } @@ -170,12 +187,34 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, pg_fatal("could not close file \"%s\": %m", srcpath); } +static bool +directory_exists(const char *dir) +{ + struct stat st; + + return stat(dir, &st) == 0 && S_ISDIR(st.st_mode); +} + +static void +local_fetch_tde_keys(rewind_source *source) +{ + char tde_source_dir[MAXPGPATH]; + const char *datadir = ((local_source *) source)->datadir; + + snprintf(tde_source_dir, sizeof(tde_source_dir), "%s/%s", datadir, PG_TDE_DATA_DIR); + + if (!directory_exists(tde_source_dir)) + return; + + init_tde(); + copy_tmp_tde_files(tde_source_dir); +} + static void local_finish_fetch(rewind_source *source) { - /* - * Nothing to do, local_queue_fetch_range() copies the ranges immediately. - */ + /* Ensure the recent key used to process data is on the disk */ + flush_current_tde_rel_key(); } static void diff --git a/fetools/pg18/pg_rewind/pg_rewind.c b/fetools/pg18/pg_rewind/pg_rewind.c index b0d7f3b6e..fe0332bc8 100644 --- a/fetools/pg18/pg_rewind/pg_rewind.c +++ b/fetools/pg18/pg_rewind/pg_rewind.c @@ -31,6 +31,7 @@ #include "pg_rewind.h" #include "rewind_source.h" #include "storage/bufpage.h" +#include "tde_ops.h" #include "pg_tde.h" #include "access/pg_tde_fe_init.h" @@ -611,12 +612,32 @@ perform_rewind(filemap_t *filemap, rewind_source *source, /* nothing else to do */ break; + case FILE_ACTION_ENSURE_TDE_KEY: + + /* + * Partial rewrites will ensure the keys on their own. + * Moreover, some partial updates, when the source is libpq, + * may happen in the last turn, when source->finish_fetch() is + * called. So running ensure_tde_keys for such files + * prematurely will make them unreadable since the source key + * would be updated before we use it to decrypt source data. + */ + if (entry->target_pages_to_overwrite.bitmapsize == 0) + ensure_tde_keys(entry->path); + break; + + case FILE_ACTION_ENSURE_WAL_SEG: + ensure_tde_wal_seg(entry->path); + break; + case FILE_ACTION_COPY: source->queue_fetch_file(source, entry->path, entry->source_size); break; case FILE_ACTION_TRUNCATE: truncate_target_file(entry->path, entry->source_size); + if (entry->target_pages_to_overwrite.bitmapsize == 0) + ensure_tde_keys(entry->path); break; case FILE_ACTION_COPY_TAIL: @@ -644,6 +665,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source, close_target_file(); + fetch_tde_dir(); + progress_report(true); /* diff --git a/fetools/pg18/pg_rewind/tde_ops.c b/fetools/pg18/pg_rewind/tde_ops.c new file mode 100644 index 000000000..80dae227d --- /dev/null +++ b/fetools/pg18/pg_rewind/tde_ops.c @@ -0,0 +1,380 @@ +#include "postgres_fe.h" + +#include + +#include "access/xlog_internal.h" +#include "catalog/pg_tablespace_d.h" +#include "common/file_perm.h" + +#include "file_ops.h" +#include "filemap.h" +#include "pg_rewind.h" +#include "tde_ops.h" + +#include "access/pg_tde_tdemap.h" +#include "access/pg_tde_xlog_keys.h" +#include "access/pg_tde_xlog_smgr.h" +#include "common/pg_tde_utils.h" +#include "pg_tde.h" + +static void copy_dir(const char *src, const char *dst); +static void create_tde_tmp_dir(void); + +typedef struct +{ + InternalKey *source_key; + InternalKey *target_key; + char path[MAXPGPATH]; + RelFileLocator rlocator; + unsigned int segNo; +} current_file_data; + +static current_file_data current_tde_file = {0}; + +/* Dir for an operational copy of source's tde files (_keys, etc) */ +static char tde_tmp_source[MAXPGPATH] = "/tmp/pg_tde_rewindXXXXXX"; +static bool source_has_tde = false; + +static void +reencrypt_fork(ForkNumber fork) +{ + int srcfd; + int trgfd; + char srcpath[MAXPGPATH]; + PGIOAlignedBlock buf; + size_t written_len; + RelPathStr rp = relpathperm(current_tde_file.rlocator, fork); + static const char *const warning_hint = "Skipping the file, as the server can start and rebuild the broken VM/FSM file."; + + snprintf(srcpath, sizeof(srcpath), "%s/%s", datadir_target, rp.str); + + /* check if fork exists, nothing to do if it does not */ + if (access(srcpath, F_OK) != 0) + return; + + srcfd = open(srcpath, O_RDONLY | PG_BINARY, 0); + if (srcfd < 0) + { + /* + * Server can recover from wrecked VM/FSM, hence only warnings here + * and in the rest of the function + */ + pg_log_warning("could not open fork file for reading \"%s\": %m", srcpath); + pg_log_warning_hint("%s", warning_hint); + return; + } + + trgfd = open(srcpath, O_WRONLY | PG_BINARY, 0); + if (trgfd < 0) + { + pg_log_warning("could not open fork file for writing \"%s\": %m", srcpath); + pg_log_warning_hint("%s", warning_hint); + close(srcfd); + return; + } + + written_len = 0; + for (;;) + { + ssize_t read_len; + + read_len = read(srcfd, buf.data, sizeof(buf.data)); + + if ((read_len <= 0)) + { + if (read_len < 0) + { + pg_log_warning("could not read block from fork file \"%s\": %m", srcpath); + pg_log_warning_hint("%s", warning_hint); + } + + break; /* EOF reached if read_len == 0 */ + } + + if (read_len != BLCKSZ) + { + pg_log_warning("unexpected read from fork file \"%s\"", srcpath); + pg_log_warning_detail("Expected %d bytes, but got %lu", BLCKSZ, read_len); + pg_log_warning_hint("%s", warning_hint); + + break; + } + + tde_reencrypt_block((unsigned char *) buf.data, written_len, fork); + + if (write(trgfd, buf.data, read_len) != read_len) + { + pg_log_warning("could not write block to fork file \"%s\": %m", srcpath); + pg_log_warning_hint("%s", warning_hint); + + break; + } + written_len += read_len; + } + + close(srcfd); + close(trgfd); +} + +/* + * Write the recent internal key that was used to re-encrypt relation data (if + * there is any). + */ +void +flush_current_tde_rel_key(void) +{ + if (current_tde_file.source_key == NULL) + return; + + pg_log_debug("ensure forks encryption for \"%s\"", current_tde_file.path); + + reencrypt_fork(FSM_FORKNUM); + reencrypt_fork(VISIBILITYMAP_FORKNUM); + + pg_log_debug("update internal key for \"%s\"", current_tde_file.path); + pg_tde_set_data_dir(tde_tmp_source); + pg_tde_save_smgr_key(current_tde_file.rlocator, current_tde_file.target_key, true); + + pfree(current_tde_file.source_key); + pfree(current_tde_file.target_key); + memset(¤t_tde_file, 0, sizeof(current_tde_file)); +} + +void +ensure_tde_wal_seg(const char *relpath) +{ + char target_tde_path[MAXPGPATH]; + char wal_path[MAXPGPATH]; + PGAlignedXLogBlock buf; + int fd; + ssize_t read_len; + off_t offset = 0; + XLogSegNo segno; + TimeLineID tli; + const char *segname = last_dir_separator(relpath); + + pg_log_debug("re-encrypt target WAL segment %s", relpath); + + segname = (segname != NULL) ? segname + 1 : relpath; + XLogFromFileName(segname, &tli, &segno, WalSegSz); + + snprintf(wal_path, sizeof(wal_path), "%s/%s", datadir_target, relpath); + + fd = open(wal_path, O_RDWR | PG_BINARY, 0); + if (fd < 0) + { + /* + * A warning here and in further as the kept segment is not necessary + * encrypted with the wrong key. Hence failing here still may result + * in recoverable server. + */ + pg_log_warning("could not open WAL segment \"%s\": %m", wal_path); + return; + } + + snprintf(target_tde_path, sizeof(target_tde_path), "%s/%s", datadir_target, PG_TDE_DATA_DIR); + + /* + * XXX: Should we slurp the whole segment and don't bother with switching + * keys every XLOG_BLCKSZ? + */ + while ((read_len = pg_pread(fd, buf.data, sizeof(buf.data), offset)) > 0) + { + /* decrypt with target keys */ + pg_tde_set_data_dir(target_tde_path); + TDEXLogCryptBuffer(buf.data, buf.data, read_len, offset, tli, segno, WalSegSz); + + /* reencrypt with source keys */ + pg_tde_set_data_dir(tde_tmp_source); + TDEXLogCryptBuffer(buf.data, buf.data, read_len, offset, tli, segno, WalSegSz); + + if (pg_pwrite(fd, buf.data, read_len, offset) != read_len) + { + pg_log_warning("could not write WAL segment \"%s\": %m", wal_path); + break; + } + offset += read_len; + } + + close(fd); +} + +void +ensure_tde_keys(const char *relpath) +{ + char target_tde_path[MAXPGPATH]; + RelFileLocator rlocator; + unsigned int segNo; + + /* no TDE on source, nothing to do */ + if (!source_has_tde) + return; + + /* the same file, nothing to do */ + if (strcmp(current_tde_file.path, relpath) == 0) + return; + + flush_current_tde_rel_key(); + + if (!path_rlocator(relpath, &rlocator, &segNo)) + return; + + pg_tde_set_data_dir(tde_tmp_source); + current_tde_file.source_key = pg_tde_get_smgr_key(rlocator); + + snprintf(target_tde_path, sizeof(target_tde_path), "%s/%s", datadir_target, PG_TDE_DATA_DIR); + pg_tde_set_data_dir(target_tde_path); + current_tde_file.target_key = pg_tde_get_smgr_key(rlocator); + + if (current_tde_file.source_key != NULL) + { + /* + * If there ever was a source_key, it must be a target_key for this + * rlocator. `ALTER TABLE ... SET ACCESS METHOD heap` would create a + * new rlocator, hence it would not be a range chage. + * + * XXX: should be an elog FATAL instead? + */ + Assert(current_tde_file.target_key != NULL); + + memset(current_tde_file.path, 0, MAXPGPATH); + strlcpy(current_tde_file.path, relpath, MAXPGPATH); + current_tde_file.rlocator = rlocator; + current_tde_file.segNo = segNo; + } +} + +void +tde_reencrypt_block(unsigned char *buf, off_t file_offset, ForkNumber fork) +{ + BlockNumber blkno; + + /* not a tde file, nothing do to */ + if (current_tde_file.source_key == NULL) + return; + + Assert(file_offset % BLCKSZ == 0); + + blkno = file_offset / BLCKSZ + current_tde_file.segNo * RELSEG_SIZE; + + pg_log_debug("re-encrypt block in %s, offset: %ld, blockNum: %u", current_tde_file.path, (long) file_offset, blkno); + tde_decrypt_smgr_block(current_tde_file.source_key, fork, blkno, buf, buf); + tde_encrypt_smgr_block(current_tde_file.target_key, fork, blkno, buf, buf); +} + +static void +create_tde_tmp_dir(void) +{ + if (mkdtemp(tde_tmp_source) == NULL) + pg_fatal("could not create temporary directory \"%s\": %m", tde_tmp_source); + + pg_log_debug("created temporary pg_tde directory: %s", tde_tmp_source); +} + +void +destroy_tde_tmp_dir(void) +{ + rmtree(tde_tmp_source, true); +} + +static void +write_file(const char *path, char *buf, size_t size) +{ + int fd; + + fd = open(path, O_WRONLY | O_CREAT | PG_BINARY, pg_file_create_mode); + if (fd < 0) + pg_fatal("could not create temporary tde file \"%s\": %m", path); + + if (write(fd, buf, size) != size) + pg_fatal("could not write temporary tde file \"%s\": %m", path); + + if (close(fd) != 0) + pg_fatal("could not close temporary tde file \"%s\": %m", path); +} + +void +write_tmp_source_file(const char *fname, char *buf, size_t size) +{ + char path[MAXPGPATH]; + + snprintf(path, MAXPGPATH, "%s/%s", tde_tmp_source, fname); + + write_file(path, buf, size); +} + +static void +copy_dir(const char *src, const char *dst) +{ + DIR *xldir; + struct dirent *xlde; + char src_path[MAXPGPATH]; + char dst_path[MAXPGPATH]; + + xldir = opendir(src); + if (xldir == NULL) + pg_fatal("could not open directory \"%s\": %m", src); + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + struct stat fst; + + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(src_path, sizeof(src_path), "%s/%s", src, xlde->d_name); + snprintf(dst_path, sizeof(dst_path), "%s/%s", dst, xlde->d_name); + + if (lstat(src_path, &fst) < 0) + pg_fatal("could not stat file \"%s\": %m", src_path); + + if (S_ISREG(fst.st_mode)) + { + char *buf; + size_t size; + + buf = slurpFile(src, xlde->d_name, &size); + + write_file(dst_path, buf, size); + pg_free(buf); + } + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", src); + + if (closedir(xldir)) + pg_fatal("could not close directory \"%s\": %m", src); +} + +void +init_tde(void) +{ + source_has_tde = true; + create_tde_tmp_dir(); + atexit(destroy_tde_tmp_dir); +} + +void +copy_tmp_tde_files(const char *from) +{ + copy_dir(from, tde_tmp_source); +} + +void +fetch_tde_dir(void) +{ + char target_tde_dir[MAXPGPATH]; + + if (dry_run) + return; + + if (!source_has_tde) + return; + + snprintf(target_tde_dir, MAXPGPATH, "%s/%s", datadir_target, PG_TDE_DATA_DIR); + + rmtree(target_tde_dir, false); + copy_dir(tde_tmp_source, target_tde_dir); +} diff --git a/fetools/pg18/pg_rewind/tde_ops.h b/fetools/pg18/pg_rewind/tde_ops.h new file mode 100644 index 000000000..14a96991a --- /dev/null +++ b/fetools/pg18/pg_rewind/tde_ops.h @@ -0,0 +1,17 @@ +#ifndef PG_REWIND_TDE_FILE_H +#define PG_REWIND_TDE_FILE_H + +#include "common/relpath.h" + +extern void flush_current_tde_rel_key(void); +extern void ensure_tde_wal_seg(const char *relpath); +extern void ensure_tde_keys(const char *relpath); +extern void tde_reencrypt_block(unsigned char *buf, off_t file_offset, ForkNumber fork); + +extern void destroy_tde_tmp_dir(void); +extern void write_tmp_source_file(const char *fname, char *buf, size_t size); +extern void fetch_tde_dir(void); +extern void copy_tmp_tde_files(const char *from); +extern void init_tde(void); + +#endif /* PG_REWIND_TDE_FILE_H */ diff --git a/meson.build b/meson.build index b35347a36..7836350a6 100644 --- a/meson.build +++ b/meson.build @@ -239,6 +239,7 @@ executable('pg_tde_rewind', fetools / 'pg_rewind/local_source.c', fetools / 'pg_rewind/parsexlog.c', fetools / 'pg_rewind/pg_rewind.c', + fetools / 'pg_rewind/tde_ops.c', fetools / 'pg_rewind/timeline.c', ), include_directories: incdirs, @@ -298,6 +299,11 @@ tap_tests = [ 't/pg_resetwal_corrupted.pl', 't/pg_rewind_basic.pl', 't/pg_rewind_databases.pl', + 't/pg_rewind_enc_copy_blocks.pl', + 't/pg_rewind_enc_ext_tablespace.pl', + 't/pg_rewind_enc_fsm.pl', + 't/pg_rewind_enc_keep_wal_seg.pl', + 't/pg_rewind_enc_unchanged_rel.pl', 't/pg_rewind_extrafiles.pl', 't/pg_rewind_growing_files.pl', 't/pg_rewind_keep_recycled_wals.pl', diff --git a/src/access/pg_tde_tdemap.c b/src/access/pg_tde_tdemap.c index 40f106d8a..e6b01bff6 100644 --- a/src/access/pg_tde_tdemap.c +++ b/src/access/pg_tde_tdemap.c @@ -82,8 +82,6 @@ static int pg_tde_open_file_basic(const char *tde_filename, int fileFlags, bool static int pg_tde_open_file_read(const char *tde_filename, bool ignore_missing, off_t *curr_pos); static void pg_tde_file_header_read(const char *tde_filename, int fd, TDEFileHeader *fheader, off_t *bytes_read); static bool pg_tde_read_one_map_entry(int fd, TDEMapEntry *map_entry, off_t *offset); - -#ifndef FRONTEND static void pg_tde_write_one_map_entry(int fd, const TDEMapEntry *map_entry, off_t *offset, const char *db_map_path); static int pg_tde_file_header_write(const char *tde_filename, int fd, const TDESignedPrincipalKeyInfo *signed_key_info, off_t *bytes_written); static void pg_tde_initialize_map_entry(TDEMapEntry *map_entry, const TDEPrincipalKey *principal_key, const RelFileLocator *rlocator, const InternalKey *rel_key_data); @@ -164,10 +162,15 @@ pg_tde_save_smgr_key(RelFileLocator rel, pg_tde_write_one_map_entry(fd, &write_entry, &write_offset, file_path); } +#ifdef FRONTEND + pfree(principal_key); +#endif + CloseTransientFile(fd); LWLockRelease(lock_pk); } +#ifndef FRONTEND const char * tde_sprint_key(InternalKey *k) { @@ -437,7 +440,6 @@ pg_tde_sign_principal_key_info(TDESignedPrincipalKeyInfo *signed_key_info, const signed_key_info->aead_tag, MAP_ENTRY_AEAD_TAG_SIZE); } -#ifndef FRONTEND static void pg_tde_initialize_map_entry(TDEMapEntry *map_entry, const TDEPrincipalKey *principal_key, const RelFileLocator *rlocator, const InternalKey *rel_key_data) { @@ -462,9 +464,7 @@ pg_tde_initialize_map_entry(TDEMapEntry *map_entry, const TDEPrincipalKey *princ map_entry->encrypted_key_data, map_entry->aead_tag, MAP_ENTRY_AEAD_TAG_SIZE); } -#endif -#ifndef FRONTEND static void pg_tde_write_one_map_entry(int fd, const TDEMapEntry *map_entry, off_t *offset, const char *db_map_path) { @@ -487,7 +487,6 @@ pg_tde_write_one_map_entry(int fd, const TDEMapEntry *map_entry, off_t *offset, *offset += bytes_written; } -#endif /* * Returns true if we find a valid match; e.g. type is not set to @@ -643,7 +642,6 @@ pg_tde_open_file_read(const char *tde_filename, bool ignore_missing, off_t *curr return fd; } -#ifndef FRONTEND /* * Open for write and Validate File Header: * header: {Format Version, Principal Key Name} @@ -677,7 +675,6 @@ pg_tde_open_file_write(const char *tde_filename, const TDESignedPrincipalKeyInfo *curr_pos = bytes_read + bytes_written; return fd; } -#endif /* * Read TDE file header from a TDE file and fill in the fheader data structure. @@ -701,7 +698,6 @@ pg_tde_file_header_read(const char *tde_filename, int fd, TDEFileHeader *fheader } } -#ifndef FRONTEND /* * Write TDE file header to a TDE file. */ @@ -734,7 +730,6 @@ pg_tde_file_header_write(const char *tde_filename, int fd, const TDESignedPrinci return fd; } -#endif /* * Returns true if a map entry if found or false if we have reached the end of @@ -890,6 +885,9 @@ pg_tde_get_smgr_key(RelFileLocator rel) errmsg("length \"%u\" of principal key \"%s\" does not match the length \"%d\" of the internal key", principal_key->keyLength, principal_key->keyInfo.name, rel_key->key_len), errhint("Create a new principal key and set it instead of the current one.")); } +#ifdef FRONTEND + pfree(principal_key); +#endif return rel_key; } diff --git a/src/access/pg_tde_xlog_keys.c b/src/access/pg_tde_xlog_keys.c index 67c2b3724..50d8d91f5 100644 --- a/src/access/pg_tde_xlog_keys.c +++ b/src/access/pg_tde_xlog_keys.c @@ -23,8 +23,6 @@ #include "pg_tde_fe.h" #endif -#define PG_TDE_WAL_KEY_FILE_NAME "wal_keys" - typedef struct WalKeyFileHeader { int32 file_version; @@ -69,17 +67,6 @@ static int pg_tde_wal_key_file_header_write(const char *filename, int fd, const static void pg_tde_write_one_wal_key_file_entry(int fd, const WalKeyFileEntry *entry, off_t *offset, const char *db_map_path); static void pg_tde_write_wal_key_file_entry(const WalEncryptionRange *range, const TDEPrincipalKey *principal_key); -static const char * -get_wal_key_file_path(void) -{ - static char wal_key_file_path[MAXPGPATH] = ""; - - if (strlen(wal_key_file_path) == 0) - snprintf(wal_key_file_path, MAXPGPATH, "%s/" PG_TDE_WAL_KEY_FILE_NAME, pg_tde_get_data_dir()); - - return wal_key_file_path; -} - void pg_tde_free_wal_key_cache(void) { diff --git a/src/catalog/tde_principal_key.c b/src/catalog/tde_principal_key.c index fa7a4dd89..244783b32 100644 --- a/src/catalog/tde_principal_key.c +++ b/src/catalog/tde_principal_key.c @@ -939,6 +939,13 @@ pg_tde_get_key_info(PG_FUNCTION_ARGS, Oid dbOid) * Process-local cache for the server (GLOBAL_DATA_TDE_OID) principal key. */ static TDEPrincipalKey *fe_server_principal_key_cache = NULL; + +void +clean_fe_server_principal_key_cache() +{ + pfree(fe_server_principal_key_cache); + fe_server_principal_key_cache = NULL; +} #endif /* FRONTEND */ /* diff --git a/src/common/pg_tde_utils.c b/src/common/pg_tde_utils.c index ce2beaf3a..61ddce35c 100644 --- a/src/common/pg_tde_utils.c +++ b/src/common/pg_tde_utils.c @@ -1,5 +1,7 @@ #include "postgres.h" +#include "access/pg_tde_xlog_keys.h" +#include "catalog/tde_principal_key.h" #include "common/pg_tde_utils.h" #include "pg_tde.h" @@ -39,16 +41,44 @@ pg_tde_is_encrypted(PG_FUNCTION_ARGS) #endif /* !FRONTEND */ static char tde_data_dir[MAXPGPATH] = PG_TDE_DATA_DIR; +static char wal_key_file_path[MAXPGPATH] = ""; + +#ifdef FRONTEND +/* + * Changes TDE data dir (keys location) and resets necessary caches. + * + * Currently, only frontend tools can change this. For backend it is always + * in PGDATA. + */ void pg_tde_set_data_dir(const char *dir) { Assert(dir != NULL); + + memset(tde_data_dir, 0, sizeof(tde_data_dir)); strlcpy(tde_data_dir, dir, sizeof(tde_data_dir)); + + memset(wal_key_file_path, 0, sizeof(wal_key_file_path)); + snprintf(wal_key_file_path, MAXPGPATH, "%s/" PG_TDE_WAL_KEY_FILE_NAME, tde_data_dir); + + /* New dir, new keys. Reset caches */ + pg_tde_free_wal_key_cache(); + clean_fe_server_principal_key_cache(); } +#endif const char * pg_tde_get_data_dir(void) { return tde_data_dir; } + +const char * +get_wal_key_file_path(void) +{ + if (strlen(wal_key_file_path) == 0) + snprintf(wal_key_file_path, MAXPGPATH, "%s/" PG_TDE_WAL_KEY_FILE_NAME, tde_data_dir); + + return wal_key_file_path; +} diff --git a/src/encryption/enc_tde.c b/src/encryption/enc_tde.c index 495e9cb90..9de1ca29f 100644 --- a/src/encryption/enc_tde.c +++ b/src/encryption/enc_tde.c @@ -141,3 +141,67 @@ pg_tde_stream_crypt(const char *iv_prefix, batch_no++; } } + +/* + * The initialization vector of a block is its block number converted to a + * 128 bit big endian number plus the forknumber XOR the base IV of the + * relation file. + */ +static void +CalcBlockIv(ForkNumber forknum, BlockNumber bn, const unsigned char *base_iv, unsigned char *iv) +{ + memset(iv, 0, 16); + + /* The init fork is copied to the main fork so we must use the same IV */ + iv[7] = forknum == INIT_FORKNUM ? MAIN_FORKNUM : forknum; + + iv[12] = bn >> 24; + iv[13] = bn >> 16; + iv[14] = bn >> 8; + iv[15] = bn; + + for (int i = 0; i < 16; i++) + iv[i] ^= base_iv[i]; +} + +void +tde_decrypt_smgr_block(InternalKey *relKey, ForkNumber forknum, BlockNumber blocknum, const unsigned char *in, unsigned char *out) +{ + unsigned char iv[16]; + bool allZero = true; + + /* + * Detect unencrypted all-zero pages written by smgrzeroextend() by + * looking at the first 32 bytes of the page. + * + * Not encrypting all-zero pages is safe because they are only written at + * the end of the file when extending a table on disk so they tend to be + * short lived plus they only leak a slightly more accurate table size + * than one can glean from just the file size. + */ + for (int i = 0; i < 32; ++i) + { + if (in[i] != 0) + { + allZero = false; + break; + } + } + + if (allZero) + return; + + CalcBlockIv(forknum, blocknum, relKey->base_iv, iv); + + AesDecrypt(relKey->key, relKey->key_len, iv, in, BLCKSZ, out); +} + +void +tde_encrypt_smgr_block(InternalKey *relKey, ForkNumber forknum, BlockNumber blocknum, const unsigned char *in, unsigned char *out) +{ + unsigned char iv[16]; + + CalcBlockIv(forknum, blocknum, relKey->base_iv, iv); + + AesEncrypt(relKey->key, relKey->key_len, iv, in, BLCKSZ, out); +} diff --git a/src/include/catalog/tde_principal_key.h b/src/include/catalog/tde_principal_key.h index a3294c73b..fe37013c8 100644 --- a/src/include/catalog/tde_principal_key.h +++ b/src/include/catalog/tde_principal_key.h @@ -47,6 +47,8 @@ extern bool pg_tde_principal_key_configured(Oid databaseId); extern TDEPrincipalKey *GetPrincipalKey(Oid dbOid, LWLockMode lockMode); #else extern TDEPrincipalKey *GetPrincipalKey(Oid dbOid, void *lockMode); + +extern void clean_fe_server_principal_key_cache(void); #endif extern void xl_tde_perform_rotate_key(XLogPrincipalKeyRotate *xlrec); diff --git a/src/include/common/pg_tde_utils.h b/src/include/common/pg_tde_utils.h index f91b53f1c..c1c4dbfad 100644 --- a/src/include/common/pg_tde_utils.h +++ b/src/include/common/pg_tde_utils.h @@ -3,5 +3,6 @@ extern void pg_tde_set_data_dir(const char *dir); extern const char *pg_tde_get_data_dir(void); +extern const char *get_wal_key_file_path(void); #endif /* PG_TDE_UTILS_H */ diff --git a/src/include/encryption/enc_tde.h b/src/include/encryption/enc_tde.h index a3c85a5a8..9b93079e7 100644 --- a/src/include/encryption/enc_tde.h +++ b/src/include/encryption/enc_tde.h @@ -5,6 +5,9 @@ #ifndef ENC_TDE_H #define ENC_TDE_H +#include "common/relpath.h" +#include "storage/block.h" + #define TDE_KEY_NAME_LEN 256 #define KEY_DATA_SIZE_128 16 /* 128 bit encryption */ #define KEY_DATA_SIZE_256 32 /* 256 bit encryption */ @@ -38,4 +41,10 @@ extern void pg_tde_stream_crypt(const char *iv_prefix, int key_len, void **ctxPtr); +extern void tde_decrypt_smgr_block(InternalKey *relKey, ForkNumber forknum, + BlockNumber blocknum, const unsigned char *in, + unsigned char *out); +extern void tde_encrypt_smgr_block(InternalKey *relKey, ForkNumber forknum, + BlockNumber blocknum, const unsigned char *in, + unsigned char *out); #endif /* ENC_TDE_H */ diff --git a/src/include/pg_tde.h b/src/include/pg_tde.h index ad3101d09..c61b7576f 100644 --- a/src/include/pg_tde.h +++ b/src/include/pg_tde.h @@ -6,6 +6,7 @@ #define PG_TDE_VERSION_STRING PG_TDE_NAME " " PG_TDE_VERSION #define PG_TDE_DATA_DIR "pg_tde" +#define PG_TDE_WAL_KEY_FILE_NAME "wal_keys" #define TDE_TRANCHE_NAME "pg_tde_tranche" diff --git a/src/smgr/pg_tde_smgr.c b/src/smgr/pg_tde_smgr.c index f0a91ac57..06daa425f 100644 --- a/src/smgr/pg_tde_smgr.c +++ b/src/smgr/pg_tde_smgr.c @@ -77,7 +77,6 @@ static void tde_smgr_save_temp_key(const RelFileLocator *newrlocator, const Inte static InternalKey *tde_smgr_get_temp_key(const RelFileLocator *rel); static bool tde_smgr_has_temp_key(const RelFileLocator *rel); static void tde_smgr_delete_temp_key(const RelFileLocator *rel); -static void CalcBlockIv(ForkNumber forknum, BlockNumber bn, const unsigned char *base_iv, unsigned char *iv); static void tde_smgr_log_create_key(const RelFileLocator *rlocator) @@ -261,13 +260,10 @@ tde_mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, for (int i = 0; i < nblocks; ++i) { BlockNumber bn = blocknum + i; - unsigned char iv[16]; local_buffers[i] = &local_blocks[i * BLCKSZ]; - CalcBlockIv(forknum, bn, tdereln->relKey.base_iv, iv); - - AesEncrypt(tdereln->relKey.key, tdereln->relKey.key_len, iv, ((unsigned char **) buffers)[i], BLCKSZ, local_buffers[i]); + tde_encrypt_smgr_block(&tdereln->relKey, forknum, bn, ((unsigned char **) buffers)[i], local_buffers[i]); } mdwritev(reln, forknum, blocknum, @@ -320,11 +316,8 @@ tde_mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, else { unsigned char *local_blocks = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); - unsigned char iv[16]; - - CalcBlockIv(forknum, blocknum, tdereln->relKey.base_iv, iv); - AesEncrypt(tdereln->relKey.key, tdereln->relKey.key_len, iv, ((unsigned char *) buffer), BLCKSZ, local_blocks); + tde_encrypt_smgr_block(&tdereln->relKey, forknum, blocknum, ((unsigned char *) buffer), local_blocks); mdextend(reln, forknum, blocknum, local_blocks, skipFsync); @@ -347,33 +340,10 @@ tde_mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, for (int i = 0; i < nblocks; ++i) { - bool allZero = true; BlockNumber bn = blocknum + i; - unsigned char iv[16]; + unsigned char *buf = ((unsigned char **) buffers)[i]; - /* - * Detect unencrypted all-zero pages written by smgrzeroextend() by - * looking at the first 32 bytes of the page. - * - * Not encrypting all-zero pages is safe because they are only written - * at the end of the file when extending a table on disk so they tend - * to be short lived plus they only leak a slightly more accurate - * table size than one can glean from just the file size. - */ - for (int j = 0; j < 32; ++j) - { - if (((char **) buffers)[i][j] != 0) - { - allZero = false; - break; - } - } - if (allZero) - continue; - - CalcBlockIv(forknum, bn, tdereln->relKey.base_iv, iv); - - AesDecrypt(tdereln->relKey.key, tdereln->relKey.key_len, iv, ((unsigned char **) buffers)[i], BLCKSZ, ((unsigned char **) buffers)[i]); + tde_decrypt_smgr_block(&tdereln->relKey, forknum, bn, buf, buf); } } @@ -511,36 +481,12 @@ tde_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data) { Buffer buf = io_data[buf_off]; char *buf_ptr = BufferGetBlock(buf); - bool allZero = true; BlockNumber bn = td->smgr.blockNum + buf_off; - unsigned char iv[16]; if (prior_result.result <= buf_off) break; - /* - * Detect unencrypted all-zero pages written by smgrzeroextend() by - * looking at the first 32 bytes of the page. - * - * Not encrypting all-zero pages is safe because they are only written - * at the end of the file when extending a table on disk so they tend - * to be short lived plus they only leak a slightly more accurate - * table size than one can glean from just the file size. - */ - for (int i = 0; i < 32; i++) - { - if (buf_ptr[i] != 0) - { - allZero = false; - break; - } - } - if (allZero) - continue; - - CalcBlockIv(td->smgr.forkNum, bn, int_key->base_iv, iv); - - AesDecrypt(int_key->key, int_key->key_len, iv, ((unsigned char *) buf_ptr), BLCKSZ, ((unsigned char *) buf_ptr)); + tde_decrypt_smgr_block(int_key, td->smgr.forkNum, bn, ((unsigned char *) buf_ptr), ((unsigned char *) buf_ptr)); } return prior_result; @@ -716,25 +662,3 @@ tde_smgr_delete_temp_key(const RelFileLocator *rel) Assert(TempRelKeys); hash_search(TempRelKeys, rel, HASH_REMOVE, NULL); } - -/* - * The intialization vector of a block is its block number conmverted to a - * 128 bit big endian number plus the forknumber XOR the base IV of the - * relation file. - */ -static void -CalcBlockIv(ForkNumber forknum, BlockNumber bn, const unsigned char *base_iv, unsigned char *iv) -{ - memset(iv, 0, 16); - - /* The init fork is copied to the main fork so we must use the same IV */ - iv[7] = forknum == INIT_FORKNUM ? MAIN_FORKNUM : forknum; - - iv[12] = bn >> 24; - iv[13] = bn >> 16; - iv[14] = bn >> 8; - iv[15] = bn; - - for (int i = 0; i < 16; i++) - iv[i] ^= base_iv[i]; -} diff --git a/t/RewindTest.pm b/t/RewindTest.pm index e2eb2792e..33a813764 100644 --- a/t/RewindTest.pm +++ b/t/RewindTest.pm @@ -165,9 +165,20 @@ shared_preload_libraries = 'pg_tde' "SELECT pg_tde_set_server_key_using_global_key_provider('global-db-principal-key', 'file-keyring-wal');" ); + $node_primary->safe_psql('postgres', + "SELECT pg_tde_add_database_key_provider_file('file-keyring','${tde_keyring_file}');" + ); + $node_primary->safe_psql('postgres', + "SELECT pg_tde_create_key_using_database_key_provider('test-db-key', 'file-keyring');" + ); + $node_primary->safe_psql('postgres', + "SELECT pg_tde_set_key_using_database_key_provider('test-db-key', 'file-keyring');" + ); + $node_primary->append_conf( 'postgresql.conf', q{ pg_tde.wal_encrypt = on +default_table_access_method='tde_heap' }); $node_primary->stop; @@ -201,13 +212,20 @@ sub start_primary sub create_standby { - my $extra_name = shift; + my ($extra_name, %params) = @_; + + my @backup_options = + exists $params{backup_options} ? @{ $params{backup_options} } : (); $node_standby = PostgreSQL::Test::Cluster->new( 'standby' . ($extra_name ? "_${extra_name}" : '')); - PGTDE::backup($node_primary, 'my_backup'); - $node_standby->init_from_backup($node_primary, 'my_backup'); + + PGTDE::backup($node_primary, 'my_backup', + backup_options => [@backup_options]); + + $node_standby->init_from_backup($node_primary, 'my_backup', + tablespace_map => $params{tablespace_map}); my $connstr_primary = $node_primary->connstr(); $node_standby->append_conf( @@ -215,6 +233,11 @@ sub create_standby primary_conninfo='$connstr_primary' )); + foreach my $param_item (@{ $params{extra_conf} // [] }) + { + $node_standby->append_conf('postgresql.conf', qq($param_item)); + } + $node_standby->set_standby_mode(); # Start standby diff --git a/t/pg_rewind_databases.pl b/t/pg_rewind_databases.pl index 755ea80e3..d5e526d0f 100644 --- a/t/pg_rewind_databases.pl +++ b/t/pg_rewind_databases.pl @@ -20,24 +20,25 @@ sub run_test # Create a database in primary with a table. primary_psql('CREATE DATABASE inprimary'); - primary_psql('CREATE TABLE inprimary_tab (a int)', 'inprimary'); + primary_psql('CREATE TABLE inprimary_tab (a int) USING heap', + 'inprimary'); RewindTest::create_standby($test_mode); # Create another database with another table, the creation is # replicated to the standby. primary_psql('CREATE DATABASE beforepromotion'); - primary_psql('CREATE TABLE beforepromotion_tab (a int)', + primary_psql('CREATE TABLE beforepromotion_tab (a int) USING heap', 'beforepromotion'); RewindTest::promote_standby(); # Create databases in the old primary and the new promoted standby. primary_psql('CREATE DATABASE primary_afterpromotion'); - primary_psql('CREATE TABLE primary_promotion_tab (a int)', + primary_psql('CREATE TABLE primary_promotion_tab (a int) USING heap', 'primary_afterpromotion'); standby_psql('CREATE DATABASE standby_afterpromotion'); - standby_psql('CREATE TABLE standby_promotion_tab (a int)', + standby_psql('CREATE TABLE standby_promotion_tab (a int) USING heap', 'standby_afterpromotion'); # The clusters are now diverged. diff --git a/t/pg_rewind_enc_copy_blocks.pl b/t/pg_rewind_enc_copy_blocks.pl new file mode 100644 index 000000000..9f9b133d3 --- /dev/null +++ b/t/pg_rewind_enc_copy_blocks.pl @@ -0,0 +1,84 @@ +# Tests the scenario when only changed blocks of the encrypted relation are +# copied from the source, hence mixes data encrypted with different keys. So +# we check if pg_tde_rewind handles that properly, and the data is re-encrypted +# correctly. +# +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + my $extra_name = shift; + my $extra_conf = shift; + + my $cluster_name = $test_mode; + + $cluster_name = $cluster_name . $extra_name if defined $extra_name; + + RewindTest::setup_cluster($cluster_name, [], $extra_conf); + RewindTest::start_primary(); + RewindTest::create_standby($cluster_name); + + primary_psql( + "CREATE TABLE tail_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO tail_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql( + "CREATE TABLE block_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO block_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql("CHECKPOINT"); + + RewindTest::promote_standby(); + + # Makes pg_rewind to copy some blocks of the relation + # (mixing data encrypted with different keys on the target). + primary_psql("UPDATE block_t SET f1='YYYYYYY' WHERE id % 10 = 0;"); + + # Insert some data making rewind to copy the tail of this relation + # (mixing data encrypted with different keys on the target). + standby_psql( + "INSERT INTO tail_t (f1) SELECT repeat('ghijk', 100) FROM generate_series(1, 1000)" + ); + standby_psql("CHECKPOINT"); + + + RewindTest::run_pg_rewind($test_mode); + + check_query( + 'SELECT count(*) FROM tail_t', + qq(2000 +), + 'tail-copy'); + + check_query( + 'SELECT count(*) FROM block_t', + qq(1000 +), + 'blocks-copy'); + + RewindTest::clean_rewind_test(); + return; +} + +# Run the test in both modes +run_test('local'); +run_test('remote'); +run_test('archive'); + +my @conf_params = ("pg_tde.cipher = 'aes_256'"); +run_test('local', "_aes_256", \@conf_params); + +done_testing(); diff --git a/t/pg_rewind_enc_ext_tablespace.pl b/t/pg_rewind_enc_ext_tablespace.pl new file mode 100644 index 000000000..8ae3ecbe0 --- /dev/null +++ b/t/pg_rewind_enc_ext_tablespace.pl @@ -0,0 +1,97 @@ +# Check encrypted relations in external tablespaces +# +use strict; +use warnings FATAL => 'all'; +use File::Path qw(rmtree make_path); +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + my $extra_name = shift; + my $extra_conf = shift; + + my $cluster_name = $test_mode; + + my $tempdir = PostgreSQL::Test::Utils::tempdir_short(); + + my $primary_tblspc = $tempdir . 'tblsp_primary'; + my $primary_tblspc_bcp = $tempdir . 'tblsp_primary_bcp'; + my $standby_tblspc = $tempdir . 'tblsp_standby'; + + $cluster_name = $cluster_name . $extra_name if defined $extra_name; + + mkdir($primary_tblspc) || die "mkdir $primary_tblspc: $!"; + + RewindTest::setup_cluster($cluster_name, [], $extra_conf); + RewindTest::start_primary(); + + primary_psql("CREATE TABLESPACE ts1 LOCATION '$primary_tblspc'"); + + RewindTest::create_standby( + $cluster_name, + backup_options => + [ '--tablespace-mapping', "$primary_tblspc=$primary_tblspc_bcp" ], + tablespace_map => { 16433 => $standby_tblspc }); + + primary_psql( + "CREATE TABLE tail_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap TABLESPACE ts1" + ); + primary_psql( + "INSERT INTO tail_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql( + "CREATE TABLE block_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap TABLESPACE ts1" + ); + primary_psql( + "INSERT INTO block_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql("CHECKPOINT"); + + RewindTest::promote_standby(); + + # Makes pg_rewind to copy some blocks of the relation + # (mixing data encrypted with different keys on the target). + primary_psql("UPDATE block_t SET f1='YYYYYYY' WHERE id % 10 = 0;"); + + # Insert some data making rewind to copy the tail of this relation + # (mixing data encrypted with different keys on the target). + standby_psql( + "INSERT INTO tail_t (f1) SELECT repeat('ghijk', 100) FROM generate_series(1, 1000)" + ); + standby_psql("CHECKPOINT"); + + + RewindTest::run_pg_rewind($test_mode); + + check_query( + 'SELECT count(*) FROM tail_t', + qq(2000 +), + 'tail-copy'); + + check_query( + 'SELECT count(*) FROM block_t', + qq(1000 +), + 'blocks-copy'); + + RewindTest::clean_rewind_test(); + return; +} + +# Run the test in both modes +run_test('local'); +run_test('remote'); +run_test('archive'); + +my @conf_params = ("pg_tde.cipher = 'aes_256'"); +run_test('local', "_aes_256", \@conf_params); + +done_testing(); diff --git a/t/pg_rewind_enc_fsm.pl b/t/pg_rewind_enc_fsm.pl new file mode 100644 index 000000000..c042a011b --- /dev/null +++ b/t/pg_rewind_enc_fsm.pl @@ -0,0 +1,65 @@ +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + my $extra_name = shift; + my $extra_conf = shift; + + my $cluster_name = $test_mode; + + $cluster_name = $cluster_name . $extra_name if defined $extra_name; + + RewindTest::setup_cluster($cluster_name, [], $extra_conf); + RewindTest::start_primary(); + RewindTest::create_standby($cluster_name); + + primary_psql( + "CREATE TABLE tbl1 (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO tbl1 (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql("CHECKPOINT"); + + RewindTest::promote_standby(); + + # Trigger updated blocks in FSM + standby_psql("DELETE FROM tbl1 WHERE id % 15 = 0;"); + standby_psql( + "INSERT INTO tbl1 (f1) SELECT repeat('ghijk', 100) FROM generate_series(1, 1000)" + ); + + + RewindTest::run_pg_rewind($test_mode); + + ok(!$RewindTest::node_primary->log_contains('; zeroing out page'), + 'verify there are no corrupted _fsm relations'); + + check_query( + 'SELECT count(*) FROM tbl1', + qq(1934 +), + 'check table'); + + RewindTest::clean_rewind_test(); + return; +} + +# Run the test in both modes +run_test('local'); +run_test('remote'); +run_test('archive'); + +my @conf_params = ("pg_tde.cipher = 'aes_256'"); +run_test('local', "_aes_256", \@conf_params); + +done_testing(); diff --git a/t/pg_rewind_enc_keep_wal_seg.pl b/t/pg_rewind_enc_keep_wal_seg.pl new file mode 100644 index 000000000..3ebf54096 --- /dev/null +++ b/t/pg_rewind_enc_keep_wal_seg.pl @@ -0,0 +1,100 @@ +# Make rewind to keep some WAL segments on taget and archive restore_command to +# generate new WAL keys wile creating replica. So kept WAl segments shuld be +# re-encrypted by rewind. +# +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + my $extra_name = shift; + my $extra_conf = shift; + + my $cluster_name = $test_mode; + + my $tempdir = PostgreSQL::Test::Utils::tempdir_short(); + + my $archive_dir = $tempdir . '/archive'; + + $cluster_name = $cluster_name . $extra_name if defined $extra_name; + + mkdir($archive_dir) || die "mkdir $archive_dir: $!"; + + push @$extra_conf, "wal_level=replica"; + push @$extra_conf, "archive_mode=on"; + push @$extra_conf, + "archive_command='pg_tde_archive_decrypt %f %p \"cp %%p $archive_dir/%%f\"'"; + push @$extra_conf, + "restore_command='pg_tde_restore_encrypt %f %p \"cp $archive_dir/%%f %%p\"'"; + + RewindTest::setup_cluster($cluster_name, [], $extra_conf); + RewindTest::start_primary(); + + $RewindTest::node_primary->stop; + $RewindTest::node_primary->start; + + primary_psql( + "CREATE TABLE tail_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO tail_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql( + "CREATE TABLE block_t (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO block_t (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql("CHECKPOINT"); + + RewindTest::create_standby($cluster_name); + + RewindTest::promote_standby(); + + # # Makes pg_rewind to copy some blocks of the relation + # # (mixing data encrypted with different keys on the target). + primary_psql("UPDATE block_t SET f1='YYYYYYY' WHERE id % 10 = 0;"); + + # Insert some data making rewind to copy the tail of this relation + # (mixing data encrypted with different keys on the target). + standby_psql( + "INSERT INTO tail_t (f1) SELECT repeat('ghijk', 100) FROM generate_series(1, 1000)" + ); + standby_psql("CHECKPOINT"); + + + RewindTest::run_pg_rewind($test_mode); + + check_query( + 'SELECT count(*) FROM tail_t', + qq(2000 +), + 'tail-copy'); + + check_query( + 'SELECT count(*) FROM block_t', + qq(1000 +), + 'blocks-copy'); + + RewindTest::clean_rewind_test(); + return; +} + +# Run the test in both modes +run_test('local'); +run_test('remote'); +run_test('archive'); + +my @conf_params = ("pg_tde.cipher = 'aes_256'"); +run_test('local', "_aes_256", \@conf_params); + +done_testing(); diff --git a/t/pg_rewind_enc_unchanged_rel.pl b/t/pg_rewind_enc_unchanged_rel.pl new file mode 100644 index 000000000..afd5c0bab --- /dev/null +++ b/t/pg_rewind_enc_unchanged_rel.pl @@ -0,0 +1,65 @@ + +# Copyright (c) 2021-2024, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + my $extra_name = shift; + my $extra_conf = shift; + + my $cluster_name = $test_mode; + + $cluster_name = $cluster_name . $extra_name if defined $extra_name; + + RewindTest::setup_cluster($cluster_name, [], $extra_conf); + RewindTest::start_primary(); + RewindTest::create_standby($cluster_name); + + primary_psql( + "CREATE TABLE tbl (id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, f1 TEXT) USING tde_heap" + ); + primary_psql( + "INSERT INTO tbl (f1) SELECT repeat('abcdeF', 1000) FROM generate_series(1, 1000)" + ); + primary_psql("CHECKPOINT"); + + RewindTest::promote_standby(); + + + # Makes an index relation to remain unchanged on target. So test that we + # preserve a target's internal key for this rel + standby_psql("CHECKPOINT"); + + + RewindTest::run_pg_rewind($test_mode); + + check_query( + 'SELECT count(*) FROM tbl', + qq(1000 +), + 'read-unchanged'); + + + RewindTest::clean_rewind_test(); + return; +} + +# Run the test in both modes +run_test('local'); +run_test('remote'); +run_test('archive'); + +my @conf_params = ("pg_tde.cipher = 'aes_256'"); +run_test('local', "_aes_256", \@conf_params); + +done_testing(); diff --git a/t/wal_key_tli.pl b/t/wal_key_tli.pl index c9ba70798..7731ef401 100644 --- a/t/wal_key_tli.pl +++ b/t/wal_key_tli.pl @@ -23,7 +23,8 @@ sub run_test # Create a database in primary with a table. primary_psql('CREATE DATABASE inprimary'); - primary_psql('CREATE TABLE inprimary_tab (a int)', 'inprimary'); + primary_psql('CREATE TABLE inprimary_tab (a int) USING heap', + 'inprimary'); RewindTest::create_standby($test_mode); @@ -35,17 +36,17 @@ sub run_test # Create another database with another table, the creation is # replicated to the standby. primary_psql('CREATE DATABASE beforepromotion'); - primary_psql('CREATE TABLE beforepromotion_tab (a int)', + primary_psql('CREATE TABLE beforepromotion_tab (a int) USING heap', 'beforepromotion'); RewindTest::promote_standby(); # Create databases in the old primary and the new promoted standby. primary_psql('CREATE DATABASE primary_afterpromotion'); - primary_psql('CREATE TABLE primary_promotion_tab (a int)', + primary_psql('CREATE TABLE primary_promotion_tab (a int) USING heap', 'primary_afterpromotion'); standby_psql('CREATE DATABASE standby_afterpromotion'); - standby_psql('CREATE TABLE standby_promotion_tab (a int)', + standby_psql('CREATE TABLE standby_promotion_tab (a int) USING heap', 'standby_afterpromotion'); # The clusters are now diverged.