From 1db2e6310275d6dbf2efa01e712281257ea1fc86 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 30 Dec 2020 20:26:52 -0800 Subject: [PATCH] aio: wal: padding of partial records. Author: Reviewed-By: Discussion: https://postgr.es/m/ Backpatch: --- src/backend/access/rmgrdesc/xlogdesc.c | 4 + src/backend/access/transam/xlog.c | 146 +++++++++++++++++++++++++ src/backend/storage/buffer/bufmgr.c | 1 + src/backend/utils/misc/guc.c | 9 ++ src/include/catalog/pg_control.h | 1 + src/include/storage/bufmgr.h | 1 + 6 files changed, 162 insertions(+) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 92cc7ea073..a876915fa1 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -140,6 +140,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, timestamptz_to_str(xlrec.end_time)); } + else if (info == XLOG_WASTE) + { + appendStringInfo(buf, "waste"); + } } const char * diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 37b0cf32a4..cd7ad390de 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1513,6 +1513,104 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) return true; } +static XLogRecPtr +PadPartialPage(XLogRecPtr upto, XLogRecPtr *final_pad) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + bool padded = false; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + size_t pad_size = 0; + XLogRecord rechdr = {0}; + pg_crc32c rdata_crc; + + rechdr.xl_xid = InvalidTransactionId; + rechdr.xl_info = XLOG_NOOP; + rechdr.xl_rmid = RM_XLOG_ID; + + Assert(upto == upto - (upto % XLOG_BLCKSZ)); + + START_CRIT_SECTION(); + + WALInsertLockAcquire(); + + SpinLockAcquire(&Insert->insertpos_lck); + + startbytepos = Insert->CurrBytePos; + if (startbytepos < XLogRecPtrToBytePos(upto)) + { + padded = true; + + pad_size = upto - XLogBytePosToEndRecPtr(startbytepos); + pad_size = Max(pad_size, MAXALIGN64(SizeOfXLogRecord + SizeOfXLogRecordDataHeaderLong)); + + endbytepos = startbytepos + pad_size; + prevbytepos = Insert->PrevBytePos; + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + } + + SpinLockRelease(&Insert->insertpos_lck); + + if (padded) + { + XLogRecData recdata[4]; + uint32 k; + uint32 rdata_len; + + rechdr.xl_prev = XLogBytePosToRecPtr(prevbytepos); + rechdr.xl_tot_len = pad_size; + + recdata[0].data = (char *) &rechdr; + recdata[0].len = SizeOfXLogRecord; + recdata[0].next = &recdata[1]; + + k = XLR_BLOCK_ID_DATA_LONG; + rdata_len = pad_size - SizeOfXLogRecord - SizeOfXLogRecordDataHeaderLong; + + recdata[1].data = (char *) &k; + recdata[1].len = sizeof(uint8); + recdata[1].next = &recdata[2]; + + recdata[2].data = (char *) &rdata_len; + recdata[2].len = sizeof(uint32); + recdata[2].next = &recdata[3]; + + recdata[3].data = XLogCtl->zerobuf; + recdata[3].len = pad_size - SizeOfXLogRecord - SizeOfXLogRecordDataHeaderLong; + recdata[3].next = NULL; + + + INIT_CRC32C(rdata_crc); + for (XLogRecData *rdt = &recdata[1]; rdt != NULL; rdt = rdt->next) + COMP_CRC32C(rdata_crc, rdt->data, rdt->len); + COMP_CRC32C(rdata_crc, &rechdr, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + rechdr.xl_crc = rdata_crc; + + CopyXLogRecordToWAL(rechdr.xl_tot_len, false, recdata, + XLogBytePosToRecPtr(startbytepos), XLogBytePosToEndRecPtr(endbytepos)); + } + + WALInsertLockRelease(); + + END_CRIT_SECTION(); + + if (padded) + { + pgWalUsage.wal_bytes += pad_size; + + *final_pad = XLogBytePosToEndRecPtr(endbytepos); + return true; + } + else + { + *final_pad = InvalidXLogRecPtr; + return false; + } +} + /* * Checks whether the current buffer page and backup page stored in the * WAL record are consistent or not. Before comparing the two pages, a @@ -3092,6 +3190,54 @@ XLogWriteIssueWrites(XLogWritePos *write_pos, bool flexible) #endif } + // FIXME: need to figure out how to not cause problems during + // shutdown checkpoints etc. + if (ispartialpage && io_wal_pad_partial && XLogInsertAllowed()) + { + XLogRecPtr pad_upto; + XLogRecPtr final_pad; + + //elog(DEBUG1, "WALWriteLock padd"); + LWLockRelease(WALWriteLock); + + pgaio_submit_pending(true); + + pad_upto = newinsertpos - newinsertpos % XLOG_BLCKSZ + XLOG_BLCKSZ; + + if (0) + { + XLogRecPtr insert_lsn = XLogBytePosToRecPtr(XLogCtl->Insert.CurrBytePos); + + elog(DEBUG1, "for min %X/%X, pad %X/%X up to: %X/%X (%d bytes), insert %X/%X", + (uint32)(write_pos->write_init_min >> 32), (uint32) write_pos->write_init_min, + (uint32)(newinsertpos >> 32), (uint32) newinsertpos, + (uint32)(pad_upto >> 32), (uint32) pad_upto, + (int32)(pad_upto - newinsertpos), + (uint32)(insert_lsn >> 32), (uint32) insert_lsn); + } + + if (PadPartialPage(pad_upto, &final_pad)) + { + if (0) + { + elog(DEBUG1, "actually pad req %X/%X new %X/%X up to: %X/%X (%d/%d bytes), started at %X/%X", + (uint32)(write_pos->write_init_min >> 32), (uint32) write_pos->write_init_min, + (uint32)(newinsertpos >> 32), (uint32) newinsertpos, + (uint32)(final_pad >> 32), (uint32) final_pad, + (int32)(final_pad - write_pos->write_init_min), + (int32)(final_pad - newinsertpos), + (uint32)(startwrite_first >> 32), (uint32) startwrite_first); + } + write_pos->write_init_opt = WaitXLogInsertionsToFinish(EndRecPtr); + } + else + { + //elog(LOG, "didn't need to pad"); + } + + goto write_out_wait; + } + if ((write_pos->write_init_opt % XLOG_BLCKSZ) != 0) lastnonpartialidx = XLogRecPtrToBufIdx(write_pos->write_init_opt - XLOG_BLCKSZ); else diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 679d7e4b20..6bbcde4083 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -163,6 +163,7 @@ bool io_data_direct = 0; bool io_data_force_async = 1; bool io_wal_direct = 0; bool io_wal_init_direct = 0; +bool io_wal_pad_partial = true; int io_wal_concurrency = 32; int io_wal_target_blocks = 8; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 7f8369e939..b8152a080f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2084,6 +2084,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"io_wal_pad_partial", PGC_SIGHUP, RESOURCES_DISK, + gettext_noop("pad WAL files upon flash"), + }, + &io_wal_pad_partial, + false, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index e3f48158ce..7997dc7bba 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_WASTE 0xC0 /* diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index bd868ea273..779ccd7df3 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -80,6 +80,7 @@ extern bool io_data_direct; extern bool io_data_force_async; extern bool io_wal_direct; extern bool io_wal_init_direct; +extern bool io_wal_pad_partial; extern int io_wal_concurrency; extern int io_wal_target_blocks; -- 2.39.5