Allow to trigger kernel writeback after a configurable number of writes.

author Andres Freund <andres@anarazel.de>

Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)

committer Andres Freund <andres@anarazel.de>

Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)
author Andres Freund <andres@anarazel.de>
Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)
committer Andres Freund <andres@anarazel.de>
Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index a09ceb2fea78f8f02efbba5074ad6bb4909108e5..3dc671985e8584a8e850ace98c70d3e230e65a86 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1843,6 +1843,32 @@ include_dir 'conf.d'
          </para>
         </listitem>
        </varlistentry>
+
+      <varlistentry id="guc-bgwriter-flush-after" xreflabel="bgwriter_flush_after">
+       <term><varname>bgwriter_flush_after</varname> (<type>int</type>)
+       <indexterm>
+        <primary><varname>bgwriter_flush_after</> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Whenever more than <varname>bgwriter_flush_after</varname> bytes have
+         been written by the bgwriter, hint to OS to flush these writes to the
+         underlying storage.  Doing so will limit the amount of dirty data in
+         the kernel's page cache, reducing the likelihood of stalls when fsync
+         is issued at the end of a checkpoint, or when the OS writes out data
+         in larger batches in the background.  Often that will result in
+         greatly reduced transaction latency, but there also are some cases,
+         especially with workloads that are bigger than <xref
+         linkend="guc-shared-buffers">, but smaller than the OS's page cache,
+         where performance might degrade.  This setting may have no effect on
+         some platforms.  <literal>0</literal> disables controlled flushing.
+         The default is <literal>256Kb</> on Linux, <literal>0</> otherwise.
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
  
       <para>
@@ -1944,6 +1970,35 @@ include_dir 'conf.d'
          </para>
         </listitem>
        </varlistentry>
+
+      <varlistentry id="guc-backend-flush-after" xreflabel="backend_flush_after">
+       <term><varname>backend_flush_after</varname> (<type>int</type>)
+       <indexterm>
+        <primary><varname>backend_flush_after</> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Whenever more than <varname>backend_flush_after</varname> bytes have
+         been written by a single backend, hint to OS to flush these writes to
+         the underlying storage.  Doing so will limit the amount of dirty data
+         in the kernel's page cache, reducing the likelihood of stalls when
+         fsync is issued at the end of a checkpoint, or when the OS writes out
+         data in larger batches in the background.  Often that will result in
+         greatly reduced transaction latency, but there also are some cases,
+         especially with workloads that are bigger than <xref
+         linkend="guc-shared-buffers">, but smaller than the OS's page cache,
+         where performance might degrade. Note that because
+         <varname>backend_flush_after</varname> is per-backend, the total
+         amount of dirty data in the kerne's page cache can be considerably
+         bigger than this setting.  This setting may have no effect on some
+         platforms.  <literal>0</literal> disables controlled flushing.  The
+         default is <literal>256Kb</> on Linux, <literal>0</> otherwise.  This
+         parameter can only be set in the <filename>postgresql.conf</> file or
+         on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
     </sect1>
@@ -2475,6 +2530,32 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
  
+     <varlistentry id="guc-checkpoint-flush-after" xreflabel="checkpoint_flush_after">
+      <term><varname>checkpoint_flush_after</varname> (<type>int</type>)
+      <indexterm>
+       <primary><varname>checkpoint_flush_after</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Whenever more than <varname>checkpoint_flush_after</varname> bytes
+        have been written while performing a checkpoint, hint to OS to flush
+        these writes to the underlying storage.  Doing so will limit the
+        amount of dirty data in the kernel's page cache, reducing the
+        likelihood of stalls when fsync is issued at the end of a checkpoint,
+        or when the OS writes out data in larger batches in the background.
+        Often that will result in greatly reduced transaction latency, but
+        there also are some cases, especially with workloads that are bigger
+        than <xref linkend="guc-shared-buffers">, but smaller than the OS's
+        page cache, where performance might degrade.  This setting may have no
+        effect on some platforms.  <literal>0</literal> disables controlled
+        flushing.  The default is <literal>256Kb</> on Linux, <literal>0</>
+        otherwise.  This parameter can only be set in the
+        <filename>postgresql.conf</> file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
       <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
        <term><varname>checkpoint_warning</varname> (<type>integer</type>)
        <indexterm>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml

index e3941c9391b7fe4f277e673e7a2f31ba7a168714..96496b0bd7d2e8000e40dde372731b27bcea4bbb 100644 (file)
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -545,6 +545,19 @@
     unexpected variation in the number of WAL segments needed.
    </para>
  
+  <para>
+   On Linux and POSIX platforms <xref linkend="guc-checkpoint-flush-after">
+   allows to guide the OS that pages written by the checkpoint should be
+   flushed to disk.  Otherwise, these pages may be kept in the OS's page
+   cache, inducing a stall when <literal>fsync</> is called later.  This
+   setting helps to reduce transaction latency, but it also can an adverse
+   effect on performance; particularly for workloads that are bigger than
+   <xref linkend="guc-shared-buffers">, but smaller than the OS's page cache.
+   It should be beneficial for high write loads on HDD.  This feature probably
+   brings no benefit on SSD, as the I/O write latency is small on such
+   hardware, thus it may be disabled.
+  </para>
+
    <para>
     The number of WAL segment files in <filename>pg_xlog</> directory depends on
     <varname>min_wal_size</>, <varname>max_wal_size</> and
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index 4ff4caf232126a7cb6627ea72e3c1b2629b26872..7d0371d807cfa4006165eb6500222ee83da67642 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -111,6 +111,7 @@ BackgroundWriterMain(void)
     sigjmp_buf  local_sigjmp_buf;
     MemoryContext bgwriter_context;
     bool        prev_hibernate;
+   WritebackContext wb_context;
  
     /*
      * Properly accept or ignore signals the postmaster might send us.
@@ -164,6 +165,8 @@ BackgroundWriterMain(void)
                                              ALLOCSET_DEFAULT_MAXSIZE);
     MemoryContextSwitchTo(bgwriter_context);
  
+   WritebackContextInit(&wb_context, &bgwriter_flush_after);
+
     /*
      * If an exception is encountered, processing resumes here.
      *
@@ -208,6 +211,9 @@ BackgroundWriterMain(void)
         /* Flush any leaked data in the top-level context */
         MemoryContextResetAndDeleteChildren(bgwriter_context);
  
+       /* re-initilialize to avoid repeated errors causing problems */
+       WritebackContextInit(&wb_context, &bgwriter_flush_after);
+
         /* Now we can allow interrupts again */
         RESUME_INTERRUPTS();
  
@@ -269,7 +275,7 @@ BackgroundWriterMain(void)
         /*
          * Do one cycle of dirty-buffer writing.
          */
-       can_hibernate = BgBufferSync();
+       can_hibernate = BgBufferSync(&wb_context);
  
         /*
          * Send off activity statistics to the stats collector
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c

index f013a4d9581553bf30bf15b9b2bd745d20e138c9..e10071d9c0ed76d3b17176238e62f535bc99f9ca 100644 (file)
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -23,6 +23,7 @@ char     *BufferBlocks;
  LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
  LWLockTranche BufferIOLWLockTranche;
  LWLockTranche BufferContentLWLockTranche;
+WritebackContext BackendWritebackContext;
  
  
  /*
@@ -149,6 +150,10 @@ InitBufferPool(void)
  
     /* Init other shared buffer-management stuff */
     StrategyInitialize(!foundDescs);
+
+   /* Initialize per-backend file flush context */
+   WritebackContextInit(&BackendWritebackContext,
+                        &backend_flush_after);
  }
  
  /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 7141eb83c68c297de1c67d01e2170828c54c08b8..cdbda0ff0a692dded140b6438750bdae3558e8e0 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -82,6 +82,14 @@ double       bgwriter_lru_multiplier = 2.0;
  bool       track_io_timing = false;
  int            effective_io_concurrency = 0;
  
+/*
+ * GUC variables about triggering kernel writeback for buffers written; OS
+ * dependant defaults are set via the GUC mechanism.
+ */
+int            checkpoint_flush_after = 0;
+int            bgwriter_flush_after = 0;
+int            backend_flush_after = 0;
+
  /*
   * How many buffers PrefetchBuffer callers should try to stay ahead of their
   * ReadBuffer calls by.  This is maintained by the assign hook for
@@ -399,7 +407,7 @@ static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
  static void PinBuffer_Locked(BufferDesc *buf);
  static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
  static void BufferSync(int flags);
-static int SyncOneBuffer(int buf_id, bool skip_recently_used);
+static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context);
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
@@ -416,6 +424,7 @@ static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
  static void AtProcExit_Buffers(int code, Datum arg);
  static void CheckForBufferLeaks(void);
  static int rnode_comparator(const void *p1, const void *p2);
+static int buffertag_comparator(const void *p1, const void *p2);
  
  
  /*
@@ -818,6 +827,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
         MemSet((char *) bufBlock, 0, BLCKSZ);
         /* don't set checksum for all-zero page */
         smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
+
+       /*
+        * XXX: Note that we're *not* doing a ScheduleBufferTagForWriteback
+        * here. At least on linux doing so defeats 'delayed allocation',
+        * leading to more fragmented files.
+        */
     }
     else
     {
@@ -1084,6 +1099,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                 FlushBuffer(buf, NULL);
                 LWLockRelease(BufferDescriptorGetContentLock(buf));
  
+               ScheduleBufferTagForWriteback(&BackendWritebackContext,
+                                             &buf->tag);
+
                 TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
                                                smgr->smgr_rnode.node.spcNode,
                                                 smgr->smgr_rnode.node.dbNode,
@@ -1642,6 +1660,7 @@ BufferSync(int flags)
     int         num_to_write;
     int         num_written;
     int         mask = BM_DIRTY;
+   WritebackContext wb_context;
  
     /* Make sure we can handle the pin inside SyncOneBuffer */
     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
@@ -1694,6 +1713,9 @@ BufferSync(int flags)
     if (num_to_write == 0)
         return;                 /* nothing to do */
  
+
+   WritebackContextInit(&wb_context, &checkpoint_flush_after);
+
     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
  
     /*
@@ -1725,7 +1747,7 @@ BufferSync(int flags)
          */
         if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
         {
-           if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
+           if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
             {
                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
                 BgWriterStats.m_buf_written_checkpoints++;
@@ -1777,7 +1799,7 @@ BufferSync(int flags)
   * bgwriter_lru_maxpages to 0.)
   */
  bool
-BgBufferSync(void)
+BgBufferSync(WritebackContext *wb_context)
  {
     /* info obtained from freelist.c */
     int         strategy_buf_id;
@@ -2002,7 +2024,8 @@ BgBufferSync(void)
     /* Execute the LRU scan */
     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
     {
-       int         buffer_state = SyncOneBuffer(next_to_clean, true);
+       int         buffer_state = SyncOneBuffer(next_to_clean, true,
+                                                wb_context);
  
         if (++next_to_clean >= NBuffers)
         {
@@ -2079,10 +2102,11 @@ BgBufferSync(void)
   * Note: caller must have done ResourceOwnerEnlargeBuffers.
   */
  static int
-SyncOneBuffer(int buf_id, bool skip_recently_used)
+SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
  {
     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
     int         result = 0;
+   BufferTag   tag;
  
     ReservePrivateRefCountEntry();
  
@@ -2123,8 +2147,13 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
     FlushBuffer(bufHdr, NULL);
  
     LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+
+   tag = bufHdr->tag;
+
     UnpinBuffer(bufHdr, true);
  
+   ScheduleBufferTagForWriteback(wb_context, &tag);
+
     return result | BUF_WRITTEN;
  }
  
@@ -3724,3 +3753,149 @@ rnode_comparator(const void *p1, const void *p2)
     else
         return 0;
  }
+
+
+/*
+ * BufferTag comparator.
+ */
+static int
+buffertag_comparator(const void *a, const void *b)
+{
+   const BufferTag *ba = (const BufferTag *) a;
+   const BufferTag *bb = (const BufferTag *) b;
+   int ret;
+
+   ret = rnode_comparator(&ba->rnode, &bb->rnode);
+
+   if (ret != 0)
+       return ret;
+
+   if (ba->forkNum < bb->forkNum)
+       return -1;
+   if (ba->forkNum > bb->forkNum)
+       return 1;
+
+   if (ba->blockNum < bb->blockNum)
+       return -1;
+   if (ba->blockNum > bb->blockNum)
+       return 1;
+
+   return 0;
+}
+
+
+/*
+ * Initialize a writeback context, discarding potential previous state.
+ *
+ * *max_coalesce is a pointer to a variable containing the current maximum
+ * number of writeback requests that will be coalesced into a bigger one. A
+ * value <= 0 means that no writeback control will be performed. max_pending
+ * is a pointer instead of an immediate value, so the coalesce limits can
+ * easily changed by the GUC mechanism, and so calling code does not have to
+ * check the current config variables.
+ */
+void
+WritebackContextInit(WritebackContext *context, int *max_pending)
+{
+   Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+   context->max_pending = max_pending;
+   context->nr_pending = 0;
+}
+
+
+/*
+ * Add buffer to list of pending writeback requests.
+ */
+void
+ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
+{
+   PendingWriteback *pending;
+
+   /* nothing to do if flushing is disabled */
+   if (*context->max_pending <= 0 && context->nr_pending <= 0)
+       return;
+
+   Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+   pending = &context->pending_writebacks[context->nr_pending++];
+
+   pending->tag = *tag;
+
+   if (context->nr_pending >= *context->max_pending)
+       IssuePendingWritebacks(context);
+}
+
+/*
+ * Issue all pending writeback requests, previously scheduled with
+ * ScheduleBufferTagForWriteback, to the OS.
+ *
+ * Because this is only used to improve the OSs IO scheduling we try to never
+ * error out - it's just a hint.
+ */
+void
+IssuePendingWritebacks(WritebackContext *context)
+{
+   int         i;
+
+   if (context->nr_pending == 0)
+       return;
+
+   /*
+    * Executing the writes in-order can make them a lot faster, and allows to
+    * merge writeback requests to consecutive blocks into larger writebacks.
+    */
+   qsort(&context->pending_writebacks, context->nr_pending,
+         sizeof(PendingWriteback), buffertag_comparator);
+
+   /*
+    * Coalesce neighbouring writes, but nothing else. For that we iterate
+    * through the, now sorted, array of pending flushes, and look forward to
+    * find all neighbouring (or identical) writes.
+    */
+   for (i = 0; i < context->nr_pending; i++)
+   {
+       PendingWriteback *cur;
+       PendingWriteback *next;
+       SMgrRelation reln;
+       int ahead;
+       BufferTag tag;
+       Size nblocks = 1;
+
+       cur = &context->pending_writebacks[i];
+       tag = cur->tag;
+
+       /*
+        * Peek ahead, into following writeback requests, to see if they can
+        * be combined with the current one.
+        */
+       for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
+       {
+           next = &context->pending_writebacks[i + ahead + 1];
+
+           /* different file, skip */
+           if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
+               cur->tag.forkNum != cur->tag.forkNum)
+               break;
+
+           /* ok, block flushed twice, skip */
+           if (cur->tag.blockNum == next->tag.blockNum)
+               continue;
+
+           /* only merge consecutive writes */
+           if (cur->tag.blockNum + 1 != next->tag.blockNum)
+               break;
+
+           nblocks++;
+           cur = next;
+       }
+
+       i += ahead;
+
+       /* and finally tell the kernel to write the data to storage */
+       reln = smgropen(tag.rnode, InvalidBackendId);
+       smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
+   }
+
+   context->nr_pending = 0;
+}
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c

index 522f42079ed3bfcceef3bd2dfec0ab3f334d4c4f..a51ee815662d4b0dbb37cec9c7f28a723dd46852 100644 (file)
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -190,9 +190,9 @@ copy_file(char *fromfile, char *tofile)
         /*
          * We fsync the files later but first flush them to avoid spamming the
          * cache and hopefully get the kernel to start writing them out before
-        * the fsync comes.  Ignore any error, since it's only a hint.
+        * the fsync comes.
          */
-       (void) pg_flush_data(dstfd, offset, nbytes);
+       pg_flush_data(dstfd, offset, nbytes);
     }
  
     if (CloseTransientFile(dstfd))
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 1b301001f09b9e6542f2ecb61b550750d04fb8e1..5b8a765760d1c807992af98a885df154b0feedde 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,9 @@
  #include <sys/file.h>
  #include <sys/param.h>
  #include <sys/stat.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
  #include <unistd.h>
  #include <fcntl.h>
  #ifdef HAVE_SYS_RESOURCE_H
@@ -82,6 +85,8 @@
  /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
  #if defined(HAVE_SYNC_FILE_RANGE)
  #define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
  #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
  #define PG_FLUSH_DATA_WORKS 1
  #endif
@@ -380,29 +385,126 @@ pg_fdatasync(int fd)
  }
  
  /*
- * pg_flush_data --- advise OS that the data described won't be needed soon
+ * pg_flush_data --- advise OS that the described dirty data should be flushed
   *
- * Not all platforms have sync_file_range or posix_fadvise; treat as no-op
- * if not available.  Also, treat as no-op if enableFsync is off; this is
- * because the call isn't free, and some platforms such as Linux will actually
- * block the requestor until the write is scheduled.
+ * An offset of 0 with an amount of 0 means that the entire file should be
+ * flushed.
   */
-int
-pg_flush_data(int fd, off_t offset, off_t amount)
+void
+pg_flush_data(int fd, off_t offset, off_t nbytes)
  {
  #ifdef PG_FLUSH_DATA_WORKS
-   if (enableFsync)
-   {
+
+   /*
+    * Right now file flushing is primarily used to avoid making later
+    * fsync()/fdatasync() calls have a less impact. Thus don't trigger
+    * flushes if fsyncs are disabled - that's a decision we might want to
+    * make configurable at some point.
+    */
+   if (!enableFsync)
+       return;
+
  #if defined(HAVE_SYNC_FILE_RANGE)
-       return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
+   {
+       int         rc = 0;
+
+       /*
+        * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
+        * tells the OS that writeback for the passed in blocks should be
+        * started, but that we don't want to wait for completion.  Note that
+        * this call might block if too much dirty data exists in the range.
+        * This is the preferrable method on OSs supporting it, as it works
+        * reliably when available (contrast to msync()) and doesn't flush out
+        * clean data (like FADV_DONTNEED).
+        */
+       rc = sync_file_range(fd, offset, nbytes,
+                            SYNC_FILE_RANGE_WRITE);
+
+       /* don't error out, this is just a performance optimization */
+       if (rc != 0)
+       {
+           ereport(WARNING,
+                   (errcode_for_file_access(),
+                    errmsg("could not flush dirty data: %m")));
+       }
+   }
+#elif !defined(WIN32) && defined(MS_ASYNC)
+   {
+       int         rc = 0;
+       void       *p;
+
+       /*
+        * On many OSs msync() on a mmap'ed file triggers writeback. On linux
+        * it only does so when MS_SYNC is specified, but then it does the
+        * writeback synchronously. Luckily all common linux systems have
+        * sync_file_range().  This is preferrable over FADV_DONTNEED because
+        * it doesn't flush out clean data.
+        *
+        * We map the file (mmap()), tell the kernel to sync back the contents
+        * (msync()), and then remove the mapping again (munmap()).
+        */
+       p = mmap(NULL, context->nbytes,
+                PROT_READ | PROT_WRITE, MAP_SHARED,
+                context->fd, context->offset);
+       if (p == MAP_FAILED)
+       {
+           ereport(WARNING,
+                   (errcode_for_file_access(),
+                    errmsg("could not mmap while flushing dirty data in file \"%s\": %m",
+                           context->filename ? context->filename : "")));
+           goto out;
+       }
+
+       rc = msync(p, context->nbytes, MS_ASYNC);
+       if (rc != 0)
+       {
+           ereport(WARNING,
+                   (errcode_for_file_access(),
+                    errmsg("could not flush dirty data in file \"%s\": %m",
+                           context->filename ? context->filename : "")));
+           /* NB: need to fall through to munmap()! */
+       }
+
+       rc = munmap(p, context->nbytes);
+       if (rc != 0)
+       {
+           /* FATAL error because mapping would remain */
+           ereport(FATAL,
+                   (errcode_for_file_access(),
+                    errmsg("could not munmap while flushing blocks in file \"%s\": %m",
+                           context->filename ? context->filename : "")));
+       }
+   }
  #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-       return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+   {
+       int         rc = 0;
+
+       /*
+        * Signal the kernel that the passed in range should not be cached
+        * anymore. This has the, desired, side effect of writing out dirty
+        * data, and the, undesired, side effect of likely discarding useful
+        * clean cached blocks.  For the latter reason this is the least
+        * preferrable method.
+        */
+
+       rc = posix_fadvise(context->fd, context->offset, context->nbytes,
+                          POSIX_FADV_DONTNEED);
+
+       /* don't error out, this is just a performance optimization */
+       if (rc != 0)
+       {
+           ereport(WARNING,
+                   (errcode_for_file_access(),
+                    errmsg("could not flush dirty data in file \"%s\": %m",
+                           context->filename ? context->filename : "")));
+           goto out;
+       }
+   }
  #else
  #error PG_FLUSH_DATA_WORKS should not have been defined
  #endif
-   }
-#endif
-   return 0;
+
+#endif /* PG_FLUSH_DATA_WORKS */
  }
  
  
@@ -1289,6 +1391,24 @@ FilePrefetch(File file, off_t offset, int amount)
  #endif
  }
  
+void
+FileWriteback(File file, off_t offset, int amount)
+{
+   int         returnCode;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " %d",
+              file, VfdCache[file].fileName,
+              (int64) offset, amount));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return;
+
+   pg_flush_data(VfdCache[file].fd, offset, amount);
+}
+
  int
  FileRead(File file, char *buffer, int amount)
  {
@@ -2655,9 +2775,10 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
     }
  
     /*
-    * We ignore errors from pg_flush_data() because this is only a hint.
+    * pg_flush_data() ignores errors, which is ok because this is only a
+    * hint.
      */
-   (void) pg_flush_data(fd, 0, 0);
+   pg_flush_data(fd, 0, 0);
  
     (void) CloseTransientFile(fd);
  }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index f6b79a9968969dda533777e610e046fd4bab01b8..bb2b465505f64131d8f62901e4719313b4ed3d84 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -662,6 +662,55 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
  #endif   /* USE_PREFETCH */
  }
  
+/*
+ * mdwriteback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a rnage of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
+{
+   off_t       seekpos;
+   MdfdVec    *v;
+
+   /*
+    * Issue flush requests in as few requests as possible; have to split at
+    * segment boundaries though, since those are actually separate files.
+    */
+   while (nblocks != 0)
+   {
+       int nflush = nblocks;
+       int segnum_start, segnum_end;
+
+       v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_RETURN_NULL);
+
+       /*
+        * We might be flushing buffers of already removed relations, that's
+        * ok, just ignore that case.
+        */
+       if (!v)
+           return;
+
+       /* compute offset inside the current segment */
+       segnum_start = blocknum / RELSEG_SIZE;
+
+       /* compute number of desired writes within the current segment */
+       segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
+       if (segnum_start != segnum_end)
+           nflush = RELSEG_SIZE - (blocknum  % ((BlockNumber) RELSEG_SIZE) );
+
+       Assert(nflush >= 1);
+       Assert(nflush <= nblocks);
+
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+       FileWriteback(v->mdfd_vfd, seekpos, BLCKSZ * nflush);
+
+       nblocks -= nflush;
+       blocknum += nflush;
+   }
+}
  
  /*
   * mdread() -- Read the specified block from a relation.
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index 87ff3583ff89746973ea1bef7bfead6c7b26f682..2cae5aae1c1044eebd73c69f3da9cdd81127ff8a 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -53,6 +53,8 @@ typedef struct f_smgr
                                           BlockNumber blocknum, char *buffer);
     void        (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
                          BlockNumber blocknum, char *buffer, bool skipFsync);
+   void        (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+                        BlockNumber blocknum, int nblocks);
     BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
     void        (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
                                               BlockNumber nblocks);
@@ -66,8 +68,8 @@ typedef struct f_smgr
  static const f_smgr smgrsw[] = {
     /* magnetic disk */
     {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
-       mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
-       mdpreckpt, mdsync, mdpostckpt
+       mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate,
+       mdimmedsync, mdpreckpt, mdsync, mdpostckpt
     }
  };
  
@@ -649,6 +651,19 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                               buffer, skipFsync);
  }
  
+
+/*
+ * smgrwriteback() -- Trigger kernel writeback for the supplied range of
+ *                    blocks.
+ */
+void
+smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+             int nblocks)
+{
+   (*(smgrsw[reln->smgr_which].smgr_writeback)) (reln, forknum, blocknum,
+                                                 nblocks);
+}
+
  /*
   * smgrnblocks() -- Calculate the number of blocks in the
   *                  supplied relation.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index ea5a09ac1be785604d0a8fd74323dfa2aaa51880..789efbc8c73431565433bf4a2312834907f380aa 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2384,6 +2384,42 @@ static struct config_int ConfigureNamesInt[] =
         check_effective_io_concurrency, assign_effective_io_concurrency, NULL
     },
  
+   {
+       {"checkpoint_flush_after", PGC_SIGHUP, RESOURCES_ASYNCHRONOUS,
+           gettext_noop("Number of pages after which previously performed writes are flushed to disk."),
+           NULL,
+           GUC_UNIT_BLOCKS
+       },
+       &checkpoint_flush_after,
+       /* see bufmgr.h: OS dependant default */
+       DEFAULT_CHECKPOINT_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES,
+       NULL, NULL, NULL
+   },
+
+   {
+       {"backend_flush_after", PGC_USERSET, WAL_CHECKPOINTS,
+           gettext_noop("Number of pages after which previously performed writes are flushed to disk."),
+           NULL,
+           GUC_UNIT_BLOCKS
+       },
+       &backend_flush_after,
+       /* see bufmgr.h: OS dependant default */
+       DEFAULT_BACKEND_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES,
+       NULL, NULL, NULL
+   },
+
+   {
+       {"bgwriter_flush_after", PGC_SIGHUP, WAL_CHECKPOINTS,
+           gettext_noop("Number of pages after which previously performed writes are flushed to disk."),
+           NULL,
+           GUC_UNIT_BLOCKS
+       },
+       &bgwriter_flush_after,
+       /* see bufmgr.h: 16 on Linux, 0 otherwise */
+       DEFAULT_BGWRITER_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES,
+       NULL, NULL, NULL
+   },
+
     {
         {"max_worker_processes",
             PGC_POSTMASTER,
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h

index cbc4843377bf3a9ff11a488c6df75f9273817eaf..fe8b423b62ad01fd2cbf849af527774efa10d0c6 100644 (file)
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -16,6 +16,7 @@
  #define BUFMGR_INTERNALS_H
  
  #include "storage/buf.h"
+#include "storage/bufmgr.h"
  #include "storage/latch.h"
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
@@ -208,16 +209,44 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
  #define UnlockBufHdr(bufHdr)   SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
+/*
+ * The PendingWriteback & WritebackContext structure are used to keep
+ * information about pending flush requests to be issued to the OS.
+ */
+typedef struct PendingWriteback
+{
+   /* could store different types of pending flushes here */
+   BufferTag tag;
+}  PendingWriteback;
+
+/* typedef forward declared in bufmgr.h */
+typedef struct WritebackContext
+{
+   /* max number of writeback requests to coalesce */
+   int        *max_pending;
+
+   /* current number of pending writeback requests */
+   int         nr_pending;
+
+   /* pending requests */
+   PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
+}  WritebackContext;
+
  /* in buf_init.c */
  extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
+extern PGDLLIMPORT WritebackContext BackendWritebackContext;
  
  /* in localbuf.c */
  extern BufferDesc *LocalBufferDescriptors;
  
  
  /*
- * Internal routines: only called by bufmgr
+ * Internal buffer management routines
   */
+/* bufmgr.c */
+extern void WritebackContextInit(WritebackContext *context, int *max_coalesce);
+extern void IssuePendingWritebacks(WritebackContext *context);
+extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
  extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index 92c4bc543e234eda7f46e0e172a1a1b22cdb36e8..a4b1b370cebf0b52c8ea0d0966379e5640e64952 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -45,16 +45,36 @@ typedef enum
                                  * replay; otherwise same as RBM_NORMAL */
  } ReadBufferMode;
  
+/* forward declared, to avoid having to expose buf_internals.h here */
+struct WritebackContext;
+
  /* in globals.c ... this duplicates miscadmin.h */
  extern PGDLLIMPORT int NBuffers;
  
  /* in bufmgr.c */
+#define WRITEBACK_MAX_PENDING_FLUSHES 128
+
+/* FIXME: Also default to on for mmap && msync(MS_ASYNC)? */
+#ifdef HAVE_SYNC_FILE_RANGE
+#define DEFAULT_CHECKPOINT_FLUSH_AFTER 32
+#define DEFAULT_BACKEND_FLUSH_AFTER 16
+#define DEFAULT_BGWRITER_FLUSH_AFTER 64
+#else
+#define DEFAULT_CHECKPOINT_FLUSH_AFTER 0
+#define DEFAULT_BACKEND_FLUSH_AFTER 0
+#define DEFAULT_BGWRITER_FLUSH_AFTER 0
+#endif   /* HAVE_SYNC_FILE_RANGE */
+
  extern bool zero_damaged_pages;
  extern int bgwriter_lru_maxpages;
  extern double bgwriter_lru_multiplier;
  extern bool track_io_timing;
  extern int target_prefetch_pages;
  
+extern int checkpoint_flush_after;
+extern int backend_flush_after;
+extern int bgwriter_flush_after;
+
  /* in buf_init.c */
  extern PGDLLIMPORT char *BufferBlocks;
  
@@ -209,7 +229,7 @@ extern bool HoldingBufferPinThatDelaysRecovery(void);
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
-extern bool BgBufferSync(void);
+extern bool BgBufferSync(struct WritebackContext *wb_context);
  
  extern void AtProcExit_LocalBuffers(void);
  
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index 4a3fccbaa9b8372889c4948f97d0ea00a74f0718..0f67760e614a1cd18dd8244313ec9595f0c86880 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -75,6 +75,7 @@ extern int    FileSync(File file);
  extern off_t FileSeek(File file, off_t offset, int whence);
  extern int FileTruncate(File file, off_t offset);
  extern char *FilePathName(File file);
+extern void FileWriteback(File file, off_t offset, int amount);
  
  /* Operations that allow use of regular stdio --- USE WITH CAUTION */
  extern FILE *AllocateFile(const char *name, const char *mode);
@@ -112,7 +113,7 @@ extern int  pg_fsync(int fd);
  extern int pg_fsync_no_writethrough(int fd);
  extern int pg_fsync_writethrough(int fd);
  extern int pg_fdatasync(int fd);
-extern int pg_flush_data(int fd, off_t offset, off_t amount);
+extern void pg_flush_data(int fd, off_t offset, off_t amount);
  extern void fsync_fname(char *fname, bool isdir);
  extern void SyncDataDirectory(void);
  
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index a7267ea7e2d2062e9482b0dba79bb7f27bfbfabb..0483fa3097075975fd522a3ebf7e982a089b3b33 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -96,6 +96,8 @@ extern void smgrread(SMgrRelation reln, ForkNumber forknum,
          BlockNumber blocknum, char *buffer);
  extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
           BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
+         BlockNumber blocknum, int nblocks);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
  extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
              BlockNumber nblocks);
@@ -122,6 +124,8 @@ extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
        char *buffer);
  extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
         BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
+       BlockNumber blocknum, int nblocks);
  extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
  extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
            BlockNumber nblocks);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index d96896b4c27d83eabf3fe8228b31318e8da89af8..f501f556af872e4f88db4976f6d0a4c7ad1c397c 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1411,6 +1411,7 @@ Pattern_Type
  PendingOperationEntry
  PendingRelDelete
  PendingUnlinkEntry
+PendingWriteback
  PerlInterpreter
  Perl_ppaddr_t
  Permutation
@@ -2142,6 +2143,7 @@ WriteBytePtr
  WriteDataPtr
  WriteExtraTocPtr
  WriteFunc
+WritebackContext
  X509
  X509_NAME
  X509_NAME_ENTRY
author	Andres Freund <andres@anarazel.de>
	Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)
committer	Andres Freund <andres@anarazel.de>
	Fri, 19 Feb 2016 20:13:05 +0000 (12:13 -0800)
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/wal.sgml		patch \| blob \| blame \| history
src/backend/postmaster/bgwriter.c		patch \| blob \| blame \| history
src/backend/storage/buffer/buf_init.c		patch \| blob \| blame \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/backend/storage/file/copydir.c		patch \| blob \| blame \| history
src/backend/storage/file/fd.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/include/storage/buf_internals.h		patch \| blob \| blame \| history
src/include/storage/bufmgr.h		patch \| blob \| blame \| history
src/include/storage/fd.h		patch \| blob \| blame \| history
src/include/storage/smgr.h		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history