Fix bundle bugs of GIN:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 4 Jun 2007 15:59:20 +0000 (15:59 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 4 Jun 2007 15:59:20 +0000 (15:59 +0000)
- Fix possible deadlock between UPDATE and VACUUM queries. Bug never was
  observed in 8.2, but it still exist there. HEAD is more sensitive to
  bug after recent "ring" of buffer improvements.
- Fix WAL creation: if parent page is stored as is after split then
  incomplete split isn't removed during replay. This happens rather rare, only
  on large tables with a lot of updates/inserts.
- Fix WAL replay: there was wrong test of XLR_BKP_BLOCK_* for left
  page after deletion of page. That causes wrong rightlink field: it pointed
  to deleted page.
- add checking of match of clearing incomplete split
- cleanup incomplete split list after proceeding

All of this chages doesn't change on-disk storage, so backpatch...
But second point may be an issue for replaying logs from previous version.

src/backend/access/gin/gindatapage.c
src/backend/access/gin/ginentrypage.c
src/backend/access/gin/ginget.c
src/backend/access/gin/ginvacuum.c
src/backend/access/gin/ginxlog.c

index a80f053e7f80bfc72e6b2d5960628ac510878c36..a034ab6bcadf9b1278fe86eae82d0f324d615556 100644 (file)
@@ -358,6 +358,7 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda
        static XLogRecData rdata[3];
        int                     sizeofitem = GinSizeOfItem(page);
        static ginxlogInsert data;
+       int             cnt=0;
 
        *prdata = rdata;
        Assert(GinPageIsData(page));
@@ -372,21 +373,33 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda
        data.isData = TRUE;
        data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
 
-       rdata[0].buffer = buf;
-       rdata[0].buffer_std = FALSE;
-       rdata[0].data = NULL;
-       rdata[0].len = 0;
-       rdata[0].next = &rdata[1];
+       /* 
+        * Prevent full page write if child's split occurs. That is needed
+        * to remove incomplete splits while replaying WAL
+        * 
+        * data.updateBlkno contains new block number (of newly created right page)
+        * for recently splited page.
+        */
+       if ( data.updateBlkno == InvalidBlockNumber ) 
+       {
+               rdata[0].buffer = buf;
+               rdata[0].buffer_std = FALSE;
+               rdata[0].data = NULL;
+               rdata[0].len = 0;
+               rdata[0].next = &rdata[1];
+               cnt++;
+       }
 
-       rdata[1].buffer = InvalidBuffer;
-       rdata[1].data = (char *) &data;
-       rdata[1].len = sizeof(ginxlogInsert);
-       rdata[1].next = &rdata[2];
+       rdata[cnt].buffer = InvalidBuffer;
+       rdata[cnt].data = (char *) &data;
+       rdata[cnt].len = sizeof(ginxlogInsert);
+       rdata[cnt].next = &rdata[cnt+1];
+       cnt++;
 
-       rdata[2].buffer = InvalidBuffer;
-       rdata[2].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem));
-       rdata[2].len = sizeofitem;
-       rdata[2].next = NULL;
+       rdata[cnt].buffer = InvalidBuffer;
+       rdata[cnt].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem));
+       rdata[cnt].len = sizeofitem;
+       rdata[cnt].next = NULL;
 
        if (GinPageIsLeaf(page))
        {
@@ -402,7 +415,7 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda
                                btree->curitem++;
                        }
                        data.nitem = btree->curitem - savedPos;
-                       rdata[2].len = sizeofitem * data.nitem;
+                       rdata[cnt].len = sizeofitem * data.nitem;
                }
                else
                {
index 74f172a01e4d9aab9a53a6f56e87ca755f7d0269..d22d26b4ca92d1ec31a568b852e58c2e979ab79d 100644 (file)
@@ -354,6 +354,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
        static XLogRecData rdata[3];
        OffsetNumber placed;
        static ginxlogInsert data;
+       int     cnt=0;
 
        *prdata = rdata;
        data.updateBlkno = entryPreparePage(btree, page, off);
@@ -371,21 +372,33 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
        data.isData = false;
        data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
 
-       rdata[0].buffer = buf;
-       rdata[0].buffer_std = TRUE;
-       rdata[0].data = NULL;
-       rdata[0].len = 0;
-       rdata[0].next = &rdata[1];
+    /*
+        * Prevent full page write if child's split occurs. That is needed
+        * to remove incomplete splits while replaying WAL
+        *
+        * data.updateBlkno contains new block number (of newly created right page)
+        * for recently splited page.
+        */
+       if ( data.updateBlkno == InvalidBlockNumber ) 
+       {
+               rdata[0].buffer = buf;
+               rdata[0].buffer_std = TRUE;
+               rdata[0].data = NULL;
+               rdata[0].len = 0;
+               rdata[0].next = &rdata[1];
+               cnt++;
+       }
 
-       rdata[1].buffer = InvalidBuffer;
-       rdata[1].data = (char *) &data;
-       rdata[1].len = sizeof(ginxlogInsert);
-       rdata[1].next = &rdata[2];
-
-       rdata[2].buffer = InvalidBuffer;
-       rdata[2].data = (char *) btree->entry;
-       rdata[2].len = IndexTupleSize(btree->entry);
-       rdata[2].next = NULL;
+       rdata[cnt].buffer = InvalidBuffer;
+       rdata[cnt].data = (char *) &data;
+       rdata[cnt].len = sizeof(ginxlogInsert);
+       rdata[cnt].next = &rdata[cnt+1];
+       cnt++;
+
+       rdata[cnt].buffer = InvalidBuffer;
+       rdata[cnt].data = (char *) btree->entry;
+       rdata[cnt].len = IndexTupleSize(btree->entry);
+       rdata[cnt].next = NULL;
 
        btree->entry = NULL;
 }
index 435f3fe4efd2db1aa2bd2403c0f87484e6ea7dda..2b5635700c7a94069f3a665d7d17e44ee00f0f9a 100644 (file)
 #include "catalog/index.h"
 #include "utils/memutils.h"
 
-static OffsetNumber
-findItemInPage(Page page, ItemPointer item, OffsetNumber off)
+static bool
+findItemInPage(Page page, ItemPointer item, OffsetNumber *off)
 {
        OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
        int                     res;
 
-       for (; off <= maxoff; off++)
+       if ( GinPageGetOpaque(page)->flags & GIN_DELETED )
+               /* page was deleted by concurrent  vacuum */
+               return false;
+
+       if ( *off > maxoff || *off == InvalidOffsetNumber )
+               res = -1;
+       else
+               res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
+
+       if ( res == 0 ) 
+       {
+               /* page isn't changed */
+               return true; 
+       } 
+       else if ( res > 0 ) 
        {
-               res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, off));
-               Assert(res >= 0);
+               /* 
+                * some items was added before our position, look further to find 
+                * it or first greater 
+                */
+       
+               (*off)++;
+               for (; *off <= maxoff; (*off)++) 
+               {
+                       res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
+
+                       if (res == 0)
+                               return true;
 
-               if (res == 0)
-                       return off;
+                       if (res < 0)
+                       {       
+                               (*off)--;
+                               return true;
+                       }
+               }
        }
+       else
+       {
+               /* 
+                * some items was deleted before our position, look from begining
+                * to find it or first greater
+                */
+
+               for(*off = FirstOffsetNumber; *off<= maxoff; (*off)++) 
+               {
+                       res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
 
-       return InvalidOffsetNumber;
+                       if ( res == 0 )
+                               return true;
+
+                       if (res < 0)
+                       {       
+                               (*off)--;
+                               return true;
+                       }
+               }
+       }
+
+       return false;
 }
 
 /*
@@ -111,7 +160,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry, bool firs
        }
        else if (entry->buffer != InvalidBuffer)
        {
-               /* we should find place were we was stopped */
+               /* we should find place where we was stopped */
                BlockNumber blkno;
                Page            page;
 
@@ -125,7 +174,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry, bool firs
                page = BufferGetPage(entry->buffer);
 
                /* try to find curItem in current buffer */
-               if ((entry->offset = findItemInPage(page, &entry->curItem, entry->offset)) != InvalidOffsetNumber)
+               if ( findItemInPage(page, &entry->curItem, &entry->offset) )
                        return;
 
                /* walk to right */
@@ -136,11 +185,15 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry, bool firs
                        LockBuffer(entry->buffer, GIN_SHARE);
                        page = BufferGetPage(entry->buffer);
 
-                       if ((entry->offset = findItemInPage(page, &entry->curItem, FirstOffsetNumber)) != InvalidOffsetNumber)
+                       entry->offset = InvalidOffsetNumber;
+                       if ( findItemInPage(page, &entry->curItem, &entry->offset) )
                                return;
                }
 
-               elog(ERROR, "Logic error: lost previously founded ItemId");
+               /*
+                * curItem and any greated items was deleted by concurrent vacuum,
+                * so we finished scan with currrent entry
+                */
        }
 }
 
index df3738cb8b84cd0d659453fdbe399492e1088940..571a9411b5ff1e50dd44762c2b6121e97f3a73e5 100644 (file)
@@ -160,14 +160,14 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        /*
         * We should be sure that we don't concurrent with inserts, insert process
         * never release root page until end (but it can unlock it and lock
-        * again). If we lock root with with LockBufferForCleanup, new scan
-        * process can't begin, but previous may run. ginmarkpos/start* keeps
-        * buffer pinned, so we will wait for it. We lock only one posting tree in
-        * whole index, so, it's concurrent enough.. Side effect: after this is
-        * full complete, tree is unused by any other process
+        * again). New scan can't start but previously started 
+        * ones work concurrently.
         */
 
-       LockBufferForCleanup(buffer);
+       if ( isRoot ) 
+               LockBufferForCleanup(buffer);
+       else
+               LockBuffer(buffer, GIN_EXCLUSIVE); 
 
        Assert(GinPageIsData(page));
 
@@ -250,6 +250,8 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        if (!isParentRoot)                      /* parent is already locked by
                                                                 * LockBufferForCleanup() */
                LockBuffer(pBuffer, GIN_EXCLUSIVE);
+       if (leftBlkno != InvalidBlockNumber)
+               LockBuffer(lBuffer, GIN_EXCLUSIVE);
 
        START_CRIT_SECTION();
 
@@ -257,8 +259,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        {
                BlockNumber rightlink;
 
-               LockBuffer(lBuffer, GIN_EXCLUSIVE);
-
                page = BufferGetPage(dBuffer);
                rightlink = GinPageGetOpaque(page)->rightlink;
 
@@ -276,6 +276,10 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        PageDeletePostingItem(parentPage, myoff);
 
        page = BufferGetPage(dBuffer);
+       /*
+        * we shouldn't change rightlink field to save 
+        * workability of running search scan
+        */
        GinPageGetOpaque(page)->flags = GIN_DELETED;
 
        if (!gvs->index->rd_istemp)
index 276b610bc77184d87a41d4d1a87e6f8810139856..811f6d9609a7deff14a7f19f5999b5f0dc51d93a 100644 (file)
@@ -53,6 +53,7 @@ static void
 forgetIncompleteSplit(RelFileNode node, BlockNumber leftBlkno, BlockNumber updateBlkno)
 {
        ListCell   *l;
+       bool            found = false;
 
        foreach(l, incomplete_splits)
        {
@@ -61,9 +62,16 @@ forgetIncompleteSplit(RelFileNode node, BlockNumber leftBlkno, BlockNumber updat
                if (RelFileNodeEquals(node, split->node) && leftBlkno == split->leftBlkno && updateBlkno == split->rightBlkno)
                {
                        incomplete_splits = list_delete_ptr(incomplete_splits, split);
+                       found = true;
                        break;
                }
        }
+
+       if (!found)
+       {
+               elog(ERROR, "failed to identify corresponding split record for %u/%u/%u",
+                                       node.relNode, leftBlkno, updateBlkno);
+       }
 }
 
 static void
@@ -416,7 +424,7 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record)
                UnlockReleaseBuffer(buffer);
        }
 
-       if (!(record->xl_info & XLR_BKP_BLOCK_2) && data->leftBlkno != InvalidBlockNumber)
+       if (!(record->xl_info & XLR_BKP_BLOCK_3) && data->leftBlkno != InvalidBlockNumber)
        {
                buffer = XLogReadBuffer(reln, data->leftBlkno, false);
                page = BufferGetPage(buffer);
@@ -594,6 +602,7 @@ gin_xlog_cleanup(void)
 
        MemoryContextSwitchTo(topCtx);
        MemoryContextDelete(opCtx);
+       incomplete_splits = NIL;
 }
 
 bool