Unify scan code.
authorRobert Haas <rhaas@postgresql.org>
Thu, 26 Jul 2012 21:43:15 +0000 (17:43 -0400)
committerRobert Haas <rhaas@postgresql.org>
Thu, 26 Jul 2012 21:43:15 +0000 (17:43 -0400)
src/backend/utils/hash/chash.c

index 6533d46b49a47fabd4485147c83b41fd05b48d89..4232294fd8c5ba743f0e5099103c05f8aff62853 100644 (file)
@@ -163,6 +163,17 @@ typedef struct CHashTableData
        (AssertMacro((ptr) != InvalidCHashPtr), \
         CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
 
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+       CHashPtr        target;
+       CHashPtr   *pointer_to_target;
+       CHashNode  *target_node;
+       bool            found;
+} CHashScanResult;
+
 /*
  * We need a memory barrier to make sure that our bucket advertisement is
  * committed to memory before we begin scanning the bucket, and another one
@@ -186,8 +197,11 @@ typedef struct CHashTableData
 static CHashPtr CHashAllocate(CHashTable table);
 static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
 static void CHashImmediateFree(CHashTable table, CHashPtr c);
-static bool CHashRemoveMarked(CHashTable table, uint32 bucket,
-                                 CHashPtr *cp, CHashPtr *p);
+static void CHashBucketScan(CHashTable table,
+                               CHashPtr *start,
+                               uint32 hashcode,
+                               const void *key,
+                               CHashScanResult *res);
 
 /*
  * First stage of CHashTable initialization.  We fill in all the constants
@@ -364,62 +378,22 @@ CHashSearch(CHashTable table, void *entry)
 {
        uint32  hashcode = hash_any(entry, table->desc.key_size);
        uint32  bucket = hashcode & table->bucket_mask;
-       CHashPtr        c;
-       CHashNode  *n;
-       bool    found = false;
-       int             cmp = 1;
+       CHashScanResult scan;
 
-       /* Suppress garbage collection for target bucket. */
+       /* Prevent garbage collection for this bucket. */
        CHashTableSuppressGC(table, bucket);
 
-       /* Scan bucket. */
-       c = table->bucket[bucket];
-       while (c != InvalidCHashPtr)
-       {
-               uint32  h;
-
-               /* Compare current node by hashcode, then by memcmp. */
-               n = CHashTableGetNode(table, c);
-               pg_read_barrier_depends();
-               h = n->un.hashcode;
-               if (h == hashcode)
-                       cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size);
-               else if (h > hashcode)
-                       cmp = 1;
-               else
-                       cmp = -1;
-
-               /* Stop if we've passed the list position where the entry should be. */
-               if (cmp >= 0)
-                       break;
-
-               /* Fetch next-pointer. */
-               c = n->next;
-       }
-
-       /*
-        * Is it the item we're looking for?
-        * 
-        * If the pointer is marked, it will be followed (if at all) by
-        * a node which is "later" than this one in terms of either
-        * hashcode or memcmp ordering; we won't find a duplicate of the
-        * current key.  This is because a marked CHashPtr is never
-        * further updated (or at least, not until after garbage
-        * collection, which we've already prevented for this bucket),
-        * so the successor element must have been present before this
-        * one was deleted.
-        */
-       if (cmp == 0 && !CHashPtrIsMarked(n->next))
-       {
+       /* Scan bucket and return data from any matching entry. */
+       CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan);
+       if (scan.found)
                memcpy(((char *) entry) + table->desc.key_size,
-                          CHashNodeGetItem(n) + table->desc.key_size,
+                          CHashNodeGetItem(scan.target_node) + table->desc.key_size,
                           table->desc.element_size - table->desc.key_size);
-               found = true;
-       }
 
-       /* We're done. */
+       /* Allow garbage collection for this bucket. */
        CHashTableUnsuppressGC();
-       return found;
+
+       return scan.found;
 }
 
 /*
@@ -441,11 +415,8 @@ CHashInsert(CHashTable table, void *entry)
        uint32  hashcode = hash_any(entry, table->desc.key_size);
        uint32  bucket = hashcode & table->bucket_mask;
        CHashPtr        new;
-       CHashPtr        c;
-       CHashPtr   *p;
-       CHashNode  *n;
        CHashNode  *nnew;
-       bool            found = false;
+       CHashScanResult scan;
 
        /*
         * Allocate and initialize a new entry, on the assumption that the insert
@@ -457,82 +428,52 @@ CHashInsert(CHashTable table, void *entry)
        nnew->un.hashcode = hashcode;
        memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
 
-       /* Suppress garbage collection for target bucket. */
+       /* Prevent garbage collection for this bucket. */
        CHashTableSuppressGC(table, bucket);
 
-       /* Scan bucket. */
+       /*
+        * Scan the bucket.  If we don't find a match, use compare-and-swap to
+        * insert the new node at the insert position.  If we do find a match,
+        * return the data to the caller.
+        */
 retry:
-       p = &table->bucket[bucket];
-       c = *p;
-       while (c != InvalidCHashPtr)
-       {
-               int             cmp;
-               uint32  h;
-
-               /* Compare current node by hashcode, then by memcmp. */
-               n = CHashTableGetNode(table, c);
-               pg_read_barrier_depends();
-               h = n->un.hashcode;
-               if (h == hashcode)
-                       cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size);
-               else if (h > hashcode)
-                       cmp = 1;
-               else
-                       cmp = -1;
-
-               /* If we found the key, or passed where it should be, we're done. */
-               if (cmp >= 0)
-               {
-                       found = (cmp == 0);
-                       break;
-               }
-
-               /* Move to next node. */
-               p = &n->next;
-               c = *p;
-
-               /*
-                * We can't safely update a delete-marked pointer, so remove any
-                * such pointers we find from the bucket chain.  Sometimes, concurrent
-                * activity may force us to restart the whole scan, but that should
-                * be rare.
-                */
-               if (CHashPtrIsMarked(c) && CHashRemoveMarked(table, bucket, &c, p))
-                       goto retry;
-       }
-
-       if (!found)
-       {
-               /*
-                * If we fail, it means that somebody concurrently inserted or
-                * deleted an element.  The correct insertion point might have changed,
-                * or the key we're trying to insert might now be present when it
-                * wasn't before, so we'll have to search the bucket chain anew.
-                */
-               nnew->next = c;
-               if (!__sync_bool_compare_and_swap(p, c, new))
-                       goto retry;
-       }
+       CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan);
+       if (scan.found)
+               memcpy(((char *) entry) + table->desc.key_size,
+                          CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+                          table->desc.element_size - table->desc.key_size);
        else
        {
                /*
-                * If we did find the key, return the corresponding value to the
-                * caller.
+                * We didn't find a matching element, so use compare-and-swap to
+                * attempt to insert the new element we've prepared.  If this fails,
+                * someone currently inserted or deleted an element.  The correct
+                * insertion point might have changed, or the key we're trying to
+                * insert might now be present when it wasn't before, so we'll have
+                * to search the bucket chain anew.
+                *
+                * There is a risk of starvation here, but we hope it will not happen
+                * in practice.  Contention for inserting new elements should be
+                * spread out pretty much evenly across N+M possible insertion points,
+                * where N is the number of buckets and M is the number of elements
+                * in the table.  Even for a quite modestly size table this is likely
+                * to exceed the number of CPU cores.
                 */
-               memcpy(((char *) entry) + table->desc.key_size,
-                          CHashNodeGetItem(n) + table->desc.key_size,
-                          table->desc.element_size - table->desc.key_size);
+               nnew->next = scan.target;
+               if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+                                                                                 scan.target, new))
+                       goto retry;
        }
 
-       /* Done scanning bucket. */
+       /* Allow garbage collection for this bucket. */
        CHashTableUnsuppressGC();
 
        /* If the insert failed, free the entry we allocated. */
-       if (found)
+       if (scan.found)
                CHashImmediateFree(table, new);
 
        /* The insert succeeded if and only if no duplicate was found. */
-       return !found;
+       return !scan.found;
 }
 
 /*
@@ -544,96 +485,219 @@ CHashDelete(CHashTable table, void *entry)
 {
        uint32  hashcode = hash_any(entry, table->desc.key_size);
        uint32  bucket = hashcode & table->bucket_mask;
-       CHashPtr        c;
-       CHashPtr   *p;
-       CHashNode  *n;
-       bool            found = false;
+       bool    cleanup_scan = false;
+       CHashScanResult scan;
 
-       /* Suppress garbage collection for target bucket. */
+       /* Prevent garbage collection for this bucket. */
        CHashTableSuppressGC(table, bucket);
 
        /* Scan bucket. */
-retry:
-       p = &table->bucket[bucket];
-       c = *p;
-       while (c != InvalidCHashPtr)
-       {
-               int             cmp;
-               uint32  h;
+       CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan);
 
-               /* Compare current node by hashcode, then by memcmp. */
-               n = CHashTableGetNode(table, c);
-               pg_read_barrier_depends();
-               h = n->un.hashcode;
-               if (h == hashcode)
-                       cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size);
-               else if (h > hashcode)
-                       cmp = 1;
-               else
-                       cmp = -1;
+       /*
+        * If we found a match, then loop until we succesfully delete-mark the
+        * target, or until someone else does so.  There's no real risk of
+        * starvation here; the maximum number of times we can iterate here is
+        * equal to the number of backends other than this one, and we'd have
+        * to be very unlucky to have things turn out even that badly.
+        */
+       while (scan.found)
+       {
+               CHashPtr        next = scan.target_node->next;
 
-               /* If we found the key, or passed where it should be, we're done. */
-               if (cmp >= 0)
+               if (CHashPtrIsMarked(scan.target))
+               {
+                       /* Someone else deleted it before we did. */
+                       scan.found = false;
+               }
+               else if (__sync_bool_compare_and_swap(&scan.target_node,
+                                                                                         next,
+                                                                                         CHashPtrMark(next)))
                {
-                       found = (cmp == 0);
+                       /* Deletion is done; attempt to remove node from list. */
+                       if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+                                                                                        scan.target,
+                                                                                        next))
+                               CHashAddToGarbage(table, bucket, scan.target);
+                       else
+                               cleanup_scan = true;
                        break;
                }
+       }
+
+       if (cleanup_scan)
+       {
+               /* XXX */
+       }
 
-               /* Move to next node. */
-               p = &n->next;
-               c = *p;
+       /* Allow garbage collection for this bucket. */
+       CHashTableUnsuppressGC();
+
+       /* We're done. */
+       CHashTableUnsuppressGC();
+       return scan.found;
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * If a matching item is found, res->target will be its arena offset and
+ * res->found will be true.  If not, res->target will be the arena offset of
+ * the item following the insert position and found will be false.  In either
+ * case, res->pointer_to_target will a pointer to the address where the value
+ * of target was found.  res->target_node will be a pointer to the address of
+ * the CHashNode with offset res->target, unless res->target is invalid.
+ */
+static void
+CHashBucketScan(CHashTable table,
+                               CHashPtr *start,
+                               uint32 hashcode,
+                               const void *key,
+                               CHashScanResult *res)
+{
+       CHashPtr        target;
+       CHashPtr   *pointer_to_target;
+       CHashNode  *target_node = NULL;
+
+retry:
+       pointer_to_target = start;
+       target = *pointer_to_target;
+       for (;;)
+       {
+               CHashPtr        next;
+               uint32          h;
+               int                     cmp;
 
                /*
-                * We can't safely update a delete-marked pointer, so remove any
-                * such pointers we find from the bucket chain.  Sometimes, concurrent
-                * activity may force us to restart the whole scan, but that should
-                * be rare.
+                * Read arena offset of target.  If we've reached the end of the
+                * bucket chain, stop; otherwise, figure out the actual address of
+                * the next item.
                 */
-               if (CHashPtrIsMarked(c) && CHashRemoveMarked(table, bucket, &c, p))
-                       goto retry;
-       }
+               if (target == InvalidCHashPtr)
+               {
+                       res->found = false;
+                       break;
+               }
+               target_node = CHashTableGetNode(table, target);
 
-       if (found)
-       {
-               CHashPtr        cc;
+               /*
+                * target may have been fetched from an arena entry that could be
+                * concurrently modified, so a dependency barrier is required before
+                * dereferencing the derived pointer.
+                */
+               pg_read_barrier_depends();
+               next = target_node->next;
 
                /*
-                * Really do the deletion.  Since we've held no lock up to this
-                * point, it may well be that someone else has deleted the item out
-                * from under us, so we recheck that after taking the lock.
+                * For simplicity, any bucket scan, even if it's servicing a search,
+                * will attempt to remove delete-marked items from the bucket.  This
+                * ensures that delete-marked elements are removed from bucket chains
+                * as quickly as possible and reduces code duplication.  See
+                * CHashDelete for further comments about why delete-marking is
+                * necessary and how it allows safe deletion.
                 */
-               do
+               if (CHashPtrIsMarked(next))
                {
-                       cc = n->next;
-                       if (CHashPtrIsMarked(cc))
+zap:
+                       if (__sync_bool_compare_and_swap(pointer_to_target, target, next))
                        {
-                               found = false;
-                               break;
+                               /*
+                                * No one else can possibly have modified target_node->next,
+                                * because such modifications are not allowed after a
+                                * delete-mark has been applied.  Thus, if we just keep
+                                * following the next pointers, we're guaranteed to visit
+                                * all non-deleted items (and possibly some deleted items)
+                                * that were present at the time we began the scan.
+                                */
+                               /* XXX We must put target on a garbage list now! */
+                               target = next;
+                               continue;
                        }
-               } while (!__sync_bool_compare_and_swap(&n->next,
-                                                                                          cc, CHashPtrMark(cc)));
+                       else
+                       {
+                               /*
+                                * If the previous node has been delete-marked, we can't
+                                * further update the next-pointer, so restart the scan.
+                                * Otherwise, we know that some other backend removed one
+                                * or more deleted nodes from the bucket chain just as we
+                                * were trying to do, and we can simply continue the scan
+                                * from wherever the previous node is pointing now.  It's
+                                * possible that some concurrent inserts have happened, too,
+                                * but that's OK; we can view those as having happened "before"
+                                * whatever operation this scan is supporting.
+                                *
+                                * Although starvation is a theoretical possibility here if
+                                * we are forced to retry repeatedly, even a single retry is
+                                * vanishingly unlikely in practice.  It requires some other
+                                * backend to delete both the node that we're looking at and
+                                * the node which precedes it before we advance to the next
+                                * node.  That could certainly happen occasionally, but we'd
+                                * have to be pretty unlucky to have it happen even twice in
+                                * a row.
+                                */
+                               target = *pointer_to_target;
+                               if (CHashPtrIsMarked(target))
+                                       goto retry;
+                               continue;
+                       }
+               }
 
                /*
-                * At this point the deletion is done.  However, it's possible that
-                * someone else did it before we did it, in which case we need to
-                * return false rather than true.  Also, we may have successfully
-                * removed the deleted from the linked list, in which case we need to
-                * add it to the garbage list; or that we failed to do so, in which
-                * case we need to rescan the list and remove any deleted items we
-                * find.
+                * Bucket chains are kept in order, so that there is exactly one legal
+                * point at which any given key can be inserted.  The ordering is by
+                * hashcode first, and then by memcmp ordering of the keys involved.
                 */
-               Assert(!CHashPtrIsMarked(cc));
-               if (__sync_bool_compare_and_swap(p, c, cc))
-                       CHashAddToGarbage(table, bucket, c);
+               h = target_node->un.hashcode;
+               if (h == hashcode)
+                       cmp = memcmp(CHashNodeGetItem(target_node), key,
+                                                table->desc.key_size);
+               else if (h > hashcode)
+                       cmp = 1;
                else
+                       cmp = -1;
+
+               /*
+                * If cmp < 0, then we haven't yet reached the point at which we'd
+                * expect to find the key, so we must continue the scan.  If cmp == 0,
+                * we've found the key and can stop.  If cmp > 0, we've either passed
+                * the point where we expect to find the key OR someone delete-marked
+                * the item and overwrote the hashcode with a gcnext pointer.  In the
+                * latter case we must take care not to be fooled into stopping the
+                * scan early.
+                */
+               if (cmp >= 0)
                {
-                       /* XXX Rescan bucket. */
+                       if (cmp == 0)
+                       {
+                               res->found = true;
+                               break;
+                       }
+                       else
+                       {
+                               /*
+                                * pg_read_barrier() prevents the reread of the next pointer
+                                * from being speculated ahead of the read of the hash value.
+                                */
+                               pg_read_barrier();
+                               next = target_node->next;
+                               if (CHashPtrIsMarked(next))
+                                       goto zap;
+                       }
                }
+
+               /* Continue scan from next node. */
+               pointer_to_target = &target_node->next;
+               target = next;
        }
 
-       /* We're done. */
-       CHashTableUnsuppressGC();
-       return found;
+       /* Send results back to caller. */
+       res->target = target;
+       res->pointer_to_target = pointer_to_target;
+       res->target_node = target_node;
 }
 
 /*
@@ -821,57 +885,3 @@ CHashImmediateFree(CHashTable table, CHashPtr c)
                n->un.gcnext = f;
        } while (!__sync_bool_compare_and_swap(free, f, c));
 }
-
-/*
- * Attempt to remove marked elements from a bucket chain.
- *
- * p is a pointer into shared memory; it points to the CHashPtr that must be
- * updated to remove deleted elements from the chain.
- *
- * cp is a pointer into backend-private memory; it is the delete-marked pointer
- * fetched from the node to which p points.
- *
- * The return value is true if the caller must retry, or false if the caller
- * may continue the scan.  In the latter case, *cp is updated to contain a
- * pointer to the node from which the scan should be resumed.
- */
-static bool
-CHashRemoveMarked(CHashTable table, uint32 bucket, CHashPtr *cp,
-                                 CHashPtr *p)
-{
-       CHashPtr        c = *cp;
-       CHashPtr        cc;
-
-       do
-       {
-               CHashNode  *n;
-
-               /*
-                * c is logically a pointer, so we must insert a dependency barrier
-                * before deferencing it.
-                */
-               pg_read_barrier_depends();
-
-               /*
-                * Attempt to remove the deleted node from the linked list.
-                * If we fail to update the logical pointer, caller must rescan
-                * the bucket.  There's no intelligent way to continue the scan,
-                * because for all we know the node that contains the pointer we're
-                * try to update may itself be deleted by now.
-                */
-               n = CHashTableGetNode(table, c);
-               cc = n->next;
-               if (!__sync_bool_compare_and_swap(p, c, cc))
-                       return true;
-
-               /* Add c to garbage list. */
-               CHashAddToGarbage(table, bucket, c);
-
-               /* The new target of the pointer may also be delete-marked, so loop. */
-               c = cc;
-       } while (CHashPtrIsMarked(c));
-
-       /* Success! */
-       *cp = c;
-       return false;
-}