Further simplifications.
authorRobert Haas <rhaas@postgresql.org>
Thu, 26 Jul 2012 19:51:04 +0000 (15:51 -0400)
committerRobert Haas <rhaas@postgresql.org>
Tue, 27 Jan 2015 02:24:22 +0000 (02:24 +0000)
src/backend/utils/hash/chash.c

index be409f1749a9141e8ffa2d6597403ce9da7ac376..6533d46b49a47fabd4485147c83b41fd05b48d89 100644 (file)
@@ -150,9 +150,9 @@ typedef struct CHashTableData
        uint32                  nfreelists;             /* # of freelists */
        uint32                  arena_limit;    /* # of arena elements */
        uint32                  arena_stride;   /* bytes allocated per arena element */
-       CHashBucket        *bucket;                     /* array of size nbuckets */
-       CHashBucket        *garbage;            /* array of size ngarbage */
-       CHashBucket        *freelist;           /* array of size nfreelists */
+       CHashPtr           *bucket;                     /* array of size nbuckets */
+       CHashPtr           *garbage;            /* array of size ngarbage */
+       CHashPtr           *freelist;           /* array of size nfreelists */
        char               *arena;                      /* arena */
 } CHashTableData;
 
@@ -187,7 +187,7 @@ static CHashPtr CHashAllocate(CHashTable table);
 static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
 static void CHashImmediateFree(CHashTable table, CHashPtr c);
 static bool CHashRemoveMarked(CHashTable table, uint32 bucket,
-                                 CHashPtr *cp, volatile CHashPtr *p);
+                                 CHashPtr *cp, CHashPtr *p);
 
 /*
  * First stage of CHashTable initialization.  We fill in all the constants
@@ -272,7 +272,7 @@ CHashEstimateSize(CHashTable table)
        total_buckets = add_size(total_buckets, table->nfreelists);
 
        size = MAXALIGN(sizeof(CHashTableData));
-       size = add_size(size, mul_size(sizeof(CHashBucket), total_buckets));
+       size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
        size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
 
        return size;
@@ -317,7 +317,7 @@ CHashInitialize(CHashTable table, CHashDescriptor *desc)
        Assert(!found);
 
        /* Bucket, garbage, and freelist arrays follow table info. */
-       table->bucket = (CHashBucket *)
+       table->bucket = (CHashPtr *)
                (((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
        table->garbage = &table->bucket[table->nbuckets];
        table->freelist = &table->garbage[table->ngarbage];
@@ -327,20 +327,20 @@ CHashInitialize(CHashTable table, CHashDescriptor *desc)
 
        /* Initialize all three sets of lists to empty. */
        for (i = 0; i < table->nbuckets; ++i)
-               table->bucket[i].head = InvalidCHashPtr;
+               table->bucket[i] = InvalidCHashPtr;
        for (i = 0; i < table->ngarbage; ++i)
-               table->garbage[i].head = InvalidCHashPtr;
+               table->garbage[i] = InvalidCHashPtr;
        for (i = 0; i < table->nfreelists; ++i)
-               table->freelist[i].head = InvalidCHashPtr;
+               table->freelist[i] = InvalidCHashPtr;
 
        /* Put all arena elements on the free lists. */
        for (i = 0; i < table->arena_limit; ++i)
        {
-               CHashBucket        *f = &table->freelist[i % table->nfreelists];
-               CHashNode          *n = CHashTableGetRaw(table, i);
+               CHashPtr   *f = &table->freelist[i % table->nfreelists];
+               CHashNode  *n = CHashTableGetRaw(table, i);
 
-               n->un.gcnext = f->head;
-               f->head = MakeCHashPtr(i);
+               n->un.gcnext = *f;
+               *f = MakeCHashPtr(i);
        }
 
        /*
@@ -365,7 +365,7 @@ CHashSearch(CHashTable table, void *entry)
        uint32  hashcode = hash_any(entry, table->desc.key_size);
        uint32  bucket = hashcode & table->bucket_mask;
        CHashPtr        c;
-       volatile CHashNode  *n;
+       CHashNode  *n;
        bool    found = false;
        int             cmp = 1;
 
@@ -373,7 +373,7 @@ CHashSearch(CHashTable table, void *entry)
        CHashTableSuppressGC(table, bucket);
 
        /* Scan bucket. */
-       c = table->bucket[bucket].head;
+       c = table->bucket[bucket];
        while (c != InvalidCHashPtr)
        {
                uint32  h;
@@ -442,9 +442,9 @@ CHashInsert(CHashTable table, void *entry)
        uint32  bucket = hashcode & table->bucket_mask;
        CHashPtr        new;
        CHashPtr        c;
-       volatile CHashPtr   *p;
-       volatile CHashNode  *n;
-       volatile CHashNode  *nnew;
+       CHashPtr   *p;
+       CHashNode  *n;
+       CHashNode  *nnew;
        bool            found = false;
 
        /*
@@ -462,7 +462,7 @@ CHashInsert(CHashTable table, void *entry)
 
        /* Scan bucket. */
 retry:
-       p = &table->bucket[bucket].head;
+       p = &table->bucket[bucket];
        c = *p;
        while (c != InvalidCHashPtr)
        {
@@ -545,8 +545,8 @@ CHashDelete(CHashTable table, void *entry)
        uint32  hashcode = hash_any(entry, table->desc.key_size);
        uint32  bucket = hashcode & table->bucket_mask;
        CHashPtr        c;
-       volatile CHashPtr   *p;
-       volatile CHashNode  *n;
+       CHashPtr   *p;
+       CHashNode  *n;
        bool            found = false;
 
        /* Suppress garbage collection for target bucket. */
@@ -554,7 +554,7 @@ CHashDelete(CHashTable table, void *entry)
 
        /* Scan bucket. */
 retry:
-       p = &table->bucket[bucket].head;
+       p = &table->bucket[bucket];
        c = *p;
        while (c != InvalidCHashPtr)
        {
@@ -640,15 +640,20 @@ retry:
  * Allocate an arena slot for a new item to be inserted into a hash table.
  *
  * We don't want to wait until every single free-list is completely empty
- * before beginning to garbage collect, because that could have undesirable
- * latency characteristics, and might possibly render free-lists thoroughly
- * worthless from the point of view of contention avoidance.  Instead, we
- * check free lists and garbage lists in alternation.  If we find a non-empty
- * free list, we allocate from it; if we find a non-empty garbage list, we
- * garbage collect it and put the contents on our free list.
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
  *
+ * To avoid that, we check free lists and garbage lists in alternation.
  * We always start with the same free list - which one is based on our
- * backend ID - but we try to round-robin among all the garbage lists.
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists.  Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list.  In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list.  It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
  */
 static CHashPtr
 CHashAllocate(CHashTable table)
@@ -668,55 +673,48 @@ CHashAllocate(CHashTable table)
        /* Loop until we allocate a buffer. */
        for (;;)
        {
-               volatile CHashBucket *b;
+               CHashPtr  *b;
 
-               /*
-                * Attempt to pop a buffer from a freelist using compare-and-swap.
-                */
+               /* Try to pop a buffer from a freelist using compare-and-swap. */
                b = &table->freelist[f_current];
-               new = b->head;
+               new = *b;
                if (new != InvalidCHashPtr)
                {
-                       volatile CHashNode  *n;
+                       CHashNode  *n = CHashTableGetNode(table, new);
 
-                       n = CHashTableGetNode(table, new);
+                       /*
+                        * n is computed from table->freelist[f_current], which could
+                        * be modified by concurrent activity, so we need a dependency
+                        * barrier here.
+                        */
                        pg_read_barrier_depends();
-                       if (__sync_bool_compare_and_swap(&b->head, new, n->un.gcnext))
+                       if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
                                return new;
                }
 
-               /*
-                * Check the next garbage list for recyclable buffers.  If we
-                * find any, try to garbage collect them.
-                */
+               /* If next garbage list is non-empty, empty it via compare-and-swap. */
                table->gc_next = (table->gc_next + 1) % table->ngarbage;
                b = &table->garbage[table->gc_next];
-               garbage = b->head;
+               garbage = *b;
                if (garbage != InvalidCHashPtr &&
-                       __sync_bool_compare_and_swap(&b->head, garbage, InvalidCHashPtr))
+                       __sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
                {
-                       CHashPtr        fhead;
-                       CHashPtr        fcurrent;
-                       CHashPtr        fnext;
-                       CHashPtr        oldhead;
                        uint64          chash_bucket;
                        uint32          i;
-                       volatile CHashNode *n;
-
-                       /*
-                        * Be certain that the writes associated with popping the
-                        * garbage list are complete before we start checking whether
-                        * the garbage is recycleable.
-                        */
-                       pg_memory_barrier();
+                       CHashPtr        fhead;
+                       CHashNode *n;
 
                        /*
-                        * Spin until garbage is recyclable.  We could have a "soft"
-                        * version of this that merely requeues the garbage if it's not
-                        * immediately recycleable, but it's not clear that we need
-                        * such a thing.  On the flip side we might want to eventually
-                        * enter a longer sleep here, or PANIC, but it's not clear
-                        * exactly how to calibrate that, either.
+                        * Spin until garbage is recyclable.  We can't begin this operation
+                        * until the clearing of the garbage list has been committed to
+                        * memory, but since that was done using an atomic operation no
+                        * explicit barrier is needed here.
+                        *
+                        * Note: We could have a "soft" version of this that merely
+                        * requeues the garbage if it's not immediately recycleable, but
+                        * it's not clear that we need such a thing.  On the flip side we
+                        * might want to eventually enter a longer sleep here, or PANIC,
+                        * but it's not clear exactly how to calibrate that.
                         */
                        chash_bucket = ((uint64) table->desc.id)<<32 | table->gc_next;
                        for (i = 0; i < ProcGlobal->allProcCount; i++)
@@ -727,40 +725,38 @@ CHashAllocate(CHashTable table)
                                        ;
                        }
 
-                       /*
-                        * Be certain that all prior reads are done before starting
-                        * the next batch of writes.
-                        */
-                       pg_memory_barrier();
-
                        /* Remove one item from list to satisfy current allocation. */
                        new = garbage;
                        n = CHashTableGetNode(table, new);
                        fhead = n->un.gcnext;
 
-                       /* If that's all there was, we're done. */
-                       if (fhead == InvalidCHashPtr)
-                               return new;
-
-                       /* Walk list of reclaimed elements to end. */
-                       fcurrent = fhead;
-                       for (;;)
+                       /* Put any remaining elements back on the free list. */
+                       if (fhead != InvalidCHashPtr)
                        {
-                               n = CHashTableGetNode(table, fcurrent);
-                               fnext = n->un.gcnext;
-                               if (fnext == InvalidCHashPtr)
-                                       break;
-                               fcurrent = fnext;
+                               CHashPtr        fcurrent;
+                               CHashPtr        fnext;
+                               CHashPtr        oldhead;
+
+                               /* Walk list of reclaimed elements to end. */
+                               fcurrent = fhead;
+                               for (;;)
+                               {
+                                       n = CHashTableGetNode(table, fcurrent);
+                                       fnext = n->un.gcnext;
+                                       if (fnext == InvalidCHashPtr)
+                                               break;
+                                       fcurrent = fnext;
+                               }
+
+                               /* Push reclaimed elements onto home free list. */
+                               b = &table->freelist[f_home];
+                               do
+                               {
+                                       oldhead = *b;
+                                       n->un.gcnext = oldhead;
+                               } while (__sync_bool_compare_and_swap(b, oldhead, fhead));
                        }
 
-                       /* Push reclaimed elements onto home free list. */
-                       b = &table->freelist[f_home];
-                       do
-                       {
-                               oldhead = b->head;
-                               n->un.gcnext = oldhead;
-                       } while (__sync_bool_compare_and_swap(&b->head, oldhead, fhead));
-
                        /* Return the element we saved for ourselves. */
                        return new;
                }
@@ -785,8 +781,8 @@ CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
 {
        uint32          garbage_bucket;
        CHashPtr        g;
-       volatile CHashNode *n;
-       volatile CHashBucket   *garbage;
+       CHashNode *n;
+       CHashPtr *garbage;
 
        garbage_bucket = bucket >> table->garbage_shift;
        n = CHashTableGetNode(table, c);
@@ -794,9 +790,9 @@ CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
 
        do
        {
-               g = garbage->head;
+               g = *garbage;
                n->un.gcnext = g;
-       } while (!__sync_bool_compare_and_swap(&garbage->head, g, c));
+       } while (!__sync_bool_compare_and_swap(garbage, g, c));
 }
 
 /*
@@ -810,8 +806,8 @@ CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
 static void
 CHashImmediateFree(CHashTable table, CHashPtr c)
 {
-       volatile CHashNode *n;
-       volatile CHashBucket   *free;
+       CHashNode  *n;
+       CHashPtr   *free;
        uint32          f_home;
        CHashPtr        f;
 
@@ -821,9 +817,9 @@ CHashImmediateFree(CHashTable table, CHashPtr c)
 
        do
        {
-               f = free->head;
+               f = *free;
                n->un.gcnext = f;
-       } while (!__sync_bool_compare_and_swap(&free->head, f, c));
+       } while (!__sync_bool_compare_and_swap(free, f, c));
 }
 
 /*
@@ -841,14 +837,14 @@ CHashImmediateFree(CHashTable table, CHashPtr c)
  */
 static bool
 CHashRemoveMarked(CHashTable table, uint32 bucket, CHashPtr *cp,
-                                 volatile CHashPtr *p)
+                                 CHashPtr *p)
 {
        CHashPtr        c = *cp;
        CHashPtr        cc;
 
        do
        {
-               volatile CHashNode  *n;
+               CHashNode  *n;
 
                /*
                 * c is logically a pointer, so we must insert a dependency barrier