From 72bf08d6ac83c8df11b88a0c1c4b90dc84ee6209 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 26 Jul 2012 17:43:15 -0400 Subject: [PATCH] Unify scan code. --- src/backend/utils/hash/chash.c | 474 +++++++++++++++++---------------- 1 file changed, 242 insertions(+), 232 deletions(-) diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c index 6533d46b49..4232294fd8 100644 --- a/src/backend/utils/hash/chash.c +++ b/src/backend/utils/hash/chash.c @@ -163,6 +163,17 @@ typedef struct CHashTableData (AssertMacro((ptr) != InvalidCHashPtr), \ CHashTableGetRaw((table), CHashPtrGetOffset((ptr)))) +/* + * Bucket scan. + */ +typedef struct +{ + CHashPtr target; + CHashPtr *pointer_to_target; + CHashNode *target_node; + bool found; +} CHashScanResult; + /* * We need a memory barrier to make sure that our bucket advertisement is * committed to memory before we begin scanning the bucket, and another one @@ -186,8 +197,11 @@ typedef struct CHashTableData static CHashPtr CHashAllocate(CHashTable table); static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c); static void CHashImmediateFree(CHashTable table, CHashPtr c); -static bool CHashRemoveMarked(CHashTable table, uint32 bucket, - CHashPtr *cp, CHashPtr *p); +static void CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res); /* * First stage of CHashTable initialization. We fill in all the constants @@ -364,62 +378,22 @@ CHashSearch(CHashTable table, void *entry) { uint32 hashcode = hash_any(entry, table->desc.key_size); uint32 bucket = hashcode & table->bucket_mask; - CHashPtr c; - CHashNode *n; - bool found = false; - int cmp = 1; + CHashScanResult scan; - /* Suppress garbage collection for target bucket. */ + /* Prevent garbage collection for this bucket. */ CHashTableSuppressGC(table, bucket); - /* Scan bucket. */ - c = table->bucket[bucket]; - while (c != InvalidCHashPtr) - { - uint32 h; - - /* Compare current node by hashcode, then by memcmp. */ - n = CHashTableGetNode(table, c); - pg_read_barrier_depends(); - h = n->un.hashcode; - if (h == hashcode) - cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size); - else if (h > hashcode) - cmp = 1; - else - cmp = -1; - - /* Stop if we've passed the list position where the entry should be. */ - if (cmp >= 0) - break; - - /* Fetch next-pointer. */ - c = n->next; - } - - /* - * Is it the item we're looking for? - * - * If the pointer is marked, it will be followed (if at all) by - * a node which is "later" than this one in terms of either - * hashcode or memcmp ordering; we won't find a duplicate of the - * current key. This is because a marked CHashPtr is never - * further updated (or at least, not until after garbage - * collection, which we've already prevented for this bucket), - * so the successor element must have been present before this - * one was deleted. - */ - if (cmp == 0 && !CHashPtrIsMarked(n->next)) - { + /* Scan bucket and return data from any matching entry. */ + CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan); + if (scan.found) memcpy(((char *) entry) + table->desc.key_size, - CHashNodeGetItem(n) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, table->desc.element_size - table->desc.key_size); - found = true; - } - /* We're done. */ + /* Allow garbage collection for this bucket. */ CHashTableUnsuppressGC(); - return found; + + return scan.found; } /* @@ -441,11 +415,8 @@ CHashInsert(CHashTable table, void *entry) uint32 hashcode = hash_any(entry, table->desc.key_size); uint32 bucket = hashcode & table->bucket_mask; CHashPtr new; - CHashPtr c; - CHashPtr *p; - CHashNode *n; CHashNode *nnew; - bool found = false; + CHashScanResult scan; /* * Allocate and initialize a new entry, on the assumption that the insert @@ -457,82 +428,52 @@ CHashInsert(CHashTable table, void *entry) nnew->un.hashcode = hashcode; memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size); - /* Suppress garbage collection for target bucket. */ + /* Prevent garbage collection for this bucket. */ CHashTableSuppressGC(table, bucket); - /* Scan bucket. */ + /* + * Scan the bucket. If we don't find a match, use compare-and-swap to + * insert the new node at the insert position. If we do find a match, + * return the data to the caller. + */ retry: - p = &table->bucket[bucket]; - c = *p; - while (c != InvalidCHashPtr) - { - int cmp; - uint32 h; - - /* Compare current node by hashcode, then by memcmp. */ - n = CHashTableGetNode(table, c); - pg_read_barrier_depends(); - h = n->un.hashcode; - if (h == hashcode) - cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size); - else if (h > hashcode) - cmp = 1; - else - cmp = -1; - - /* If we found the key, or passed where it should be, we're done. */ - if (cmp >= 0) - { - found = (cmp == 0); - break; - } - - /* Move to next node. */ - p = &n->next; - c = *p; - - /* - * We can't safely update a delete-marked pointer, so remove any - * such pointers we find from the bucket chain. Sometimes, concurrent - * activity may force us to restart the whole scan, but that should - * be rare. - */ - if (CHashPtrIsMarked(c) && CHashRemoveMarked(table, bucket, &c, p)) - goto retry; - } - - if (!found) - { - /* - * If we fail, it means that somebody concurrently inserted or - * deleted an element. The correct insertion point might have changed, - * or the key we're trying to insert might now be present when it - * wasn't before, so we'll have to search the bucket chain anew. - */ - nnew->next = c; - if (!__sync_bool_compare_and_swap(p, c, new)) - goto retry; - } + CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan); + if (scan.found) + memcpy(((char *) entry) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, + table->desc.element_size - table->desc.key_size); else { /* - * If we did find the key, return the corresponding value to the - * caller. + * We didn't find a matching element, so use compare-and-swap to + * attempt to insert the new element we've prepared. If this fails, + * someone currently inserted or deleted an element. The correct + * insertion point might have changed, or the key we're trying to + * insert might now be present when it wasn't before, so we'll have + * to search the bucket chain anew. + * + * There is a risk of starvation here, but we hope it will not happen + * in practice. Contention for inserting new elements should be + * spread out pretty much evenly across N+M possible insertion points, + * where N is the number of buckets and M is the number of elements + * in the table. Even for a quite modestly size table this is likely + * to exceed the number of CPU cores. */ - memcpy(((char *) entry) + table->desc.key_size, - CHashNodeGetItem(n) + table->desc.key_size, - table->desc.element_size - table->desc.key_size); + nnew->next = scan.target; + if (!__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, new)) + goto retry; } - /* Done scanning bucket. */ + /* Allow garbage collection for this bucket. */ CHashTableUnsuppressGC(); /* If the insert failed, free the entry we allocated. */ - if (found) + if (scan.found) CHashImmediateFree(table, new); /* The insert succeeded if and only if no duplicate was found. */ - return !found; + return !scan.found; } /* @@ -544,96 +485,219 @@ CHashDelete(CHashTable table, void *entry) { uint32 hashcode = hash_any(entry, table->desc.key_size); uint32 bucket = hashcode & table->bucket_mask; - CHashPtr c; - CHashPtr *p; - CHashNode *n; - bool found = false; + bool cleanup_scan = false; + CHashScanResult scan; - /* Suppress garbage collection for target bucket. */ + /* Prevent garbage collection for this bucket. */ CHashTableSuppressGC(table, bucket); /* Scan bucket. */ -retry: - p = &table->bucket[bucket]; - c = *p; - while (c != InvalidCHashPtr) - { - int cmp; - uint32 h; + CHashBucketScan(table, &table->bucket[bucket], hashcode, entry, &scan); - /* Compare current node by hashcode, then by memcmp. */ - n = CHashTableGetNode(table, c); - pg_read_barrier_depends(); - h = n->un.hashcode; - if (h == hashcode) - cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size); - else if (h > hashcode) - cmp = 1; - else - cmp = -1; + /* + * If we found a match, then loop until we succesfully delete-mark the + * target, or until someone else does so. There's no real risk of + * starvation here; the maximum number of times we can iterate here is + * equal to the number of backends other than this one, and we'd have + * to be very unlucky to have things turn out even that badly. + */ + while (scan.found) + { + CHashPtr next = scan.target_node->next; - /* If we found the key, or passed where it should be, we're done. */ - if (cmp >= 0) + if (CHashPtrIsMarked(scan.target)) + { + /* Someone else deleted it before we did. */ + scan.found = false; + } + else if (__sync_bool_compare_and_swap(&scan.target_node, + next, + CHashPtrMark(next))) { - found = (cmp == 0); + /* Deletion is done; attempt to remove node from list. */ + if (__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, + next)) + CHashAddToGarbage(table, bucket, scan.target); + else + cleanup_scan = true; break; } + } + + if (cleanup_scan) + { + /* XXX */ + } - /* Move to next node. */ - p = &n->next; - c = *p; + /* Allow garbage collection for this bucket. */ + CHashTableUnsuppressGC(); + + /* We're done. */ + CHashTableUnsuppressGC(); + return scan.found; +} + +/* + * Scan one bucket of a concurrent hash table, storing the results in a + * CHashResult object provided by the caller. + * + * Caller must suppress garbage collection for the target bucket before + * calling this function, and resume it afterwards. + * + * If a matching item is found, res->target will be its arena offset and + * res->found will be true. If not, res->target will be the arena offset of + * the item following the insert position and found will be false. In either + * case, res->pointer_to_target will a pointer to the address where the value + * of target was found. res->target_node will be a pointer to the address of + * the CHashNode with offset res->target, unless res->target is invalid. + */ +static void +CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res) +{ + CHashPtr target; + CHashPtr *pointer_to_target; + CHashNode *target_node = NULL; + +retry: + pointer_to_target = start; + target = *pointer_to_target; + for (;;) + { + CHashPtr next; + uint32 h; + int cmp; /* - * We can't safely update a delete-marked pointer, so remove any - * such pointers we find from the bucket chain. Sometimes, concurrent - * activity may force us to restart the whole scan, but that should - * be rare. + * Read arena offset of target. If we've reached the end of the + * bucket chain, stop; otherwise, figure out the actual address of + * the next item. */ - if (CHashPtrIsMarked(c) && CHashRemoveMarked(table, bucket, &c, p)) - goto retry; - } + if (target == InvalidCHashPtr) + { + res->found = false; + break; + } + target_node = CHashTableGetNode(table, target); - if (found) - { - CHashPtr cc; + /* + * target may have been fetched from an arena entry that could be + * concurrently modified, so a dependency barrier is required before + * dereferencing the derived pointer. + */ + pg_read_barrier_depends(); + next = target_node->next; /* - * Really do the deletion. Since we've held no lock up to this - * point, it may well be that someone else has deleted the item out - * from under us, so we recheck that after taking the lock. + * For simplicity, any bucket scan, even if it's servicing a search, + * will attempt to remove delete-marked items from the bucket. This + * ensures that delete-marked elements are removed from bucket chains + * as quickly as possible and reduces code duplication. See + * CHashDelete for further comments about why delete-marking is + * necessary and how it allows safe deletion. */ - do + if (CHashPtrIsMarked(next)) { - cc = n->next; - if (CHashPtrIsMarked(cc)) +zap: + if (__sync_bool_compare_and_swap(pointer_to_target, target, next)) { - found = false; - break; + /* + * No one else can possibly have modified target_node->next, + * because such modifications are not allowed after a + * delete-mark has been applied. Thus, if we just keep + * following the next pointers, we're guaranteed to visit + * all non-deleted items (and possibly some deleted items) + * that were present at the time we began the scan. + */ + /* XXX We must put target on a garbage list now! */ + target = next; + continue; } - } while (!__sync_bool_compare_and_swap(&n->next, - cc, CHashPtrMark(cc))); + else + { + /* + * If the previous node has been delete-marked, we can't + * further update the next-pointer, so restart the scan. + * Otherwise, we know that some other backend removed one + * or more deleted nodes from the bucket chain just as we + * were trying to do, and we can simply continue the scan + * from wherever the previous node is pointing now. It's + * possible that some concurrent inserts have happened, too, + * but that's OK; we can view those as having happened "before" + * whatever operation this scan is supporting. + * + * Although starvation is a theoretical possibility here if + * we are forced to retry repeatedly, even a single retry is + * vanishingly unlikely in practice. It requires some other + * backend to delete both the node that we're looking at and + * the node which precedes it before we advance to the next + * node. That could certainly happen occasionally, but we'd + * have to be pretty unlucky to have it happen even twice in + * a row. + */ + target = *pointer_to_target; + if (CHashPtrIsMarked(target)) + goto retry; + continue; + } + } /* - * At this point the deletion is done. However, it's possible that - * someone else did it before we did it, in which case we need to - * return false rather than true. Also, we may have successfully - * removed the deleted from the linked list, in which case we need to - * add it to the garbage list; or that we failed to do so, in which - * case we need to rescan the list and remove any deleted items we - * find. + * Bucket chains are kept in order, so that there is exactly one legal + * point at which any given key can be inserted. The ordering is by + * hashcode first, and then by memcmp ordering of the keys involved. */ - Assert(!CHashPtrIsMarked(cc)); - if (__sync_bool_compare_and_swap(p, c, cc)) - CHashAddToGarbage(table, bucket, c); + h = target_node->un.hashcode; + if (h == hashcode) + cmp = memcmp(CHashNodeGetItem(target_node), key, + table->desc.key_size); + else if (h > hashcode) + cmp = 1; else + cmp = -1; + + /* + * If cmp < 0, then we haven't yet reached the point at which we'd + * expect to find the key, so we must continue the scan. If cmp == 0, + * we've found the key and can stop. If cmp > 0, we've either passed + * the point where we expect to find the key OR someone delete-marked + * the item and overwrote the hashcode with a gcnext pointer. In the + * latter case we must take care not to be fooled into stopping the + * scan early. + */ + if (cmp >= 0) { - /* XXX Rescan bucket. */ + if (cmp == 0) + { + res->found = true; + break; + } + else + { + /* + * pg_read_barrier() prevents the reread of the next pointer + * from being speculated ahead of the read of the hash value. + */ + pg_read_barrier(); + next = target_node->next; + if (CHashPtrIsMarked(next)) + goto zap; + } } + + /* Continue scan from next node. */ + pointer_to_target = &target_node->next; + target = next; } - /* We're done. */ - CHashTableUnsuppressGC(); - return found; + /* Send results back to caller. */ + res->target = target; + res->pointer_to_target = pointer_to_target; + res->target_node = target_node; } /* @@ -821,57 +885,3 @@ CHashImmediateFree(CHashTable table, CHashPtr c) n->un.gcnext = f; } while (!__sync_bool_compare_and_swap(free, f, c)); } - -/* - * Attempt to remove marked elements from a bucket chain. - * - * p is a pointer into shared memory; it points to the CHashPtr that must be - * updated to remove deleted elements from the chain. - * - * cp is a pointer into backend-private memory; it is the delete-marked pointer - * fetched from the node to which p points. - * - * The return value is true if the caller must retry, or false if the caller - * may continue the scan. In the latter case, *cp is updated to contain a - * pointer to the node from which the scan should be resumed. - */ -static bool -CHashRemoveMarked(CHashTable table, uint32 bucket, CHashPtr *cp, - CHashPtr *p) -{ - CHashPtr c = *cp; - CHashPtr cc; - - do - { - CHashNode *n; - - /* - * c is logically a pointer, so we must insert a dependency barrier - * before deferencing it. - */ - pg_read_barrier_depends(); - - /* - * Attempt to remove the deleted node from the linked list. - * If we fail to update the logical pointer, caller must rescan - * the bucket. There's no intelligent way to continue the scan, - * because for all we know the node that contains the pointer we're - * try to update may itself be deleted by now. - */ - n = CHashTableGetNode(table, c); - cc = n->next; - if (!__sync_bool_compare_and_swap(p, c, cc)) - return true; - - /* Add c to garbage list. */ - CHashAddToGarbage(table, bucket, c); - - /* The new target of the pointer may also be delete-marked, so loop. */ - c = cc; - } while (CHashPtrIsMarked(c)); - - /* Success! */ - *cp = c; - return false; -} -- 2.39.5