#include "postgres.h"
#include "lib/stringinfo.h"
#include "miscadmin.h"
-#include "utils/freepage.h"
+#include "utils/sb_region.h"
/* Magic numbers to identify various page types */
#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
/* Helper functions */
static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
FreePageBtree *btp);
-static void FreePageBtreeCleanup(FreePageManager *fpm);
+static Size FreePageBtreeCleanup(FreePageManager *fpm);
static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
FreePageBtree *btp);
static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
StringInfo buf);
static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
Size *first_page);
-static bool FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
+static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
Size npages, bool soft);
static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
fpm->btree_recycle_count = 0;
fpm->singleton_first_page = 0;
fpm->singleton_npages = 0;
+ fpm->largest_reported_chunk = 0;
for (f = 0; f < FPM_NUM_FREELISTS; f++)
relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
{
LWLock *lock = fpm_lock(fpm);
bool result;
+ Size contiguous_pages;
if (lock != NULL)
LWLockAcquire(lock, LW_EXCLUSIVE);
result = FreePageManagerGetInternal(fpm, npages, first_page);
- FreePageBtreeCleanup(fpm);
+
+ /*
+ * It's a bit counterintuitive, but allocating pages can actually create
+ * opportunities for cleanup that create larger ranges. We might pull
+ * a key out of the btree that enables the item at the head of the btree
+ * recycle list to be inserted; and then if there are more items behind it
+ * one of those might cause two currently-separated ranges to merge,
+ * creating a single range of contiguous pages larger than any that existed
+ * previously. It might be worth trying to improve the cleanup algorithm
+ * to avoid such corner cases, but for now we just notice the condition
+ * and do the appropriate reporting.
+ *
+ * Reporting is only needed for backend-private regions, so we can skip
+ * it when locking is in use, or if we discover that the region has an
+ * associated dynamic shared memory segment.
+ */
+ contiguous_pages = FreePageBtreeCleanup(fpm);
+ if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk)
+ {
+ sb_region *region = sb_lookup_region(fpm);
+
+ if (region != NULL && region->seg == NULL)
+ {
+ sb_report_contiguous_freespace(region, contiguous_pages);
+ fpm->largest_reported_chunk = contiguous_pages;
+ }
+ else
+ {
+ /* There's no containing region, so try to avoid future work. */
+ fpm->largest_reported_chunk = (Size) -1;
+ }
+ }
+
if (lock != NULL)
LWLockRelease(lock);
/*
* Return the size of the largest run of pages that the user could
- * succesfully get.
+ * succesfully get. (If this value subsequently increases, it will trigger
+ * a callback to sb_report_contiguous_freespace.)
*/
Size
FreePageManagerInquireLargest(FreePageManager *fpm)
} while (f > 0);
}
+ fpm->largest_reported_chunk = largest;
+
if (lock != NULL)
LWLockRelease(lock);
}
/*
- * Transfer a run of pages to the free page manager.
+ * Transfer a run of pages to the free page manager. (If the number of
+ * contiguous pages now available is larger than it was previously, then
+ * we attempt to report this to the sb_region module.)
*/
void
FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
{
LWLock *lock = fpm_lock(fpm);
+ Size contiguous_pages;
Assert(npages > 0);
/* Acquire lock (if there is one). */
if (lock != NULL)
LWLockAcquire(lock, LW_EXCLUSIVE);
- FreePageManagerPutInternal(fpm, first_page, npages, false);
- FreePageBtreeCleanup(fpm);
+ /* Record the new pages. */
+ contiguous_pages =
+ FreePageManagerPutInternal(fpm, first_page, npages, false);
+
+ /*
+ * If the new range we inserted into the page manager was contiguous
+ * with an existing range, it may have opened up cleanup opportunities.
+ */
+ if (contiguous_pages > npages)
+ {
+ Size cleanup_contiguous_pages;
+
+ cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
+ if (cleanup_contiguous_pages > contiguous_pages)
+ contiguous_pages = cleanup_contiguous_pages;
+ }
+
+ /*
+ * If we now have more contiguous pages available than previously
+ * reported, attempt to notify sb_region system.
+ *
+ * Reporting is only needed for backend-private regions, so we can skip
+ * it when locking is in use, or if we discover that the region has an
+ * associated dynamic shared memory segment.
+ */
+ if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk)
+ {
+ sb_region *region = sb_lookup_region(fpm);
+
+ if (region != NULL && region->seg == NULL)
+ {
+ sb_report_contiguous_freespace(region, contiguous_pages);
+ fpm->largest_reported_chunk = contiguous_pages;
+ }
+ else
+ {
+ /* There's no containing region, so try to avoid future work. */
+ fpm->largest_reported_chunk = (Size) -1;
+ }
+ }
/* Release lock (if there is one). */
if (lock != NULL)
}
/*
- * Attempt to reclaim space from the free-page btree.
+ * Attempt to reclaim space from the free-page btree. The return value is
+ * the largest range of contiguous pages created by the cleanup operation.
*/
-static void
+static Size
FreePageBtreeCleanup(FreePageManager *fpm)
{
char *base = fpm_segment_base(fpm);
+ Size max_contiguous_pages;
/* Attempt to shrink the depth of the btree. */
while (!relptr_is_null(fpm->btree_root))
{
FreePageBtree *btp;
Size first_page;
+ Size contiguous_pages;
btp = FreePageBtreeGetRecycled(fpm);
first_page = fpm_pointer_to_page(base, btp);
- if (!FreePageManagerPutInternal(fpm, first_page, 1, true))
+ contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
+ if (contiguous_pages == 0)
{
FreePageBtreeRecycle(fpm, first_page);
break;
}
+ else
+ {
+ if (contiguous_pages > max_contiguous_pages)
+ max_contiguous_pages = contiguous_pages;
+ }
}
+
+ return max_contiguous_pages;
}
/*
* Put a range of pages into the btree and freelists, consolidating it with
* existing free spans just before and/or after it. If 'soft' is true,
* only perform the insertion if it can be done without allocating new btree
- * pages; if false, do it always. Returns true if the insertion was performed;
- * false if the soft flag caused it to be skipped.
+ * pages; if false, do it always. Returns 0 if the soft flag caused the
+ * insertion to be skipped, or otherwise the size of the contiguous span
+ * created by the insertion. This may be larger than npages if we're able
+ * to consolidate with an adjacent range.
*/
-static bool
+static Size
FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
bool soft)
{
fpm->singleton_first_page = first_page;
fpm->singleton_npages = npages;
FreePagePushSpanLeader(fpm, first_page, npages);
- return true;
+ return fpm->singleton_npages;
}
else if (fpm->singleton_first_page + fpm->singleton_npages ==
first_page)
FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
fpm->singleton_npages);
- return true;
+ return fpm->singleton_npages;
}
else if (first_page + npages == fpm->singleton_first_page)
{
fpm->singleton_npages += npages;
FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
fpm->singleton_npages);
- return true;
+ return fpm->singleton_npages;
}
else
{
if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
{
bool remove_next = false;
+ Size result;
Assert(prevkey->first_page + prevkey->npages == first_page);
prevkey->npages = (first_page - prevkey->first_page) + npages;
remove_next = true;
}
- /* Put the span on the correct freelist. */
+ /* Put the span on the correct freelist and save size. */
FreePagePopSpanLeader(fpm, prevkey->first_page);
FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
+ result = prevkey->npages;
/*
* If we consolidated with both the preceding and following entries,
if (remove_next)
FreePageBtreeRemove(fpm, np, nindex);
- return true;
+ return result;
}
/* Consolidate with the next entry if possible. */
if (nindex == 0)
FreePageBtreeAdjustAncestorKeys(fpm, np);
- return true;
+ return nextkey->npages;
}
/* Split leaf page and as many of its ancestors as necessary. */
/* If this is a soft insert, it's time to give up. */
if (soft)
- return false;
+ return 0;
/* Check whether we need to allocate more btree pages to split. */
if (result.split_pages > fpm->btree_recycle_count)
*/
FreePagePushSpanLeader(fpm, first_page, npages);
- return true;
+ return npages;
}
}
/* Put it on the free list. */
FreePagePushSpanLeader(fpm, first_page, npages);
- return true;
+ return npages;
}
/*
sb_lookup_l2 **l2;
} sb_lookup_root;
+/* Toplevel address lookup structure. */
#if SIZEOF_SIZE_T > 4
static sb_lookup_root lookup_root;
#else
static sb_lookup_leaf lookup_root_leaf;
#endif
+/*
+ * Backend-private chunks binned by maximum contiguous freespace. Lists are
+ * doubly-linked using fl_node. List 0 contains regions with no internal
+ * no free pages at all. List I, for I>0, contains regions where the number
+ * of contiguous free pages is no larger than 2^(I-1), except for the last
+ * list which contains everything with too many pages for any other list.
+ * A region may be on a higher-numbered list than where it actually belongs,
+ * but it cannot be any lower. Thus it's safe to assume that searching
+ * lower-numbered lists is always pointless, but higher-numbered lists may
+ * contain regions that can't actually satisfy a requested allocation.
+ */
+#define NUM_PRIVATE_FREELISTS 16
+static dlist_head private_freelist[NUM_PRIVATE_FREELISTS];
+
+/* Static functions. */
+static bool sb_adjust_lookup(sb_region *region, bool insert);
+static bool sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region,
+ bool insert);
+#if SIZEOF_SIZE_T > 4
+static sb_lookup_leaf *sb_find_leaf(Size highbits, bool insert);
+#endif
+
/*
* Find the region to which a pointer belongs.
*/
*/
#if SIZEOF_SIZE_T > 4
{
- uint32 highbits = p >> 32;
- sb_lookup_l2 *l2 = NULL;
- int i;
-
- /* Check for entry in cache. */
- for (i = 0; i < lookup_root.ncached; ++i)
- if (lookup_root.cache_key[i] == highbits)
- l2 = lookup_root.cache_value[i];
-
- /*
- * If there's nothing in cache but the full table has been initialized,
- * find the l2 entry there and pull it into the cache. Since we expect
- * this path to be taken virtually never, we don't worry about LRU but
- * just pick a slot more or less arbitrarily.
- */
- if (l2 == NULL && lookup_root.l2 != NULL)
- {
- uint32 rootbits = highbits >> SB_LOOKUP_L2_BITS;
- rootbits &= SB_LOOKUP_ROOT_ENTRIES - 1;
- l2 = lookup_root.l2[rootbits];
-
- if (l2 != NULL)
- {
- i = highbits % SB_LOOKUP_ROOT_CACHE_SIZE;
- lookup_root.cache_key[i] = highbits;
- lookup_root.cache_value[i] = l2;
- }
- }
+ Size highbits = p >> 32;
- /* Now use the L2 map (if any) to find the correct leaf node. */
- if (l2 != NULL)
- leaf = l2->leaf[highbits & (SB_LOOKUP_L2_ENTRIES - 1)];
+ leaf = sb_find_leaf(highbits, false);
/* No lookup table for this 4GB range? OK, no matching region. */
if (leaf == NULL)
else
return region;
}
+
return NULL;
}
+
+/*
+ * When a free page manager detects that the maximum contiguous freespace in
+ * a backend-private region has increased, it calls this function. Our job
+ * is to move the region to a higher-numbered freelist if necessary.
+ */
+void
+sb_report_contiguous_freespace(sb_region *region, Size npages)
+{
+ Size old_freelist;
+ Size new_freelist;
+
+ /* This should only be called for private regions. */
+ Assert(region->seg == NULL);
+ Assert(region->allocator == NULL);
+
+ /*
+ * If there have been allocations from the region since the last report,
+ * it's possible that the number of pages reported is less than what we
+ * already know about. In that case, exit quickly; else update our
+ * cached value.
+ */
+ if (npages < region->contiguous_pages)
+ return;
+
+ /*
+ * If the entire region is free, deallocate it. Any FreePageManager
+ * or sb_map for this region is stored within the region itself, so
+ * we needn't do anything special to get rid of them.
+ */
+ if (npages == region->usable_pages)
+ {
+ /* Return the managed space to the operating system. */
+ free(region->region_start);
+
+ /* Pull the region out of the lookup table. */
+ sb_adjust_lookup(region, false);
+
+ /* Remove the region object from the private freelist. */
+ dlist_delete(®ion->fl_node);
+
+ /* Finally, free the region object itself. */
+ free(region);
+ return;
+ }
+
+ /* If necessary, move the region to a higher-numbered freelist. */
+ old_freelist = Min(fls(region->contiguous_pages), NUM_PRIVATE_FREELISTS);
+ new_freelist = Min(fls(npages), NUM_PRIVATE_FREELISTS);
+ if (new_freelist > old_freelist)
+ {
+ dlist_delete(®ion->fl_node);
+ dlist_push_head(&private_freelist[new_freelist], ®ion->fl_node);
+ }
+
+ /* Record the reported value for future calls to this function. */
+ region->contiguous_pages = npages;
+}
+
+/*
+ * Insert a region into, or delete a region from, the address-based lookup
+ * tables. Returns true on success and false if we fail due to memory
+ * exhaustion; delete always succeeds.
+ */
+static bool
+sb_adjust_lookup(sb_region *region, bool insert)
+{
+ bool ok = true;
+
+ /*
+ * If this is a 64-bit system, we need to loop over all of the relevant
+ * tables and update each one. On a 32-bit system, there's only one table
+ * and we simply update that.
+ */
+#if SIZEOF_SIZE_T > 4
+ Size tabstart;
+ Size tabstop;
+ Size i;
+
+ tabstart = ((Size) region->region_start) >> 32;
+ tabstop = ((Size) region->region_start + region->region_size - 1) >> 32;
+
+ for (i = tabstart; i <= tabstop; ++i)
+ {
+ sb_lookup_leaf *leaf = sb_find_leaf(i, insert);
+
+ /*
+ * Finding the leaf might fail if we're inserting and can't allocate
+ * memory for a new lookup table. Even if we get the leaf, inserting
+ * the new region pointer into it might also fail for lack of memory.
+ */
+ Assert(insert || leaf != NULL);
+ if (leaf == NULL)
+ ok = false;
+ else
+ ok = sb_adjust_lookup_leaf(leaf, region, insert);
+
+ if (!ok)
+ {
+ /* We ran out of memory; back out changes already made. */
+ ok = false;
+ tabstop = i - 1;
+ for (i = tabstart; i <= tabstop; ++i)
+ sb_adjust_lookup_leaf(sb_find_leaf(i, false), region, false);
+ break;
+ }
+ }
+#else
+ ok = sb_adjust_lookup_leaf(&lookup_root_leaf, region, insert);
+#endif
+
+ return ok;
+}
+
+/*
+ * Insert a region into, or remove a region from, a particular sb_lookup_leaf.
+ * Returns true on success and false if we fail due to memory exhaustion;
+ * delete always succeeds.
+ */
+static bool
+sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region, bool insert)
+{
+ int high, low;
+
+ /* If we're inserting, we might need to allocate more space. */
+ if (insert && leaf->nused >= leaf->nallocated)
+ {
+ Size newsize;
+ sb_region **newtab;
+
+ newsize = leaf->nallocated == 0 ? 16 : leaf->nallocated * 2;
+ newtab = malloc(sizeof(sb_region *) * newsize);
+ if (newtab == NULL)
+ return false;
+ if (leaf->nused > 0)
+ memcpy(newtab, leaf->region, sizeof(sb_region *) * leaf->nused);
+ if (leaf->region != NULL)
+ free(leaf->region);
+ leaf->region = newtab;
+ }
+
+ /* Use binary search on the sb_lookup_leaf. */
+ high = leaf->nused;
+ low = 0;
+ while (low < high)
+ {
+ int mid;
+ sb_region *candidate;
+
+ mid = (high + low) / 2;
+ region = leaf->region[mid];
+ if (candidate->region_start > region->region_start)
+ high = mid;
+ else if (candidate->region_start < region->region_start)
+ low = mid + 1;
+ else
+ low = high = mid;
+ }
+
+ /* Really do it. */
+ if (insert)
+ {
+ Assert(low == leaf->nused ||
+ leaf->region[low]->region_start > region->region_start);
+ if (low < leaf->nused)
+ memmove(&leaf->region[low], &leaf->region[low + 1],
+ sizeof(sb_region *) * (leaf->nused - low - 1));
+ leaf->region[low] = region;
+ ++leaf->nused;
+ }
+ else
+ {
+ Assert(leaf->region[low] == region);
+ if (low < leaf->nused - 1)
+ memmove(&leaf->region[low + 1], &leaf->region[low],
+ sizeof(sb_region *) * (leaf->nused - low - 1));
+ --leaf->nused;
+ }
+
+ return true;
+}
+
+#if SIZEOF_SIZE_T > 4
+static sb_lookup_leaf *
+sb_find_leaf(Size highbits, bool insert)
+{
+ Size rootbits;
+ sb_lookup_l2 *l2 = NULL;
+ sb_lookup_leaf **leafptr;
+ int i;
+ int unused = -1;
+
+ rootbits = (highbits >> SB_LOOKUP_L2_BITS) & (SB_LOOKUP_ROOT_ENTRIES - 1);
+
+ /* Check for L2 entry in toplevel cache. */
+ for (i = 0; i < lookup_root.ncached; ++i)
+ {
+ if (lookup_root.cache_key[i] == highbits)
+ l2 = lookup_root.cache_value[i];
+ else if (lookup_root.cache_value[i] == NULL)
+ unused = i;
+ }
+
+ /* If no hit, check the full L2 loookup table, if it's been initialized. */
+ if (l2 == NULL && lookup_root.l2 != NULL)
+ {
+ rootbits &= SB_LOOKUP_ROOT_ENTRIES - 1;
+ l2 = lookup_root.l2[rootbits];
+
+ /* Pull entry into cache. */
+ if (l2 != NULL)
+ {
+ /*
+ * No need to be smart about replacement policy; we expect to
+ * arrive here virtually never.
+ */
+ i = highbits % SB_LOOKUP_ROOT_CACHE_SIZE;
+ lookup_root.cache_key[i] = highbits;
+ lookup_root.cache_value[i] = l2;
+ }
+ }
+
+ /* If inserting and no L2 entry found, create one. */
+ if (insert && l2 == NULL)
+ {
+ l2 = calloc(1, sizeof(sb_lookup_l2));
+ if (l2 == NULL)
+ return NULL;
+ if (unused != -1)
+ {
+ lookup_root.cache_key[i] = highbits;
+ lookup_root.cache_value[i] = l2;
+ }
+ else if (lookup_root.l2 != NULL)
+ lookup_root.l2[rootbits] = l2;
+ else
+ {
+ lookup_root.l2 = malloc(sizeof(sb_lookup_l2 *)
+ * SB_LOOKUP_ROOT_ENTRIES);
+ if (lookup_root.l2 == NULL)
+ {
+ free(l2);
+ return NULL;
+ }
+ for (i = 0; i < lookup_root.ncached; ++i)
+ lookup_root.l2[lookup_root.cache_key[i]] =
+ lookup_root.cache_value[i];
+ }
+ }
+
+ /* Find slot for entry, and try to initialize it if needed. */
+ leafptr = &l2->leaf[highbits & (SB_LOOKUP_L2_ENTRIES - 1)];
+ if (insert || *leafptr == NULL)
+ *leafptr = calloc(1, sizeof(sb_lookup_leaf));
+
+ return *leafptr;
+}
+#endif
unsigned btree_recycle_count;
Size singleton_first_page;
Size singleton_npages;
+ Size largest_reported_chunk;
relptr(FreePageSpanLeader) freelist[FPM_NUM_FREELISTS];
};
#ifndef SB_REGION_H
#define SB_REGION_H
+#include "lib/ilist.h"
#include "storage/dsm.h"
#include "storage/shm_toc.h"
#include "utils/freepage.h"
*/
typedef struct sb_region
{
- char *region_start;
- Size region_size;
- Size usable_pages;
- dsm_segment *seg;
- FreePageManager *fpm;
- sb_map *pagemap;
- sb_allocator *allocator;
+ char *region_start; /* Address of region. */
+ Size region_size; /* Number of bytes in region. */
+ Size usable_pages; /* Number of usable pages in region. */
+ dsm_segment *seg; /* If not backend-private, DSM handle. */
+ sb_allocator *allocator; /* If not backend-private, shared allocator. */
+ FreePageManager *fpm; /* Free page manager for region (if any). */
+ sb_map *pagemap; /* Page map for region (if any). */
+ Size contiguous_pages; /* Last reported contiguous free pages. */
+ dlist_node fl_node; /* Freelist links. */
} sb_region;
/*
extern sb_allocator *sb_attach_shared_region(dsm_segment *,
sb_shared_region *);
-/* For use by sb_alloc/sb_free. */
+/* For internal use by cooperating modules. */
extern sb_region *sb_lookup_region(void *);
extern sb_region *sb_private_region_for_allocator(Size npages);
-extern void sb_release_private_region(sb_region *);
+extern void sb_report_contiguous_freespace(sb_region *, Size npages);
#endif /* SB_REGION_H */