Pack tuples in a hash join batch densely, to save memory.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c

index 589b2f1509969d618e918c8d852a36da88ab6005..3ef7cfbb5bfad5aa4da83875813acc190e706d6a 100644 (file)
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -47,6 +47,7 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable,
                         int bucketNumber);
  static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
  
+static void *dense_alloc(HashJoinTable hashtable, Size size);
  
  /* ----------------------------------------------------------------
   *     ExecHash
@@ -293,6 +294,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
     hashtable->spaceUsedSkew = 0;
     hashtable->spaceAllowedSkew =
         hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+   hashtable->chunks = NULL;
  
     /*
      * Get info about the hash functions to be used for each hash key. Also
@@ -556,10 +558,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
     int         oldnbatch = hashtable->nbatch;
     int         curbatch = hashtable->curbatch;
     int         nbatch;
-   int         i;
     MemoryContext oldcxt;
     long        ninmemory;
     long        nfreed;
+   HashMemoryChunk oldchunks;
  
     /* do nothing if we've decided to shut off growth */
     if (!hashtable->growEnabled)
@@ -612,51 +614,65 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
      */
     ninmemory = nfreed = 0;
  
-   for (i = 0; i < hashtable->nbuckets; i++)
-   {
-       HashJoinTuple prevtuple;
-       HashJoinTuple tuple;
+   /*
+    * We will scan through the chunks directly, so that we can reset the
+    * buckets now and not have to keep track which tuples in the buckets have
+    * already been processed. We will free the old chunks as we go.
+    */
+   memset(hashtable->buckets, 0, sizeof(HashJoinTuple *) * hashtable->nbuckets);
+   oldchunks = hashtable->chunks;
+   hashtable->chunks = NULL;
  
-       prevtuple = NULL;
-       tuple = hashtable->buckets[i];
+   /* so, let's scan through the old chunks, and all tuples in each chunk */
+   while (oldchunks != NULL)
+   {
+       HashMemoryChunk nextchunk = oldchunks->next;
+       /* position within the buffer (up to oldchunks->used) */
+       size_t      idx = 0;
  
-       while (tuple != NULL)
+       /* process all tuples stored in this chunk (and then free it) */
+       while (idx < oldchunks->used)
         {
-           /* save link in case we delete */
-           HashJoinTuple nexttuple = tuple->next;
+           HashJoinTuple hashTuple = (HashJoinTuple) (oldchunks->data + idx);
+           MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+           int         hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
             int         bucketno;
             int         batchno;
  
             ninmemory++;
-           ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+           ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
                                       &bucketno, &batchno);
-           Assert(bucketno == i);
+
             if (batchno == curbatch)
             {
-               /* keep tuple */
-               prevtuple = tuple;
+               /* keep tuple in memory - copy it into the new chunk */
+               HashJoinTuple copyTuple =
+                   (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+               memcpy(copyTuple, hashTuple, hashTupleSize);
+
+               /* and add it back to the appropriate bucket */
+               copyTuple->next = hashtable->buckets[bucketno];
+               hashtable->buckets[bucketno] = copyTuple;
             }
             else
             {
                 /* dump it out */
                 Assert(batchno > curbatch);
-               ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
-                                     tuple->hashvalue,
+               ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+                                     hashTuple->hashvalue,
                                       &hashtable->innerBatchFile[batchno]);
-               /* and remove from hash table */
-               if (prevtuple)
-                   prevtuple->next = nexttuple;
-               else
-                   hashtable->buckets[i] = nexttuple;
-               /* prevtuple doesn't change */
-               hashtable->spaceUsed -=
-                   HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
-               pfree(tuple);
+
+               hashtable->spaceUsed -= hashTupleSize;
                 nfreed++;
             }
  
-           tuple = nexttuple;
+           /* next tuple in this chunk */
+           idx += MAXALIGN(hashTupleSize);
         }
+
+       /* we're done with this chunk - free it and proceed to the next one */
+       pfree(oldchunks);
+       oldchunks = nextchunk;
     }
  
  #ifdef HJDEBUG
@@ -717,8 +733,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
  
         /* Create the HashJoinTuple */
         hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
-       hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
-                                                      hashTupleSize);
+       hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+
         hashTuple->hashvalue = hashvalue;
         memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
  
@@ -1068,6 +1084,9 @@ ExecHashTableReset(HashJoinTable hashtable)
     hashtable->spaceUsed = 0;
  
     MemoryContextSwitchTo(oldcxt);
+
+   /* Forget the chunks (the memory was freed by the context reset above). */
+   hashtable->chunks = NULL;
  }
  
  /*
@@ -1462,3 +1481,79 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
         hashtable->spaceUsedSkew = 0;
     }
  }
+
+/*
+ * Allocate 'size' bytes from the currently active HashMemoryChunk
+ */
+static void *
+dense_alloc(HashJoinTable hashtable, Size size)
+{
+   HashMemoryChunk newChunk;
+   char       *ptr;
+
+   /* just in case the size is not already aligned properly */
+   size = MAXALIGN(size);
+
+   /*
+    * If tuple size is larger than of 1/4 of chunk size, allocate a separate
+    * chunk.
+    */
+   if (size > HASH_CHUNK_THRESHOLD)
+   {
+       /* allocate new chunk and put it at the beginning of the list */
+       newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+                                 offsetof(HashMemoryChunkData, data) + size);
+       newChunk->maxlen = size;
+       newChunk->used = 0;
+       newChunk->ntuples = 0;
+
+       /*
+        * Add this chunk to the list after the first existing chunk, so that
+        * we don't lose the remaining space in the "current" chunk.
+        */
+       if (hashtable->chunks != NULL)
+       {
+           newChunk->next = hashtable->chunks->next;
+           hashtable->chunks->next = newChunk;
+       }
+       else
+       {
+           newChunk->next = hashtable->chunks;
+           hashtable->chunks = newChunk;
+       }
+
+       newChunk->used += size;
+       newChunk->ntuples += 1;
+
+       return newChunk->data;
+   }
+
+   /*
+    * See if we have enough space for it in the current chunk (if any).
+    * If not, allocate a fresh chunk.
+    */
+   if ((hashtable->chunks == NULL) ||
+       (hashtable->chunks->maxlen - hashtable->chunks->used) < size)
+   {
+       /* allocate new chunk and put it at the beginning of the list */
+       newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+                      offsetof(HashMemoryChunkData, data) + HASH_CHUNK_SIZE);
+
+       newChunk->maxlen = HASH_CHUNK_SIZE;
+       newChunk->used = size;
+       newChunk->ntuples = 1;
+
+       newChunk->next = hashtable->chunks;
+       hashtable->chunks = newChunk;
+
+       return newChunk->data;
+   }
+
+   /* There is enough space in the current chunk, let's add the tuple */
+   ptr = hashtable->chunks->data + hashtable->chunks->used;
+   hashtable->chunks->used += size;
+   hashtable->chunks->ntuples += 1;
+
+   /* return pointer to the start of the tuple memory */
+   return ptr;
+}
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h

index 3beae403ce6ad6570880ef0e61c64a3649132b8e..c9e61dfa39cc7b0ff57a6a6178765a7b62dd2e86 100644 (file)
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -102,6 +102,25 @@ typedef struct HashSkewBucket
  #define SKEW_WORK_MEM_PERCENT  2
  #define SKEW_MIN_OUTER_FRACTION  0.01
  
+/*
+ * To reduce palloc overhead, the HashJoinTuples for the current batch are
+ * packed in 32kB buffers instead of pallocing each tuple individually.
+ */
+typedef struct HashMemoryChunkData
+{
+   int         ntuples;    /* number of tuples stored in this chunk */
+   size_t      maxlen;     /* size of the buffer holding the tuples */
+   size_t      used;       /* number of buffer bytes already used */
+
+   struct HashMemoryChunkData *next; /* pointer to the next chunk (linked list) */
+
+   char        data[1];    /* buffer allocated at the end */
+} HashMemoryChunkData;
+
+typedef struct HashMemoryChunkData *HashMemoryChunk;
+
+#define HASH_CHUNK_SIZE            (32 * 1024L)
+#define HASH_CHUNK_THRESHOLD   (HASH_CHUNK_SIZE / 4)
  
  typedef struct HashJoinTableData
  {
@@ -157,6 +176,9 @@ typedef struct HashJoinTableData
  
     MemoryContext hashCxt;      /* context for whole-hash-join storage */
     MemoryContext batchCxt;     /* context for this-batch-only storage */
+
+   /* used for dense allocation of tuples (into linked chunks) */
+   HashMemoryChunk chunks;     /*  one list for the whole batch */
  }  HashJoinTableData;
  
  #endif   /* HASHJOIN_H */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 10 Sep 2014 18:24:52 +0000 (21:24 +0300)
src/backend/executor/nodeHash.c		patch \| blob \| blame \| history
src/include/executor/hashjoin.h		patch \| blob \| blame \| history