Teach tuplesort.c about "top N" sorting, in which only the first N tuples

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)
diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c

index bb52b0fada3bfa44d00b07d38ef42ed8660f3df3..5a32ccfc47fc38092af3b7252704e456b7dcad22 100644 (file)
--- a/src/backend/executor/nodeLimit.c
+++ b/src/backend/executor/nodeLimit.c
@@ -280,6 +280,37 @@ recompute_limits(LimitState *node)
         /* Reset position to start-of-scan */
         node->position = 0;
         node->subSlot = NULL;
+
+       /*
+        * If we have a COUNT, and our input is a Sort node, notify it that it can
+        * use bounded sort.
+        *
+        * This is a bit of a kluge, but we don't have any more-abstract way of
+        * communicating between the two nodes; and it doesn't seem worth trying
+        * to invent one without some more examples of special communication needs.
+        *
+        * Note: it is the responsibility of nodeSort.c to react properly to
+        * changes of these parameters.  If we ever do redesign this, it'd be
+        * a good idea to integrate this signaling with the parameter-change
+        * mechanism.
+        */
+       if (IsA(outerPlanState(node), SortState))
+       {
+               SortState *sortState = (SortState *) outerPlanState(node);
+               int64 tuples_needed = node->count + node->offset;
+
+               /* negative test checks for overflow */
+               if (node->noCount || tuples_needed < 0)
+               {
+                       /* make sure flag gets reset if needed upon rescan */
+                       sortState->bounded = false;
+               }
+               else
+               {
+                       sortState->bounded = true;
+                       sortState->bound = tuples_needed;
+               }
+       }
  }
  
  /* ----------------------------------------------------------------
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c

index 11553c9bb6e9a180615b32ee637fa613a2a00702..01d727a6ec91485aa9b6e0f92c993952891427f1 100644 (file)
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -89,6 +89,8 @@ ExecSort(SortState *node)
                                                                                           plannode->nullsFirst,
                                                                                           work_mem,
                                                                                           node->randomAccess);
+               if (node->bounded)
+                       tuplesort_set_bound(tuplesortstate, node->bound);
                 node->tuplesortstate = (void *) tuplesortstate;
  
                 /*
@@ -119,6 +121,8 @@ ExecSort(SortState *node)
                  * finally set the sorted flag to true
                  */
                 node->sort_Done = true;
+               node->bounded_Done = node->bounded;
+               node->bound_Done = node->bound;
                 SO1_printf("ExecSort: %s\n", "sorting done");
         }
  
@@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
                                                                                  EXEC_FLAG_BACKWARD |
                                                                                  EXEC_FLAG_MARK)) != 0;
  
+       sortstate->bounded = false;
         sortstate->sort_Done = false;
         sortstate->tuplesortstate = NULL;
  
@@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt)
  
         /*
          * If subnode is to be rescanned then we forget previous sort results; we
-        * have to re-read the subplan and re-sort.
+        * have to re-read the subplan and re-sort.  Also must re-sort if the
+        * bounded-sort parameters changed or we didn't select randomAccess.
          *
          * Otherwise we can just rewind and rescan the sorted output.
          */
         if (((PlanState *) node)->lefttree->chgParam != NULL ||
+               node->bounded != node->bounded_Done ||
+               node->bound != node->bound_Done ||
                 !node->randomAccess)
         {
                 node->sort_Done = false;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index 74b4c8ed268b16ad89ce7d80b0bdf0299be29a70..c428c8159f1ea170372be8581d551ba3d670e478 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
   *             disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
   *             cpu = comparison_cost * t * log2(t)
   *
+ * If the sort is bounded (i.e., only the first k result tuples are needed)
+ * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
+ *
   * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
   * accesses (XXX can't we refine that guess?)
   *
@@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
   * 'input_cost' is the total cost for reading the input data
   * 'tuples' is the number of tuples in the relation
   * 'width' is the average tuple width in bytes
+ * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
   *
   * NOTE: some callers currently pass NIL for pathkeys because they
   * can't conveniently supply the sort keys.  Since this routine doesn't
@@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
   */
  void
  cost_sort(Path *path, PlannerInfo *root,
-                 List *pathkeys, Cost input_cost, double tuples, int width)
+                 List *pathkeys, Cost input_cost, double tuples, int width,
+                 double limit_tuples)
  {
         Cost            startup_cost = input_cost;
         Cost            run_cost = 0;
-       double          nbytes = relation_byte_size(tuples, width);
+       double          input_bytes = relation_byte_size(tuples, width);
+       double          output_bytes;
+       double          output_tuples;
         long            work_mem_bytes = work_mem * 1024L;
  
         if (!enable_sort)
@@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
         if (tuples < 2.0)
                 tuples = 2.0;
  
-       /*
-        * CPU costs
-        *
-        * Assume about two operator evals per tuple comparison and N log2 N
-        * comparisons
-        */
-       startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+       /* Do we have a useful LIMIT? */
+       if (limit_tuples > 0 && limit_tuples < tuples)
+       {
+               output_tuples = limit_tuples;
+               output_bytes = relation_byte_size(output_tuples, width);
+       }
+       else
+       {
+               output_tuples = tuples;
+               output_bytes = input_bytes;
+       }
  
-       /* disk costs */
-       if (nbytes > work_mem_bytes)
+       if (output_bytes > work_mem_bytes)
         {
-               double          npages = ceil(nbytes / BLCKSZ);
-               double          nruns = (nbytes / work_mem_bytes) * 0.5;
+               /*
+                * We'll have to use a disk-based sort of all the tuples
+                */
+               double          npages = ceil(input_bytes / BLCKSZ);
+               double          nruns = (input_bytes / work_mem_bytes) * 0.5;
                 double          mergeorder = tuplesort_merge_order(work_mem_bytes);
                 double          log_runs;
                 double          npageaccesses;
  
+               /*
+                * CPU costs
+                *
+                * Assume about two operator evals per tuple comparison and N log2 N
+                * comparisons
+                */
+               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+
+               /* Disk costs */
+
                 /* Compute logM(r) as log(r) / log(M) */
                 if (nruns > mergeorder)
                         log_runs = ceil(log(nruns) / log(mergeorder));
@@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
                 startup_cost += npageaccesses *
                         (seq_page_cost * 0.75 + random_page_cost * 0.25);
         }
+       else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+       {
+               /*
+                * We'll use a bounded heap-sort keeping just K tuples in memory,
+                * for a total number of tuple comparisons of N log2 K; but the
+                * constant factor is a bit higher than for quicksort.  Tweak it
+                * so that the cost curve is continuous at the crossover point.
+                */
+               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+       }
+       else
+       {
+               /* We'll use plain quicksort on all the input tuples */
+               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+       }
  
         /*
          * Also charge a small amount (arbitrarily set equal to operator cost) per
-        * extracted tuple.
+        * extracted tuple.  Note it's correct to use tuples not output_tuples
+        * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+        * counting the LIMIT otherwise.
          */
         run_cost += cpu_operator_cost * tuples;
  
@@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
                                   outersortkeys,
                                   outer_path->total_cost,
                                   outer_path_rows,
-                                 outer_path->parent->width);
+                                 outer_path->parent->width,
+                                 -1.0);
                 startup_cost += sort_path.startup_cost;
                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
                         * outerscansel;
@@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
                                   innersortkeys,
                                   inner_path->total_cost,
                                   inner_path_rows,
-                                 inner_path->parent->width);
+                                 inner_path->parent->width,
+                                 -1.0);
                 startup_cost += sort_path.startup_cost;
                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
                         * innerscansel * rescanratio;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c

index 57a5a078377444c7ec1e94cc762cc7855e5f8ed5..eb4a276ff6b105d0a1bfe7427e04fb7385eb7535 100644 (file)
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist,
                            Plan *lefttree, Plan *righttree,
                            JoinType jointype);
  static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
-                 AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst);
+                 AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+                 double limit_tuples);
  
  
  /*
@@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root,
                 outer_plan = (Plan *)
                         make_sort_from_pathkeys(root,
                                                                         outer_plan,
-                                                                       best_path->outersortkeys);
+                                                                       best_path->outersortkeys,
+                                                                       -1.0);
                 outerpathkeys = best_path->outersortkeys;
         }
         else
@@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root,
                 inner_plan = (Plan *)
                         make_sort_from_pathkeys(root,
                                                                         inner_plan,
-                                                                       best_path->innersortkeys);
+                                                                       best_path->innersortkeys,
+                                                                       -1.0);
                 innerpathkeys = best_path->innersortkeys;
         }
         else
@@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist,
   * make_sort --- basic routine to build a Sort plan node
   *
   * Caller must have built the sortColIdx, sortOperators, and nullsFirst
- * arrays already.
+ * arrays already.  limit_tuples is as for cost_sort (in particular, pass
+ * -1 if no limit)
   */
  static Sort *
  make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
-                 AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst)
+                 AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+                 double limit_tuples)
  {
         Sort       *node = makeNode(Sort);
         Plan       *plan = &node->plan;
@@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
         cost_sort(&sort_path, root, NIL,
                           lefttree->total_cost,
                           lefttree->plan_rows,
-                         lefttree->plan_width);
+                         lefttree->plan_width,
+                         limit_tuples);
         plan->startup_cost = sort_path.startup_cost;
         plan->total_cost = sort_path.total_cost;
         plan->targetlist = lefttree->targetlist;
@@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
   *
   *       'lefttree' is the node which yields input tuples
   *       'pathkeys' is the list of pathkeys by which the result is to be sorted
+ *       'limit_tuples' is the bound on the number of output tuples;
+ *                             -1 if no bound
   *
   * We must convert the pathkey information into arrays of sort key column
   * numbers and sort operator OIDs.
@@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
   * adding a Result node just to do the projection.
   */
  Sort *
-make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
+make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
+                                               double limit_tuples)
  {
         List       *tlist = lefttree->targetlist;
         ListCell   *i;
@@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
         Assert(numsortkeys > 0);
  
         return make_sort(root, lefttree, numsortkeys,
-                                        sortColIdx, sortOperators, nullsFirst);
+                                        sortColIdx, sortOperators, nullsFirst, limit_tuples);
  }
  
  /*
@@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree)
         Assert(numsortkeys > 0);
  
         return make_sort(root, lefttree, numsortkeys,
-                                        sortColIdx, sortOperators, nullsFirst);
+                                        sortColIdx, sortOperators, nullsFirst, -1.0);
  }
  
  /*
@@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root,
         Assert(numsortkeys > 0);
  
         return make_sort(root, lefttree, numsortkeys,
-                                        sortColIdx, sortOperators, nullsFirst);
+                                        sortColIdx, sortOperators, nullsFirst, -1.0);
  }
  
  Material *
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c

index c0b8dab4cbcfeba8ce1850d44eb6ef528cabd177..cc053142255cc8fafdf1b77a80370c5563354bd2 100644 (file)
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -47,6 +47,8 @@
   * tlist is the target list the query should produce
   *             (this is NOT necessarily root->parse->targetList!)
   * tuple_fraction is the fraction of tuples we expect will be retrieved
+ * limit_tuples is a hard limit on number of tuples to retrieve,
+ *             or -1 if no limit
   *
   * Output parameters:
   * *cheapest_path receives the overall-cheapest path for the query
@@ -74,9 +76,13 @@
   *             from the plan to be retrieved
   *       tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
   *             expected to be retrieved (ie, a LIMIT specification)
+ * Note that a nonzero tuple_fraction could come from outer context; it is
+ * therefore not redundant with limit_tuples.  We use limit_tuples to determine
+ * whether a bounded sort can be used at runtime.
   */
  void
-query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
+query_planner(PlannerInfo *root, List *tlist,
+                         double tuple_fraction, double limit_tuples,
                           Path **cheapest_path, Path **sorted_path,
                           double *num_groups)
  {
@@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
                         /* Figure cost for sorting */
                         cost_sort(&sort_path, root, root->query_pathkeys,
                                           cheapestpath->total_cost,
-                                         final_rel->rows, final_rel->width);
+                                         final_rel->rows, final_rel->width,
+                                         limit_tuples);
                 }
  
                 if (compare_fractional_path_costs(sortedpath, &sort_path,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 0aba6826102b01bb3fe1af384efcdfd659d74c8c..bb9f468f2b69d426c539689bdcc13357e281c53e 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -61,7 +61,8 @@ static double preprocess_limit(PlannerInfo *root,
                                  double tuple_fraction,
                                  int64 *offset_est, int64 *count_est);
  static Oid *extract_grouping_ops(List *groupClause);
-static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
+static bool choose_hashed_grouping(PlannerInfo *root,
+                                          double tuple_fraction, double limit_tuples,
                                            Path *cheapest_path, Path *sorted_path,
                                            Oid *groupOperators, double dNumGroups,
                                            AggClauseCounts *agg_counts);
@@ -696,6 +697,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
         List       *tlist = parse->targetList;
         int64           offset_est = 0;
         int64           count_est = 0;
+       double          limit_tuples = -1.0;
         Plan       *result_plan;
         List       *current_pathkeys;
         List       *sort_pathkeys;
@@ -703,8 +705,16 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
  
         /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
         if (parse->limitCount || parse->limitOffset)
+       {
                 tuple_fraction = preprocess_limit(root, tuple_fraction,
                                                                                   &offset_est, &count_est);
+               /*
+                * If we have a known LIMIT, and don't have an unknown OFFSET,
+                * we can estimate the effects of using a bounded sort.
+                */
+               if (count_est > 0 && offset_est >= 0)
+                       limit_tuples = (double) count_est + (double) offset_est;
+       }
  
         if (parse->setOperations)
         {
@@ -850,7 +860,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                  * estimate the number of groups in the query, and canonicalize all
                  * the pathkeys.
                  */
-               query_planner(root, sub_tlist, tuple_fraction,
+               query_planner(root, sub_tlist, tuple_fraction, limit_tuples,
                                           &cheapest_path, &sorted_path, &dNumGroups);
  
                 group_pathkeys = root->group_pathkeys;
@@ -864,7 +874,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                 {
                         groupOperators = extract_grouping_ops(parse->groupClause);
                         use_hashed_grouping =
-                               choose_hashed_grouping(root, tuple_fraction,
+                               choose_hashed_grouping(root, tuple_fraction, limit_tuples,
                                                                            cheapest_path, sorted_path,
                                                                            groupOperators, dNumGroups,
                                                                            &agg_counts);
@@ -1099,7 +1109,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                 {
                         result_plan = (Plan *) make_sort_from_pathkeys(root,
                                                                                                                    result_plan,
-                                                                                                                  sort_pathkeys);
+                                                                                                                  sort_pathkeys,
+                                                                                                                  limit_tuples);
                         current_pathkeys = sort_pathkeys;
                 }
         }
@@ -1414,7 +1425,8 @@ extract_grouping_ops(List *groupClause)
   * choose_hashed_grouping - should we use hashed grouping?
   */
  static bool
-choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
+choose_hashed_grouping(PlannerInfo *root,
+                                          double tuple_fraction, double limit_tuples,
                                            Path *cheapest_path, Path *sorted_path,
                                            Oid *groupOperators, double dNumGroups,
                                            AggClauseCounts *agg_counts)
@@ -1499,7 +1511,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
         /* Result of hashed agg is always unsorted */
         if (root->sort_pathkeys)
                 cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
-                                 dNumGroups, cheapest_path_width);
+                                 dNumGroups, cheapest_path_width, limit_tuples);
  
         if (sorted_path)
         {
@@ -1516,7 +1528,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
         if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
         {
                 cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
-                                 cheapest_path_rows, cheapest_path_width);
+                                 cheapest_path_rows, cheapest_path_width, -1.0);
                 current_pathkeys = root->group_pathkeys;
         }
  
@@ -1533,7 +1545,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
         if (root->sort_pathkeys &&
                 !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
                 cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
-                                 dNumGroups, cheapest_path_width);
+                                 dNumGroups, cheapest_path_width, limit_tuples);
  
         /*
          * Now make the decision using the top-level tuple fraction.  First we
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index ce25b077db442a0903afd30d61abdc21722c52e9..929e3eb0f23ab8fff7583cd13446a5374fed9c6d 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -842,7 +842,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
         cost_sort(&sort_path, root, NIL,
                           subpath->total_cost,
                           rel->rows,
-                         rel->width);
+                         rel->width,
+                         -1.0);
  
         /*
          * Charge one cpu_operator_cost per comparison per input tuple. We assume
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 31fbb949fc3cacbbd7b7de18da2eeec2ed5940a4..c6124676b110f54be24cbfe69aa8b389c2b91170 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -108,6 +108,9 @@ extern bool fullPageWrites;
  #ifdef TRACE_SORT
  extern bool trace_sort;
  #endif
+#ifdef DEBUG_BOUNDED_SORT
+extern bool optimize_bounded_sort;
+#endif
  
  #ifdef USE_SSL
  extern char *SSLCipherSuites;
@@ -966,6 +969,20 @@ static struct config_bool ConfigureNamesBool[] =
         },
  #endif
  
+#ifdef DEBUG_BOUNDED_SORT
+       /* this is undocumented because not exposed in a standard build */
+       {
+               {
+                       "optimize_bounded_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+                       gettext_noop("Enable bounded sorting using heap sort."),
+                       NULL,
+                       GUC_NOT_IN_SAMPLE
+               },
+               &optimize_bounded_sort,
+               true, NULL, NULL
+       },
+#endif
+
  #ifdef WAL_DEBUG
         {
                 {"wal_debug", PGC_SUSET, DEVELOPER_OPTIONS,
@@ -1711,7 +1728,7 @@ static struct config_int ConfigureNamesInt[] =
                 &server_version_num,
                 PG_VERSION_NUM, PG_VERSION_NUM, PG_VERSION_NUM, NULL, NULL
         },
-                               
+
         {
                 {"log_temp_files", PGC_USERSET, LOGGING_WHAT,
                         gettext_noop("Log the use of temporary files larger than this number of kilobytes."),
@@ -2883,7 +2900,7 @@ InitializeGUCOptions(void)
                                                                                                    PGC_S_DEFAULT))
                                                         elog(FATAL, "failed to initialize %s to %d",
                                                                  conf->gen.name, conf->boot_val);
-                                       *conf->variable = conf->reset_val = conf->boot_val; 
+                                       *conf->variable = conf->reset_val = conf->boot_val;
                                         break;
                                 }
                         case PGC_REAL:
@@ -2897,7 +2914,7 @@ InitializeGUCOptions(void)
                                                                                                    PGC_S_DEFAULT))
                                                         elog(FATAL, "failed to initialize %s to %g",
                                                                  conf->gen.name, conf->boot_val);
-                                       *conf->variable = conf->reset_val = conf->boot_val; 
+                                       *conf->variable = conf->reset_val = conf->boot_val;
                                         break;
                                 }
                         case PGC_STRING:
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index 50cfe13098e9abe55984b7c4029b576aecac856b..b775dd5f4cc7ce1a768026f68f275c281e070f74 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -98,6 +98,8 @@
  
  #include "postgres.h"
  
+#include <limits.h>
+
  #include "access/heapam.h"
  #include "access/nbtree.h"
  #include "catalog/pg_amop.h"
@@ -112,10 +114,13 @@
  #include "utils/tuplesort.h"
  
  
-/* GUC variable */
+/* GUC variables */
  #ifdef TRACE_SORT
  bool           trace_sort = false;
  #endif
+#ifdef DEBUG_BOUNDED_SORT
+bool           optimize_bounded_sort = true;
+#endif
  
  
  /*
@@ -159,6 +164,7 @@ typedef struct
  typedef enum
  {
         TSS_INITIAL,                            /* Loading tuples; still within memory limit */
+       TSS_BOUNDED,                            /* Loading tuples into bounded-size heap */
         TSS_BUILDRUNS,                          /* Loading tuples; writing to tape */
         TSS_SORTEDINMEM,                        /* Sort completed entirely in memory */
         TSS_SORTEDONTAPE,                       /* Sort completed, final run is on tape */
@@ -188,6 +194,9 @@ struct Tuplesortstate
         TupSortStatus status;           /* enumerated value as shown above */
         int                     nKeys;                  /* number of columns in sort key */
         bool            randomAccess;   /* did caller request random access? */
+       bool            bounded;                /* did caller specify a maximum number of
+                                                                * tuples to return? */
+       int                     bound;                  /* if bounded, the maximum number of tuples */
         long            availMem;               /* remaining memory available, in bytes */
         long            allowedMem;             /* total memory allowed, in bytes */
         int                     maxTapes;               /* number of tapes (Knuth's T) */
@@ -234,6 +243,13 @@ struct Tuplesortstate
         void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
                                                                                 int tapenum, unsigned int len);
  
+       /*
+        * Function to reverse the sort direction from its current state.
+        * (We could dispense with this if we wanted to enforce that all variants
+        * represent the sort key information alike.)
+        */
+       void            (*reversedirection) (Tuplesortstate *state);
+
         /*
          * This array holds the tuples now in sort memory.      If we are in state
          * INITIAL, the tuples are in no particular order; if we are in state
@@ -347,6 +363,7 @@ struct Tuplesortstate
  #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
  #define WRITETUP(state,tape,stup)      ((*(state)->writetup) (state, tape, stup))
  #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
+#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state))
  #define LACKMEM(state)         ((state)->availMem < 0)
  #define USEMEM(state,amt)      ((state)->availMem -= (amt))
  #define FREEMEM(state,amt)     ((state)->availMem += (amt))
@@ -403,6 +420,8 @@ static void beginmerge(Tuplesortstate *state);
  static void mergepreread(Tuplesortstate *state);
  static void mergeprereadone(Tuplesortstate *state, int srcTape);
  static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void make_bounded_heap(Tuplesortstate *state);
+static void sort_bounded_heap(Tuplesortstate *state);
  static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
                                           int tupleindex, bool checkIndex);
  static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@@ -415,6 +434,7 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
                           SortTuple *stup);
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
                          int tapenum, unsigned int len);
+static void reversedirection_heap(Tuplesortstate *state);
  static int comparetup_index(const SortTuple *a, const SortTuple *b,
                                  Tuplesortstate *state);
  static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -422,6 +442,7 @@ static void writetup_index(Tuplesortstate *state, int tapenum,
                            SortTuple *stup);
  static void readtup_index(Tuplesortstate *state, SortTuple *stup,
                           int tapenum, unsigned int len);
+static void reversedirection_index(Tuplesortstate *state);
  static int comparetup_datum(const SortTuple *a, const SortTuple *b,
                                  Tuplesortstate *state);
  static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -429,6 +450,8 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
                            SortTuple *stup);
  static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
                           int tapenum, unsigned int len);
+static void reversedirection_datum(Tuplesortstate *state);
+static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  
  
  /*
@@ -538,6 +561,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
         state->copytup = copytup_heap;
         state->writetup = writetup_heap;
         state->readtup = readtup_heap;
+       state->reversedirection = reversedirection_heap;
  
         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
         state->scanKeys = (ScanKey) palloc0(nkeys * sizeof(ScanKeyData));
@@ -601,6 +625,7 @@ tuplesort_begin_index(Relation indexRel,
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
+       state->reversedirection = reversedirection_index;
  
         state->indexRel = indexRel;
         /* see comments below about btree dependence of this code... */
@@ -639,6 +664,7 @@ tuplesort_begin_datum(Oid datumType,
         state->copytup = copytup_datum;
         state->writetup = writetup_datum;
         state->readtup = readtup_datum;
+       state->reversedirection = reversedirection_datum;
  
         state->datumType = datumType;
  
@@ -664,6 +690,40 @@ tuplesort_begin_datum(Oid datumType,
         return state;
  }
  
+/*
+ * tuplesort_set_bound
+ *
+ *     Advise tuplesort that at most the first N result tuples are required.
+ *
+ * Must be called before inserting any tuples.  (Actually, we could allow it
+ * as long as the sort hasn't spilled to disk, but there seems no need for
+ * delayed calls at the moment.)
+ *
+ * This is a hint only. The tuplesort may still return more tuples than
+ * requested.
+ */
+void
+tuplesort_set_bound(Tuplesortstate *state, int64 bound)
+{
+       /* Assert we're called before loading any tuples */
+       Assert(state->status == TSS_INITIAL);
+       Assert(state->memtupcount == 0);
+       Assert(!state->bounded);
+
+#ifdef DEBUG_BOUNDED_SORT
+       /* Honor GUC setting that disables the feature (for easy testing) */
+       if (!optimize_bounded_sort)
+               return;
+#endif
+
+       /* We want to be able to compute bound * 2, so limit the setting */
+       if (bound > (int64) (INT_MAX/2))
+               return;
+
+       state->bounded = true;
+       state->bound = (int) bound;
+}
+
  /*
   * tuplesort_end
   *
@@ -862,6 +922,32 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
                         }
                         state->memtuples[state->memtupcount++] = *tuple;
  
+                       /*
+                        * Check if it's time to switch over to a bounded heapsort.
+                        * We do so if the input tuple count exceeds twice the desired
+                        * tuple count (this is a heuristic for where heapsort becomes
+                        * cheaper than a quicksort), or if we've just filled workMem
+                        * and have enough tuples to meet the bound.
+                        *
+                        * Note that once we enter TSS_BOUNDED state we will always try
+                        * to complete the sort that way.  In the worst case, if later
+                        * input tuples are larger than earlier ones, this might cause
+                        * us to exceed workMem significantly.
+                        */
+                       if (state->bounded &&
+                               (state->memtupcount > state->bound * 2 ||
+                                (state->memtupcount > state->bound && LACKMEM(state))))
+                       {
+#ifdef TRACE_SORT
+                               if (trace_sort)
+                                       elog(LOG, "switching to bounded heapsort at %d tuples: %s",
+                                                state->memtupcount,
+                                                pg_rusage_show(&state->ru_start));
+#endif
+                               make_bounded_heap(state);
+                               return;
+                       }
+
                         /*
                          * Done if we still fit in available memory and have array slots.
                          */
@@ -878,6 +964,31 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
                          */
                         dumptuples(state, false);
                         break;
+
+               case TSS_BOUNDED:
+                       /*
+                        * We don't want to grow the array here, so check whether the
+                        * new tuple can be discarded before putting it in.  This should
+                        * be a good speed optimization, too, since when there are many
+                        * more input tuples than the bound, most input tuples can be
+                        * discarded with just this one comparison.  Note that because
+                        * we currently have the sort direction reversed, we must check
+                        * for <= not >=.
+                        */
+                       if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
+                       {
+                               /* new tuple <= top of the heap, so we can discard it */
+                               free_sort_tuple(state, tuple);
+                       }
+                       else
+                       {
+                               /* discard top of heap, sift up, insert new tuple */
+                               free_sort_tuple(state, &state->memtuples[0]);
+                               tuplesort_heap_siftup(state, false);
+                               tuplesort_heap_insert(state, tuple, 0, false);
+                       }
+                       break;
+
                 case TSS_BUILDRUNS:
  
                         /*
@@ -904,6 +1015,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
                          */
                         dumptuples(state, false);
                         break;
+
                 default:
                         elog(ERROR, "invalid tuplesort state");
                         break;
@@ -944,6 +1056,23 @@ tuplesort_performsort(Tuplesortstate *state)
                         state->markpos_eof = false;
                         state->status = TSS_SORTEDINMEM;
                         break;
+
+               case TSS_BOUNDED:
+
+                       /*
+                        * We were able to accumulate all the tuples required for output
+                        * in memory, using a heap to eliminate excess tuples.  Now we have
+                        * to transform the heap to a properly-sorted array.
+                        */
+                       if (state->memtupcount > 1)
+                               sort_bounded_heap(state);
+                       state->current = 0;
+                       state->eof_reached = false;
+                       state->markpos_offset = 0;
+                       state->markpos_eof = false;
+                       state->status = TSS_SORTEDINMEM;
+                       break;
+
                 case TSS_BUILDRUNS:
  
                         /*
@@ -959,6 +1088,7 @@ tuplesort_performsort(Tuplesortstate *state)
                         state->markpos_offset = 0;
                         state->markpos_eof = false;
                         break;
+
                 default:
                         elog(ERROR, "invalid tuplesort state");
                         break;
@@ -1004,6 +1134,15 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
                                         return true;
                                 }
                                 state->eof_reached = true;
+
+                               /*
+                                * Complain if caller tries to retrieve more tuples than
+                                * originally asked for in a bounded sort.  This is because
+                                * returning EOF here might be the wrong thing.
+                                */
+                               if (state->bounded && state->current >= state->bound)
+                                       elog(ERROR, "retrieved too many tuples in a bounded sort");
+
                                 return false;
                         }
                         else
@@ -1987,6 +2126,98 @@ tuplesort_restorepos(Tuplesortstate *state)
          ((tup1)->tupindex) - ((tup2)->tupindex) : \
          COMPARETUP(state, tup1, tup2))
  
+/*
+ * Convert the existing unordered array of SortTuples to a bounded heap,
+ * discarding all but the smallest "state->bound" tuples.
+ *
+ * When working with a bounded heap, we want to keep the largest entry
+ * at the root (array entry zero), instead of the smallest as in the normal
+ * sort case.  This allows us to discard the largest entry cheaply.
+ * Therefore, we temporarily reverse the sort direction.
+ *
+ * We assume that all entries in a bounded heap will always have tupindex
+ * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
+ * the direction of comparison for tupindexes.
+ */
+static void
+make_bounded_heap(Tuplesortstate *state)
+{
+       int             tupcount = state->memtupcount;
+       int             i;
+
+       Assert(state->status == TSS_INITIAL);
+       Assert(state->bounded);
+       Assert(tupcount >= state->bound);
+
+       /* Reverse sort direction so largest entry will be at root */
+       REVERSEDIRECTION(state);
+
+       state->memtupcount = 0;         /* make the heap empty */
+       for (i=0; i<tupcount; i++)
+       {
+               if (state->memtupcount >= state->bound &&
+                       COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
+               {
+                       /* New tuple would just get thrown out, so skip it */
+                       free_sort_tuple(state, &state->memtuples[i]);
+               }
+               else
+               {
+                       /* Insert next tuple into heap */
+                       /* Must copy source tuple to avoid possible overwrite */
+                       SortTuple stup = state->memtuples[i];
+
+                       tuplesort_heap_insert(state, &stup, 0, false);
+
+                       /* If heap too full, discard largest entry */
+                       if (state->memtupcount > state->bound)
+                       {
+                               free_sort_tuple(state, &state->memtuples[0]);
+                               tuplesort_heap_siftup(state, false);
+                       }
+               }
+       }
+
+       Assert(state->memtupcount == state->bound);
+       state->status = TSS_BOUNDED;
+}
+
+/*
+ * Convert the bounded heap to a properly-sorted array
+ */
+static void
+sort_bounded_heap(Tuplesortstate *state)
+{
+       int             tupcount = state->memtupcount;
+
+       Assert(state->status == TSS_BOUNDED);
+       Assert(state->bounded);
+       Assert(tupcount == state->bound);
+
+       /*
+        * We can unheapify in place because each sift-up will remove the largest
+        * entry, which we can promptly store in the newly freed slot at the end.
+        * Once we're down to a single-entry heap, we're done.
+        */
+       while (state->memtupcount > 1)
+       {
+               SortTuple stup = state->memtuples[0];
+
+               /* this sifts-up the next-largest entry and decreases memtupcount */
+               tuplesort_heap_siftup(state, false);
+               state->memtuples[state->memtupcount] = stup;
+       }
+       state->memtupcount = tupcount;
+
+       /*
+        * Reverse sort direction back to the original state.  This is not
+        * actually necessary but seems like a good idea for tidiness.
+        */
+       REVERSEDIRECTION(state);
+
+       state->status = TSS_SORTEDINMEM;
+}
+
  /*
   * Insert a new tuple into an empty or existing heap, maintaining the
   * heap invariant.     Caller is responsible for ensuring there's room.
@@ -2322,6 +2553,18 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
                                                                 &stup->isnull1);
  }
  
+static void
+reversedirection_heap(Tuplesortstate *state)
+{
+       ScanKey         scanKey = state->scanKeys;
+       int                     nkey;
+
+       for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
+       {
+               scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+       }
+}
+
  
  /*
   * Routines specialized for IndexTuple case
@@ -2497,6 +2740,18 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
                                                                  &stup->isnull1);
  }
  
+static void
+reversedirection_index(Tuplesortstate *state)
+{
+       ScanKey         scanKey = state->indexScanKey;
+       int                     nkey;
+
+       for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
+       {
+               scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+       }
+}
+
  
  /*
   * Routines specialized for DatumTuple case
@@ -2601,3 +2856,19 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
                                                         sizeof(tuplen)) != sizeof(tuplen))
                         elog(ERROR, "unexpected end of data");
  }
+
+static void
+reversedirection_datum(Tuplesortstate *state)
+{
+       state->sortFnFlags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+}
+
+/*
+ * Convenience routine to free a tuple previously loaded into sort memory
+ */
+static void
+free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
+{
+       FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
+       pfree(stup->tuple);
+}
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 8178739cf3547494d189c556cde738f647a85c6a..a00482548a2f9a7d05da392abb624ae27ea62a68 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1294,7 +1294,11 @@ typedef struct SortState
  {
         ScanState       ss;                             /* its first field is NodeTag */
         bool            randomAccess;   /* need random access to sort output? */
+       bool            bounded;                /* is the result set bounded? */
+       int64           bound;                  /* if bounded, how many tuples are needed */
         bool            sort_Done;              /* sort completed yet? */
+       bool            bounded_Done;   /* value of bounded we did the sort with */
+       int64           bound_Done;             /* value of bound we did the sort with */
         void       *tuplesortstate; /* private state of tuplesort.c */
  } SortState;
  
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h

index 98266145fc2efa03cf3749363f0733f94cdcbca8..858afa6a6b2f1cd956cd6a09197a1ea1ae52ca10 100644 (file)
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -73,7 +73,8 @@ extern void cost_functionscan(Path *path, PlannerInfo *root,
  extern void cost_valuesscan(Path *path, PlannerInfo *root,
                                 RelOptInfo *baserel);
  extern void cost_sort(Path *path, PlannerInfo *root,
-                 List *pathkeys, Cost input_cost, double tuples, int width);
+                 List *pathkeys, Cost input_cost, double tuples, int width,
+                 double limit_tuples);
  extern void cost_material(Path *path,
                           Cost input_cost, double tuples, int width);
  extern void cost_agg(Path *path, PlannerInfo *root,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h

index 1b09ebf11b46ccb36d0a572fca6b16fcea0e6b92..c6b0308e3ab2a2fa5c8acfadc25542cea1d33575 100644 (file)
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -21,7 +21,7 @@
   * prototypes for plan/planmain.c
   */
  extern void query_planner(PlannerInfo *root, List *tlist,
-                         double tuple_fraction,
+                         double tuple_fraction, double limit_tuples,
                           Path **cheapest_path, Path **sorted_path,
                           double *num_groups);
  
@@ -39,7 +39,7 @@ extern SubqueryScan *make_subqueryscan(List *qptlist, List *qpqual,
                                   Index scanrelid, Plan *subplan, List *subrtable);
  extern Append *make_append(List *appendplans, bool isTarget, List *tlist);
  extern Sort *make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree,
-                                               List *pathkeys);
+                                               List *pathkeys, double limit_tuples);
  extern Sort *make_sort_from_sortclauses(PlannerInfo *root, List *sortcls,
                                                    Plan *lefttree);
  extern Sort *make_sort_from_groupcols(PlannerInfo *root, List *groupcls,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h

index a0c68909dfe119151a38cf9d1e5a6dc794f1ce44..8ce165a8fcf98da874a69374fa7e7fa299e5bca0 100644 (file)
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -55,6 +55,8 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
                                           Oid sortOperator, bool nullsFirstFlag,
                                           int workMem, bool randomAccess);
  
+extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
+
  extern void tuplesort_puttupleslot(Tuplesortstate *state,
                                            TupleTableSlot *slot);
  extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 4 May 2007 01:13:45 +0000 (01:13 +0000)
src/backend/executor/nodeLimit.c		patch \| blob \| blame \| history
src/backend/executor/nodeSort.c		patch \| blob \| blame \| history
src/backend/optimizer/path/costsize.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/createplan.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planmain.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/optimizer/cost.h		patch \| blob \| blame \| history
src/include/optimizer/planmain.h		patch \| blob \| blame \| history
src/include/utils/tuplesort.h		patch \| blob \| blame \| history