Split out into a separate function the code in grouping_planner() that
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 10 Apr 2005 19:50:08 +0000 (19:50 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 10 Apr 2005 19:50:08 +0000 (19:50 +0000)
decides whether to use hashed grouping instead of sort-plus-uniq
grouping. The function needs an annoyingly large number of parameters,
but this still seems like a win for legibility, since it removes over
a hundred lines from grouping_planner (which is still too big :-().

src/backend/optimizer/plan/planner.c

index 9f898997f0064882ddf981ab4074afae6d71ff0b..eea58e45a15ee90f38d1911cd269153a5e21513f 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.182 2005/04/06 16:34:05 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.183 2005/04/10 19:50:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -58,6 +58,10 @@ static Node *preprocess_expression(Query *parse, Node *expr, int kind);
 static void preprocess_qual_conditions(Query *parse, Node *jtnode);
 static Plan *inheritance_planner(Query *parse, List *inheritlist);
 static Plan *grouping_planner(Query *parse, double tuple_fraction);
+static bool choose_hashed_grouping(Query *parse, double tuple_fraction,
+                      Path *cheapest_path, Path *sorted_path,
+                      List *sort_pathkeys, List *group_pathkeys,
+                      double dNumGroups, AggClauseCounts *agg_counts);
 static bool hash_safe_grouping(Query *parse);
 static List *make_subplanTargetList(Query *parse, List *tlist,
                       AttrNumber **groupColIdx, bool *need_tlist_eval);
@@ -920,34 +924,25 @@ grouping_planner(Query *parse, double tuple_fraction)
        sort_pathkeys = canonicalize_pathkeys(parse, sort_pathkeys);
 
        /*
-        * Consider whether we might want to use hashed grouping.
+        * If grouping, estimate the number of groups.  (We can't do this
+        * until after running query_planner(), either.)  Then decide
+        * whether we want to use hashed grouping.
         */
        if (parse->groupClause)
        {
            List       *groupExprs;
            double      cheapest_path_rows;
-           int         cheapest_path_width;
 
            /*
-            * Beware in this section of the possibility that
-            * cheapest_path->parent is NULL.  This could happen if user
-            * does something silly like SELECT 'foo' GROUP BY 1;
+            * Beware of the possibility that cheapest_path->parent is NULL.
+            * This could happen if user does something silly like
+            *      SELECT 'foo' GROUP BY 1;
             */
            if (cheapest_path->parent)
-           {
                cheapest_path_rows = cheapest_path->parent->rows;
-               cheapest_path_width = cheapest_path->parent->width;
-           }
            else
-           {
                cheapest_path_rows = 1; /* assume non-set result */
-               cheapest_path_width = 100;      /* arbitrary */
-           }
 
-           /*
-            * Always estimate the number of groups.  We can't do this
-            * until after running query_planner(), either.
-            */
            groupExprs = get_sortgrouplist_exprs(parse->groupClause,
                                                 parse->targetList);
            dNumGroups = estimate_num_groups(parse,
@@ -956,130 +951,11 @@ grouping_planner(Query *parse, double tuple_fraction)
            /* Also want it as a long int --- but 'ware overflow! */
            numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
 
-           /*
-            * Check can't-do-it conditions, including whether the
-            * grouping operators are hashjoinable.
-            *
-            * Executor doesn't support hashed aggregation with DISTINCT
-            * aggregates.  (Doing so would imply storing *all* the input
-            * values in the hash table, which seems like a certain
-            * loser.)
-            */
-           if (!enable_hashagg || !hash_safe_grouping(parse))
-               use_hashed_grouping = false;
-           else if (agg_counts.numDistinctAggs != 0)
-               use_hashed_grouping = false;
-           else
-           {
-               /*
-                * Use hashed grouping if (a) we think we can fit the
-                * hashtable into work_mem, *and* (b) the estimated cost
-                * is no more than doing it the other way.  While avoiding
-                * the need for sorted input is usually a win, the fact
-                * that the output won't be sorted may be a loss; so we
-                * need to do an actual cost comparison.
-                */
-               Size        hashentrysize;
-
-               /* Estimate per-hash-entry space at tuple width... */
-               hashentrysize = cheapest_path_width;
-               /* plus space for pass-by-ref transition values... */
-               hashentrysize += agg_counts.transitionSpace;
-               /* plus the per-hash-entry overhead */
-               hashentrysize += hash_agg_entry_size(agg_counts.numAggs);
-
-               if (hashentrysize * dNumGroups <= work_mem * 1024L)
-               {
-                   /*
-                    * Okay, do the cost comparison.  We need to consider
-                    * cheapest_path + hashagg [+ final sort] versus
-                    * either cheapest_path [+ sort] + group or agg [+
-                    * final sort] or presorted_path + group or agg [+
-                    * final sort] where brackets indicate a step that may
-                    * not be needed. We assume query_planner() will have
-                    * returned a presorted path only if it's a winner
-                    * compared to cheapest_path for this purpose.
-                    *
-                    * These path variables are dummies that just hold cost
-                    * fields; we don't make actual Paths for these steps.
-                    */
-                   Path        hashed_p;
-                   Path        sorted_p;
-
-                   cost_agg(&hashed_p, parse,
-                            AGG_HASHED, agg_counts.numAggs,
-                            numGroupCols, dNumGroups,
-                            cheapest_path->startup_cost,
-                            cheapest_path->total_cost,
-                            cheapest_path_rows);
-                   /* Result of hashed agg is always unsorted */
-                   if (sort_pathkeys)
-                       cost_sort(&hashed_p, parse, sort_pathkeys,
-                                 hashed_p.total_cost,
-                                 dNumGroups,
-                                 cheapest_path_width);
-
-                   if (sorted_path)
-                   {
-                       sorted_p.startup_cost = sorted_path->startup_cost;
-                       sorted_p.total_cost = sorted_path->total_cost;
-                       current_pathkeys = sorted_path->pathkeys;
-                   }
-                   else
-                   {
-                       sorted_p.startup_cost = cheapest_path->startup_cost;
-                       sorted_p.total_cost = cheapest_path->total_cost;
-                       current_pathkeys = cheapest_path->pathkeys;
-                   }
-                   if (!pathkeys_contained_in(group_pathkeys,
-                                              current_pathkeys))
-                   {
-                       cost_sort(&sorted_p, parse, group_pathkeys,
-                                 sorted_p.total_cost,
-                                 cheapest_path_rows,
-                                 cheapest_path_width);
-                       current_pathkeys = group_pathkeys;
-                   }
-                   if (parse->hasAggs)
-                       cost_agg(&sorted_p, parse,
-                                AGG_SORTED, agg_counts.numAggs,
-                                numGroupCols, dNumGroups,
-                                sorted_p.startup_cost,
-                                sorted_p.total_cost,
-                                cheapest_path_rows);
-                   else
-                       cost_group(&sorted_p, parse,
-                                  numGroupCols, dNumGroups,
-                                  sorted_p.startup_cost,
-                                  sorted_p.total_cost,
-                                  cheapest_path_rows);
-                   /* The Agg or Group node will preserve ordering */
-                   if (sort_pathkeys &&
-                       !pathkeys_contained_in(sort_pathkeys,
-                                              current_pathkeys))
-                   {
-                       cost_sort(&sorted_p, parse, sort_pathkeys,
-                                 sorted_p.total_cost,
-                                 dNumGroups,
-                                 cheapest_path_width);
-                   }
-
-                   /*
-                    * Now make the decision using the top-level tuple
-                    * fraction.  First we have to convert an absolute
-                    * count (LIMIT) into fractional form.
-                    */
-                   if (tuple_fraction >= 1.0)
-                       tuple_fraction /= dNumGroups;
-
-                   if (compare_fractional_path_costs(&hashed_p, &sorted_p,
-                                                     tuple_fraction) < 0)
-                   {
-                       /* Hashed is cheaper, so use it */
-                       use_hashed_grouping = true;
-                   }
-               }
-           }
+           use_hashed_grouping =
+               choose_hashed_grouping(parse, tuple_fraction,
+                                      cheapest_path, sorted_path,
+                                      sort_pathkeys, group_pathkeys,
+                                      dNumGroups, &agg_counts);
        }
 
        /*
@@ -1331,6 +1207,146 @@ grouping_planner(Query *parse, double tuple_fraction)
    return result_plan;
 }
 
+/*
+ * choose_hashed_grouping - should we use hashed grouping?
+ */
+static bool
+choose_hashed_grouping(Query *parse, double tuple_fraction,
+                      Path *cheapest_path, Path *sorted_path,
+                      List *sort_pathkeys, List *group_pathkeys,
+                      double dNumGroups, AggClauseCounts *agg_counts)
+{
+   int         numGroupCols = list_length(parse->groupClause);
+   double      cheapest_path_rows;
+   int         cheapest_path_width;
+   Size        hashentrysize;
+   List       *current_pathkeys;
+   Path        hashed_p;
+   Path        sorted_p;
+
+   /*
+    * Check can't-do-it conditions, including whether the grouping operators
+    * are hashjoinable.
+    *
+    * Executor doesn't support hashed aggregation with DISTINCT aggregates.
+    * (Doing so would imply storing *all* the input values in the hash table,
+    * which seems like a certain loser.)
+    */
+   if (!enable_hashagg)
+       return false;
+   if (agg_counts->numDistinctAggs != 0)
+       return false;
+   if (!hash_safe_grouping(parse))
+       return false;
+
+   /*
+    * Don't do it if it doesn't look like the hashtable will fit into
+    * work_mem.
+    *
+    * Beware here of the possibility that cheapest_path->parent is NULL.
+    * This could happen if user does something silly like
+    *      SELECT 'foo' GROUP BY 1;
+    */
+   if (cheapest_path->parent)
+   {
+       cheapest_path_rows = cheapest_path->parent->rows;
+       cheapest_path_width = cheapest_path->parent->width;
+   }
+   else
+   {
+       cheapest_path_rows = 1;             /* assume non-set result */
+       cheapest_path_width = 100;          /* arbitrary */
+   }
+
+   /* Estimate per-hash-entry space at tuple width... */
+   hashentrysize = cheapest_path_width;
+   /* plus space for pass-by-ref transition values... */
+   hashentrysize += agg_counts->transitionSpace;
+   /* plus the per-hash-entry overhead */
+   hashentrysize += hash_agg_entry_size(agg_counts->numAggs);
+
+   if (hashentrysize * dNumGroups > work_mem * 1024L)
+       return false;
+
+   /*
+    * See if the estimated cost is no more than doing it the other way.
+    * While avoiding the need for sorted input is usually a win, the fact
+    * that the output won't be sorted may be a loss; so we need to do an
+    * actual cost comparison.
+    *
+    * We need to consider
+    *      cheapest_path + hashagg [+ final sort]
+    * versus either
+    *      cheapest_path [+ sort] + group or agg [+ final sort]
+    * or
+    *      presorted_path + group or agg [+ final sort]
+    * where brackets indicate a step that may not be needed. We assume
+    * query_planner() will have returned a presorted path only if it's a
+    * winner compared to cheapest_path for this purpose.
+    *
+    * These path variables are dummies that just hold cost fields; we don't
+    * make actual Paths for these steps.
+    */
+   cost_agg(&hashed_p, parse, AGG_HASHED, agg_counts->numAggs,
+            numGroupCols, dNumGroups,
+            cheapest_path->startup_cost, cheapest_path->total_cost,
+            cheapest_path_rows);
+   /* Result of hashed agg is always unsorted */
+   if (sort_pathkeys)
+       cost_sort(&hashed_p, parse, sort_pathkeys, hashed_p.total_cost,
+                 dNumGroups, cheapest_path_width);
+
+   if (sorted_path)
+   {
+       sorted_p.startup_cost = sorted_path->startup_cost;
+       sorted_p.total_cost = sorted_path->total_cost;
+       current_pathkeys = sorted_path->pathkeys;
+   }
+   else
+   {
+       sorted_p.startup_cost = cheapest_path->startup_cost;
+       sorted_p.total_cost = cheapest_path->total_cost;
+       current_pathkeys = cheapest_path->pathkeys;
+   }
+   if (!pathkeys_contained_in(group_pathkeys,
+                              current_pathkeys))
+   {
+       cost_sort(&sorted_p, parse, group_pathkeys, sorted_p.total_cost,
+                 cheapest_path_rows, cheapest_path_width);
+       current_pathkeys = group_pathkeys;
+   }
+
+   if (parse->hasAggs)
+       cost_agg(&sorted_p, parse, AGG_SORTED, agg_counts->numAggs,
+                numGroupCols, dNumGroups,
+                sorted_p.startup_cost, sorted_p.total_cost,
+                cheapest_path_rows);
+   else
+       cost_group(&sorted_p, parse, numGroupCols, dNumGroups,
+                  sorted_p.startup_cost, sorted_p.total_cost,
+                  cheapest_path_rows);
+   /* The Agg or Group node will preserve ordering */
+   if (sort_pathkeys &&
+       !pathkeys_contained_in(sort_pathkeys, current_pathkeys))
+       cost_sort(&sorted_p, parse, sort_pathkeys, sorted_p.total_cost,
+                 dNumGroups, cheapest_path_width);
+
+   /*
+    * Now make the decision using the top-level tuple fraction.  First we
+    * have to convert an absolute count (LIMIT) into fractional form.
+    */
+   if (tuple_fraction >= 1.0)
+       tuple_fraction /= dNumGroups;
+
+   if (compare_fractional_path_costs(&hashed_p, &sorted_p,
+                                     tuple_fraction) < 0)
+   {
+       /* Hashed is cheaper, so use it */
+       return true;
+   }
+   return false;
+}
+
 /*
  * hash_safe_grouping - are grouping operators hashable?
  *