Remove GROUP BY columns that are functionally dependent on other columns.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c

index d85484976a910feca3106e982483ccc2f592863b..74007856f35c50be8f2e9c231bc15067844d7c55 100644 (file)
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -17,6 +17,7 @@
  #include "access/genam.h"
  #include "access/heapam.h"
  #include "access/htup_details.h"
+#include "access/sysattr.h"
  #include "catalog/dependency.h"
  #include "catalog/indexing.h"
  #include "catalog/objectaccess.h"
@@ -871,6 +872,99 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok)
         return conOid;
  }
  
+/*
+ * get_primary_key_attnos
+ *             Identify the columns in a relation's primary key, if any.
+ *
+ * Returns a Bitmapset of the column attnos of the primary key's columns,
+ * with attnos being offset by FirstLowInvalidHeapAttributeNumber so that
+ * system columns can be represented.
+ *
+ * If there is no primary key, return NULL.  We also return NULL if the pkey
+ * constraint is deferrable and deferrableOk is false.
+ *
+ * *constraintOid is set to the OID of the pkey constraint, or InvalidOid
+ * on failure.
+ */
+Bitmapset *
+get_primary_key_attnos(Oid relid, bool deferrableOk, Oid *constraintOid)
+{
+       Bitmapset  *pkattnos = NULL;
+       Relation        pg_constraint;
+       HeapTuple       tuple;
+       SysScanDesc scan;
+       ScanKeyData skey[1];
+
+       /* Set *constraintOid, to avoid complaints about uninitialized vars */
+       *constraintOid = InvalidOid;
+
+       /* Scan pg_constraint for constraints of the target rel */
+       pg_constraint = heap_open(ConstraintRelationId, AccessShareLock);
+
+       ScanKeyInit(&skey[0],
+                               Anum_pg_constraint_conrelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(relid));
+
+       scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, true,
+                                                         NULL, 1, skey);
+
+       while (HeapTupleIsValid(tuple = systable_getnext(scan)))
+       {
+               Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple);
+               Datum           adatum;
+               bool            isNull;
+               ArrayType  *arr;
+               int16      *attnums;
+               int                     numkeys;
+               int                     i;
+
+               /* Skip constraints that are not PRIMARY KEYs */
+               if (con->contype != CONSTRAINT_PRIMARY)
+                       continue;
+
+               /*
+                * If the primary key is deferrable, but we've been instructed to
+                * ignore deferrable constraints, then we might as well give up
+                * searching, since there can only be a single primary key on a table.
+                */
+               if (con->condeferrable && !deferrableOk)
+                       break;
+
+               /* Extract the conkey array, ie, attnums of PK's columns */
+               adatum = heap_getattr(tuple, Anum_pg_constraint_conkey,
+                                                         RelationGetDescr(pg_constraint), &isNull);
+               if (isNull)
+                       elog(ERROR, "null conkey for constraint %u",
+                                HeapTupleGetOid(tuple));
+               arr = DatumGetArrayTypeP(adatum);               /* ensure not toasted */
+               numkeys = ARR_DIMS(arr)[0];
+               if (ARR_NDIM(arr) != 1 ||
+                       numkeys < 0 ||
+                       ARR_HASNULL(arr) ||
+                       ARR_ELEMTYPE(arr) != INT2OID)
+                       elog(ERROR, "conkey is not a 1-D smallint array");
+               attnums = (int16 *) ARR_DATA_PTR(arr);
+
+               /* Construct the result value */
+               for (i = 0; i < numkeys; i++)
+               {
+                       pkattnos = bms_add_member(pkattnos,
+                                                       attnums[i] - FirstLowInvalidHeapAttributeNumber);
+               }
+               *constraintOid = HeapTupleGetOid(tuple);
+
+               /* No need to search further */
+               break;
+       }
+
+       systable_endscan(scan);
+
+       heap_close(pg_constraint, AccessShareLock);
+
+       return pkattnos;
+}
+
  /*
   * Determine whether a relation can be proven functionally dependent on
   * a set of grouping columns.  If so, return TRUE and add the pg_constraint
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index af4a2198465964ba8e1c7d73902dd63b2fff075e..f77c804b702ad826f2e07fb5c44978a47cc1b8a3 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -20,7 +20,9 @@
  
  #include "access/htup_details.h"
  #include "access/parallel.h"
+#include "access/sysattr.h"
  #include "access/xact.h"
+#include "catalog/pg_constraint_fn.h"
  #include "executor/executor.h"
  #include "executor/nodeAgg.h"
  #include "foreign/fdwapi.h"
@@ -89,6 +91,7 @@ static double preprocess_limit(PlannerInfo *root,
                                  double tuple_fraction,
                                  int64 *offset_est, int64 *count_est);
  static bool limit_needed(Query *parse);
+static void remove_useless_groupby_columns(PlannerInfo *root);
  static List *preprocess_groupclause(PlannerInfo *root, List *force);
  static List *extract_rollup_sets(List *groupingSets);
  static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
@@ -710,6 +713,9 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
         }
         parse->havingQual = (Node *) newHaving;
  
+       /* Remove any redundant GROUP BY columns */
+       remove_useless_groupby_columns(root);
+
         /*
          * If we have any outer joins, try to reduce them to plain inner joins.
          * This step is most easily done after we've done expression
@@ -3207,6 +3213,159 @@ limit_needed(Query *parse)
  }
  
  
+/*
+ * remove_useless_groupby_columns
+ *             Remove any columns in the GROUP BY clause that are redundant due to
+ *             being functionally dependent on other GROUP BY columns.
+ *
+ * Since some other DBMSes do not allow references to ungrouped columns, it's
+ * not unusual to find all columns listed in GROUP BY even though listing the
+ * primary-key columns would be sufficient.  Deleting such excess columns
+ * avoids redundant sorting work, so it's worth doing.  When we do this, we
+ * must mark the plan as dependent on the pkey constraint (compare the
+ * parser's check_ungrouped_columns() and check_functional_grouping()).
+ *
+ * In principle, we could treat any NOT-NULL columns appearing in a UNIQUE
+ * index as the determining columns.  But as with check_functional_grouping(),
+ * there's currently no way to represent dependency on a NOT NULL constraint,
+ * so we consider only the pkey for now.
+ */
+static void
+remove_useless_groupby_columns(PlannerInfo *root)
+{
+       Query      *parse = root->parse;
+       Bitmapset **groupbyattnos;
+       Bitmapset **surplusvars;
+       ListCell   *lc;
+       int                     relid;
+
+       /* No chance to do anything if there are less than two GROUP BY items */
+       if (list_length(parse->groupClause) < 2)
+               return;
+
+       /* Don't fiddle with the GROUP BY clause if the query has grouping sets */
+       if (parse->groupingSets)
+               return;
+
+       /*
+        * Scan the GROUP BY clause to find GROUP BY items that are simple Vars.
+        * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k
+        * that are GROUP BY items.
+        */
+       groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+                                                                                  (list_length(parse->rtable) + 1));
+       foreach(lc, parse->groupClause)
+       {
+               SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
+               TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
+               Var                *var = (Var *) tle->expr;
+
+               /*
+                * Ignore non-Vars and Vars from other query levels.
+                *
+                * XXX in principle, stable expressions containing Vars could also be
+                * removed, if all the Vars are functionally dependent on other GROUP
+                * BY items.  But it's not clear that such cases occur often enough to
+                * be worth troubling over.
+                */
+               if (!IsA(var, Var) ||
+                       var->varlevelsup > 0)
+                       continue;
+
+               /* OK, remember we have this Var */
+               relid = var->varno;
+               Assert(relid <= list_length(parse->rtable));
+               groupbyattnos[relid] = bms_add_member(groupbyattnos[relid],
+                                                var->varattno - FirstLowInvalidHeapAttributeNumber);
+       }
+
+       /*
+        * Consider each relation and see if it is possible to remove some of its
+        * Vars from GROUP BY.  For simplicity and speed, we do the actual removal
+        * in a separate pass.  Here, we just fill surplusvars[k] with a bitmapset
+        * of the column attnos of RTE k that are removable GROUP BY items.
+        */
+       surplusvars = NULL;                     /* don't allocate array unless required */
+       relid = 0;
+       foreach(lc, parse->rtable)
+       {
+               RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+               Bitmapset  *relattnos;
+               Bitmapset  *pkattnos;
+               Oid                     constraintOid;
+
+               relid++;
+
+               /* Only plain relations could have primary-key constraints */
+               if (rte->rtekind != RTE_RELATION)
+                       continue;
+
+               /* Nothing to do unless this rel has multiple Vars in GROUP BY */
+               relattnos = groupbyattnos[relid];
+               if (bms_membership(relattnos) != BMS_MULTIPLE)
+                       continue;
+
+               /*
+                * Can't remove any columns for this rel if there is no suitable
+                * (i.e., nondeferrable) primary key constraint.
+                */
+               pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
+               if (pkattnos == NULL)
+                       continue;
+
+               /*
+                * If the primary key is a proper subset of relattnos then we have
+                * some items in the GROUP BY that can be removed.
+                */
+               if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+               {
+                       /*
+                        * To easily remember whether we've found anything to do, we don't
+                        * allocate the surplusvars[] array until we find something.
+                        */
+                       if (surplusvars == NULL)
+                               surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+                                                                                  (list_length(parse->rtable) + 1));
+
+                       /* Remember the attnos of the removable columns */
+                       surplusvars[relid] = bms_difference(relattnos, pkattnos);
+
+                       /* Also, mark the resulting plan as dependent on this constraint */
+                       parse->constraintDeps = lappend_oid(parse->constraintDeps,
+                                                                                               constraintOid);
+               }
+       }
+
+       /*
+        * If we found any surplus Vars, build a new GROUP BY clause without them.
+        * (Note: this may leave some TLEs with unreferenced ressortgroupref
+        * markings, but that's harmless.)
+        */
+       if (surplusvars != NULL)
+       {
+               List       *new_groupby = NIL;
+
+               foreach(lc, parse->groupClause)
+               {
+                       SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
+                       TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
+                       Var                *var = (Var *) tle->expr;
+
+                       /*
+                        * New list must include non-Vars, outer Vars, and anything not
+                        * marked as surplus.
+                        */
+                       if (!IsA(var, Var) ||
+                               var->varlevelsup > 0 ||
+                       !bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber,
+                                                  surplusvars[var->varno]))
+                               new_groupby = lappend(new_groupby, sgc);
+               }
+
+               parse->groupClause = new_groupby;
+       }
+}
+
  /*
   * preprocess_groupclause - do preparatory work on GROUP BY clause
   *
diff --git a/src/include/catalog/pg_constraint_fn.h b/src/include/catalog/pg_constraint_fn.h

index ccd6dffc27af91d22d60b4be883cb0c9406e2290..1f1117421022f4e3629e492af9164650b7e30312 100644 (file)
--- a/src/include/catalog/pg_constraint_fn.h
+++ b/src/include/catalog/pg_constraint_fn.h
@@ -71,6 +71,9 @@ extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId,
  extern Oid     get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok);
  extern Oid     get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok);
  
+extern Bitmapset *get_primary_key_attnos(Oid relid, bool deferrableOk,
+                                          Oid *constraintOid);
+
  extern bool check_functional_grouping(Oid relid,
                                                   Index varno, Index varlevelsup,
                                                   List *grouping_columns,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out

index de826b5e50f8b69f3f80331674757eef6bbc48d6..e434c5d8cd08cfbd901be79572ca4a091de50dd1 100644 (file)
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -856,6 +856,70 @@ ERROR:  aggregate function calls cannot be nested
  LINE 1: select (select max(min(unique1)) from int8_tbl) from tenk1;
                             ^
  --
+-- Test removal of redundant GROUP BY columns
+--
+create temp table t1 (a int, b int, c int, d int, primary key (a, b));
+create temp table t2 (x int, y int, z int, primary key (x, y));
+create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+-- Non-primary-key columns can be removed from GROUP BY
+explain (costs off) select * from t1 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t1
+(3 rows)
+
+-- No removal can happen if the complete PK is not present in GROUP BY
+explain (costs off) select a,c from t1 group by a,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, c, d
+   ->  Seq Scan on t1
+(3 rows)
+
+-- Test removal across multiple relations
+explain (costs off) select *
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Group
+   Group Key: t1.a, t1.b, t2.x, t2.y
+   ->  Merge Join
+         Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
+         ->  Index Scan using t1_pkey on t1
+         ->  Index Scan using t2_pkey on t2
+(6 rows)
+
+-- Test case where t1 can be optimized but not t2
+explain (costs off) select t1.*,t2.x,t2.z
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ HashAggregate
+   Group Key: t1.a, t1.b, t2.x, t2.z
+   ->  Merge Join
+         Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
+         ->  Index Scan using t1_pkey on t1
+         ->  Index Scan using t2_pkey on t2
+(6 rows)
+
+-- Cannot optimize when PK is deferrable
+explain (costs off) select * from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+drop table t1;
+drop table t2;
+drop table t3;
+--
  -- Test combinations of DISTINCT and/or ORDER BY
  --
  select array_agg(a order by b)
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out

index 0ac21bb813e91770e3971362aa59b3a9f9aa7dc6..e4028349d13f90ee6897f4a25a9347f96916690d 100644 (file)
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3943,12 +3943,14 @@ select d.* from d left join (select distinct * from b) s
  (1 row)
  
  -- join removal is not possible when the GROUP BY contains a column that is
--- not in the join condition
+-- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
+-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
+-- but this happens too late for join removal in the outer plan level.)
  explain (costs off)
  select d.* from d left join (select * from b group by b.id, b.c_id) s
    on d.a = s.id;
-                 QUERY PLAN                  
----------------------------------------------
+              QUERY PLAN               
+---------------------------------------
   Merge Left Join
     Merge Cond: (d.a = s.id)
     ->  Sort
@@ -3958,7 +3960,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
           Sort Key: s.id
           ->  Subquery Scan on s
                 ->  HashAggregate
-                     Group Key: b.id, b.c_id
+                     Group Key: b.id
                       ->  Seq Scan on b
  (11 rows)
  
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql

index 8d501dc008d862138ebaf79d4ec44776a0c22292..80ef14cda22cfda17a7a90e1d2a717da66e52007 100644 (file)
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -300,6 +300,37 @@ drop table minmaxtest cascade;
  select max(min(unique1)) from tenk1;
  select (select max(min(unique1)) from int8_tbl) from tenk1;
  
+--
+-- Test removal of redundant GROUP BY columns
+--
+
+create temp table t1 (a int, b int, c int, d int, primary key (a, b));
+create temp table t2 (x int, y int, z int, primary key (x, y));
+create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+
+-- Non-primary-key columns can be removed from GROUP BY
+explain (costs off) select * from t1 group by a,b,c,d;
+
+-- No removal can happen if the complete PK is not present in GROUP BY
+explain (costs off) select a,c from t1 group by a,c,d;
+
+-- Test removal across multiple relations
+explain (costs off) select *
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
+
+-- Test case where t1 can be optimized but not t2
+explain (costs off) select t1.*,t2.x,t2.z
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
+
+-- Cannot optimize when PK is deferrable
+explain (costs off) select * from t3 group by a,b,c;
+
+drop table t1;
+drop table t2;
+drop table t3;
+
  --
  -- Test combinations of DISTINCT and/or ORDER BY
  --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql

index fafbb3fb003a5500f72834ea6dc8cf5704dfe980..3430f91812461b84a9156d3514c5fc2547a48c8e 100644 (file)
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1279,7 +1279,9 @@ select d.* from d left join (select distinct * from b) s
    on d.a = s.id and d.b = s.c_id;
  
  -- join removal is not possible when the GROUP BY contains a column that is
--- not in the join condition
+-- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
+-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
+-- but this happens too late for join removal in the outer plan level.)
  explain (costs off)
  select d.* from d left join (select * from b group by b.id, b.c_id) s
    on d.a = s.id;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 11 Feb 2016 22:34:59 +0000 (17:34 -0500)
src/backend/catalog/pg_constraint.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/include/catalog/pg_constraint_fn.h		patch \| blob \| blame \| history
src/test/regress/expected/aggregates.out		patch \| blob \| blame \| history
src/test/regress/expected/join.out		patch \| blob \| blame \| history
src/test/regress/sql/aggregates.sql		patch \| blob \| blame \| history
src/test/regress/sql/join.sql		patch \| blob \| blame \| history