Teach nbtree to avoid evaluating row compare keys.
authorPeter Geoghegan <pg@bowt.ie>
Mon, 15 Sep 2025 20:56:49 +0000 (16:56 -0400)
committerPeter Geoghegan <pg@bowt.ie>
Mon, 15 Sep 2025 20:56:49 +0000 (16:56 -0400)
Add logic to _bt_set_startikey that determines whether row compare keys
are guaranteed to be satisfied by every tuple on a page that is about to
be read by _bt_readpage.  This works in essentially the same way as the
existing scalar inequality logic.  Testing has shown that the new logic
improves performance to about the same degree as the existing scalar
inequality logic (compared to the unoptimized case).  In other words,
the new logic makes many row compare scans significantly faster.

Note that the new row compare inequality logic is only effective when
the same individual row member is the deciding subkey for all tuples on
the page (obviously, all tuples have to satisfy the row compare, too).
This is what makes the new row compare logic very similar to the
existing logic for scalar inequalities.  Note, in particular, that this
makes it safe to ignore whether all row compare members are against
either ASC or DESC index attributes (i.e. it doesn't matter if
individual subkeys don't all use the same inequality strategy).

Also stop refusing to set pstate.startikey to an offset beyond any
nonrequired key (don't add logic that'll do that for an individual row
compare subkey, either).  We can fully rely on our firstchangingattnum
tests instead.  This will do the right thing when a page has a group of
tuples with NULLs in a lower-order attribute that makes the tuples fail
to satisfy a row compare key -- we won't incorrectly conclude that all
tuples must satisfy the row compare, just because firsttup and lasttup
happen to.  Our firstchangingattnum test prevents that from happening.
(Note that the original "avoid evaluating nbtree scan keys" mechanism
added by commit e0b1ee17 couldn't support row compares due to issues
with tuples that contain NULLs in a lower-order subkey's attribute.
That original mechanism relied on requiredness markings, which the
replacement _bt_set_startikey mechanism never really needed.)

Follow up to commit 8a510275, which added the _bt_set_startikey
optimization.  _bt_set_startikey is now feature complete; there's no
remaining kind of nbtree scan key that it still doesn't support.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/CAH2-WznL6Z3H_GTQze9d8T_Ls=cYbnd-_9f-Jo7aYgTGRUD58g@mail.gmail.com

src/backend/access/nbtree/nbtutils.c

index edfea2acaff664161f3b89c9bf2284fb86e20714..41b4fbd1c37e8a301f0ddb4f2f90d84bc9fd3256 100644 (file)
@@ -62,6 +62,7 @@ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
                              IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
                              bool advancenonrequired, bool forcenonrequired,
                              bool *continuescan, int *ikey);
+static bool _bt_rowcompare_cmpresult(ScanKey subkey, int cmpresult);
 static bool _bt_check_rowcompare(ScanKey skey,
                                 IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
                                 ScanDirection dir, bool forcenonrequired, bool *continuescan);
@@ -2438,15 +2439,103 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
         * Determine if it's safe to set pstate.startikey to an offset to a
         * key that comes after this key, by examining this key
         */
-       if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
-       {
-           /* Scan key isn't marked required (corner case) */
-           break;              /* unsafe */
-       }
        if (key->sk_flags & SK_ROW_HEADER)
        {
-           /* RowCompare inequalities currently aren't supported */
-           break;              /* "unsafe" */
+           /* RowCompare inequality (header key) */
+           ScanKey     subkey = (ScanKey) DatumGetPointer(key->sk_argument);
+           bool        satisfied = false;
+
+           for (;;)
+           {
+               int         cmpresult;
+               bool        firstsatisfies = false;
+
+               if (subkey->sk_attno > firstchangingattnum) /* >, not >= */
+                   break;      /* unsafe, preceding attr has multiple
+                                * distinct values */
+
+               if (subkey->sk_flags & SK_ISNULL)
+                   break;      /* unsafe, unsatisfiable NULL subkey arg */
+
+               firstdatum = index_getattr(firsttup, subkey->sk_attno,
+                                          tupdesc, &firstnull);
+               lastdatum = index_getattr(lasttup, subkey->sk_attno,
+                                         tupdesc, &lastnull);
+
+               if (firstnull || lastnull)
+                   break;      /* unsafe, NULL value won't satisfy subkey */
+
+               /*
+                * Compare the first tuple's datum for this row compare member
+                */
+               cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
+                                                           subkey->sk_collation,
+                                                           firstdatum,
+                                                           subkey->sk_argument));
+               if (subkey->sk_flags & SK_BT_DESC)
+                   INVERT_COMPARE_RESULT(cmpresult);
+
+               if (cmpresult != 0 || (subkey->sk_flags & SK_ROW_END))
+               {
+                   firstsatisfies = _bt_rowcompare_cmpresult(subkey,
+                                                             cmpresult);
+                   if (!firstsatisfies)
+                   {
+                       /* Unsafe, firstdatum does not satisfy subkey */
+                       break;
+                   }
+               }
+
+               /*
+                * Compare the last tuple's datum for this row compare member
+                */
+               cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
+                                                           subkey->sk_collation,
+                                                           lastdatum,
+                                                           subkey->sk_argument));
+               if (subkey->sk_flags & SK_BT_DESC)
+                   INVERT_COMPARE_RESULT(cmpresult);
+
+               if (cmpresult != 0 || (subkey->sk_flags & SK_ROW_END))
+               {
+                   if (!firstsatisfies)
+                   {
+                       /*
+                        * It's only safe to set startikey beyond the row
+                        * compare header key when both firsttup and lasttup
+                        * satisfy the key as a whole based on the same
+                        * deciding subkey/attribute.  That can't happen now.
+                        */
+                       break;  /* unsafe */
+                   }
+
+                   satisfied = _bt_rowcompare_cmpresult(subkey, cmpresult);
+                   break;      /* safe iff 'satisfied' is true */
+               }
+
+               /* Move on to next row member/subkey */
+               if (subkey->sk_flags & SK_ROW_END)
+                   break;      /* defensive */
+               subkey++;
+
+               /*
+                * We deliberately don't check if the next subkey has the same
+                * strategy as this iteration's subkey (which happens when
+                * subkeys for both ASC and DESC columns are used together),
+                * nor if any subkey is marked required.  This is safe because
+                * in general all prior index attributes must have only one
+                * distinct value (across all of the tuples on the page) in
+                * order for us to even consider any subkey's attribute.
+                */
+           }
+
+           if (satisfied)
+           {
+               /* Safe, row compare satisfied by every tuple on page */
+               continue;
+           }
+
+           break;              /* unsafe */
        }
        if (key->sk_strategy != BTEqualStrategyNumber)
        {
@@ -2914,6 +3003,42 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
    return true;
 }
 
+/*
+ * Call here when a row compare member returns a non-zero result, or with the
+ * result for the final ROW_END row compare member (no matter the cmpresult).
+ *
+ * cmpresult indicates the overall result of the row comparison (must already
+ * be commuted for DESC subkeys), and subkey is the deciding row member.
+ */
+static bool
+_bt_rowcompare_cmpresult(ScanKey subkey, int cmpresult)
+{
+   bool        satisfied;
+
+   switch (subkey->sk_strategy)
+   {
+       case BTLessStrategyNumber:
+           satisfied = (cmpresult < 0);
+           break;
+       case BTLessEqualStrategyNumber:
+           satisfied = (cmpresult <= 0);
+           break;
+       case BTGreaterEqualStrategyNumber:
+           satisfied = (cmpresult >= 0);
+           break;
+       case BTGreaterStrategyNumber:
+           satisfied = (cmpresult > 0);
+           break;
+       default:
+           /* EQ and NE cases aren't allowed here */
+           elog(ERROR, "unexpected strategy number %d", subkey->sk_strategy);
+           satisfied = false;  /* keep compiler quiet */
+           break;
+   }
+
+   return satisfied;
+}
+
 /*
  * Test whether an indextuple satisfies a row-comparison scan condition.
  *
@@ -3094,31 +3219,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
        subkey++;
    }
 
-   /*
-    * At this point cmpresult indicates the overall result of the row
-    * comparison, and subkey points to the deciding column (or the last
-    * column if the result is "=").
-    */
-   switch (subkey->sk_strategy)
-   {
-           /* EQ and NE cases aren't allowed here */
-       case BTLessStrategyNumber:
-           result = (cmpresult < 0);
-           break;
-       case BTLessEqualStrategyNumber:
-           result = (cmpresult <= 0);
-           break;
-       case BTGreaterEqualStrategyNumber:
-           result = (cmpresult >= 0);
-           break;
-       case BTGreaterStrategyNumber:
-           result = (cmpresult > 0);
-           break;
-       default:
-           elog(ERROR, "unexpected strategy number %d", subkey->sk_strategy);
-           result = 0;         /* keep compiler quiet */
-           break;
-   }
+   /* Final subkey/column determines if row compare is satisfied */
+   result = _bt_rowcompare_cmpresult(subkey, cmpresult);
 
    if (!result && !forcenonrequired)
    {