diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90335cb8b9..b697d2324b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: hooks: - id: trailing-whitespace - id: end-of-file-fixer - exclude: "^tests/unit/core/compile/sqlglot/snapshots" + exclude: "^tests/unit/core/compile/sqlglot/.*snapshots" - id: check-yaml - repo: https://github.com/pycqa/isort rev: 5.12.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index c1868c0dbc..9911d2cb2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,27 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.22.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.21.0...v2.22.0) (2025-09-25) + + +### Features + +* Add `GroupBy.__iter__` ([#1394](https://github.com/googleapis/python-bigquery-dataframes/issues/1394)) ([c56a78c](https://github.com/googleapis/python-bigquery-dataframes/commit/c56a78cd509a535d4998d5b9a99ec3ecd334b883)) +* Add ai.generate_int to bigframes.bigquery package ([#2109](https://github.com/googleapis/python-bigquery-dataframes/issues/2109)) ([af6b862](https://github.com/googleapis/python-bigquery-dataframes/commit/af6b862de5c3921684210ec169338815f45b19dd)) +* Add Groupby.describe() ([#2088](https://github.com/googleapis/python-bigquery-dataframes/issues/2088)) ([328a765](https://github.com/googleapis/python-bigquery-dataframes/commit/328a765e746138806a021bea22475e8c03512aeb)) +* Implement `Index.to_list()` ([#2106](https://github.com/googleapis/python-bigquery-dataframes/issues/2106)) ([60056ca](https://github.com/googleapis/python-bigquery-dataframes/commit/60056ca06511f99092647fe55fc02eeab486b4ca)) +* Implement inplace parameter for `DataFrame.drop` ([#2105](https://github.com/googleapis/python-bigquery-dataframes/issues/2105)) ([3487f13](https://github.com/googleapis/python-bigquery-dataframes/commit/3487f13d12e34999b385c2e11551b5e27bfbf4ff)) +* Support callable for series map method ([#2100](https://github.com/googleapis/python-bigquery-dataframes/issues/2100)) ([ac25618](https://github.com/googleapis/python-bigquery-dataframes/commit/ac25618feed2da11fe4fb85058d498d262c085c0)) +* Support df.info() with null index ([#2094](https://github.com/googleapis/python-bigquery-dataframes/issues/2094)) ([fb81eea](https://github.com/googleapis/python-bigquery-dataframes/commit/fb81eeaf13af059f32cb38e7f117fb3504243d51)) + + +### Bug Fixes + +* Avoid ibis fillna warning in compiler ([#2113](https://github.com/googleapis/python-bigquery-dataframes/issues/2113)) ([7ef667b](https://github.com/googleapis/python-bigquery-dataframes/commit/7ef667b0f46f13bcc8ad4f2ed8f81278132b5aec)) +* Negative start and stop parameter values in Series.str.slice() ([#2104](https://github.com/googleapis/python-bigquery-dataframes/issues/2104)) ([f57a348](https://github.com/googleapis/python-bigquery-dataframes/commit/f57a348f1935a4e2bb14c501bb4c47cd552d102a)) +* Throw type error for incomparable join keys ([#2098](https://github.com/googleapis/python-bigquery-dataframes/issues/2098)) ([9dc9695](https://github.com/googleapis/python-bigquery-dataframes/commit/9dc96959a84b751d18b290129c2926df6e50b3f5)) +* Transformers with non-standard column names throw errors ([#2089](https://github.com/googleapis/python-bigquery-dataframes/issues/2089)) ([a2daa3f](https://github.com/googleapis/python-bigquery-dataframes/commit/a2daa3fffe6743327edb9f4c74db93198bd12f8e)) + ## [2.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.20.0...v2.21.0) (2025-09-17) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 3bafce6166..f0b4f51611 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -113,6 +113,81 @@ def generate_bool( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_int( + prompt: PROMPT_TYPE, + *, + connection_id: str | None = None, + endpoint: str | None = None, + request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified", + model_params: Mapping[Any, Any] | None = None, +) -> series.Series: + """ + Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) + 0 {'result': 2, 'full_response': '{"candidates":... + 1 {'result': 4, 'full_response': '{"candidates":... + 2 {'result': 8, 'full_response': '{"candidates":... + dtype: struct>, status: string>[pyarrow] + + >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")).struct.field("result") + 0 2 + 1 4 + 2 8 + Name: result, dtype: Int64 + + Args: + prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series + or pandas Series. + connection_id (str, optional): + Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`. + If not provided, the connection from the current session will be used. + endpoint (str, optional): + Specifies the Vertex AI endpoint to use for the model. For example `"gemini-2.5-flash"`. You can specify any + generally available or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and + uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects a recent stable + version of Gemini to use. + request_type (Literal["dedicated", "shared", "unspecified"]): + Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses. + * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not + purchased or is not active if Provisioned Throughput quota isn't available. + * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. + * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. + If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. + model_params (Mapping[Any, Any]): + Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. + + Returns: + bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: + * "result": an integer (INT64) value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. + * "full_response": a JSON value containing the response from the projects.locations.endpoints.generateContent call to the model. + The generated text is in the text element. + * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. + """ + + prompt_context, series_list = _separate_context_and_series(prompt) + assert len(series_list) > 0 + + operator = ai_ops.AIGenerateInt( + prompt_context=tuple(prompt_context), + connection_id=_resolve_connection_id(series_list[0], connection_id), + endpoint=endpoint, + request_type=request_type, + model_params=json.dumps(model_params) if model_params else None, + ) + + return series_list[0]._apply_nary_op(operator, series_list[1:]) + + def _separate_context_and_series( prompt: PROMPT_TYPE, ) -> Tuple[List[str | None], List[series.Series]]: diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index b37c581a4a..878d62bcb5 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -480,6 +480,14 @@ def relational_join( type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", propogate_order: Optional[bool] = None, ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: + for lcol, rcol in conditions: + ltype = self.get_column_type(lcol) + rtype = other.get_column_type(rcol) + if not bigframes.dtypes.can_compare(ltype, rtype): + raise TypeError( + f"Cannot join with non-comparable join key types: {ltype}, {rtype}" + ) + l_mapping = { # Identity mapping, only rename right side lcol.name: lcol.name for lcol in self.node.ids } diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6e22baabec..f9896784bb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -252,6 +252,10 @@ def from_local( pass return block + @property + def has_index(self) -> bool: + return len(self._index_columns) > 0 + @property def index(self) -> BlockIndexProperties: """Row identities for values in the Block.""" @@ -1371,10 +1375,16 @@ def aggregate( ) -> typing.Tuple[Block, typing.Sequence[str]]: """ Apply aggregations to the block. + Arguments: by_column_id: column id of the aggregation key, this is preserved through the transform and used as index. aggregations: input_column_id, operation tuples dropna: whether null keys should be dropped + + Returns: + Tuple[Block, Sequence[str]]: + The first element is the grouped block. The second is the + column IDs corresponding to each applied aggregation. """ if column_labels is None: column_labels = pd.Index(range(len(aggregations))) @@ -1780,7 +1790,9 @@ def pivot( else: return result_block.with_column_labels(columns_values) - def stack(self, how="left", levels: int = 1): + def stack( + self, how="left", levels: int = 1, *, override_labels: Optional[pd.Index] = None + ): """Unpivot last column axis level into row axis""" if levels == 0: return self @@ -1788,7 +1800,9 @@ def stack(self, how="left", levels: int = 1): # These are the values that will be turned into rows col_labels, row_labels = utils.split_index(self.column_labels, levels=levels) - row_labels = row_labels.drop_duplicates() + row_labels = ( + row_labels.drop_duplicates() if override_labels is None else override_labels + ) if col_labels is None: result_index: pd.Index = pd.Index([None]) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 3a4695c50d..dde6f3a325 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -15,19 +15,18 @@ from typing import TYPE_CHECKING -from bigframes.core import rewrite -from bigframes.core.compile.ibis_compiler import ibis_compiler - if TYPE_CHECKING: import bigframes.core.nodes def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" + from bigframes.core.compile.ibis_compiler import ibis_compiler + import bigframes.core.rewrite import bigframes.core.schema node = ibis_compiler._replace_unsupported_ops(node) - node = rewrite.bake_order(node) + node = bigframes.core.rewrite.bake_order(node) ir = ibis_compiler.compile_node(node) items = tuple( bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id)) diff --git a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py index b101f4e09f..0106b150e2 100644 --- a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py +++ b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py @@ -175,15 +175,11 @@ def _( @compile_unary_agg.register -@numeric_op def _( op: agg_ops.MedianOp, column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - # TODO(swast): Allow switching between exact and approximate median. - # For now, the best we can do is an approximate median when we're doing - # an aggregation, as PERCENTILE_CONT is only an analytic function. return cast(ibis_types.NumericValue, column.approx_median()) diff --git a/bigframes/core/compile/default_ordering.py b/bigframes/core/compile/ibis_compiler/default_ordering.py similarity index 95% rename from bigframes/core/compile/default_ordering.py rename to bigframes/core/compile/ibis_compiler/default_ordering.py index 1a1350cfd6..3f2628d10c 100644 --- a/bigframes/core/compile/default_ordering.py +++ b/bigframes/core/compile/ibis_compiler/default_ordering.py @@ -47,10 +47,7 @@ def _convert_to_nonnull_string(column: ibis_types.Value) -> ibis_types.StringVal result = ibis_ops.ToJsonString(column).to_expr() # type: ignore # Escape backslashes and use backslash as delineator escaped = cast( - ibis_types.StringColumn, - result.fill_null(ibis_types.literal("")) - if hasattr(result, "fill_null") - else result.fillna(""), + ibis_types.StringColumn, result.fill_null(ibis_types.literal("")) ).replace( "\\", # type: ignore "\\\\", # type: ignore diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 8ffc556f76..8426a86375 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -28,7 +28,7 @@ import pandas as pd from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS -import bigframes.core.compile.default_ordering +import bigframes.core.compile.ibis_compiler.default_ordering from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( scalar_op_compiler, # TODO(tswast): avoid import of variables ) @@ -1064,7 +1064,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): if op.match_nulls and contains_nulls: return x.isnull() | x.isin(matchable_ibis_values) else: - return x.isin(matchable_ibis_values).fillna(False) + return x.isin(matchable_ibis_values).fill_null(ibis.literal(False)) @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) @@ -1383,8 +1383,8 @@ def eq_nulls_match_op( left = x.cast(ibis_dtypes.str).fill_null(literal) right = y.cast(ibis_dtypes.str).fill_null(literal) else: - left = x.cast(ibis_dtypes.str).fillna(literal) - right = y.cast(ibis_dtypes.str).fillna(literal) + left = x.cast(ibis_dtypes.str).fill_null(literal) + right = y.cast(ibis_dtypes.str).fill_null(literal) return left == right @@ -1813,7 +1813,7 @@ def fillna_op( if hasattr(x, "fill_null"): return x.fill_null(typing.cast(ibis_types.Scalar, y)) else: - return x.fillna(typing.cast(ibis_types.Scalar, y)) + return x.fill_null(typing.cast(ibis_types.Scalar, y)) @scalar_op_compiler.register_binary_op(ops.round_op) @@ -1975,28 +1975,48 @@ def ai_generate_bool( *values: ibis_types.Value, op: ops.AIGenerateBool ) -> ibis_types.StructValue: + return ai_ops.AIGenerateBool( + _construct_prompt(values, op.prompt_context), # type: ignore + op.connection_id, # type: ignore + op.endpoint, # type: ignore + op.request_type.upper(), # type: ignore + op.model_params, # type: ignore + ).to_expr() + + +@scalar_op_compiler.register_nary_op(ops.AIGenerateInt, pass_op=True) +def ai_generate_int( + *values: ibis_types.Value, op: ops.AIGenerateBool +) -> ibis_types.StructValue: + + return ai_ops.AIGenerateInt( + _construct_prompt(values, op.prompt_context), # type: ignore + op.connection_id, # type: ignore + op.endpoint, # type: ignore + op.request_type.upper(), # type: ignore + op.model_params, # type: ignore + ).to_expr() + + +def _construct_prompt( + col_refs: tuple[ibis_types.Value], prompt_context: tuple[str | None] +) -> ibis_types.StructValue: prompt: dict[str, ibis_types.Value | str] = {} column_ref_idx = 0 - for idx, elem in enumerate(op.prompt_context): + for idx, elem in enumerate(prompt_context): if elem is None: - prompt[f"_field_{idx + 1}"] = values[column_ref_idx] + prompt[f"_field_{idx + 1}"] = col_refs[column_ref_idx] column_ref_idx += 1 else: prompt[f"_field_{idx + 1}"] = elem - return ai_ops.AIGenerateBool( - ibis.struct(prompt), # type: ignore - op.connection_id, # type: ignore - op.endpoint, # type: ignore - op.request_type.upper(), # type: ignore - op.model_params, # type: ignore - ).to_expr() + return ibis.struct(prompt) @scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True) def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value: - return bigframes.core.compile.default_ordering.gen_row_key(values) + return bigframes.core.compile.ibis_compiler.default_ordering.gen_row_key(values) # Helpers diff --git a/bigframes/core/compile/sqlglot/__init__.py b/bigframes/core/compile/sqlglot/__init__.py index 5fe8099043..1fc22e1af6 100644 --- a/bigframes/core/compile/sqlglot/__init__.py +++ b/bigframes/core/compile/sqlglot/__init__.py @@ -14,8 +14,8 @@ from __future__ import annotations from bigframes.core.compile.sqlglot.compiler import SQLGlotCompiler +import bigframes.core.compile.sqlglot.expressions.ai_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.array_ops # noqa: F401 -import bigframes.core.compile.sqlglot.expressions.binary_compiler # noqa: F401 import bigframes.core.compile.sqlglot.expressions.blob_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.comparison_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.date_ops # noqa: F401 diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index c7eb84cba6..598a89e4eb 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -16,6 +16,7 @@ import typing +import pandas as pd import sqlglot.expressions as sge from bigframes import dtypes @@ -46,18 +47,66 @@ def _( return apply_window_if_present(sge.func("COUNT", column.expr), window) -@UNARY_OP_REGISTRATION.register(agg_ops.SumOp) +@UNARY_OP_REGISTRATION.register(agg_ops.DenseRankOp) def _( - op: agg_ops.SumOp, + op: agg_ops.DenseRankOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + # Ranking functions do not support window framing clauses. + return apply_window_if_present( + sge.func("DENSE_RANK"), window, include_framing_clauses=False + ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.MaxOp) +def _( + op: agg_ops.MaxOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("MAX", column.expr), window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.MeanOp) +def _( + op: agg_ops.MeanOp, column: typed_expr.TypedExpr, window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: expr = column.expr if column.dtype == dtypes.BOOL_DTYPE: - expr = sge.Cast(this=column.expr, to="INT64") - # Will be null if all inputs are null. Pandas defaults to zero sum though. - expr = apply_window_if_present(sge.func("SUM", expr), window) - return sge.func("IFNULL", expr, ir._literal(0, column.dtype)) + expr = sge.Cast(this=expr, to="INT64") + + expr = sge.func("AVG", expr) + + should_floor_result = ( + op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE + ) + if should_floor_result: + expr = sge.Cast(this=sge.func("FLOOR", expr), to="INT64") + return apply_window_if_present(expr, window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.MedianOp) +def _( + op: agg_ops.MedianOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + approx_quantiles = sge.func("APPROX_QUANTILES", column.expr, sge.convert(2)) + return sge.Bracket( + this=approx_quantiles, expressions=[sge.func("OFFSET", sge.convert(1))] + ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.MinOp) +def _( + op: agg_ops.MinOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("MIN", column.expr), window) @UNARY_OP_REGISTRATION.register(agg_ops.SizeUnaryOp) @@ -67,3 +116,32 @@ def _( window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.RankOp) +def _( + op: agg_ops.RankOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + # Ranking functions do not support window framing clauses. + return apply_window_if_present( + sge.func("RANK"), window, include_framing_clauses=False + ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.SumOp) +def _( + op: agg_ops.SumOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + expr = column.expr + if column.dtype == dtypes.BOOL_DTYPE: + expr = sge.Cast(this=column.expr, to="INT64") + + expr = apply_window_if_present(sge.func("SUM", expr), window) + + # Will be null if all inputs are null. Pandas defaults to zero sum though. + zero = pd.to_timedelta(0) if column.dtype == dtypes.TIMEDELTA_DTYPE else 0 + return sge.func("IFNULL", expr, ir._literal(zero, column.dtype)) diff --git a/bigframes/core/compile/sqlglot/aggregations/windows.py b/bigframes/core/compile/sqlglot/aggregations/windows.py index 4d7a3f7406..1bfa72b878 100644 --- a/bigframes/core/compile/sqlglot/aggregations/windows.py +++ b/bigframes/core/compile/sqlglot/aggregations/windows.py @@ -25,6 +25,7 @@ def apply_window_if_present( value: sge.Expression, window: typing.Optional[window_spec.WindowSpec] = None, + include_framing_clauses: bool = True, ) -> sge.Expression: if window is None: return value @@ -64,6 +65,9 @@ def apply_window_if_present( if not window.bounds and not order: return sge.Window(this=value, partition_by=group_by) + if not window.bounds and not include_framing_clauses: + return sge.Window(this=value, partition_by=group_by, order=order) + kind = ( "ROWS" if isinstance(window.bounds, window_spec.RowsWindowBounds) else "RANGE" ) diff --git a/bigframes/core/compile/sqlglot/expressions/ai_ops.py b/bigframes/core/compile/sqlglot/expressions/ai_ops.py new file mode 100644 index 0000000000..50d56611b1 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/ai_ops.py @@ -0,0 +1,88 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import asdict +import typing + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot import scalar_compiler +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +register_nary_op = scalar_compiler.scalar_op_compiler.register_nary_op + + +@register_nary_op(ops.AIGenerateBool, pass_op=True) +def _(*exprs: TypedExpr, op: ops.AIGenerateBool) -> sge.Expression: + args = [_construct_prompt(exprs, op.prompt_context)] + _construct_named_args(op) + + return sge.func("AI.GENERATE_BOOL", *args) + + +@register_nary_op(ops.AIGenerateInt, pass_op=True) +def _(*exprs: TypedExpr, op: ops.AIGenerateInt) -> sge.Expression: + args = [_construct_prompt(exprs, op.prompt_context)] + _construct_named_args(op) + + return sge.func("AI.GENERATE_INT", *args) + + +def _construct_prompt( + exprs: tuple[TypedExpr, ...], prompt_context: tuple[str | None, ...] +) -> sge.Kwarg: + prompt: list[str | sge.Expression] = [] + column_ref_idx = 0 + + for elem in prompt_context: + if elem is None: + prompt.append(exprs[column_ref_idx].expr) + else: + prompt.append(sge.Literal.string(elem)) + + return sge.Kwarg(this="prompt", expression=sge.Tuple(expressions=prompt)) + + +def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]: + args = [] + + op_args = asdict(op) + + connection_id = typing.cast(str, op_args["connection_id"]) + args.append( + sge.Kwarg(this="connection_id", expression=sge.Literal.string(connection_id)) + ) + + endpoit = typing.cast(str, op_args.get("endpoint", None)) + if endpoit is not None: + args.append(sge.Kwarg(this="endpoint", expression=sge.Literal.string(endpoit))) + + request_type = typing.cast(str, op_args["request_type"]).upper() + args.append( + sge.Kwarg(this="request_type", expression=sge.Literal.string(request_type)) + ) + + model_params = typing.cast(str, op_args.get("model_params", None)) + if model_params is not None: + args.append( + sge.Kwarg( + this="model_params", + # sge.JSON requires the SQLGlot version to be at least 25.18.0 + # PARSE_JSON won't work as the function requires a JSON literal. + expression=sge.JSON(this=sge.Literal.string(model_params)), + ) + ) + + return args diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py deleted file mode 100644 index b18d15cae6..0000000000 --- a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import bigframes_vendored.constants as bf_constants -import sqlglot.expressions as sge - -from bigframes import dtypes -from bigframes import operations as ops -import bigframes.core.compile.sqlglot.expressions.constants as constants -from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr -import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler - -register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op - -# TODO: add parenthesize for operators - - -@register_binary_op(ops.add_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: - # String addition - return sge.Concat(expressions=[left.expr, right.expr]) - - if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.Add(this=left_expr, expression=right_expr) - - if ( - dtypes.is_time_or_date_like(left.dtype) - and right.dtype == dtypes.TIMEDELTA_DTYPE - ): - left_expr = _coerce_date_to_datetime(left) - return sge.TimestampAdd( - this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") - ) - if ( - dtypes.is_time_or_date_like(right.dtype) - and left.dtype == dtypes.TIMEDELTA_DTYPE - ): - right_expr = _coerce_date_to_datetime(right) - return sge.TimestampAdd( - this=right_expr, expression=left.expr, unit=sge.Var(this="MICROSECOND") - ) - if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: - return sge.Add(this=left.expr, expression=right.expr) - - raise TypeError( - f"Cannot add type {left.dtype} and {right.dtype}. {bf_constants.FEEDBACK_LINK}" - ) - - -@register_binary_op(ops.eq_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.EQ(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.eq_null_match_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = left.expr - if right.dtype != dtypes.BOOL_DTYPE: - left_expr = _coerce_bool_to_int(left) - - right_expr = right.expr - if left.dtype != dtypes.BOOL_DTYPE: - right_expr = _coerce_bool_to_int(right) - - sentinel = sge.convert("$NULL_SENTINEL$") - left_coalesce = sge.Coalesce( - this=sge.Cast(this=left_expr, to="STRING"), expressions=[sentinel] - ) - right_coalesce = sge.Coalesce( - this=sge.Cast(this=right_expr, to="STRING"), expressions=[sentinel] - ) - return sge.EQ(this=left_coalesce, expression=right_coalesce) - - -@register_binary_op(ops.div_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - - result = sge.func("IEEE_DIVIDE", left_expr, right_expr) - if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return sge.Cast(this=sge.Floor(this=result), to="INT64") - else: - return result - - -@register_binary_op(ops.floordiv_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - - result: sge.Expression = sge.Cast( - this=sge.Floor(this=sge.func("IEEE_DIVIDE", left_expr, right_expr)), to="INT64" - ) - - # DIV(N, 0) will error in bigquery, but needs to return `0` for int, and - # `inf`` for float in BQ so we short-circuit in this case. - # Multiplying left by zero propogates nulls. - zero_result = ( - constants._INF - if (left.dtype == dtypes.FLOAT_DTYPE or right.dtype == dtypes.FLOAT_DTYPE) - else constants._ZERO - ) - result = sge.Case( - ifs=[ - sge.If( - this=sge.EQ(this=right_expr, expression=constants._ZERO), - true=zero_result * left_expr, - ) - ], - default=result, - ) - - if dtypes.is_numeric(right.dtype) and left.dtype == dtypes.TIMEDELTA_DTYPE: - result = sge.Cast(this=sge.Floor(this=result), to="INT64") - - return result - - -@register_binary_op(ops.ge_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.GTE(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.gt_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.GT(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.JSONSet, pass_op=True) -def _(left: TypedExpr, right: TypedExpr, op) -> sge.Expression: - return sge.func("JSON_SET", left.expr, sge.convert(op.json_path), right.expr) - - -@register_binary_op(ops.lt_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.LT(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.le_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.LTE(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.mul_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - - result = sge.Mul(this=left_expr, expression=right_expr) - - if (dtypes.is_numeric(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE) or ( - left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype) - ): - return sge.Cast(this=sge.Floor(this=result), to="INT64") - else: - return result - - -@register_binary_op(ops.ne_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.NEQ(this=left_expr, expression=right_expr) - - -@register_binary_op(ops.obj_make_ref_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - return sge.func("OBJ.MAKE_REF", left.expr, right.expr) - - -@register_binary_op(ops.sub_op) -def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): - left_expr = _coerce_bool_to_int(left) - right_expr = _coerce_bool_to_int(right) - return sge.Sub(this=left_expr, expression=right_expr) - - if ( - dtypes.is_time_or_date_like(left.dtype) - and right.dtype == dtypes.TIMEDELTA_DTYPE - ): - left_expr = _coerce_date_to_datetime(left) - return sge.TimestampSub( - this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") - ) - if dtypes.is_time_or_date_like(left.dtype) and dtypes.is_time_or_date_like( - right.dtype - ): - left_expr = _coerce_date_to_datetime(left) - right_expr = _coerce_date_to_datetime(right) - return sge.TimestampDiff( - this=left_expr, expression=right_expr, unit=sge.Var(this="MICROSECOND") - ) - - if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: - return sge.Sub(this=left.expr, expression=right.expr) - - raise TypeError( - f"Cannot subtract type {left.dtype} and {right.dtype}. {bf_constants.FEEDBACK_LINK}" - ) - - -def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: - """Coerce boolean expression to integer.""" - if typed_expr.dtype == dtypes.BOOL_DTYPE: - return sge.Cast(this=typed_expr.expr, to="INT64") - return typed_expr.expr - - -def _coerce_date_to_datetime(typed_expr: TypedExpr) -> sge.Expression: - """Coerce date expression to datetime.""" - if typed_expr.dtype == dtypes.DATE_DTYPE: - return sge.Cast(this=typed_expr.expr, to="DATETIME") - return typed_expr.expr diff --git a/bigframes/core/compile/sqlglot/expressions/blob_ops.py b/bigframes/core/compile/sqlglot/expressions/blob_ops.py index 58f905087d..03708f80c6 100644 --- a/bigframes/core/compile/sqlglot/expressions/blob_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/blob_ops.py @@ -21,6 +21,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @register_unary_op(ops.obj_fetch_metadata_op) @@ -31,3 +32,8 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.ObjGetAccessUrl) def _(expr: TypedExpr) -> sge.Expression: return sge.func("OBJ.GET_ACCESS_URL", expr.expr) + + +@register_binary_op(ops.obj_make_ref_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.func("OBJ.MAKE_REF", left.expr, right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index 3bf94cf8ab..eb08144b8a 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -19,12 +19,13 @@ import pandas as pd import sqlglot.expressions as sge +from bigframes import dtypes from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler -import bigframes.dtypes as dtypes register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @register_unary_op(ops.IsInOp, pass_op=True) @@ -53,7 +54,76 @@ def _(expr: TypedExpr, op: ops.IsInOp) -> sge.Expression: ) +@register_binary_op(ops.eq_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.EQ(this=left_expr, expression=right_expr) + + +@register_binary_op(ops.eq_null_match_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = left.expr + if right.dtype != dtypes.BOOL_DTYPE: + left_expr = _coerce_bool_to_int(left) + + right_expr = right.expr + if left.dtype != dtypes.BOOL_DTYPE: + right_expr = _coerce_bool_to_int(right) + + sentinel = sge.convert("$NULL_SENTINEL$") + left_coalesce = sge.Coalesce( + this=sge.Cast(this=left_expr, to="STRING"), expressions=[sentinel] + ) + right_coalesce = sge.Coalesce( + this=sge.Cast(this=right_expr, to="STRING"), expressions=[sentinel] + ) + return sge.EQ(this=left_coalesce, expression=right_coalesce) + + +@register_binary_op(ops.ge_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.GTE(this=left_expr, expression=right_expr) + + +@register_binary_op(ops.gt_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.GT(this=left_expr, expression=right_expr) + + +@register_binary_op(ops.lt_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.LT(this=left_expr, expression=right_expr) + + +@register_binary_op(ops.le_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.LTE(this=left_expr, expression=right_expr) + + +@register_binary_op(ops.ne_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.NEQ(this=left_expr, expression=right_expr) + + # Helpers def _is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: + """Coerce boolean expression to integer.""" + if typed_expr.dtype == dtypes.BOOL_DTYPE: + return sge.Cast(this=typed_expr.expr, to="INT64") + return typed_expr.expr diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 5ee4ede94a..8a792c0753 100644 --- a/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -16,17 +16,54 @@ import sqlglot.expressions as sge +from bigframes import dtypes from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler +from bigframes.core.compile.sqlglot.sqlglot_types import SQLGlotType register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op @register_unary_op(ops.AsTypeOp, pass_op=True) def _(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: - # TODO: Support more types for casting, such as JSON, etc. - return sge.Cast(this=expr.expr, to=op.to_type) + from_type = expr.dtype + to_type = op.to_type + sg_to_type = SQLGlotType.from_bigframes_dtype(to_type) + sg_expr = expr.expr + + if to_type == dtypes.JSON_DTYPE: + return _cast_to_json(expr, op) + + if from_type == dtypes.JSON_DTYPE: + return _cast_from_json(expr, op) + + if to_type == dtypes.INT_DTYPE: + result = _cast_to_int(expr, op) + if result is not None: + return result + + if to_type == dtypes.FLOAT_DTYPE and from_type == dtypes.BOOL_DTYPE: + sg_expr = _cast(sg_expr, "INT64", op.safe) + return _cast(sg_expr, sg_to_type, op.safe) + + if to_type == dtypes.BOOL_DTYPE: + if from_type == dtypes.BOOL_DTYPE: + return sg_expr + else: + return sge.NEQ(this=sg_expr, expression=sge.convert(0)) + + if to_type == dtypes.STRING_DTYPE: + sg_expr = _cast(sg_expr, sg_to_type, op.safe) + if from_type == dtypes.BOOL_DTYPE: + sg_expr = sge.func("INITCAP", sg_expr) + return sg_expr + + if dtypes.is_time_like(to_type) and from_type == dtypes.INT_DTYPE: + sg_expr = sge.func("TIMESTAMP_MICROS", sg_expr) + return _cast(sg_expr, sg_to_type, op.safe) + + return _cast(sg_expr, sg_to_type, op.safe) @register_unary_op(ops.hash_op) @@ -53,3 +90,64 @@ def _(expr: TypedExpr, op: ops.MapOp) -> sge.Expression: @register_unary_op(ops.notnull_op) def _(expr: TypedExpr) -> sge.Expression: return sge.Not(this=sge.Is(this=expr.expr, expression=sge.Null())) + + +# Helper functions +def _cast_to_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: + from_type = expr.dtype + sg_expr = expr.expr + + if from_type == dtypes.STRING_DTYPE: + func_name = "PARSE_JSON_IN_SAFE" if op.safe else "PARSE_JSON" + return sge.func(func_name, sg_expr) + if from_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE, dtypes.FLOAT_DTYPE): + sg_expr = sge.Cast(this=sg_expr, to="STRING") + return sge.func("PARSE_JSON", sg_expr) + raise TypeError(f"Cannot cast from {from_type} to {dtypes.JSON_DTYPE}") + + +def _cast_from_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: + to_type = op.to_type + sg_expr = expr.expr + func_name = "" + if to_type == dtypes.INT_DTYPE: + func_name = "INT64" + elif to_type == dtypes.FLOAT_DTYPE: + func_name = "FLOAT64" + elif to_type == dtypes.BOOL_DTYPE: + func_name = "BOOL" + elif to_type == dtypes.STRING_DTYPE: + func_name = "STRING" + if func_name: + func_name = "SAFE." + func_name if op.safe else func_name + return sge.func(func_name, sg_expr) + raise TypeError(f"Cannot cast from {dtypes.JSON_DTYPE} to {to_type}") + + +def _cast_to_int(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression | None: + from_type = expr.dtype + sg_expr = expr.expr + # Cannot cast DATETIME to INT directly so need to convert to TIMESTAMP first. + if from_type == dtypes.DATETIME_DTYPE: + sg_expr = _cast(sg_expr, "TIMESTAMP", op.safe) + return sge.func("UNIX_MICROS", sg_expr) + if from_type == dtypes.TIMESTAMP_DTYPE: + return sge.func("UNIX_MICROS", sg_expr) + if from_type == dtypes.TIME_DTYPE: + return sge.func( + "TIME_DIFF", + _cast(sg_expr, "TIME", op.safe), + sge.convert("00:00:00"), + "MICROSECOND", + ) + if from_type == dtypes.NUMERIC_DTYPE or from_type == dtypes.FLOAT_DTYPE: + sg_expr = sge.func("TRUNC", sg_expr) + return _cast(sg_expr, "INT64", op.safe) + return None + + +def _cast(expr: sge.Expression, to: str, safe: bool): + if safe: + return sge.TryCast(this=expr, to=to) + else: + return sge.Cast(this=expr, to=to) diff --git a/bigframes/core/compile/sqlglot/expressions/json_ops.py b/bigframes/core/compile/sqlglot/expressions/json_ops.py index 754e8d80eb..442eb9fdf5 100644 --- a/bigframes/core/compile/sqlglot/expressions/json_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/json_ops.py @@ -21,6 +21,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @register_unary_op(ops.JSONExtract, pass_op=True) @@ -66,3 +67,8 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.ToJSONString) def _(expr: TypedExpr) -> sge.Expression: return sge.func("TO_JSON_STRING", expr.expr) + + +@register_binary_op(ops.JSONSet, pass_op=True) +def _(left: TypedExpr, right: TypedExpr, op) -> sge.Expression: + return sge.func("JSON_SET", left.expr, sge.convert(op.json_path), right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 09c08e2095..1a6447ceb7 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -14,14 +14,17 @@ from __future__ import annotations +import bigframes_vendored.constants as bf_constants import sqlglot.expressions as sge +from bigframes import dtypes from bigframes import operations as ops import bigframes.core.compile.sqlglot.expressions.constants as constants from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @register_unary_op(ops.abs_op) @@ -238,3 +241,144 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.tanh_op) def _(expr: TypedExpr) -> sge.Expression: return sge.func("TANH", expr.expr) + + +@register_binary_op(ops.add_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: + # String addition + return sge.Concat(expressions=[left.expr, right.expr]) + + if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.Add(this=left_expr, expression=right_expr) + + if ( + dtypes.is_time_or_date_like(left.dtype) + and right.dtype == dtypes.TIMEDELTA_DTYPE + ): + left_expr = _coerce_date_to_datetime(left) + return sge.TimestampAdd( + this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") + ) + if ( + dtypes.is_time_or_date_like(right.dtype) + and left.dtype == dtypes.TIMEDELTA_DTYPE + ): + right_expr = _coerce_date_to_datetime(right) + return sge.TimestampAdd( + this=right_expr, expression=left.expr, unit=sge.Var(this="MICROSECOND") + ) + if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: + return sge.Add(this=left.expr, expression=right.expr) + + raise TypeError( + f"Cannot add type {left.dtype} and {right.dtype}. {bf_constants.FEEDBACK_LINK}" + ) + + +@register_binary_op(ops.div_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + + result = sge.func("IEEE_DIVIDE", left_expr, right_expr) + if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + return sge.Cast(this=sge.Floor(this=result), to="INT64") + else: + return result + + +@register_binary_op(ops.floordiv_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + + result: sge.Expression = sge.Cast( + this=sge.Floor(this=sge.func("IEEE_DIVIDE", left_expr, right_expr)), to="INT64" + ) + + # DIV(N, 0) will error in bigquery, but needs to return `0` for int, and + # `inf`` for float in BQ so we short-circuit in this case. + # Multiplying left by zero propogates nulls. + zero_result = ( + constants._INF + if (left.dtype == dtypes.FLOAT_DTYPE or right.dtype == dtypes.FLOAT_DTYPE) + else constants._ZERO + ) + result = sge.Case( + ifs=[ + sge.If( + this=sge.EQ(this=right_expr, expression=constants._ZERO), + true=zero_result * left_expr, + ) + ], + default=result, + ) + + if dtypes.is_numeric(right.dtype) and left.dtype == dtypes.TIMEDELTA_DTYPE: + result = sge.Cast(this=sge.Floor(this=result), to="INT64") + + return result + + +@register_binary_op(ops.mul_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + + result = sge.Mul(this=left_expr, expression=right_expr) + + if (dtypes.is_numeric(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE) or ( + left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype) + ): + return sge.Cast(this=sge.Floor(this=result), to="INT64") + else: + return result + + +@register_binary_op(ops.sub_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.Sub(this=left_expr, expression=right_expr) + + if ( + dtypes.is_time_or_date_like(left.dtype) + and right.dtype == dtypes.TIMEDELTA_DTYPE + ): + left_expr = _coerce_date_to_datetime(left) + return sge.TimestampSub( + this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") + ) + if dtypes.is_time_or_date_like(left.dtype) and dtypes.is_time_or_date_like( + right.dtype + ): + left_expr = _coerce_date_to_datetime(left) + right_expr = _coerce_date_to_datetime(right) + return sge.TimestampDiff( + this=left_expr, expression=right_expr, unit=sge.Var(this="MICROSECOND") + ) + + if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: + return sge.Sub(this=left.expr, expression=right.expr) + + raise TypeError( + f"Cannot subtract type {left.dtype} and {right.dtype}. {bf_constants.FEEDBACK_LINK}" + ) + + +def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: + """Coerce boolean expression to integer.""" + if typed_expr.dtype == dtypes.BOOL_DTYPE: + return sge.Cast(this=typed_expr.expr, to="INT64") + return typed_expr.expr + + +def _coerce_date_to_datetime(typed_expr: TypedExpr) -> sge.Expression: + """Coerce date expression to datetime.""" + if typed_expr.dtype == dtypes.DATE_DTYPE: + return sge.Cast(this=typed_expr.expr, to="DATETIME") + return typed_expr.expr diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 3e12da6d92..8167f40fc3 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -79,7 +79,7 @@ def register_unary_op( """ key = typing.cast(str, op_ref.name) - def decorator(impl: typing.Callable[..., TypedExpr]): + def decorator(impl: typing.Callable[..., sge.Expression]): def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp): if pass_op: return impl(args[0], op) @@ -108,7 +108,7 @@ def register_binary_op( """ key = typing.cast(str, op_ref.name) - def decorator(impl: typing.Callable[..., TypedExpr]): + def decorator(impl: typing.Callable[..., sge.Expression]): def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp): if pass_op: return impl(args[0], args[1], op) @@ -132,7 +132,7 @@ def register_ternary_op( """ key = typing.cast(str, op_ref.name) - def decorator(impl: typing.Callable[..., TypedExpr]): + def decorator(impl: typing.Callable[..., sge.Expression]): def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp): return impl(args[0], args[1], args[2]) @@ -156,7 +156,7 @@ def register_nary_op( """ key = typing.cast(str, op_ref.name) - def decorator(impl: typing.Callable[..., TypedExpr]): + def decorator(impl: typing.Callable[..., sge.Expression]): def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp): if pass_op: return impl(*args, op=op) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index 21f49fe563..40e96f6f42 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Optional, Sequence, Tuple, Union +from typing import Iterable, Literal, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -29,7 +29,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -from bigframes.core.groupby import aggs, series_group_by +from bigframes.core.groupby import aggs, group_by, series_group_by import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations @@ -54,6 +54,7 @@ def __init__( selected_cols: typing.Optional[typing.Sequence[str]] = None, dropna: bool = True, as_index: bool = True, + by_key_is_singular: bool = False, ): # TODO(tbergeron): Support more group-by expression types self._block = block @@ -64,6 +65,9 @@ def __init__( ) } self._by_col_ids = by_col_ids + self._by_key_is_singular = by_key_is_singular + if by_key_is_singular: + assert len(by_col_ids) == 1, "singular key should be exactly one group key" self._dropna = dropna self._as_index = as_index @@ -149,6 +153,30 @@ def head(self, n: int = 5) -> df.DataFrame: ) ) + def describe(self, include: None | Literal["all"] = None): + from bigframes.pandas.core.methods import describe + + return df.DataFrame( + describe._describe( + self._block, + self._selected_cols, + include, + as_index=self._as_index, + by_col_ids=self._by_col_ids, + dropna=self._dropna, + ) + ) + + def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]: + for group_keys, filtered_block in group_by.block_groupby_iter( + self._block, + by_col_ids=self._by_col_ids, + by_key_is_singular=self._by_key_is_singular, + dropna=self._dropna, + ): + filtered_df = df.DataFrame(filtered_block) + yield group_keys, filtered_df + def size(self) -> typing.Union[df.DataFrame, series.Series]: agg_block, _ = self._block.aggregate_size( by_column_ids=self._by_col_ids, diff --git a/bigframes/core/groupby/group_by.py b/bigframes/core/groupby/group_by.py new file mode 100644 index 0000000000..f00ff7c0b0 --- /dev/null +++ b/bigframes/core/groupby/group_by.py @@ -0,0 +1,91 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +from typing import Sequence + +import pandas as pd + +from bigframes.core import blocks +from bigframes.core import expression as ex +import bigframes.enums +import bigframes.operations as ops + + +def block_groupby_iter( + block: blocks.Block, + *, + by_col_ids: Sequence[str], + by_key_is_singular: bool, + dropna: bool, +): + original_index_columns = block._index_columns + original_index_labels = block._index_labels + by_col_ids = by_col_ids + block = block.reset_index( + level=None, + # Keep the original index columns so they can be recovered. + drop=False, + allow_duplicates=True, + replacement=bigframes.enums.DefaultIndexKind.NULL, + ).set_index( + by_col_ids, + # Keep by_col_ids in-place so the ordering doesn't change. + drop=False, + append=False, + ) + block.cached( + force=True, + # All DataFrames will be filtered by by_col_ids, so + # force block.cached() to cluster by the new index by explicitly + # setting `session_aware=False`. This will ensure that the filters + # are more efficient. + session_aware=False, + ) + keys_block, _ = block.aggregate(by_col_ids, dropna=dropna) + for chunk in keys_block.to_pandas_batches(): + # Convert to MultiIndex to make sure we get tuples, + # even for singular keys. + by_keys_index = chunk.index + if not isinstance(by_keys_index, pd.MultiIndex): + by_keys_index = pd.MultiIndex.from_frame(by_keys_index.to_frame()) + + for by_keys in by_keys_index: + filtered_block = ( + # To ensure the cache is used, filter first, then reset the + # index before yielding the DataFrame. + block.filter( + functools.reduce( + ops.and_op.as_expr, + ( + ops.eq_op.as_expr(by_col, ex.const(by_key)) + for by_col, by_key in zip(by_col_ids, by_keys) + ), + ), + ).set_index( + original_index_columns, + # We retained by_col_ids in the set_index call above, + # so it's safe to drop the duplicates now. + drop=True, + append=False, + index_labels=original_index_labels, + ) + ) + + if by_key_is_singular: + yield by_keys[0], filtered_block + else: + yield by_keys, filtered_block diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 8ab39d27cc..1f2632078d 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Sequence, Union +from typing import Iterable, Literal, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -28,7 +28,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -from bigframes.core.groupby import aggs +from bigframes.core.groupby import aggs, group_by import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations @@ -52,6 +52,8 @@ def __init__( by_col_ids: typing.Sequence[str], value_name: blocks.Label = None, dropna=True, + *, + by_key_is_singular: bool = False, ): # TODO(tbergeron): Support more group-by expression types self._block = block @@ -60,6 +62,10 @@ def __init__( self._value_name = value_name self._dropna = dropna # Applies to aggregations but not windowing + self._by_key_is_singular = by_key_is_singular + if by_key_is_singular: + assert len(by_col_ids) == 1, "singular key should be exactly one group key" + @property def _session(self) -> session.Session: return self._block.session @@ -75,6 +81,33 @@ def head(self, n: int = 5) -> series.Series: ) ) + def describe(self, include: None | Literal["all"] = None): + from bigframes.pandas.core.methods import describe + + return df.DataFrame( + describe._describe( + self._block, + columns=[self._value_column], + include=include, + as_index=True, + by_col_ids=self._by_col_ids, + dropna=self._dropna, + ) + ).droplevel(level=0, axis=1) + + def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]: + for group_keys, filtered_block in group_by.block_groupby_iter( + self._block, + by_col_ids=self._by_col_ids, + by_key_is_singular=self._by_key_is_singular, + dropna=self._dropna, + ): + filtered_series = series.Series( + filtered_block.select_column(self._value_column) + ) + filtered_series.name = self._value_name + yield group_keys, filtered_series + def all(self) -> series.Series: return self._aggregate(agg_ops.all_op) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 2a35ab6546..c5e2657629 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -740,6 +740,9 @@ def to_numpy(self, dtype=None, *, allow_large_results=None, **kwargs) -> np.ndar __array__ = to_numpy + def to_list(self, *, allow_large_results: Optional[bool] = None) -> list: + return self.to_pandas(allow_large_results=allow_large_results).to_list() + def __len__(self): return self.shape[0] diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 1989b1a543..a20b698ff4 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -18,12 +18,8 @@ from typing import cast, Optional, Sequence, Set, Tuple import bigframes.core.expression -import bigframes.core.guid import bigframes.core.identifiers -import bigframes.core.join_def import bigframes.core.nodes -import bigframes.core.window_spec -import bigframes.operations.aggregations # Combination of selects and additive nodes can be merged as an explicit keyless "row join" ALIGNABLE_NODES = ( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 371f69e713..eb5ed997a1 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -489,7 +489,6 @@ def memory_usage(self, index: bool = True): column_sizes = pandas.concat([index_size, column_sizes]) return column_sizes - @validations.requires_index def info( self, verbose: Optional[bool] = None, @@ -512,12 +511,17 @@ def info( obuf.write(f"{type(self)}\n") - index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" + if self._block.has_index: + index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" - # These accessses are kind of expensive, maybe should try to skip? - first_indice = self.index[0] - last_indice = self.index[-1] - obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n") + # These accessses are kind of expensive, maybe should try to skip? + first_indice = self.index[0] + last_indice = self.index[-1] + obuf.write( + f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n" + ) + else: + obuf.write("NullIndex\n") dtype_strings = self.dtypes.astype("string") if show_all_columns: @@ -2002,6 +2006,7 @@ def insert( self._set_block(block) + @overload def drop( self, labels: typing.Any = None, @@ -2010,7 +2015,33 @@ def drop( index: typing.Any = None, columns: Union[blocks.Label, Sequence[blocks.Label]] = None, level: typing.Optional[LevelType] = None, + inplace: Literal[False] = False, ) -> DataFrame: + ... + + @overload + def drop( + self, + labels: typing.Any = None, + *, + axis: typing.Union[int, str] = 0, + index: typing.Any = None, + columns: Union[blocks.Label, Sequence[blocks.Label]] = None, + level: typing.Optional[LevelType] = None, + inplace: Literal[True], + ) -> None: + ... + + def drop( + self, + labels: typing.Any = None, + *, + axis: typing.Union[int, str] = 0, + index: typing.Any = None, + columns: Union[blocks.Label, Sequence[blocks.Label]] = None, + level: typing.Optional[LevelType] = None, + inplace: bool = False, + ) -> Optional[DataFrame]: if labels: if index or columns: raise ValueError("Cannot specify both 'labels' and 'index'/'columns") @@ -2052,7 +2083,11 @@ def drop( inverse_condition_id, ops.invert_op ) elif isinstance(index, indexes.Index): - return self._drop_by_index(index) + dropped_block = self._drop_by_index(index)._get_block() + if inplace: + self._set_block(dropped_block) + return None + return DataFrame(dropped_block) else: block, condition_id = block.project_expr( ops.ne_op.as_expr(level_id, ex.const(index)) @@ -2064,7 +2099,12 @@ def drop( block = block.drop_columns(self._sql_names(columns)) if index is None and not columns: raise ValueError("Must specify 'labels' or 'index'/'columns") - return DataFrame(block) + + if inplace: + self._set_block(block) + return None + else: + return DataFrame(block) def _drop_by_index(self, index: indexes.Index) -> DataFrame: block = index._block @@ -3909,11 +3949,17 @@ def _groupby_level( as_index: bool = True, dropna: bool = True, ): + if utils.is_list_like(level): + by_key_is_singular = False + else: + by_key_is_singular = True + return groupby.DataFrameGroupBy( self._block, by_col_ids=self._resolve_levels(level), as_index=as_index, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def _groupby_series( @@ -3926,10 +3972,14 @@ def _groupby_series( as_index: bool = True, dropna: bool = True, ): + # Pandas makes a distinction between groupby with a list of keys + # versus groupby with a single item in some methods, like __iter__. if not isinstance(by, bigframes.series.Series) and utils.is_list_like(by): by = list(by) + by_key_is_singular = False else: by = [typing.cast(typing.Union[blocks.Label, bigframes.series.Series], by)] + by_key_is_singular = True block = self._block col_ids: typing.Sequence[str] = [] @@ -3959,6 +4009,7 @@ def _groupby_series( by_col_ids=col_ids, as_index=as_index, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def abs(self) -> DataFrame: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 2c4cccefd2..3695110672 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -358,6 +358,11 @@ def is_comparable(type_: ExpressionType) -> bool: return (type_ is not None) and is_orderable(type_) +def can_compare(type1: ExpressionType, type2: ExpressionType) -> bool: + coerced_type = coerce_to_common(type1, type2) + return is_comparable(coerced_type) + + def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]: assert isinstance(type_, pd.ArrowDtype) assert isinstance(type_.pyarrow_dtype, pa.StructType) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 46d40d5fc8..92c98695cd 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -29,6 +29,7 @@ from bigframes.core import log_adapter import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, impute, preprocessing, utils import bigframes.pandas as bpd @@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"): # TODO: More robust unescaping self._target_column = target_column.replace("`", "") - PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE) - def _compile_to_sql( self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None ) -> List[str]: if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) result = [] for column in columns: current_sql = self._sql.format(sql_utils.identifier(column)) diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index f19c8e2cd3..818151a4f9 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -23,6 +23,7 @@ import bigframes_vendored.sklearn.impute._base from bigframes.core import log_adapter +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -62,6 +63,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_imputer( column, self.strategy, f"imputer_{column}" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 2e8dc64a53..94c61674f6 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -27,6 +27,7 @@ import bigframes_vendored.sklearn.preprocessing._polynomial from bigframes.core import log_adapter +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -59,6 +60,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_standard_scaler( column, f"standard_scaled_{column}" @@ -136,6 +138,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_max_abs_scaler( column, f"max_abs_scaled_{column}" @@ -214,6 +217,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_min_max_scaler( column, f"min_max_scaled_{column}" @@ -304,6 +308,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) array_split_points = {} if self.strategy == "uniform": for column in columns: @@ -433,6 +438,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) drop = self.drop if self.drop is not None else "none" # minus one here since BQML's implementation always includes index 0, and top_k is on top of that. top_k = ( @@ -547,6 +553,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -644,6 +651,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) output_name = "poly_feat" return [ self._base_sql_generator.ml_polynomial_expand( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 6239b88e9e..17e1f7534f 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -14,7 +14,7 @@ from __future__ import annotations -from bigframes.operations.ai_ops import AIGenerateBool +from bigframes.operations.ai_ops import AIGenerateBool, AIGenerateInt from bigframes.operations.array_ops import ( ArrayIndexOp, ArrayReduceOp, @@ -413,6 +413,7 @@ "GeoStDistanceOp", # AI ops "AIGenerateBool", + "AIGenerateInt", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 02b475d198..f6e8600d42 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -251,12 +251,7 @@ def name(self): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if not dtypes.is_orderable(input_types[0]): raise TypeError(f"Type {input_types[0]} is not orderable") - if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype( - input_types[0] - ): - return dtypes.FLOAT_DTYPE - else: - return input_types[0] + return input_types[0] @dataclasses.dataclass(frozen=True) @@ -524,6 +519,8 @@ def implicitly_inherits_order(self): @dataclasses.dataclass(frozen=True) class DenseRankOp(UnaryWindowOp): + name: ClassVar[str] = "dense_rank" + @property def skips_nulls(self): return False diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py index 680c1585fb..7a8202abd2 100644 --- a/bigframes/operations/ai_ops.py +++ b/bigframes/operations/ai_ops.py @@ -28,7 +28,6 @@ class AIGenerateBool(base_ops.NaryOp): name: ClassVar[str] = "ai_generate_bool" - # None are the placeholders for column references. prompt_context: Tuple[str | None, ...] connection_id: str endpoint: str | None @@ -45,3 +44,25 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) ) ) + + +@dataclasses.dataclass(frozen=True) +class AIGenerateInt(base_ops.NaryOp): + name: ClassVar[str] = "ai_generate_int" + + prompt_context: Tuple[str | None, ...] + connection_id: str + endpoint: str | None + request_type: Literal["dedicated", "shared", "unspecified"] + model_params: str | None + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.int64()), + pa.field("full_response", dtypes.JSON_ARROW_TYPE), + pa.field("status", pa.string()), + ) + ) + ) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index b4029d74c7..020bd0ea57 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -174,15 +174,7 @@ class CoerceCommon(BinaryTypeSignature): def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - try: - return bigframes.dtypes.coerce_to_common(left_type, right_type) - except TypeError: - pass - if bigframes.dtypes.can_coerce(left_type, right_type): - return right_type - if bigframes.dtypes.can_coerce(right_type, left_type): - return left_type - raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.") + return bigframes.dtypes.coerce_to_common(left_type, right_type) @dataclasses.dataclass @@ -192,8 +184,7 @@ class Comparison(BinaryTypeSignature): def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - common_type = CoerceCommon().output_type(left_type, right_type) - if not bigframes.dtypes.is_comparable(common_type): + if not bigframes.dtypes.can_compare(left_type, right_type): raise TypeError(f"Types {left_type} and {right_type} are not comparable") return bigframes.dtypes.BOOL_DTYPE diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py index 18d2318379..f8a8721cf2 100644 --- a/bigframes/pandas/core/methods/describe.py +++ b/bigframes/pandas/core/methods/describe.py @@ -16,8 +16,15 @@ import typing +import pandas as pd + from bigframes import dataframe, dtypes, series -from bigframes.core.reshape import api as rs +from bigframes.core import agg_expressions, blocks +from bigframes.operations import aggregations + +_DEFAULT_DTYPES = ( + dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES +) def describe( @@ -30,100 +37,88 @@ def describe( elif not isinstance(input, dataframe.DataFrame): raise TypeError(f"Unsupported type: {type(input)}") + block = input._block + + describe_block = _describe(block, columns=block.value_columns, include=include) + # we override default stack behavior, because we want very specific ordering + stack_cols = pd.Index( + [ + "count", + "nunique", + "top", + "freq", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + ).intersection(describe_block.column_labels.get_level_values(-1)) + describe_block = describe_block.stack(override_labels=stack_cols) + + return dataframe.DataFrame(describe_block).droplevel(level=0) + + +def _describe( + block: blocks.Block, + columns: typing.Sequence[str], + include: None | typing.Literal["all"] = None, + *, + as_index: bool = True, + by_col_ids: typing.Sequence[str] = [], + dropna: bool = False, +) -> blocks.Block: + stats: list[agg_expressions.Aggregation] = [] + column_labels: list[typing.Hashable] = [] + + # include=None behaves like include='all' if no numeric columns present if include is None: - numeric_df = _select_dtypes( - input, - dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE - + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES, - ) - if len(numeric_df.columns) == 0: - # Describe eligible non-numeric columns - return _describe_non_numeric(input) - - # Otherwise, only describe numeric columns - return _describe_numeric(input) - - elif include == "all": - numeric_result = _describe_numeric(input) - non_numeric_result = _describe_non_numeric(input) - - if len(numeric_result.columns) == 0: - return non_numeric_result - elif len(non_numeric_result.columns) == 0: - return numeric_result - else: - # Use reindex after join to preserve the original column order. - return rs.concat( - [non_numeric_result, numeric_result], axis=1 - )._reindex_columns(input.columns) - - else: - raise ValueError(f"Unsupported include type: {include}") - - -def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame: - number_df_result = typing.cast( - dataframe.DataFrame, - _select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg( - [ - "count", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ] - ), - ) - temporal_df_result = typing.cast( - dataframe.DataFrame, - _select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]), + if not any( + block.expr.get_column_type(col) in _DEFAULT_DTYPES for col in columns + ): + include = "all" + + for col_id in columns: + label = block.col_id_to_label[col_id] + dtype = block.expr.get_column_type(col_id) + if include != "all" and dtype not in _DEFAULT_DTYPES: + continue + agg_ops = _get_aggs_for_dtype(dtype) + stats.extend(op.as_expr(col_id) for op in agg_ops) + label_tuple = (label,) if block.column_labels.nlevels == 1 else label + column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore + + agg_block, _ = block.aggregate( + by_column_ids=by_col_ids, + aggregations=stats, + dropna=dropna, + column_labels=pd.Index(column_labels, name=(*block.column_labels.names, None)), ) - - if len(number_df_result.columns) == 0: - return temporal_df_result - elif len(temporal_df_result.columns) == 0: - return number_df_result + return agg_block if as_index else agg_block.reset_index(drop=False) + + +def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: + if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: + return [ + aggregations.count_op, + aggregations.mean_op, + aggregations.std_op, + aggregations.min_op, + aggregations.ApproxQuartilesOp(1), + aggregations.ApproxQuartilesOp(2), + aggregations.ApproxQuartilesOp(3), + aggregations.max_op, + ] + elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES: + return [aggregations.count_op] + elif dtype in [ + dtypes.STRING_DTYPE, + dtypes.BOOL_DTYPE, + dtypes.BYTES_DTYPE, + dtypes.TIME_DTYPE, + ]: + return [aggregations.count_op, aggregations.nunique_op] else: - import bigframes.core.reshape.api as rs - - original_columns = _select_dtypes( - df, - dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE - + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES, - ).columns - - # Use reindex after join to preserve the original column order. - return rs.concat( - [number_df_result, temporal_df_result], - axis=1, - )._reindex_columns(original_columns) - - -def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame: - return typing.cast( - dataframe.DataFrame, - _select_dtypes( - df, - [ - dtypes.STRING_DTYPE, - dtypes.BOOL_DTYPE, - dtypes.BYTES_DTYPE, - dtypes.TIME_DTYPE, - ], - ).agg(["count", "nunique"]), - ) - - -def _select_dtypes( - df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype] -) -> dataframe.DataFrame: - """Selects columns without considering inheritance relationships.""" - columns = [ - col_id - for col_id, dtype in zip(df._block.value_columns, df._block.dtypes) - if dtype in dtypes - ] - return dataframe.DataFrame(df._block.select_columns(columns)) + return [] diff --git a/bigframes/series.py b/bigframes/series.py index da2f3f07c4..87387a4333 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -25,6 +25,7 @@ import typing from typing import ( Any, + Callable, cast, Iterable, List, @@ -1854,12 +1855,18 @@ def _groupby_level( level: int | str | typing.Sequence[int] | typing.Sequence[str], dropna: bool = True, ) -> bigframes.core.groupby.SeriesGroupBy: + if utils.is_list_like(level): + by_key_is_singular = False + else: + by_key_is_singular = True + return groupby.SeriesGroupBy( self._block, self._value_column, by_col_ids=self._resolve_levels(level), value_name=self.name, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def _groupby_values( @@ -1871,8 +1878,10 @@ def _groupby_values( ) -> bigframes.core.groupby.SeriesGroupBy: if not isinstance(by, Series) and _is_list_like(by): by = list(by) + by_key_is_singular = False else: by = [typing.cast(typing.Union[blocks.Label, Series], by)] + by_key_is_singular = True block = self._block grouping_cols: typing.Sequence[str] = [] @@ -1904,6 +1913,7 @@ def _groupby_values( by_col_ids=grouping_cols, value_name=self.name, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def apply( @@ -2330,7 +2340,7 @@ def _throw_if_index_contains_duplicates( def map( self, - arg: typing.Union[Mapping, Series], + arg: typing.Union[Mapping, Series, Callable], na_action: Optional[str] = None, *, verify_integrity: bool = False, @@ -2352,6 +2362,7 @@ def map( ) map_df = map_df.set_index("keys") elif callable(arg): + # This is for remote function and managed funtion. return self.apply(arg) else: # Mirroring pandas, call the uncallable object diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 30a25762eb..00531ce25d 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -27,15 +27,9 @@ import google.api_core.exceptions import google.cloud.bigquery as bigquery -import bigframes.clients -import bigframes.core.compile -import bigframes.core.compile.default_ordering import bigframes.core.sql -import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.session._io.bigquery -import bigframes.session.clients -import bigframes.version # Avoid circular imports. if typing.TYPE_CHECKING: diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index d38e323d57..b4daab7aad 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -25,9 +25,10 @@ import pyarrow as pa # type: ignore import pytest -from bigframes.core import expression as expr +from bigframes import operations as ops +from bigframes.core import expression as ex import bigframes.functions._utils as bff_utils -import bigframes.pandas +import bigframes.pandas as bpd ML_REGRESSION_METRICS = [ "mean_absolute_error", @@ -67,17 +68,13 @@ # Prefer this function for tests that run in both ordered and unordered mode -def assert_dfs_equivalent( - pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs -): +def assert_dfs_equivalent(pd_df: pd.DataFrame, bf_df: bpd.DataFrame, **kwargs): bf_df_local = bf_df.to_pandas() ignore_order = not bf_df._session._strictly_ordered assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) -def assert_series_equivalent( - pd_series: pd.Series, bf_series: bigframes.pandas.Series, **kwargs -): +def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwargs): bf_df_local = bf_series.to_pandas() ignore_order = not bf_series._session._strictly_ordered assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) @@ -452,12 +449,12 @@ def get_function_name(func, package_requirements=None, is_row_processor=False): def _apply_unary_ops( - obj: bigframes.pandas.DataFrame, - ops_list: Sequence[expr.Expression], + obj: bpd.DataFrame, + ops_list: Sequence[ex.Expression], new_names: Sequence[str], ) -> str: """Applies a list of unary ops to the given DataFrame and returns the SQL - representing the resulting DataFrames.""" + representing the resulting DataFrame.""" array_value = obj._block.expr result, old_names = array_value.compute_values(ops_list) @@ -468,3 +465,23 @@ def _apply_unary_ops( sql = result.session._executor.to_sql(result, enable_cache=False) return sql + + +def _apply_binary_op( + obj: bpd.DataFrame, + op: ops.BinaryOp, + l_arg: str, + r_arg: Union[str, ex.Expression], +) -> str: + """Applies a binary op to the given DataFrame and return the SQL representing + the resulting DataFrame.""" + array_value = obj._block.expr + op_expr = op.as_expr(l_arg, r_arg) + result, col_ids = array_value.compute_values([op_expr]) + + # Rename columns for deterministic golden SQL results. + assert len(col_ids) == 1 + result = result.rename_columns({col_ids[0]: l_arg}).select_columns([l_arg]) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql diff --git a/bigframes/version.py b/bigframes/version.py index f8f4376098..5b669176e8 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.21.0" +__version__ = "2.22.0" # {x-release-please-start-date} -__release_date__ = "2025-09-17" +__release_date__ = "2025-09-25" # {x-release-please-end} diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index dd08ed17d9..e74bc8579f 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -1245,7 +1245,7 @@ def the_sum(s): cleanup_function_assets(the_sum_mf, session.bqclient, ignore_failures=False) -def test_managed_function_series_where_mask(session, dataset_id, scalars_dfs): +def test_managed_function_series_where_mask_map(session, dataset_id, scalars_dfs): try: # The return type has to be bool type for callable where condition. @@ -1286,6 +1286,13 @@ def _is_positive(s): # Ignore any dtype difference. pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + # Test series.map method. + bf_result = bf_int64_filtered.map(is_positive_mf).to_pandas() + pd_result = pd_int64_filtered.map(_is_positive) + + # Ignore any dtype difference. + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: # Clean up the gcp assets created for the managed function. cleanup_function_assets(is_positive_mf, session.bqclient, ignore_failures=False) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index be67a0d580..9f6feb0bbc 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys - +from packaging import version import pandas as pd import pyarrow as pa import pytest +import sqlglot from bigframes import dtypes, series import bigframes.bigquery as bbq import bigframes.pandas as bpd -def test_ai_generate_bool(session): - s1 = bpd.Series(["apple", "bear"], session=session) +def test_ai_function_pandas_input(session): + s1 = pd.Series(["apple", "bear"]) s2 = bpd.Series(["fruit", "tree"], session=session) prompt = (s1, " is a ", s2) @@ -42,12 +42,20 @@ def test_ai_generate_bool(session): ) -def test_ai_generate_bool_with_pandas(session): - s1 = pd.Series(["apple", "bear"]) +def test_ai_function_compile_model_params(session): + if version.Version(sqlglot.__version__) < version.Version("25.18.0"): + pytest.skip( + "Skip test because SQLGLot cannot compile model params to JSON at this version." + ) + + s1 = bpd.Series(["apple", "bear"], session=session) s2 = bpd.Series(["fruit", "tree"], session=session) prompt = (s1, " is a ", s2) + model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} - result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash") + result = bbq.ai.generate_bool( + prompt, endpoint="gemini-2.5-flash", model_params=model_params + ) assert _contains_no_nulls(result) assert result.dtype == pd.ArrowDtype( @@ -61,20 +69,12 @@ def test_ai_generate_bool_with_pandas(session): ) -def test_ai_generate_bool_with_model_params(session): - if sys.version_info < (3, 12): - pytest.skip( - "Skip test because SQLGLot cannot compile model params to JSON at this env." - ) - +def test_ai_generate_bool(session): s1 = bpd.Series(["apple", "bear"], session=session) s2 = bpd.Series(["fruit", "tree"], session=session) prompt = (s1, " is a ", s2) - model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} - result = bbq.ai.generate_bool( - prompt, endpoint="gemini-2.5-flash", model_params=model_params - ) + result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash") assert _contains_no_nulls(result) assert result.dtype == pd.ArrowDtype( @@ -107,5 +107,44 @@ def test_ai_generate_bool_multi_model(session): ) +def test_ai_generate_int(session): + s = bpd.Series(["Cat"], session=session) + prompt = ("How many legs does a ", s, " have?") + + result = bbq.ai.generate_int(prompt, endpoint="gemini-2.5-flash") + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.int64()), + pa.field("full_response", dtypes.JSON_ARROW_TYPE), + pa.field("status", pa.string()), + ) + ) + ) + + +def test_ai_generate_int_multi_model(session): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + + result = bbq.ai.generate_int( + ("How many animals are there in the picture ", df["image"]) + ) + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.int64()), + pa.field("full_response", dtypes.JSON_ARROW_TYPE), + pa.field("status", pa.string()), + ) + ) + ) + + def _contains_no_nulls(s: series.Series) -> bool: return len(s) == s.count() diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index a4a49c622a..9b4efe8cbe 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from google.cloud import bigquery import pytest from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes import bigframes.operations.aggregations as agg_ops -from bigframes.session import polars_executor +from bigframes.session import direct_gbq_execution, polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution pytest.importorskip("polars") @@ -70,7 +71,7 @@ def test_engines_aggregate_size( assert_equivalence_execution(node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( "op", [agg_ops.min_op, agg_ops.max_op, agg_ops.mean_op, agg_ops.sum_op, agg_ops.count_op], @@ -84,6 +85,21 @@ def test_engines_unary_aggregates( assert_equivalence_execution(node, REFERENCE_ENGINE, engine) +def test_sql_engines_median_op_aggregates( + scalars_array_value: array_value.ArrayValue, + bigquery_client: bigquery.Client, +): + node = apply_agg_to_all_valid( + scalars_array_value, + agg_ops.MedianOp(), + ).node + left_engine = direct_gbq_execution.DirectGbqExecutor(bigquery_client) + right_engine = direct_gbq_execution.DirectGbqExecutor( + bigquery_client, compiler="sqlglot" + ) + assert_equivalence_execution(node, left_engine, right_engine) + + @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( "grouping_cols", diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index fc40b7e59d..fc491d358b 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -52,7 +52,7 @@ def apply_op( return new_arr -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_int(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -63,7 +63,7 @@ def test_engines_astype_int(scalars_array_value: array_value.ArrayValue, engine) assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_int(scalars_array_value: array_value.ArrayValue, engine): vals = ["1", "100", "-3"] arr, _ = scalars_array_value.compute_values( @@ -78,7 +78,7 @@ def test_engines_astype_string_int(scalars_array_value: array_value.ArrayValue, assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_float(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -89,7 +89,7 @@ def test_engines_astype_float(scalars_array_value: array_value.ArrayValue, engin assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_float( scalars_array_value: array_value.ArrayValue, engine ): @@ -106,7 +106,7 @@ def test_engines_astype_string_float( assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_bool(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, ops.AsTypeOp(to_type=bigframes.dtypes.BOOL_DTYPE) @@ -115,7 +115,7 @@ def test_engines_astype_bool(scalars_array_value: array_value.ArrayValue, engine assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string(scalars_array_value: array_value.ArrayValue, engine): # floats work slightly different with trailing zeroes rn arr = apply_op( @@ -127,7 +127,7 @@ def test_engines_astype_string(scalars_array_value: array_value.ArrayValue, engi assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_numeric(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -138,7 +138,7 @@ def test_engines_astype_numeric(scalars_array_value: array_value.ArrayValue, eng assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_numeric( scalars_array_value: array_value.ArrayValue, engine ): @@ -155,7 +155,7 @@ def test_engines_astype_string_numeric( assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_date(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -166,7 +166,7 @@ def test_engines_astype_date(scalars_array_value: array_value.ArrayValue, engine assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_date( scalars_array_value: array_value.ArrayValue, engine ): @@ -183,7 +183,7 @@ def test_engines_astype_string_date( assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_datetime(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -194,7 +194,7 @@ def test_engines_astype_datetime(scalars_array_value: array_value.ArrayValue, en assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_datetime( scalars_array_value: array_value.ArrayValue, engine ): @@ -211,7 +211,7 @@ def test_engines_astype_string_datetime( assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_timestamp(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -222,7 +222,7 @@ def test_engines_astype_timestamp(scalars_array_value: array_value.ArrayValue, e assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_string_timestamp( scalars_array_value: array_value.ArrayValue, engine ): @@ -243,7 +243,7 @@ def test_engines_astype_string_timestamp( assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_time(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, @@ -254,7 +254,7 @@ def test_engines_astype_time(scalars_array_value: array_value.ArrayValue, engine assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_from_json(scalars_array_value: array_value.ArrayValue, engine): exprs = [ ops.AsTypeOp(to_type=bigframes.dtypes.INT_DTYPE).as_expr( @@ -275,7 +275,7 @@ def test_engines_astype_from_json(scalars_array_value: array_value.ArrayValue, e assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_to_json(scalars_array_value: array_value.ArrayValue, engine): exprs = [ ops.AsTypeOp(to_type=bigframes.dtypes.JSON_DTYPE).as_expr( @@ -298,7 +298,7 @@ def test_engines_astype_to_json(scalars_array_value: array_value.ArrayValue, eng assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_timedelta(scalars_array_value: array_value.ArrayValue, engine): arr = apply_op( scalars_array_value, diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 65a851efc3..3280b16f42 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,6 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing +import bigframes.pandas as bpd from bigframes.testing import utils ONE_HOT_ENCODED_DTYPE = ( @@ -62,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=0.1) -def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): +def test_standard_scaler_normalizes_fit_transform(new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. scaler = preprocessing.StandardScaler() result = scaler.fit_transform( @@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=0.1) +def test_standard_scaler_normalizes_non_standard_column_names( + new_penguins_df: bpd.DataFrame, +): + new_penguins_df = new_penguins_df.rename( + columns={ + "culmen_length_mm": "culmen?metric", + "culmen_depth_mm": "culmen/metric", + } + ) + scaler = preprocessing.StandardScaler() + result = scaler.fit_transform( + new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]] + ).to_pandas() + + # If standard-scaled correctly, mean should be 0.0 + for column in result.columns: + assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) + + expected = pd.DataFrame( + { + "standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + def test_standard_scaler_save_load(new_penguins_df, dataset_id): transformer = preprocessing.StandardScaler() transformer.fit( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index afd1a74dff..d3e868db59 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -236,7 +236,20 @@ def test_reverse(scalars_dfs): @pytest.mark.parametrize( - ["start", "stop"], [(0, 1), (3, 5), (100, 101), (None, 1), (0, 12), (0, None)] + ["start", "stop"], + [ + (0, 1), + (3, 5), + (100, 101), + (None, 1), + (0, 12), + (0, None), + (None, -1), + (-1, None), + (-5, -1), + (1, -1), + (-10, 10), + ], ) def test_slice(scalars_dfs, start, stop): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/pandas/test_describe.py b/tests/system/small/pandas/test_describe.py index 5971e47997..6f28811512 100644 --- a/tests/system/small/pandas/test_describe.py +++ b/tests/system/small/pandas/test_describe.py @@ -230,3 +230,125 @@ def test_series_describe_temporal(scalars_dfs): check_dtype=False, check_index_type=False, ) + + +def test_df_groupby_describe(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + numeric_columns = [ + "int64_col", + "float64_col", + ] + non_numeric_columns = ["string_col"] + supported_columns = numeric_columns + non_numeric_columns + + bf_full_result = ( + scalars_df.groupby("bool_col")[supported_columns] + .describe(include="all") + .to_pandas() + ) + + pd_full_result = scalars_pandas_df.groupby("bool_col")[supported_columns].describe( + include="all" + ) + + for col in supported_columns: + pd_result = pd_full_result[col] + bf_result = bf_full_result[col] + + if col in numeric_columns: + # Drop quartiles, as they are approximate + bf_min = bf_result["min"] + bf_p25 = bf_result["25%"] + bf_p50 = bf_result["50%"] + bf_p75 = bf_result["75%"] + bf_max = bf_result["max"] + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex( + columns=["count", "mean", "std", "min", "max"] + ) + pd_result = pd_result.reindex( + columns=["count", "mean", "std", "min", "max"] + ) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + else: + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(columns=["count", "nunique"]) + pd_result = pd_result.reindex(columns=["count", "unique"]) + pandas.testing.assert_frame_equal( + # BF counter part of "unique" is called "nunique" + pd_result.astype("Float64").rename(columns={"unique": "nunique"}), + bf_result, + check_dtype=False, + check_index_type=False, + ) + + +def test_series_groupby_describe(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + numeric_columns = [ + "int64_col", + "float64_col", + ] + non_numeric_columns = ["string_col"] + supported_columns = numeric_columns + non_numeric_columns + + bf_df = scalars_df.groupby("bool_col") + + pd_df = scalars_pandas_df.groupby("bool_col") + + for col in supported_columns: + pd_result = pd_df[col].describe(include="all") + bf_result = bf_df[col].describe(include="all").to_pandas() + + if col in numeric_columns: + # Drop quartiles, as they are approximate + bf_min = bf_result["min"] + bf_p25 = bf_result["25%"] + bf_p50 = bf_result["50%"] + bf_p75 = bf_result["75%"] + bf_max = bf_result["max"] + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex( + columns=["count", "mean", "std", "min", "max"] + ) + pd_result = pd_result.reindex( + columns=["count", "mean", "std", "min", "max"] + ) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + else: + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(columns=["count", "nunique"]) + pd_result = pd_result.reindex(columns=["count", "unique"]) + pandas.testing.assert_frame_equal( + # BF counter part of "unique" is called "nunique" + pd_result.astype("Float64").rename(columns={"unique": "nunique"}), + bf_result, + check_dtype=False, + check_index_type=False, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index bad90d0562..1a942a023e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3129,8 +3129,6 @@ def test_series_binop_add_different_table( @all_joins def test_join_same_table(scalars_dfs_maybe_ordered, how): bf_df, pd_df = scalars_dfs_maybe_ordered - if not bf_df._session._strictly_ordered and how == "cross": - pytest.skip("Cross join not supported in partial ordering mode.") bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] bf_df_a = bf_df_a.sort_index() @@ -3153,6 +3151,21 @@ def test_join_same_table(scalars_dfs_maybe_ordered, how): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_join_incompatible_key_type_error(scalars_dfs): + bf_df, _ = scalars_dfs + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("date_col")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + with pytest.raises(TypeError): + # joining incompatible date, int columns + bf_df_a.join(bf_df_b, how="left") + + @all_joins def test_join_different_table( scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index a82bdf7635..90986c989a 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -638,6 +638,12 @@ def test_index_item_with_empty(session): bf_idx_empty.item() +def test_index_to_list(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.index.to_list() + pd_result = scalars_pandas_df_index.index.to_list() + assert bf_result == pd_result + + @pytest.mark.parametrize( ("key", "value"), [ diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index a1c7c0f1a3..4aa7ba8c77 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -13,6 +13,8 @@ # limitations under the License. +import io + import pandas as pd import pytest @@ -44,6 +46,38 @@ def test_null_index_materialize(scalars_df_null_index, scalars_pandas_df_default ) +def test_null_index_info(scalars_df_null_index): + expected = ( + "\n" + "NullIndex\n" + "Data columns (total 14 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + " 13 duration_col 7 non-null duration[us][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1269 bytes\n" + ) + + bf_result = io.StringIO() + + scalars_df_null_index.drop(columns="rowindex").info(buf=bf_result) + + assert expected == bf_result.getvalue() + + def test_null_index_series_repr(scalars_df_null_index, scalars_pandas_df_default_index): bf_result = scalars_df_null_index["int64_too"].head(5).__repr__() pd_result = ( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 0a761a3a3a..d1a252f8dc 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1919,10 +1919,22 @@ def test_mean(scalars_dfs): assert math.isclose(pd_result, bf_result) -def test_median(scalars_dfs): +@pytest.mark.parametrize( + ("col_name"), + [ + "int64_col", + # Non-numeric column + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "string_col", + ], +) +def test_median(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].median() + bf_result = scalars_df[col_name].median(exact=False) pd_max = scalars_pandas_df[col_name].max() pd_min = scalars_pandas_df[col_name].min() # Median is approximate, so just check for plausibility. @@ -1932,7 +1944,7 @@ def test_median(scalars_dfs): def test_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" - bf_result = scalars_df[col_name].median(exact=True) + bf_result = scalars_df[col_name].median() pd_result = scalars_pandas_df[col_name].median() assert math.isclose(pd_result, bf_result) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000000..a9b26afeef --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_count/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_count/out.sql new file mode 100644 index 0000000000..01684b4af6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_count/out.sql @@ -0,0 +1,12 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + COUNT(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql new file mode 100644 index 0000000000..38b6ed9f5c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + DENSE_RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_max/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_max/out.sql new file mode 100644 index 0000000000..c88fa58d0f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_max/out.sql @@ -0,0 +1,12 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + MAX(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql new file mode 100644 index 0000000000..6d4bb6f89a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql @@ -0,0 +1,27 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `duration_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_6`, + `bfcol_0` AS `bfcol_7`, + `bfcol_2` AS `bfcol_8` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + AVG(`bfcol_6`) AS `bfcol_12`, + AVG(CAST(`bfcol_7` AS INT64)) AS `bfcol_13`, + CAST(FLOOR(AVG(`bfcol_8`)) AS INT64) AS `bfcol_14`, + CAST(FLOOR(AVG(`bfcol_6`)) AS INT64) AS `bfcol_15` + FROM `bfcte_1` +) +SELECT + `bfcol_12` AS `int64_col`, + `bfcol_13` AS `bool_col`, + `bfcol_14` AS `duration_col`, + `bfcol_15` AS `int64_col_w_floor` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_median/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_median/out.sql new file mode 100644 index 0000000000..bf7006ef87 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_median/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `date_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + APPROX_QUANTILES(`bfcol_1`, 2)[OFFSET(1)] AS `bfcol_3`, + APPROX_QUANTILES(`bfcol_0`, 2)[OFFSET(1)] AS `bfcol_4`, + APPROX_QUANTILES(`bfcol_2`, 2)[OFFSET(1)] AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_3` AS `int64_col`, + `bfcol_4` AS `date_col`, + `bfcol_5` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_min/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_min/out.sql new file mode 100644 index 0000000000..b067817218 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_min/out.sql @@ -0,0 +1,12 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + MIN(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql new file mode 100644 index 0000000000..5de2330ef6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size_unary/out.sql similarity index 73% rename from tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql rename to tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size_unary/out.sql index 78104eb578..fffb4831b9 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size_unary/out.sql @@ -1,6 +1,6 @@ WITH `bfcte_0` AS ( SELECT - `string_col` AS `bfcol_0` + `float64_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT @@ -8,5 +8,5 @@ WITH `bfcte_0` AS ( FROM `bfcte_0` ) SELECT - `bfcol_1` AS `string_col_agg` + `bfcol_1` AS `float64_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql index e748f71278..be684f6768 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql @@ -1,12 +1,15 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0` + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT - COALESCE(SUM(`bfcol_0`), 0) AS `bfcol_1` + COALESCE(SUM(`bfcol_1`), 0) AS `bfcol_4`, + COALESCE(SUM(CAST(`bfcol_0` AS INT64)), 0) AS `bfcol_5` FROM `bfcte_0` ) SELECT - `bfcol_1` AS `int64_col_agg` + `bfcol_4` AS `int64_col`, + `bfcol_5` AS `bool_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index d12b4dda17..bf2523930f 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -12,40 +12,154 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + import pytest -from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes +from bigframes.core import agg_expressions as agg_exprs +from bigframes.core import ( + array_value, + expression, + identifiers, + nodes, + ordering, + window_spec, +) from bigframes.operations import aggregations as agg_ops import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") -def _apply_unary_op(obj: bpd.DataFrame, op: agg_ops.UnaryWindowOp, arg: str) -> str: - agg_node = nodes.AggregateNode( +def _apply_unary_agg_ops( + obj: bpd.DataFrame, + ops_list: typing.Sequence[agg_exprs.UnaryAggregation], + new_names: typing.Sequence[str], +) -> str: + aggs = [(op, identifiers.ColumnId(name)) for op, name in zip(ops_list, new_names)] + + agg_node = nodes.AggregateNode(obj._block.expr.node, aggregations=tuple(aggs)) + result = array_value.ArrayValue(agg_node) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql + + +def _apply_unary_window_op( + obj: bpd.DataFrame, + op: agg_exprs.UnaryAggregation, + window_spec: window_spec.WindowSpec, + new_name: str, +) -> str: + win_node = nodes.WindowOpNode( obj._block.expr.node, - aggregations=( - ( - agg_expressions.UnaryAggregation(op, expression.deref(arg)), - identifiers.ColumnId(arg + "_agg"), - ), - ), + expression=op, + window_spec=window_spec, + output_name=identifiers.ColumnId(new_name), ) - result = array_value.ArrayValue(agg_node) + result = array_value.ArrayValue(win_node).select_columns([new_name]) sql = result.session._executor.to_sql(result, enable_cache=False) return sql -def test_size(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["string_col"]] - sql = _apply_unary_op(bf_df, agg_ops.SizeUnaryOp(), "string_col") +def test_count(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_ops.CountOp().as_expr(col_name) + sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name]) + + snapshot.assert_match(sql, "out.sql") + + +def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation( + agg_ops.DenseRankOp(), expression.deref(col_name) + ) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + +def test_max(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_ops.MaxOp().as_expr(col_name) + sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name]) + + snapshot.assert_match(sql, "out.sql") + + +def test_mean(scalar_types_df: bpd.DataFrame, snapshot): + col_names = ["int64_col", "bool_col", "duration_col"] + bf_df = scalar_types_df[col_names] + bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") + + # The `to_timedelta` creates a new mapping for the column id. + col_names.insert(0, "rowindex") + name2id = { + col_name: col_id + for col_name, col_id in zip(col_names, bf_df._block.expr.column_ids) + } + + agg_ops_map = { + "int64_col": agg_ops.MeanOp().as_expr(name2id["int64_col"]), + "bool_col": agg_ops.MeanOp().as_expr(name2id["bool_col"]), + "duration_col": agg_ops.MeanOp().as_expr(name2id["duration_col"]), + "int64_col_w_floor": agg_ops.MeanOp(should_floor_result=True).as_expr( + name2id["int64_col"] + ), + } + sql = _apply_unary_agg_ops( + bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys()) + ) + + snapshot.assert_match(sql, "out.sql") + + +def test_median(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + ops_map = { + "int64_col": agg_ops.MedianOp().as_expr("int64_col"), + "date_col": agg_ops.MedianOp().as_expr("date_col"), + "string_col": agg_ops.MedianOp().as_expr("string_col"), + } + sql = _apply_unary_agg_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + + snapshot.assert_match(sql, "out.sql") + + +def test_min(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_ops.MinOp().as_expr(col_name) + sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name]) + + snapshot.assert_match(sql, "out.sql") + + +def test_rank(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation(agg_ops.RankOp(), expression.deref(col_name)) + + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") snapshot.assert_match(sql, "out.sql") def test_sum(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col"]] - sql = _apply_unary_op(bf_df, agg_ops.SumOp(), "int64_col") + bf_df = scalar_types_df[["int64_col", "bool_col"]] + agg_ops_map = { + "int64_col": agg_ops.SumOp().as_expr("int64_col"), + "bool_col": agg_ops.SumOp().as_expr("bool_col"), + } + sql = _apply_unary_agg_ops( + bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys()) + ) snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool/out.sql new file mode 100644 index 0000000000..584ccd9ce1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + AI.GENERATE_BOOL( + prompt => (`bfcol_0`, ' is the same as ', `bfcol_0`), + connection_id => 'test_connection_id', + endpoint => 'gemini-2.5-flash', + request_type => 'SHARED' + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `result` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool_with_model_param/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool_with_model_param/out.sql new file mode 100644 index 0000000000..fca2b965bf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_bool_with_model_param/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + AI.GENERATE_BOOL( + prompt => (`bfcol_0`, ' is the same as ', `bfcol_0`), + connection_id => 'test_connection_id', + request_type => 'SHARED', + model_params => JSON '{}' + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `result` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int/out.sql new file mode 100644 index 0000000000..e48b64bead --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + AI.GENERATE_INT( + prompt => (`bfcol_0`, ' is the same as ', `bfcol_0`), + connection_id => 'test_connection_id', + endpoint => 'gemini-2.5-flash', + request_type => 'SHARED' + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `result` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int_with_model_param/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int_with_model_param/out.sql new file mode 100644 index 0000000000..6f406dea18 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_int_with_model_param/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + AI.GENERATE_INT( + prompt => (`bfcol_0`, ' is the same as ', `bfcol_0`), + connection_id => 'test_connection_id', + request_type => 'SHARED', + model_params => JSON '{}' + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `result` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql deleted file mode 100644 index f8752d0a60..0000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql +++ /dev/null @@ -1,43 +0,0 @@ -WITH `bfcte_0` AS ( - SELECT - `int64_col` AS `bfcol_0`, - `rowindex` AS `bfcol_1`, - `timestamp_col` AS `bfcol_2`, - `duration_col` AS `bfcol_3` - FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` -), `bfcte_1` AS ( - SELECT - *, - `bfcol_1` AS `bfcol_8`, - `bfcol_2` AS `bfcol_9`, - `bfcol_0` AS `bfcol_10`, - `bfcol_3` AS `bfcol_11` - FROM `bfcte_0` -), `bfcte_2` AS ( - SELECT - *, - `bfcol_8` AS `bfcol_16`, - `bfcol_9` AS `bfcol_17`, - `bfcol_10` AS `bfcol_18`, - `bfcol_11` AS `bfcol_19`, - CAST(FLOOR(`bfcol_11` * `bfcol_10`) AS INT64) AS `bfcol_20` - FROM `bfcte_1` -), `bfcte_3` AS ( - SELECT - *, - `bfcol_16` AS `bfcol_26`, - `bfcol_17` AS `bfcol_27`, - `bfcol_18` AS `bfcol_28`, - `bfcol_19` AS `bfcol_29`, - `bfcol_20` AS `bfcol_30`, - CAST(FLOOR(`bfcol_18` * `bfcol_19`) AS INT64) AS `bfcol_31` - FROM `bfcte_2` -) -SELECT - `bfcol_26` AS `rowindex`, - `bfcol_27` AS `timestamp_col`, - `bfcol_28` AS `int64_col`, - `bfcol_29` AS `duration_col`, - `bfcol_30` AS `timedelta_mul_numeric`, - `bfcol_31` AS `numeric_mul_timedelta` -FROM `bfcte_3` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_obj_make_ref/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_obj_make_ref/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_null_match/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_null_match/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_null_match/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_null_match/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ge_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ge_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_gt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_gt_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_le_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_le_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_lt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_lt_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ne_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ne_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_add_timedelta/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_timedelta/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_add_timedelta/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_sub_timedelta/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_sub_timedelta/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_bool/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_bool/out.sql new file mode 100644 index 0000000000..440aea9161 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_bool/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `float64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_2`, + `bfcol_1` <> 0 AS `bfcol_3`, + `bfcol_1` <> 0 AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `bool_col`, + `bfcol_3` AS `float64_col`, + `bfcol_4` AS `float64_w_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_float/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_float/out.sql new file mode 100644 index 0000000000..81a8805f47 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_float/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CAST(CAST(`bfcol_0` AS INT64) AS FLOAT64) AS `bfcol_1`, + CAST('1.34235e4' AS FLOAT64) AS `bfcol_2`, + SAFE_CAST(SAFE_CAST(`bfcol_0` AS INT64) AS FLOAT64) AS `bfcol_3` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `bool_col`, + `bfcol_2` AS `str_const`, + `bfcol_3` AS `bool_w_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql new file mode 100644 index 0000000000..25d51b26b3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + INT64(`bfcol_0`) AS `bfcol_1`, + FLOAT64(`bfcol_0`) AS `bfcol_2`, + BOOL(`bfcol_0`) AS `bfcol_3`, + STRING(`bfcol_0`) AS `bfcol_4`, + SAFE.INT64(`bfcol_0`) AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col`, + `bfcol_2` AS `float64_col`, + `bfcol_3` AS `bool_col`, + `bfcol_4` AS `string_col`, + `bfcol_5` AS `int64_w_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_int/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_int/out.sql new file mode 100644 index 0000000000..22aa2cf91a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_int/out.sql @@ -0,0 +1,33 @@ +WITH `bfcte_0` AS ( + SELECT + `datetime_col` AS `bfcol_0`, + `numeric_col` AS `bfcol_1`, + `float64_col` AS `bfcol_2`, + `time_col` AS `bfcol_3`, + `timestamp_col` AS `bfcol_4` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + UNIX_MICROS(CAST(`bfcol_0` AS TIMESTAMP)) AS `bfcol_5`, + UNIX_MICROS(SAFE_CAST(`bfcol_0` AS TIMESTAMP)) AS `bfcol_6`, + TIME_DIFF(CAST(`bfcol_3` AS TIME), '00:00:00', MICROSECOND) AS `bfcol_7`, + TIME_DIFF(SAFE_CAST(`bfcol_3` AS TIME), '00:00:00', MICROSECOND) AS `bfcol_8`, + UNIX_MICROS(`bfcol_4`) AS `bfcol_9`, + CAST(TRUNC(`bfcol_1`) AS INT64) AS `bfcol_10`, + CAST(TRUNC(`bfcol_2`) AS INT64) AS `bfcol_11`, + SAFE_CAST(TRUNC(`bfcol_2`) AS INT64) AS `bfcol_12`, + CAST('100' AS INT64) AS `bfcol_13` + FROM `bfcte_0` +) +SELECT + `bfcol_5` AS `datetime_col`, + `bfcol_6` AS `datetime_w_safe`, + `bfcol_7` AS `time_col`, + `bfcol_8` AS `time_w_safe`, + `bfcol_9` AS `timestamp_col`, + `bfcol_10` AS `numeric_col`, + `bfcol_11` AS `float64_col`, + `bfcol_12` AS `float64_w_safe`, + `bfcol_13` AS `str_const` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql new file mode 100644 index 0000000000..8230b4a60b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql @@ -0,0 +1,26 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `float64_col` AS `bfcol_2`, + `string_col` AS `bfcol_3` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + PARSE_JSON(CAST(`bfcol_1` AS STRING)) AS `bfcol_4`, + PARSE_JSON(CAST(`bfcol_2` AS STRING)) AS `bfcol_5`, + PARSE_JSON(CAST(`bfcol_0` AS STRING)) AS `bfcol_6`, + PARSE_JSON(`bfcol_3`) AS `bfcol_7`, + PARSE_JSON(CAST(`bfcol_0` AS STRING)) AS `bfcol_8`, + PARSE_JSON_IN_SAFE(`bfcol_3`) AS `bfcol_9` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `int64_col`, + `bfcol_5` AS `float64_col`, + `bfcol_6` AS `bool_col`, + `bfcol_7` AS `string_col`, + `bfcol_8` AS `bool_w_safe`, + `bfcol_9` AS `string_w_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_string/out.sql new file mode 100644 index 0000000000..f230a3799e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_string/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CAST(`bfcol_1` AS STRING) AS `bfcol_2`, + INITCAP(CAST(`bfcol_0` AS STRING)) AS `bfcol_3`, + INITCAP(SAFE_CAST(`bfcol_0` AS STRING)) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `int64_col`, + `bfcol_3` AS `bool_col`, + `bfcol_4` AS `bool_w_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_time_like/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_time_like/out.sql new file mode 100644 index 0000000000..141b7ffa9a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_time_like/out.sql @@ -0,0 +1,19 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CAST(TIMESTAMP_MICROS(`bfcol_0`) AS DATETIME) AS `bfcol_1`, + CAST(TIMESTAMP_MICROS(`bfcol_0`) AS TIME) AS `bfcol_2`, + CAST(TIMESTAMP_MICROS(`bfcol_0`) AS TIMESTAMP) AS `bfcol_3`, + SAFE_CAST(TIMESTAMP_MICROS(`bfcol_0`) AS TIME) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_to_datetime`, + `bfcol_2` AS `int64_to_time`, + `bfcol_3` AS `int64_to_timestamp`, + `bfcol_4` AS `int64_to_time_safe` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_set/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_set/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_div_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_div_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_div_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_div_timedelta/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_floordiv_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_timedelta/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_floordiv_timedelta/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_timedelta/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_numeric/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_add_string/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_add_string/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py new file mode 100644 index 0000000000..33a257f9a9 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py @@ -0,0 +1,113 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +from packaging import version +import pytest +import sqlglot + +from bigframes import dataframe +from bigframes import operations as ops +from bigframes.testing import utils + +pytest.importorskip("pytest_snapshot") + + +def test_ai_generate_bool(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIGenerateBool( + prompt_context=(None, " is the same as ", None), + connection_id="test_connection_id", + endpoint="gemini-2.5-flash", + request_type="shared", + model_params=None, + ) + + sql = utils._apply_unary_ops( + scalar_types_df, [op.as_expr(col_name, col_name)], ["result"] + ) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_generate_bool_with_model_param( + scalar_types_df: dataframe.DataFrame, snapshot +): + if version.Version(sqlglot.__version__) < version.Version("25.18.0"): + pytest.skip( + "Skip test because SQLGLot cannot compile model params to JSON at this version." + ) + + col_name = "string_col" + + op = ops.AIGenerateBool( + prompt_context=(None, " is the same as ", None), + connection_id="test_connection_id", + endpoint=None, + request_type="shared", + model_params=json.dumps(dict()), + ) + + sql = utils._apply_unary_ops( + scalar_types_df, [op.as_expr(col_name, col_name)], ["result"] + ) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_generate_int(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIGenerateInt( + # The prompt does not make semantic sense but we only care about syntax correctness. + prompt_context=(None, " is the same as ", None), + connection_id="test_connection_id", + endpoint="gemini-2.5-flash", + request_type="shared", + model_params=None, + ) + + sql = utils._apply_unary_ops( + scalar_types_df, [op.as_expr(col_name, col_name)], ["result"] + ) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_generate_int_with_model_param( + scalar_types_df: dataframe.DataFrame, snapshot +): + if version.Version(sqlglot.__version__) < version.Version("25.18.0"): + pytest.skip( + "Skip test because SQLGLot cannot compile model params to JSON at this version." + ) + + col_name = "string_col" + + op = ops.AIGenerateInt( + # The prompt does not make semantic sense but we only care about syntax correctness. + prompt_context=(None, " is the same as ", None), + connection_id="test_connection_id", + endpoint=None, + request_type="shared", + model_params=json.dumps(dict()), + ) + + sql = utils._apply_unary_ops( + scalar_types_df, [op.as_expr(col_name, col_name)], ["result"] + ) + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py deleted file mode 100644 index a2218d0afa..0000000000 --- a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -import pandas as pd -import pytest - -from bigframes import operations as ops -import bigframes.core.expression as ex -import bigframes.pandas as bpd - -pytest.importorskip("pytest_snapshot") - - -def _apply_binary_op( - obj: bpd.DataFrame, - op: ops.BinaryOp, - l_arg: str, - r_arg: typing.Union[str, ex.Expression], -) -> str: - array_value = obj._block.expr - op_expr = op.as_expr(l_arg, r_arg) - result, col_ids = array_value.compute_values([op_expr]) - - # Rename columns for deterministic golden SQL results. - assert len(col_ids) == 1 - result = result.rename_columns({col_ids[0]: l_arg}).select_columns([l_arg]) - - sql = result.session._executor.to_sql(result, enable_cache=False) - return sql - - -def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_add_int"] = bf_df["int64_col"] + bf_df["int64_col"] - bf_df["int_add_1"] = bf_df["int64_col"] + 1 - - bf_df["int_add_bool"] = bf_df["int64_col"] + bf_df["bool_col"] - bf_df["bool_add_int"] = bf_df["bool_col"] + bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["string_col"]] - sql = _apply_binary_op(bf_df, ops.add_op, "string_col", ex.const("a")) - - snapshot.assert_match(sql, "out.sql") - - -def test_add_timedelta(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["timestamp_col", "date_col"]] - timedelta = pd.Timedelta(1, unit="d") - - bf_df["date_add_timedelta"] = bf_df["date_col"] + timedelta - bf_df["timestamp_add_timedelta"] = bf_df["timestamp_col"] + timedelta - bf_df["timedelta_add_date"] = timedelta + bf_df["date_col"] - bf_df["timedelta_add_timestamp"] = timedelta + bf_df["timestamp_col"] - bf_df["timedelta_add_timedelta"] = timedelta + timedelta - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_add_unsupported_raises(scalar_types_df: bpd.DataFrame): - with pytest.raises(TypeError): - _apply_binary_op(scalar_types_df, ops.add_op, "timestamp_col", "date_col") - - with pytest.raises(TypeError): - _apply_binary_op(scalar_types_df, ops.add_op, "int64_col", "string_col") - - -def test_div_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] - - bf_df["int_div_int"] = bf_df["int64_col"] / bf_df["int64_col"] - bf_df["int_div_1"] = bf_df["int64_col"] / 1 - bf_df["int_div_0"] = bf_df["int64_col"] / 0.0 - - bf_df["int_div_float"] = bf_df["int64_col"] / bf_df["float64_col"] - bf_df["float_div_int"] = bf_df["float64_col"] / bf_df["int64_col"] - bf_df["float_div_0"] = bf_df["float64_col"] / 0.0 - - bf_df["int_div_bool"] = bf_df["int64_col"] / bf_df["bool_col"] - bf_df["bool_div_int"] = bf_df["bool_col"] / bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_div_timedelta(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["timestamp_col", "int64_col"]] - timedelta = pd.Timedelta(1, unit="d") - bf_df["timedelta_div_numeric"] = timedelta / bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_eq_null_match(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - sql = _apply_binary_op(bf_df, ops.eq_null_match_op, "int64_col", "bool_col") - snapshot.assert_match(sql, "out.sql") - - -def test_eq_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_ne_int"] = bf_df["int64_col"] == bf_df["int64_col"] - bf_df["int_ne_1"] = bf_df["int64_col"] == 1 - - bf_df["int_ne_bool"] = bf_df["int64_col"] == bf_df["bool_col"] - bf_df["bool_ne_int"] = bf_df["bool_col"] == bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] - - bf_df["int_div_int"] = bf_df["int64_col"] // bf_df["int64_col"] - bf_df["int_div_1"] = bf_df["int64_col"] // 1 - bf_df["int_div_0"] = bf_df["int64_col"] // 0.0 - - bf_df["int_div_float"] = bf_df["int64_col"] // bf_df["float64_col"] - bf_df["float_div_int"] = bf_df["float64_col"] // bf_df["int64_col"] - bf_df["float_div_0"] = bf_df["float64_col"] // 0.0 - - bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] - bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] - - -def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["timestamp_col", "date_col"]] - timedelta = pd.Timedelta(1, unit="d") - - bf_df["timedelta_div_numeric"] = timedelta // 2 - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_gt_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_gt_int"] = bf_df["int64_col"] > bf_df["int64_col"] - bf_df["int_gt_1"] = bf_df["int64_col"] > 1 - - bf_df["int_gt_bool"] = bf_df["int64_col"] > bf_df["bool_col"] - bf_df["bool_gt_int"] = bf_df["bool_col"] > bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_ge_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_ge_int"] = bf_df["int64_col"] >= bf_df["int64_col"] - bf_df["int_ge_1"] = bf_df["int64_col"] >= 1 - - bf_df["int_ge_bool"] = bf_df["int64_col"] >= bf_df["bool_col"] - bf_df["bool_ge_int"] = bf_df["bool_col"] >= bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_json_set(json_types_df: bpd.DataFrame, snapshot): - bf_df = json_types_df[["json_col"]] - sql = _apply_binary_op( - bf_df, ops.JSONSet(json_path="$.a"), "json_col", ex.const(100) - ) - - snapshot.assert_match(sql, "out.sql") - - -def test_lt_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_lt_int"] = bf_df["int64_col"] < bf_df["int64_col"] - bf_df["int_lt_1"] = bf_df["int64_col"] < 1 - - bf_df["int_lt_bool"] = bf_df["int64_col"] < bf_df["bool_col"] - bf_df["bool_lt_int"] = bf_df["bool_col"] < bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_le_int"] = bf_df["int64_col"] <= bf_df["int64_col"] - bf_df["int_le_1"] = bf_df["int64_col"] <= 1 - - bf_df["int_le_bool"] = bf_df["int64_col"] <= bf_df["bool_col"] - bf_df["bool_le_int"] = bf_df["bool_col"] <= bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_add_int"] = bf_df["int64_col"] - bf_df["int64_col"] - bf_df["int_add_1"] = bf_df["int64_col"] - 1 - - bf_df["int_add_bool"] = bf_df["int64_col"] - bf_df["bool_col"] - bf_df["bool_add_int"] = bf_df["bool_col"] - bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_sub_timedelta(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["timestamp_col", "duration_col", "date_col"]] - bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") - - bf_df["date_sub_timedelta"] = bf_df["date_col"] - bf_df["duration_col"] - bf_df["timestamp_sub_timedelta"] = bf_df["timestamp_col"] - bf_df["duration_col"] - bf_df["timestamp_sub_date"] = bf_df["date_col"] - bf_df["date_col"] - bf_df["date_sub_timestamp"] = bf_df["timestamp_col"] - bf_df["timestamp_col"] - bf_df["timedelta_sub_timedelta"] = bf_df["duration_col"] - bf_df["duration_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_sub_unsupported_raises(scalar_types_df: bpd.DataFrame): - with pytest.raises(TypeError): - _apply_binary_op(scalar_types_df, ops.sub_op, "string_col", "string_col") - - with pytest.raises(TypeError): - _apply_binary_op(scalar_types_df, ops.sub_op, "int64_col", "string_col") - - -def test_mul_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_mul_int"] = bf_df["int64_col"] * bf_df["int64_col"] - bf_df["int_mul_1"] = bf_df["int64_col"] * 1 - - bf_df["int_mul_bool"] = bf_df["int64_col"] * bf_df["bool_col"] - bf_df["bool_mul_int"] = bf_df["bool_col"] * bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_mul_timedelta(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["timestamp_col", "int64_col", "duration_col"]] - bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") - - bf_df["timedelta_mul_numeric"] = bf_df["duration_col"] * bf_df["int64_col"] - bf_df["numeric_mul_timedelta"] = bf_df["int64_col"] * bf_df["duration_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") - - -def test_obj_make_ref(scalar_types_df: bpd.DataFrame, snapshot): - blob_df = scalar_types_df["string_col"].str.to_blob() - snapshot.assert_match(blob_df.to_frame().sql, "out.sql") - - -def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "bool_col"]] - - bf_df["int_ne_int"] = bf_df["int64_col"] != bf_df["int64_col"] - bf_df["int_ne_1"] = bf_df["int64_col"] != 1 - - bf_df["int_ne_bool"] = bf_df["int64_col"] != bf_df["bool_col"] - bf_df["bool_ne_int"] = bf_df["bool_col"] != bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py index 7876a754ee..80aa22aaac 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py @@ -29,3 +29,8 @@ def test_obj_get_access_url(scalar_types_df: bpd.DataFrame, snapshot): blob_s = scalar_types_df["string_col"].str.to_blob() sql = blob_s.blob.read_url().to_frame().sql snapshot.assert_match(sql, "out.sql") + + +def test_obj_make_ref(scalar_types_df: bpd.DataFrame, snapshot): + blob_df = scalar_types_df["string_col"].str.to_blob() + snapshot.assert_match(blob_df.to_frame().sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 9a901687fa..6c3eb64414 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -42,3 +42,81 @@ def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") + + +def test_eq_null_match(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + sql = utils._apply_binary_op(bf_df, ops.eq_null_match_op, "int64_col", "bool_col") + snapshot.assert_match(sql, "out.sql") + + +def test_eq_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ne_int"] = bf_df["int64_col"] == bf_df["int64_col"] + bf_df["int_ne_1"] = bf_df["int64_col"] == 1 + + bf_df["int_ne_bool"] = bf_df["int64_col"] == bf_df["bool_col"] + bf_df["bool_ne_int"] = bf_df["bool_col"] == bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_gt_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_gt_int"] = bf_df["int64_col"] > bf_df["int64_col"] + bf_df["int_gt_1"] = bf_df["int64_col"] > 1 + + bf_df["int_gt_bool"] = bf_df["int64_col"] > bf_df["bool_col"] + bf_df["bool_gt_int"] = bf_df["bool_col"] > bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_ge_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ge_int"] = bf_df["int64_col"] >= bf_df["int64_col"] + bf_df["int_ge_1"] = bf_df["int64_col"] >= 1 + + bf_df["int_ge_bool"] = bf_df["int64_col"] >= bf_df["bool_col"] + bf_df["bool_ge_int"] = bf_df["bool_col"] >= bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_lt_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_lt_int"] = bf_df["int64_col"] < bf_df["int64_col"] + bf_df["int_lt_1"] = bf_df["int64_col"] < 1 + + bf_df["int_lt_bool"] = bf_df["int64_col"] < bf_df["bool_col"] + bf_df["bool_lt_int"] = bf_df["bool_col"] < bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_le_int"] = bf_df["int64_col"] <= bf_df["int64_col"] + bf_df["int_le_1"] = bf_df["int64_col"] <= 1 + + bf_df["int_le_bool"] = bf_df["int64_col"] <= bf_df["bool_col"] + bf_df["bool_le_int"] = bf_df["bool_col"] <= bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ne_int"] = bf_df["int64_col"] != bf_df["int64_col"] + bf_df["int_ne_1"] = bf_df["int64_col"] != 1 + + bf_df["int_ne_bool"] = bf_df["int64_col"] != bf_df["bool_col"] + bf_df["bool_ne_int"] = bf_df["bool_col"] != bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py index 0a8aa320bb..91926e7bdd 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd import pytest from bigframes import operations as ops @@ -215,3 +216,29 @@ def test_iso_year(scalar_types_df: bpd.DataFrame, snapshot): sql = utils._apply_unary_ops(bf_df, [ops.iso_year_op.as_expr(col_name)], [col_name]) snapshot.assert_match(sql, "out.sql") + + +def test_add_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "date_col"]] + timedelta = pd.Timedelta(1, unit="d") + + bf_df["date_add_timedelta"] = bf_df["date_col"] + timedelta + bf_df["timestamp_add_timedelta"] = bf_df["timestamp_col"] + timedelta + bf_df["timedelta_add_date"] = timedelta + bf_df["date_col"] + bf_df["timedelta_add_timestamp"] = timedelta + bf_df["timestamp_col"] + bf_df["timedelta_add_timedelta"] = timedelta + timedelta + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_sub_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "duration_col", "date_col"]] + bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") + + bf_df["date_sub_timedelta"] = bf_df["date_col"] - bf_df["duration_col"] + bf_df["timestamp_sub_timedelta"] = bf_df["timestamp_col"] - bf_df["duration_col"] + bf_df["timestamp_sub_date"] = bf_df["date_col"] - bf_df["date_col"] + bf_df["date_sub_timestamp"] = bf_df["timestamp_col"] - bf_df["timestamp_col"] + bf_df["timedelta_sub_timedelta"] = bf_df["duration_col"] - bf_df["duration_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index 130d34a2fa..d9ae6ab539 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -14,13 +14,160 @@ import pytest +from bigframes import dtypes from bigframes import operations as ops +from bigframes.core import expression as ex import bigframes.pandas as bpd from bigframes.testing import utils pytest.importorskip("pytest_snapshot") +def test_astype_int(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + to_type = dtypes.INT_DTYPE + + ops_map = { + "datetime_col": ops.AsTypeOp(to_type=to_type).as_expr("datetime_col"), + "datetime_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr( + "datetime_col" + ), + "time_col": ops.AsTypeOp(to_type=to_type).as_expr("time_col"), + "time_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr("time_col"), + "timestamp_col": ops.AsTypeOp(to_type=to_type).as_expr("timestamp_col"), + "numeric_col": ops.AsTypeOp(to_type=to_type).as_expr("numeric_col"), + "float64_col": ops.AsTypeOp(to_type=to_type).as_expr("float64_col"), + "float64_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr( + "float64_col" + ), + "str_const": ops.AsTypeOp(to_type=to_type).as_expr(ex.const("100")), + } + + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_float(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + to_type = dtypes.FLOAT_DTYPE + + ops_map = { + "bool_col": ops.AsTypeOp(to_type=to_type).as_expr("bool_col"), + "str_const": ops.AsTypeOp(to_type=to_type).as_expr(ex.const("1.34235e4")), + "bool_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr("bool_col"), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_bool(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + to_type = dtypes.BOOL_DTYPE + + ops_map = { + "bool_col": ops.AsTypeOp(to_type=to_type).as_expr("bool_col"), + "float64_col": ops.AsTypeOp(to_type=to_type).as_expr("float64_col"), + "float64_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr( + "float64_col" + ), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_time_like(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + + ops_map = { + "int64_to_datetime": ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr( + "int64_col" + ), + "int64_to_time": ops.AsTypeOp(to_type=dtypes.TIME_DTYPE).as_expr("int64_col"), + "int64_to_timestamp": ops.AsTypeOp(to_type=dtypes.TIMESTAMP_DTYPE).as_expr( + "int64_col" + ), + "int64_to_time_safe": ops.AsTypeOp( + to_type=dtypes.TIME_DTYPE, safe=True + ).as_expr("int64_col"), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_string(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + to_type = dtypes.STRING_DTYPE + + ops_map = { + "int64_col": ops.AsTypeOp(to_type=to_type).as_expr("int64_col"), + "bool_col": ops.AsTypeOp(to_type=to_type).as_expr("bool_col"), + "bool_w_safe": ops.AsTypeOp(to_type=to_type, safe=True).as_expr("bool_col"), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_json(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df + + ops_map = { + "int64_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("int64_col"), + "float64_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("float64_col"), + "bool_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("bool_col"), + "string_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("string_col"), + "bool_w_safe": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE, safe=True).as_expr( + "bool_col" + ), + "string_w_safe": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE, safe=True).as_expr( + "string_col" + ), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_from_json(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df + + ops_map = { + "int64_col": ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr("json_col"), + "float64_col": ops.AsTypeOp(to_type=dtypes.FLOAT_DTYPE).as_expr("json_col"), + "bool_col": ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr("json_col"), + "string_col": ops.AsTypeOp(to_type=dtypes.STRING_DTYPE).as_expr("json_col"), + "int64_w_safe": ops.AsTypeOp(to_type=dtypes.INT_DTYPE, safe=True).as_expr( + "json_col" + ), + } + sql = utils._apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) + snapshot.assert_match(sql, "out.sql") + + +def test_astype_json_invalid( + scalar_types_df: bpd.DataFrame, json_types_df: bpd.DataFrame +): + # Test invalid cast to JSON + with pytest.raises(TypeError, match="Cannot cast timestamp.* to .*json.*"): + ops_map_to = { + "datetime_to_json": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr( + "datetime_col" + ), + } + utils._apply_unary_ops( + scalar_types_df, list(ops_map_to.values()), list(ops_map_to.keys()) + ) + + # Test invalid cast from JSON + with pytest.raises(TypeError, match="Cannot cast .*json.* to timestamp.*"): + ops_map_from = { + "json_to_datetime": ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr( + "json_col" + ), + } + utils._apply_unary_ops( + json_types_df, list(ops_map_from.values()), list(ops_map_from.keys()) + ) + + def test_hash(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py index ecbac10ef2..75206091e0 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py @@ -15,6 +15,7 @@ import pytest from bigframes import operations as ops +import bigframes.core.expression as ex import bigframes.pandas as bpd from bigframes.testing import utils @@ -97,3 +98,12 @@ def test_to_json_string(json_types_df: bpd.DataFrame, snapshot): ) snapshot.assert_match(sql, "out.sql") + + +def test_json_set(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = utils._apply_binary_op( + bf_df, ops.JSONSet(json_path="$.a"), "json_col", ex.const(100) + ) + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index 10fd4b2427..e0c41857e9 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd import pytest from bigframes import operations as ops @@ -211,3 +212,88 @@ def test_tanh(scalar_types_df: bpd.DataFrame, snapshot): sql = utils._apply_unary_ops(bf_df, [ops.tanh_op.as_expr(col_name)], [col_name]) snapshot.assert_match(sql, "out.sql") + + +def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_add_int"] = bf_df["int64_col"] + bf_df["int64_col"] + bf_df["int_add_1"] = bf_df["int64_col"] + 1 + + bf_df["int_add_bool"] = bf_df["int64_col"] + bf_df["bool_col"] + bf_df["bool_add_int"] = bf_df["bool_col"] + bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_div_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] + + bf_df["int_div_int"] = bf_df["int64_col"] / bf_df["int64_col"] + bf_df["int_div_1"] = bf_df["int64_col"] / 1 + bf_df["int_div_0"] = bf_df["int64_col"] / 0.0 + + bf_df["int_div_float"] = bf_df["int64_col"] / bf_df["float64_col"] + bf_df["float_div_int"] = bf_df["float64_col"] / bf_df["int64_col"] + bf_df["float_div_0"] = bf_df["float64_col"] / 0.0 + + bf_df["int_div_bool"] = bf_df["int64_col"] / bf_df["bool_col"] + bf_df["bool_div_int"] = bf_df["bool_col"] / bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_div_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "int64_col"]] + timedelta = pd.Timedelta(1, unit="d") + bf_df["timedelta_div_numeric"] = timedelta / bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] + + bf_df["int_div_int"] = bf_df["int64_col"] // bf_df["int64_col"] + bf_df["int_div_1"] = bf_df["int64_col"] // 1 + bf_df["int_div_0"] = bf_df["int64_col"] // 0.0 + + bf_df["int_div_float"] = bf_df["int64_col"] // bf_df["float64_col"] + bf_df["float_div_int"] = bf_df["float64_col"] // bf_df["int64_col"] + bf_df["float_div_0"] = bf_df["float64_col"] // 0.0 + + bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] + bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] + + +def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "date_col"]] + timedelta = pd.Timedelta(1, unit="d") + + bf_df["timedelta_div_numeric"] = timedelta // 2 + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_mul_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_mul_int"] = bf_df["int64_col"] * bf_df["int64_col"] + bf_df["int_mul_1"] = bf_df["int64_col"] * 1 + + bf_df["int_mul_bool"] = bf_df["int64_col"] * bf_df["bool_col"] + bf_df["bool_mul_int"] = bf_df["bool_col"] * bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_add_int"] = bf_df["int64_col"] - bf_df["int64_col"] + bf_df["int_add_1"] = bf_df["int64_col"] - 1 + + bf_df["int_add_bool"] = bf_df["int64_col"] - bf_df["bool_col"] + bf_df["bool_add_int"] = bf_df["bool_col"] - bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py index 79c67a09ca..9121334811 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py @@ -15,6 +15,7 @@ import pytest from bigframes import operations as ops +import bigframes.core.expression as ex import bigframes.pandas as bpd from bigframes.testing import utils @@ -303,3 +304,10 @@ def test_zfill(scalar_types_df: bpd.DataFrame, snapshot): bf_df, [ops.ZfillOp(width=10).as_expr(col_name)], [col_name] ) snapshot.assert_match(sql, "out.sql") + + +def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = utils._apply_binary_op(bf_df, ops.add_op, "string_col", ex.const("a")) + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/test_groupby.py b/tests/unit/core/test_groupby.py new file mode 100644 index 0000000000..f3d9218123 --- /dev/null +++ b/tests/unit/core/test_groupby.py @@ -0,0 +1,263 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +import bigframes.core.utils as utils +import bigframes.pandas as bpd + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + + +def test_groupby_df_iter_by_key_singular(polars_session): + pd_df = pd.DataFrame({"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]}) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby("colA"), pd_df.groupby("colA")): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_key_list(polars_session): + pd_df = pd.DataFrame({"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]}) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(["colA"]), pd_df.groupby(["colA"])): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_key_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip( # type: ignore + bf_df.groupby(["colA", "colB"]), pd_df.groupby(["colA", "colB"]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_singular(polars_session): + pd_df = pd.DataFrame( + {"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]} + ).set_index("colA") + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(level=0), pd_df.groupby(level=0)): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_list_one_item(polars_session): + pd_df = pd.DataFrame( + {"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]} + ).set_index("colA") + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(level=[0]), pd_df.groupby(level=[0])): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + + # In pandas 2.x, we get a warning from pandas: "Creating a Groupby + # object with a length-1 list-like level parameter will yield indexes + # as tuples in a future version. To keep indexes as scalars, create + # Groupby objects with a scalar level parameter instead. + if utils.is_list_like(pd_key): + assert bf_key == tuple(pd_key) + else: + assert bf_key == (pd_key,) + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ).set_index(["colA", "colB"]) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip( # type: ignore + bf_df.groupby(level=[0, 1]), pd_df.groupby(level=[0, 1]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_singular(polars_session): + series_index = ["a", "a", "b"] + pd_series = pd.Series([1, 2, 3], index=series_index) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=0), pd_series.groupby(level=0) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_list_one_item(polars_session): + series_index = ["a", "a", "b"] + pd_series = pd.Series([1, 2, 3], index=series_index) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=[0]), pd_series.groupby(level=[0]) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + + # In pandas 2.x, we get a warning from pandas: "Creating a Groupby + # object with a length-1 list-like level parameter will yield indexes + # as tuples in a future version. To keep indexes as scalars, create + # Groupby objects with a scalar level parameter instead. + if utils.is_list_like(pd_key): + assert bf_key == tuple(pd_key) + else: + assert bf_key == (pd_key,) + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ).set_index(["colA", "colB"]) + pd_series = pd_df["colC"] + bf_df = bpd.DataFrame(pd_df, session=polars_session) + bf_series = bf_df["colC"] + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=[0, 1]), pd_series.groupby(level=[0, 1]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series(polars_session): + pd_groups = pd.Series(["a", "a", "b"]) + bf_groups = bpd.Series(pd_groups, session=polars_session) + pd_series = pd.Series([1, 2, 3]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(bf_groups), pd_series.groupby(pd_groups) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series_list_one_item(polars_session): + pd_groups = pd.Series(["a", "a", "b"]) + bf_groups = bpd.Series(pd_groups, session=polars_session) + pd_series = pd.Series([1, 2, 3]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby([bf_groups]), pd_series.groupby([pd_groups]) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series_list_multiple(polars_session): + pd_group_a = pd.Series(["a", "a", "b", "c", "c"]) + bf_group_a = bpd.Series(pd_group_a, session=polars_session) + pd_group_b = pd.Series([0, 0, 0, 1, 1]) + bf_group_b = bpd.Series(pd_group_b, session=polars_session) + pd_series = pd.Series([1, 2, 3, 4, 5]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby([bf_group_a, bf_group_b]), + pd_series.groupby([pd_group_a, pd_group_b]), + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index d630380e7a..6aaccd644e 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -13,9 +13,11 @@ # limitations under the License. import google.cloud.bigquery +import pandas as pd import pytest import bigframes.dataframe +import bigframes.session from bigframes.testing import mocks @@ -129,6 +131,38 @@ def test_dataframe_rename_axis_inplace_returns_none(monkeypatch: pytest.MonkeyPa assert list(dataframe.index.names) == ["a", "b"] +def test_dataframe_drop_columns_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [1], "col2": [2], "col3": [3]} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + assert dataframe.drop(columns=["col1", "col3"], inplace=True) is None + assert dataframe.columns.to_list() == ["col2"] + + +def test_dataframe_drop_index_inplace_returns_none( + # Drop index depends on the actual data, not just metadata, so use the + # local engine for more robust testing. + polars_session: bigframes.session.Session, +): + dataframe = polars_session.read_pandas( + pd.DataFrame({"col1": [1, 2, 3], "index_col": [0, 1, 2]}).set_index("index_col") + ) + assert dataframe.index.to_list() == [0, 1, 2] + assert dataframe.drop(index=[0, 2], inplace=True) is None + assert dataframe.index.to_list() == [1] + + +def test_dataframe_drop_columns_returns_new_dataframe(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [1], "col2": [2], "col3": [3]} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + new_dataframe = dataframe.drop(columns=["col1", "col3"]) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + assert new_dataframe.columns.to_list() == ["col2"] + + def test_dataframe_semantics_property_future_warning( monkeypatch: pytest.MonkeyPatch, ): diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 97f1e4419e..b875d56e7a 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd import pytest from bigframes.testing import mocks @@ -38,3 +39,13 @@ def test_index_rename_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): # Make sure the linked DataFrame is updated, too. assert dataframe.index.name == "my_index_name" assert index.name == "my_index_name" + + +def test_index_to_list(monkeypatch: pytest.MonkeyPatch): + pd_index = pd.Index([1, 2, 3], name="my_index") + df = mocks.create_dataframe( + monkeypatch, + data={"my_index": [1, 2, 3]}, + ).set_index("my_index") + bf_index = df.index + assert bf_index.to_list() == pd_index.to_list() diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 509bc6ade2..7d3d532d88 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -24,14 +24,6 @@ pytest.importorskip("pandas", minversion="2.0.0") -# All tests in this file require polars to be installed to pass. -@pytest.fixture(scope="module") -def polars_session(): - from bigframes.testing import polars_session - - return polars_session.TestSession() - - @pytest.fixture(scope="module") def small_inline_frame() -> pd.DataFrame: df = pd.DataFrame( diff --git a/tests/unit/test_notebook.py b/tests/unit/test_notebook.py index a41854fb29..3feacd52b2 100644 --- a/tests/unit/test_notebook.py +++ b/tests/unit/test_notebook.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib -import os.path +REPO_ROOT = pathlib.Path(__file__).parent.parent.parent def test_template_notebook_exists(): # This notebook is meant for being used as a BigFrames usage template and # could be dynamically linked in places such as BQ Studio and IDE extensions. # Let's make sure it exists in the well known path. - assert os.path.exists("notebooks/getting_started/bq_dataframes_template.ipynb") + assert ( + REPO_ROOT / "notebooks" / "getting_started" / "bq_dataframes_template.ipynb" + ).exists() diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 6ea11d5215..ef150534ee 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1105,9 +1105,14 @@ def visit_StringAgg(self, op, *, arg, sep, order_by, where): return self.agg.string_agg(expr, sep, where=where) def visit_AIGenerateBool(self, op, **kwargs): - func_name = "AI.GENERATE_BOOL" + return sge.func("AI.GENERATE_BOOL", *self._compile_ai_args(**kwargs)) + def visit_AIGenerateInt(self, op, **kwargs): + return sge.func("AI.GENERATE_INT", *self._compile_ai_args(**kwargs)) + + def _compile_ai_args(self, **kwargs): args = [] + for key, val in kwargs.items(): if val is None: continue @@ -1117,7 +1122,7 @@ def visit_AIGenerateBool(self, op, **kwargs): args.append(sge.Kwarg(this=sge.Identifier(this=key), expression=val)) - return sge.func(func_name, *args) + return args def visit_FirstNonNullValue(self, op, *, arg): return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py index 1f8306bad6..4b855f71c0 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -30,3 +30,22 @@ def dtype(self) -> dt.Struct: return dt.Struct.from_tuples( (("result", dt.bool), ("full_resposne", dt.string), ("status", dt.string)) ) + + +@public +class AIGenerateInt(Value): + """Generate integers based on the prompt""" + + prompt: Value + connection_id: Value[dt.String] + endpoint: Optional[Value[dt.String]] + request_type: Value[dt.String] + model_params: Optional[Value[dt.String]] + + shape = rlz.shape_like("prompt") + + @attribute + def dtype(self) -> dt.Struct: + return dt.Struct.from_tuples( + (("result", dt.int64), ("full_resposne", dt.string), ("status", dt.string)) + ) diff --git a/third_party/bigframes_vendored/ibis/expr/rewrites.py b/third_party/bigframes_vendored/ibis/expr/rewrites.py index b0569846da..779a5081ca 100644 --- a/third_party/bigframes_vendored/ibis/expr/rewrites.py +++ b/third_party/bigframes_vendored/ibis/expr/rewrites.py @@ -206,21 +206,26 @@ def replace_parameter(_, params, **kwargs): @replace(p.StringSlice) def lower_stringslice(_, **kwargs): """Rewrite StringSlice in terms of Substring.""" - if _.end is None: - return ops.Substring(_.arg, start=_.start) if _.start is None: - return ops.Substring(_.arg, start=0, length=_.end) - if ( - isinstance(_.start, ops.Literal) - and isinstance(_.start.value, int) - and isinstance(_.end, ops.Literal) - and isinstance(_.end.value, int) - ): - # optimization for constant values - length = _.end.value - _.start.value + real_start = 0 else: - length = ops.Subtract(_.end, _.start) - return ops.Substring(_.arg, start=_.start, length=length) + real_start = ops.IfElse( + ops.GreaterEqual(_.start, 0), + _.start, + ops.Greatest((0, ops.Add(ops.StringLength(_.arg), _.start))), + ) + + if _.end is None: + real_end = ops.StringLength(_.arg) + else: + real_end = ops.IfElse( + ops.GreaterEqual(_.end, 0), + _.end, + ops.Greatest((0, ops.Add(ops.StringLength(_.arg), _.end))), + ) + + length = ops.Greatest((0, ops.Subtract(real_end, real_start))) + return ops.Substring(_.arg, start=real_start, length=length) @replace(p.Analytic) diff --git a/third_party/bigframes_vendored/ibis/expr/types/strings.py b/third_party/bigframes_vendored/ibis/expr/types/strings.py index 85b455e66e..f63cf96e72 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/strings.py +++ b/third_party/bigframes_vendored/ibis/expr/types/strings.py @@ -96,15 +96,6 @@ def __getitem__(self, key: slice | int | ir.IntegerScalar) -> StringValue: if isinstance(step, ir.Expr) or (step is not None and step != 1): raise ValueError("Step can only be 1") - if start is not None and not isinstance(start, ir.Expr) and start < 0: - raise ValueError( - "Negative slicing not yet supported, got start value " - f"of {start:d}" - ) - if stop is not None and not isinstance(stop, ir.Expr) and stop < 0: - raise ValueError( - "Negative slicing not yet supported, got stop value " f"of {stop:d}" - ) if start is None and stop is None: return self return ops.StringSlice(self, start, stop).to_expr() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 953ece9beb..1d8f5cbace 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4672,10 +4672,10 @@ def join( Another option to join using the key columns is to use the on parameter: - >>> df1.join(df2, on="col1", how="right") + >>> df1.join(df2, on="col2", how="right") col1 col2 col3 col4 - 11 foo 3 - 22 baz 4 + 11 foo 3 + 22 baz 4 [2 rows x 4 columns] diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index b6b91388e3..1e39ec8f94 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -9,6 +9,8 @@ class providing the base-class of operations. """ from __future__ import annotations +from typing import Literal + from bigframes import constants @@ -17,6 +19,64 @@ class GroupBy: Class for grouping and aggregating relational data. """ + def describe(self, include: None | Literal["all"] = None): + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + Args: + include ("all" or None, optional): + If "all": All columns of the input will be included in the output. + If None: The result will include all numeric columns. + + .. note:: + Percentile values are approximates only. + + .. note:: + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) + >>> df + A B C + 0 1 0 cat + 1 1 2 cat + 2 1 8 dog + 3 2 2 mouse + 4 2 7 cat + + [5 rows x 3 columns] + + >>> df.groupby("A").describe(include="all") + B C + count mean std min 25% 50% 75% max count nunique + A + 1 3 3.333333 4.163332 0 0 2 8 8 3 2 + 2 2 4.5 3.535534 2 2 2 7 7 2 2 + + [2 rows x 10 columns] + + Returns: + bigframes.pandas.DataFrame: + Summary statistics of the Series or Dataframe provided. + + Raises: + ValueError: + If unsupported ``include`` type is provided. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def any(self): """ Return True if any value in the group is true, else False. @@ -1199,11 +1259,11 @@ def size(self): **Examples:** - For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 3], index=lst) >>> ser @@ -1241,6 +1301,74 @@ def size(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __iter__(self): + r""" + Groupby iterator. + + This method provides an iterator over the groups created by the ``resample`` + or ``groupby`` operation on the object. The method yields tuples where + the first element is the label (group key) corresponding to each group or + resampled bin, and the second element is the subset of the data that falls + within that group or bin. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + For SeriesGroupBy: + + >>> lst = ["a", "a", "b"] + >>> ser = bpd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: Int64 + >>> for x, y in ser.groupby(level=0): + ... print(f"{x}\n{y}\n") + a + a 1 + a 2 + dtype: Int64 + b + b 3 + dtype: Int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = bpd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + + [3 rows x 3 columns] + >>> for x, y in df.groupby(by=["a"]): + ... print(f'{x}\n{y}\n') + (1,) + a b c + 0 1 2 3 + 1 1 5 6 + + [2 rows x 3 columns] + (7,) + + a b c + 2 7 8 9 + + [1 rows x 3 columns] + + + Returns: + Iterable[Label | Tuple, bigframes.pandas.Series | bigframes.pandas.DataFrame]: + Generator yielding sequence of (name, subsetted object) + for each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class SeriesGroupBy(GroupBy): def agg(self, func): diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index f8f4376098..5b669176e8 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.21.0" +__version__ = "2.22.0" # {x-release-please-start-date} -__release_date__ = "2025-09-17" +__release_date__ = "2025-09-25" # {x-release-please-end}