+
+ As of January 1, 2020 this library no longer supports Python 2 on the latest released version.
+ Library versions released prior to that date will continue to be available. For more information please
+ visit
Python 2 support on Google Cloud.
+
{% block body %} {% endblock %}
diff --git a/docs/conf.py b/docs/conf.py
index 3ab5be5103..af8c5efda8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Google LLC
+# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -25,8 +25,8 @@
# serve to show the default.
import os
+import shlex
import sys
-from typing import Any, Dict
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
@@ -80,8 +80,8 @@
root_doc = "index"
# General information about the project.
-project = "BigQuery DataFrames"
-copyright = "2022-2023 Google LLC"
+project = "bigframes"
+copyright = "2019, Google"
author = "Google APIs"
# The version info for the project you're documenting, acts as replacement for
@@ -98,7 +98,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = "en"
+language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
@@ -154,10 +154,10 @@
# further. For a list of options available for each theme, see the
# documentation.
html_theme_options = {
- "description": "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine.",
- # "github_user": "googleapis",
- # "github_repo": "python-bigquery-storage",
- # "github_banner": True,
+ "description": "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
+ "github_user": "googleapis",
+ "github_repo": "python-bigquery-dataframes",
+ "github_banner": True,
"font_family": "'Roboto', Georgia, sans",
"head_font_family": "'Roboto', Georgia, serif",
"code_font_family": "'Roboto Mono', 'Consolas', monospace",
@@ -185,7 +185,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
+html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
@@ -264,13 +264,15 @@
# -- Options for LaTeX output ---------------------------------------------
-latex_elements: Dict[str, Any] = {
- # Avoid "too deeply nested" error by using enumitem package.
- # See: https://stackoverflow.com/a/28454426/101923
- "preamble": r"""
-\usepackage{enumitem}
-\setlistdepth{99}
-"""
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
+ # Latex figure (float) alignment
+ #'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
@@ -366,6 +368,11 @@
"grpc": ("https://grpc.github.io/grpc/python/", None),
"proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None),
"protobuf": ("https://googleapis.dev/python/protobuf/latest/", None),
+ "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+ "pydata-google-auth": (
+ "https://pydata-google-auth.readthedocs.io/en/latest/",
+ None,
+ ),
}
diff --git a/noxfile.py b/noxfile.py
index 1ceca6831b..033bbfefe4 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -31,7 +31,7 @@
BLACK_VERSION = "black==22.3.0"
ISORT_VERSION = "isort==5.12.0"
SPHINX_VERSION = "sphinx==4.5.0"
-LINT_PATHS = ["docs", "bigframes", "tests", "noxfile.py", "setup.py"]
+LINT_PATHS = ["docs", "bigframes", "tests", "third_party", "noxfile.py", "setup.py"]
DEFAULT_PYTHON_VERSION = "3.10"
@@ -42,6 +42,7 @@
"pytest",
"pytest-cov",
"pytest-asyncio",
+ "pytest-mock",
]
UNIT_TEST_EXTERNAL_DEPENDENCIES: List[str] = []
UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
@@ -108,6 +109,7 @@ def lint(session):
"--check",
*LINT_PATHS,
)
+ # TODO(tswast): lint all LINT_PATHS
session.run("flake8", "bigframes", "tests")
diff --git a/owlbot.py b/owlbot.py
index 4ba7d14eb5..be30eea5c2 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -15,6 +15,7 @@
"""This script is used to synthesize generated parts of this library."""
import pathlib
+import re
from synthtool import gcp
import synthtool as s
@@ -27,11 +28,10 @@
# ----------------------------------------------------------------------------
# Add templated files
# ----------------------------------------------------------------------------
-
templated_files = common.py_library(
unit_test_python_versions=["3.9", "3.10", "3.11"],
system_test_python_versions=["3.9", "3.11"],
- cov_level=40,
+ cov_level=35,
intersphinx_dependencies={
"pandas": "https://pandas.pydata.org/pandas-docs/stable/",
"pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/",
@@ -40,11 +40,17 @@
s.move(
templated_files,
excludes=[
- # Multi-processing note isn't relevant, as pandas_gbq is responsible for
+ # Multi-processing note isn't relevant, as bigframes is responsible for
# creating clients, not the end user.
"docs/multiprocessing.rst",
"noxfile.py",
+ ".pre-commit-config.yaml",
"README.rst",
+ ".github/release-trigger.yml",
+ # BigQuery DataFrames manages its own Kokoro cluster for presubmit & continuous tests.
+ ".kokoro/build.sh",
+ ".kokoro/continuous/common.cfg",
+ ".kokoro/presubmit/common.cfg",
],
)
@@ -52,6 +58,46 @@
# Fixup files
# ----------------------------------------------------------------------------
+# Make sure build includes all necessary files.
+s.replace(
+ ["MANIFEST.in"],
+ re.escape("recursive-include google"),
+ "recursive-include third_party *\nrecursive-include bigframes",
+)
+
+# Even though BigQuery DataFrames isn't technically a client library, we are
+# opting into Cloud RAD for docs hosting.
+s.replace(
+ [".kokoro/docs/common.cfg"],
+ re.escape('value: "docs-staging-v2-staging"'),
+ 'value: "docs-staging-v2"',
+)
+
+# Use a custom table of contents since the default one isn't organized well
+# enough for the number of classes we have.
+s.replace(
+ [".kokoro/publish-docs.sh"],
+ (
+ re.escape("# upload docs")
+ + "\n"
+ + re.escape(
+ 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"'
+ )
+ ),
+ (
+ "# Replace toc.yml template file\n"
+ + "mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml\n\n"
+ + "# upload docs\n"
+ + 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"'
+ ),
+)
+
+# Fixup the documentation.
+s.replace(
+ ["docs/conf.py"],
+ re.escape("Google Cloud Client Libraries for bigframes"),
+ "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
+)
# ----------------------------------------------------------------------------
# Samples templates
@@ -63,6 +109,6 @@
# Final cleanup
# ----------------------------------------------------------------------------
-s.shell.run(["nox", "-s", "blacken"], hide_output=False)
+s.shell.run(["nox", "-s", "format"], hide_output=False)
for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"):
s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False)
diff --git a/renovate.json b/renovate.json
new file mode 100644
index 0000000000..39b2a0ec92
--- /dev/null
+++ b/renovate.json
@@ -0,0 +1,12 @@
+{
+ "extends": [
+ "config:base",
+ "group:all",
+ ":preserveSemverRanges",
+ ":disableDependencyDashboard"
+ ],
+ "ignorePaths": [".pre-commit-config.yaml", ".kokoro/requirements.txt", "setup.py"],
+ "pip_requirements": {
+ "fileMatch": ["requirements-test.txt", "samples/[\\S/]*constraints.txt", "samples/[\\S/]*constraints-test.txt"]
+ }
+}
diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py
index 37972672c3..9998a23eb2 100644
--- a/samples/snippets/remote_function.py
+++ b/samples/snippets/remote_function.py
@@ -39,11 +39,19 @@ def run_remote_function_and_read_gbq_function(project_id: str):
# already created, BigQuery DataFrames will attempt to create one assuming
# the necessary APIs and IAM permissions are setup in the project. In our
# examples we would be using a pre-created connection named
- # `bigframes-rf-conn`. Let's try a `pandas`-like use case in which we want
- # to apply a user defined scalar function to every value in a `Series`, more
- # specifically bucketize the `body_mass_g` value of the penguins, which is a
- # real number, into a category, which is a string.
- @bpd.remote_function([float], str, bigquery_connection="bigframes-rf-conn")
+ # `bigframes-rf-conn`. We will also set `reuse=False` to make sure we don't
+ # step over someone else creating remote function in the same project from
+ # the exact same source code at the same time. Let's try a `pandas`-like use
+ # case in which we want to apply a user defined scalar function to every
+ # value in a `Series`, more specifically bucketize the `body_mass_g` value
+ # of the penguins, which is a real number, into a category, which is a
+ # string.
+ @bpd.remote_function(
+ [float],
+ str,
+ bigquery_connection="bigframes-rf-conn",
+ reuse=False,
+ )
def get_bucket(num):
if not num:
return "NA"
@@ -80,9 +88,11 @@ def get_bucket(num):
# Let's continue trying other potential use cases of remote functions. Let's
# say we consider the `species`, `island` and `sex` of the penguins
# sensitive information and want to redact that by replacing with their hash
- # code instead. Let's define another scalar custom function and decorated it
+ # code instead. Let's define another scalar custom function and decorate it
# as a remote function
- @bpd.remote_function([str], str, bigquery_connection="bigframes-rf-conn")
+ @bpd.remote_function(
+ [str], str, bigquery_connection="bigframes-rf-conn", reuse=False
+ )
def get_hash(input):
import hashlib
diff --git a/scripts/decrypt-secrets.sh b/scripts/decrypt-secrets.sh
new file mode 100755
index 0000000000..0018b421dd
--- /dev/null
+++ b/scripts/decrypt-secrets.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2023 Google LLC All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT=$( dirname "$DIR" )
+
+# Work from the project root.
+cd $ROOT
+
+# Prevent it from overriding files.
+# We recommend that sample authors use their own service account files and cloud project.
+# In that case, they are supposed to prepare these files by themselves.
+if [[ -f "testing/test-env.sh" ]] || \
+ [[ -f "testing/service-account.json" ]] || \
+ [[ -f "testing/client-secrets.json" ]]; then
+ echo "One or more target files exist, aborting."
+ exit 1
+fi
+
+# Use SECRET_MANAGER_PROJECT if set, fallback to cloud-devrel-kokoro-resources.
+PROJECT_ID="${SECRET_MANAGER_PROJECT:-cloud-devrel-kokoro-resources}"
+
+gcloud secrets versions access latest --secret="python-docs-samples-test-env" \
+ --project="${PROJECT_ID}" \
+ > testing/test-env.sh
+gcloud secrets versions access latest \
+ --secret="python-docs-samples-service-account" \
+ --project="${PROJECT_ID}" \
+ > testing/service-account.json
+gcloud secrets versions access latest \
+ --secret="python-docs-samples-client-secrets" \
+ --project="${PROJECT_ID}" \
+ > testing/client-secrets.json
diff --git a/scripts/readme-gen/readme_gen.py b/scripts/readme-gen/readme_gen.py
new file mode 100644
index 0000000000..1acc119835
--- /dev/null
+++ b/scripts/readme-gen/readme_gen.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generates READMEs using configuration defined in yaml."""
+
+import argparse
+import io
+import os
+import subprocess
+
+import jinja2
+import yaml
+
+
+jinja_env = jinja2.Environment(
+ trim_blocks=True,
+ loader=jinja2.FileSystemLoader(
+ os.path.abspath(os.path.join(os.path.dirname(__file__), "templates"))
+ ),
+ autoescape=True,
+)
+
+README_TMPL = jinja_env.get_template("README.tmpl.rst")
+
+
+def get_help(file):
+ return subprocess.check_output(["python", file, "--help"]).decode()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("source")
+ parser.add_argument("--destination", default="README.rst")
+
+ args = parser.parse_args()
+
+ source = os.path.abspath(args.source)
+ root = os.path.dirname(source)
+ destination = os.path.join(root, args.destination)
+
+ jinja_env.globals["get_help"] = get_help
+
+ with io.open(source, "r") as f:
+ config = yaml.load(f)
+
+ # This allows get_help to execute in the right directory.
+ os.chdir(root)
+
+ output = README_TMPL.render(config)
+
+ with io.open(destination, "w") as f:
+ f.write(output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/readme-gen/templates/README.tmpl.rst b/scripts/readme-gen/templates/README.tmpl.rst
new file mode 100644
index 0000000000..4fd239765b
--- /dev/null
+++ b/scripts/readme-gen/templates/README.tmpl.rst
@@ -0,0 +1,87 @@
+{# The following line is a lie. BUT! Once jinja2 is done with it, it will
+ become truth! #}
+.. This file is automatically generated. Do not edit this file directly.
+
+{{product.name}} Python Samples
+===============================================================================
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/README.rst
+
+
+This directory contains samples for {{product.name}}. {{product.description}}
+
+{{description}}
+
+.. _{{product.name}}: {{product.url}}
+
+{% if required_api_url %}
+To run the sample, you need to enable the API at: {{required_api_url}}
+{% endif %}
+
+{% if required_role %}
+To run the sample, you need to have `{{required_role}}` role.
+{% endif %}
+
+{{other_required_steps}}
+
+{% if setup %}
+Setup
+-------------------------------------------------------------------------------
+
+{% for section in setup %}
+
+{% include section + '.tmpl.rst' %}
+
+{% endfor %}
+{% endif %}
+
+{% if samples %}
+Samples
+-------------------------------------------------------------------------------
+
+{% for sample in samples %}
+{{sample.name}}
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+{% if not sample.hide_cloudshell_button %}
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/{{sample.file}},{{folder}}/README.rst
+{% endif %}
+
+
+{{sample.description}}
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python {{sample.file}}
+{% if sample.show_help %}
+
+ {{get_help(sample.file)|indent}}
+{% endif %}
+
+
+{% endfor %}
+{% endif %}
+
+{% if cloud_client_library %}
+
+The client library
+-------------------------------------------------------------------------------
+
+This sample uses the `Google Cloud Client Library for Python`_.
+You can read the documentation for more details on API usage and use GitHub
+to `browse the source`_ and `report issues`_.
+
+.. _Google Cloud Client Library for Python:
+ https://googlecloudplatform.github.io/google-cloud-python/
+.. _browse the source:
+ https://github.com/GoogleCloudPlatform/google-cloud-python
+.. _report issues:
+ https://github.com/GoogleCloudPlatform/google-cloud-python/issues
+
+{% endif %}
+
+.. _Google Cloud SDK: https://cloud.google.com/sdk/
\ No newline at end of file
diff --git a/scripts/readme-gen/templates/auth.tmpl.rst b/scripts/readme-gen/templates/auth.tmpl.rst
new file mode 100644
index 0000000000..1446b94a5e
--- /dev/null
+++ b/scripts/readme-gen/templates/auth.tmpl.rst
@@ -0,0 +1,9 @@
+Authentication
+++++++++++++++
+
+This sample requires you to have authentication setup. Refer to the
+`Authentication Getting Started Guide`_ for instructions on setting up
+credentials for applications.
+
+.. _Authentication Getting Started Guide:
+ https://cloud.google.com/docs/authentication/getting-started
diff --git a/scripts/readme-gen/templates/auth_api_key.tmpl.rst b/scripts/readme-gen/templates/auth_api_key.tmpl.rst
new file mode 100644
index 0000000000..11957ce271
--- /dev/null
+++ b/scripts/readme-gen/templates/auth_api_key.tmpl.rst
@@ -0,0 +1,14 @@
+Authentication
+++++++++++++++
+
+Authentication for this service is done via an `API Key`_. To obtain an API
+Key:
+
+1. Open the `Cloud Platform Console`_
+2. Make sure that billing is enabled for your project.
+3. From the **Credentials** page, create a new **API Key** or use an existing
+ one for your project.
+
+.. _API Key:
+ https://developers.google.com/api-client-library/python/guide/aaa_apikeys
+.. _Cloud Console: https://console.cloud.google.com/project?_
diff --git a/scripts/readme-gen/templates/install_deps.tmpl.rst b/scripts/readme-gen/templates/install_deps.tmpl.rst
new file mode 100644
index 0000000000..6f069c6c87
--- /dev/null
+++ b/scripts/readme-gen/templates/install_deps.tmpl.rst
@@ -0,0 +1,29 @@
+Install Dependencies
+++++++++++++++++++++
+
+#. Clone python-docs-samples and change directory to the sample directory you want to use.
+
+ .. code-block:: bash
+
+ $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git
+
+#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions.
+
+ .. _Python Development Environment Setup Guide:
+ https://cloud.google.com/python/setup
+
+#. Create a virtualenv. Samples are compatible with Python 3.7+.
+
+ .. code-block:: bash
+
+ $ virtualenv env
+ $ source env/bin/activate
+
+#. Install the dependencies needed to run the samples.
+
+ .. code-block:: bash
+
+ $ pip install -r requirements.txt
+
+.. _pip: https://pip.pypa.io/
+.. _virtualenv: https://virtualenv.pypa.io/
diff --git a/scripts/readme-gen/templates/install_portaudio.tmpl.rst b/scripts/readme-gen/templates/install_portaudio.tmpl.rst
new file mode 100644
index 0000000000..5ea33d18c0
--- /dev/null
+++ b/scripts/readme-gen/templates/install_portaudio.tmpl.rst
@@ -0,0 +1,35 @@
+Install PortAudio
++++++++++++++++++
+
+Install `PortAudio`_. This is required by the `PyAudio`_ library to stream
+audio from your computer's microphone. PyAudio depends on PortAudio for cross-platform compatibility, and is installed differently depending on the
+platform.
+
+* For Mac OS X, you can use `Homebrew`_::
+
+ brew install portaudio
+
+ **Note**: if you encounter an error when running `pip install` that indicates
+ it can't find `portaudio.h`, try running `pip install` with the following
+ flags::
+
+ pip install --global-option='build_ext' \
+ --global-option='-I/usr/local/include' \
+ --global-option='-L/usr/local/lib' \
+ pyaudio
+
+* For Debian / Ubuntu Linux::
+
+ apt-get install portaudio19-dev python-all-dev
+
+* Windows may work without having to install PortAudio explicitly (it will get
+ installed with PyAudio).
+
+For more details, see the `PyAudio installation`_ page.
+
+
+.. _PyAudio: https://people.csail.mit.edu/hubert/pyaudio/
+.. _PortAudio: http://www.portaudio.com/
+.. _PyAudio installation:
+ https://people.csail.mit.edu/hubert/pyaudio/#downloads
+.. _Homebrew: http://brew.sh
diff --git a/scripts/upload_to_google_drive.py b/scripts/upload_to_google_drive.py
index e579151359..dcdc9168ba 100644
--- a/scripts/upload_to_google_drive.py
+++ b/scripts/upload_to_google_drive.py
@@ -41,12 +41,9 @@
wheel_id = "15fZ1DkrFDk4ibMNTzms4akpxmf2pzeAR"
wheel_path = next(iter((repo_root / "dist").glob("bigframes-*.whl")))
-pdf_id = "1agYjxmPLrxelsaHI-lc41QHcgnQYemcX"
-pdf_path = repo_root / "docs" / "_build" / "latex" / "bigframes-latest.pdf"
-
uploads = (
(wheel_id, wheel_path, "application/octet-stream"),
- (pdf_id, pdf_path, "application/pdf"),
+ # (pdf_id, pdf_path, "application/pdf"),
)
upload_template = (
diff --git a/setup.cfg b/setup.cfg
index 8bd749387e..0523500895 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright 2020 Google LLC
+# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -17,17 +17,3 @@
# Generated by synthtool. DO NOT EDIT!
[bdist_wheel]
universal = 1
-
-[pytype]
-python_version = 3.9
-inputs =
- google/cloud/
-exclude =
- tests/
- google/cloud/bigquery_v2/ # Legacy proto-based types.
-output = .pytype/
-disable =
- # There's some issue with finding some pyi files, thus disabling.
- # The issue https://github.com/google/pytype/issues/150 is closed, but the
- # error still occurs for some reason.
- pyi-error
diff --git a/setup.py b/setup.py
index 139873e6fc..69b71c88f1 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
"ibis-framework[bigquery] >=6.0.0,<=6.1.0",
"pandas >=1.5.0",
"pydata-google-auth >=1.8.2",
+ "requests >=2.27.1",
"scikit-learn >=1.2.2",
"sqlalchemy >=1.4,<3.0",
"ipywidgets >=7.7.1",
@@ -58,7 +59,7 @@
"pandas-gbq >=0.19.0",
],
# Packages required for basic development flow.
- "dev": ["pytest", "pre-commit", "nox", "google-cloud-testutils"],
+ "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
}
extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))
diff --git a/testing/.gitignore b/testing/.gitignore
new file mode 100644
index 0000000000..b05fbd6308
--- /dev/null
+++ b/testing/.gitignore
@@ -0,0 +1,3 @@
+test-env.sh
+service-account.json
+client-secrets.json
\ No newline at end of file
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index 523256ee83..cd69d45dc9 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -9,7 +9,7 @@ cachetools==5.3.0
certifi==2022.12.7
cffi==1.15.1
cfgv==3.3.1
-charset-normalizer==3.1.0
+charset-normalizer==2.0.0
click==8.1.3
cloudpickle==2.0.0
colorlog==6.7.0
@@ -90,13 +90,14 @@ pyperclip==1.8.2
pytest==7.2.2
pytest-asyncio==0.21.0
pytest-cov==4.0.0
+pytest-mock==3.11.1
pytest-retry==1.1.0
pytest-xdist==3.2.1
python-dateutil==2.8.2
pytz==2023.3
PyYAML==6.0
readme-renderer==37.3
-requests==2.28.2
+requests==2.27.1
requests-oauthlib==1.3.1
requests-toolbelt==0.10.1
rfc3986==2.0.0
diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index 9b2872d673..a8613dfeb9 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -70,7 +70,7 @@ def test_xgbregressor_dart_booster_multiple_params(
):
model = bigframes.ml.ensemble.XGBRegressor(
booster="dart",
- tree_method="AUTO",
+ tree_method="auto",
min_tree_child_weight=2,
colsample_bytree=0.95,
colsample_bylevel=0.95,
@@ -121,7 +121,7 @@ def test_xgbregressor_dart_booster_multiple_params(
in reloaded_model._bqml_model.model_name
)
assert reloaded_model.booster == "DART"
- assert reloaded_model.dart_normalized_type == "TREE"
+ assert reloaded_model.dart_normalized_type == "tree"
assert reloaded_model.tree_method == "AUTO"
assert reloaded_model.colsample_bytree == 0.95
assert reloaded_model.colsample_bylevel == 0.95
@@ -185,7 +185,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
):
model = bigframes.ml.ensemble.XGBClassifier(
booster="dart",
- tree_method="AUTO",
+ tree_method="auto",
min_tree_child_weight=2,
colsample_bytree=0.95,
colsample_bylevel=0.95,
@@ -235,7 +235,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
in reloaded_model._bqml_model.model_name
)
assert reloaded_model.booster == "DART"
- assert reloaded_model.dart_normalized_type == "TREE"
+ assert reloaded_model.dart_normalized_type == "tree"
assert reloaded_model.tree_method == "AUTO"
assert reloaded_model.colsample_bytree == 0.95
assert reloaded_model.colsample_bylevel == 0.95
@@ -297,7 +297,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
@pytest.mark.flaky(retries=2, delay=120)
def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id):
model = bigframes.ml.ensemble.RandomForestRegressor(
- tree_method="AUTO",
+ tree_method="auto",
min_tree_child_weight=2,
colsample_bytree=0.95,
colsample_bylevel=0.95,
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index 8033f79c47..2f231f40c9 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -61,16 +61,32 @@ def get_remote_function_endpoints(bigquery_client, dataset_id):
return endpoints
-def get_cloud_functions(functions_client, project, location, name_prefix="bigframes-"):
+def get_cloud_functions(
+ functions_client, project, location, name=None, name_prefix=None
+):
"""Get the cloud functions in the given project and location."""
+
+ assert (
+ not name or not name_prefix
+ ), f"At most one of the {name.__name__} or {name_prefix.__name__} can be passed."
+
_, location = get_remote_function_locations(location)
parent = f"projects/{project}/locations/{location}"
request = functions_v2.ListFunctionsRequest(parent=parent)
page_result = functions_client.list_functions(request=request)
- full_name_prefix = parent + f"/functions/{name_prefix}"
for response in page_result:
- if not name_prefix or response.name.startswith(full_name_prefix):
- yield response
+ # If name is provided and it does not match then skip
+ if bool(name):
+ full_name = parent + f"/functions/{name}"
+ if response.name != full_name:
+ continue
+ # If name prefix is provided and it does not match then skip
+ elif bool(name_prefix):
+ full_name_prefix = parent + f"/functions/{name_prefix}"
+ if not response.name.startswith(full_name_prefix):
+ continue
+
+ yield response
def delete_cloud_function(functions_client, full_name):
@@ -84,8 +100,17 @@ def cleanup_remote_function_assets(
bigquery_client, functions_client, remote_udf, ignore_failures=True
):
"""Clean up the GCP assets behind a bigframes remote function."""
+
+ # Clean up BQ remote function
try:
bigquery_client.delete_routine(remote_udf.bigframes_remote_function)
+ except Exception:
+ # By default don't raise exception in cleanup
+ if not ignore_failures:
+ raise
+
+ # Clean up cloud function
+ try:
delete_cloud_function(functions_client, remote_udf.bigframes_cloud_function)
except Exception:
# By default don't raise exception in cleanup
@@ -94,7 +119,15 @@ def cleanup_remote_function_assets(
def make_uniq_udf(udf):
- """Transform a udf to another with same behavior but a unique name."""
+ """Transform a udf to another with same behavior but a unique name.
+ Use this to test remote functions with reuse=True, in which case parallel
+ instances of the same tests may evaluate same named cloud functions and BQ
+ remote functions, therefore interacting with each other and causing unwanted
+ failures. With this method one can transform a udf into another with the
+ same behavior but a different name which will remain unique for the
+ lifetime of one test instance.
+ """
+
prefixer = test_utils.prefixer.Prefixer(udf.__name__, "")
udf_uniq_name = prefixer.create_prefix()
udf_file_name = f"{udf_uniq_name}.py"
@@ -111,7 +144,18 @@ def make_uniq_udf(udf):
target_code = source_code.replace(source_key, target_key, 1)
f.write(target_code)
spec = importlib.util.spec_from_file_location(udf_file_name, udf_file_path)
- return getattr(spec.loader.load_module(), udf_uniq_name), tmpdir
+ udf_uniq = getattr(spec.loader.load_module(), udf_uniq_name)
+
+ # This is a bit of a hack but we need to remove the reference to a foreign
+ # module, otherwise the serialization would keep the foreign module
+ # reference and deserialization would fail with error like following:
+ # ModuleNotFoundError: No module named 'add_one_2nxcmd9j'
+ # TODO(shobs): Figure out if there is a better way of generating the unique
+ # function object, but for now let's just set it to same module as the
+ # original udf.
+ udf_uniq.__module__ = udf.__module__
+
+ return udf_uniq, tmpdir
@pytest.fixture(scope="module")
@@ -136,7 +180,10 @@ def cleanup_cloud_functions(session, functions_client, dataset_id_permanent):
)
delete_count = 0
for cloud_function in get_cloud_functions(
- functions_client, session.bqclient.project, session.bqclient.location
+ functions_client,
+ session.bqclient.project,
+ session.bqclient.location,
+ name_prefix="bigframes-",
):
# Ignore bigframes cloud functions referred by the remote functions in
# the permanent dataset
@@ -524,15 +571,6 @@ def add_one(x):
# Make a unique udf
add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one)
- # This is a bit of a hack but we need to remove the reference to a foreign
- # module, otherwise the serialization would keep the foreign module
- # reference and deserialization would fail with error like following:
- # ModuleNotFoundError: No module named 'add_one_2nxcmd9j'
- # TODO(shobs): Figure out if there is a better way of generating the unique
- # function object, but for now let's just set it to same module as the
- # original udf.
- add_one_uniq.__module__ = add_one.__module__
-
# Expected cloud function name for the unique udf
add_one_uniq_cf_name = get_cloud_function_name(add_one_uniq)
@@ -542,7 +580,7 @@ def add_one(x):
functions_client,
session.bqclient.project,
session.bqclient.location,
- name_prefix=add_one_uniq_cf_name,
+ name=add_one_uniq_cf_name,
)
)
assert len(cloud_functions) == 0
@@ -563,7 +601,7 @@ def add_one(x):
functions_client,
session.bqclient.project,
session.bqclient.location,
- name_prefix=add_one_uniq_cf_name,
+ name=add_one_uniq_cf_name,
)
)
assert len(cloud_functions) == 1
@@ -611,7 +649,7 @@ def inner_test():
functions_client,
session.bqclient.project,
session.bqclient.location,
- name_prefix=add_one_uniq_cf_name,
+ name=add_one_uniq_cf_name,
)
)
assert len(cloud_functions) == 0
@@ -633,7 +671,7 @@ def inner_test():
functions_client,
session.bqclient.project,
session.bqclient.location,
- name_prefix=add_one_uniq_cf_name,
+ name=add_one_uniq_cf_name,
)
)
assert len(cloud_functions) == 1
@@ -776,3 +814,221 @@ def test_remote_udf_lambda(
cleanup_remote_function_assets(
session.bqclient, functions_client, add_one_lambda_remote
)
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_with_explicit_name(
+ session, scalars_dfs, dataset_id, bq_cf_connection, functions_client
+):
+ try:
+
+ def square(x):
+ return x * x
+
+ prefixer = test_utils.prefixer.Prefixer(square.__name__, "")
+ rf_name = prefixer.create_prefix()
+ expected_remote_function = f"{dataset_id}.{rf_name}"
+
+ # Initially the expected BQ remote function should not exist
+ with pytest.raises(NotFound):
+ session.bqclient.get_routine(expected_remote_function)
+
+ # Create the remote function with the name provided explicitly
+ square_remote = session.remote_function(
+ [int],
+ int,
+ dataset_id,
+ bq_cf_connection,
+ reuse=False,
+ name=rf_name,
+ )(square)
+
+ # The remote function should reflect the explicitly provided name
+ assert square_remote.bigframes_remote_function == expected_remote_function
+
+ # Now the expected BQ remote function should exist
+ session.bqclient.get_routine(expected_remote_function)
+
+ # The behavior of the created remote function should be as expected
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_int64_col = scalars_df["int64_too"]
+ bf_result_col = bf_int64_col.apply(square_remote)
+ bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
+
+ pd_int64_col = scalars_pandas_df["int64_too"]
+ pd_result_col = pd_int64_col.apply(square)
+ # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e.
+ # pd_int64_col.dtype is Int64Dtype()
+ # pd_int64_col.apply(square).dtype is int64.
+ # For this test let's force the pandas dtype to be same as bigframes' dtype.
+ pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
+ pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ finally:
+ # clean up the gcp assets created for the remote function
+ cleanup_remote_function_assets(
+ session.bqclient, functions_client, square_remote
+ )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_with_explicit_name_reuse(
+ session, scalars_dfs, dataset_id, bq_cf_connection, functions_client
+):
+ try:
+
+ dirs_to_cleanup = []
+
+ # Define a user code
+ def square(x):
+ return x * x
+
+ # Make it a unique udf
+ square_uniq, square_uniq_dir = make_uniq_udf(square)
+ dirs_to_cleanup.append(square_uniq_dir)
+
+ # Define a common routine which accepts a remote function and the
+ # corresponding user defined function and tests that bigframes bahavior
+ # on the former is in parity with the pandas behaviour on the latter
+ def test_internal(rf, udf):
+ # The behavior of the created remote function should be as expected
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_int64_col = scalars_df["int64_too"]
+ bf_result_col = bf_int64_col.apply(rf)
+ bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
+
+ pd_int64_col = scalars_pandas_df["int64_too"]
+ pd_result_col = pd_int64_col.apply(udf)
+ # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e.
+ # pd_int64_col.dtype is Int64Dtype()
+ # pd_int64_col.apply(square).dtype is int64.
+ # For this test let's force the pandas dtype to be same as bigframes' dtype.
+ pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
+ pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+ # Create an explicit name for the remote function
+ prefixer = test_utils.prefixer.Prefixer("foo", "")
+ rf_name = prefixer.create_prefix()
+ expected_remote_function = f"{dataset_id}.{rf_name}"
+
+ # Initially the expected BQ remote function should not exist
+ with pytest.raises(NotFound):
+ session.bqclient.get_routine(expected_remote_function)
+
+ # Create a new remote function with the name provided explicitly
+ square_remote1 = session.remote_function(
+ [int],
+ int,
+ dataset_id,
+ bq_cf_connection,
+ name=rf_name,
+ )(square_uniq)
+
+ # The remote function should reflect the explicitly provided name
+ assert square_remote1.bigframes_remote_function == expected_remote_function
+
+ # Now the expected BQ remote function should exist
+ routine = session.bqclient.get_routine(expected_remote_function)
+ square_remote1_created = routine.created
+ square_remote1_cf_updated = session.cloudfunctionsclient.get_function(
+ name=square_remote1.bigframes_cloud_function
+ ).update_time
+
+ # Test pandas parity with square udf
+ test_internal(square_remote1, square)
+
+ # Now Create another remote function with the same name provided
+ # explicitly. Since reuse is True by default, the previously created
+ # remote function with the same name will be reused.
+ square_remote2 = session.remote_function(
+ [int],
+ int,
+ dataset_id,
+ bq_cf_connection,
+ name=rf_name,
+ )(square_uniq)
+
+ # The new remote function should still reflect the explicitly provided name
+ assert square_remote2.bigframes_remote_function == expected_remote_function
+
+ # The expected BQ remote function should still exist
+ routine = session.bqclient.get_routine(expected_remote_function)
+ square_remote2_created = routine.created
+ square_remote2_cf_updated = session.cloudfunctionsclient.get_function(
+ name=square_remote2.bigframes_cloud_function
+ ).update_time
+
+ # The new remote function should reflect that the previous BQ remote
+ # function and the cloud function were reused instead of creating anew
+ assert square_remote2_created == square_remote1_created
+ assert (
+ square_remote2.bigframes_cloud_function
+ == square_remote1.bigframes_cloud_function
+ )
+ assert square_remote2_cf_updated == square_remote1_cf_updated
+
+ # Test again that the new remote function is actually same as the
+ # previous remote function
+ test_internal(square_remote2, square)
+
+ # Now define a different user code
+ def plusone(x):
+ return x + 1
+
+ # Make it a unique udf
+ plusone_uniq, plusone_uniq_dir = make_uniq_udf(plusone)
+ dirs_to_cleanup.append(plusone_uniq_dir)
+
+ # Now Create a third remote function with the same name provided
+ # explicitly. Even though reuse is True by default, the previously
+ # created remote function with the same name should not be reused since
+ # this time it is a different user code.
+ plusone_remote = session.remote_function(
+ [int],
+ int,
+ dataset_id,
+ bq_cf_connection,
+ name=rf_name,
+ )(plusone_uniq)
+
+ # The new remote function should still reflect the explicitly provided name
+ assert plusone_remote.bigframes_remote_function == expected_remote_function
+
+ # The expected BQ remote function should still exist
+ routine = session.bqclient.get_routine(expected_remote_function)
+ plusone_remote_created = routine.created
+ plusone_remote_cf_updated = session.cloudfunctionsclient.get_function(
+ name=plusone_remote.bigframes_cloud_function
+ ).update_time
+
+ # The new remote function should reflect that the previous BQ remote
+ # function and the cloud function were NOT reused, instead were created
+ # anew
+ assert plusone_remote_created > square_remote2_created
+ assert (
+ plusone_remote.bigframes_cloud_function
+ != square_remote2.bigframes_cloud_function
+ )
+ assert plusone_remote_cf_updated > square_remote2_cf_updated
+
+ # Test again that the new remote function is equivalent to the new user
+ # defined function
+ test_internal(plusone_remote, plusone)
+ finally:
+ # clean up the gcp assets created for the remote function
+ cleanup_remote_function_assets(
+ session.bqclient, functions_client, square_remote1
+ )
+ cleanup_remote_function_assets(
+ session.bqclient, functions_client, square_remote2
+ )
+ cleanup_remote_function_assets(
+ session.bqclient, functions_client, plusone_remote
+ )
+ for dir_ in dirs_to_cleanup:
+ shutil.rmtree(dir_)
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 6c3e8e06f5..ace943956f 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -18,6 +18,7 @@
import pandas as pd
import pyarrow as pa
+import pytest
import pytz
import bigframes
@@ -278,6 +279,7 @@ def test_model_predict_with_unnamed_index(
)
+@pytest.mark.flaky(retries=2, delay=120)
def test_model_generate_text(
bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df
):
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index 8df4145fcf..c71bbbe3b0 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -16,33 +16,14 @@
from bigframes.ml import decomposition
-_PD_NEW_PENGUINS = pd.DataFrame(
- {
- "tag_number": [1633, 1672, 1690],
- "species": [
- "Adelie Penguin (Pygoscelis adeliae)",
- "Gentoo penguin (Pygoscelis papua)",
- "Adelie Penguin (Pygoscelis adeliae)",
- ],
- "island": ["Dream", "Biscoe", "Torgersen"],
- "culmen_length_mm": [37.8, 46.5, 41.1],
- "culmen_depth_mm": [18.1, 14.8, 18.6],
- "flipper_length_mm": [193.0, 217.0, 189.0],
- "body_mass_g": [3750.0, 5200.0, 3325.0],
- "sex": ["MALE", "FEMALE", "MALE"],
- }
-).set_index("tag_number")
-
-def test_pca_predict(session, penguins_pca_model: decomposition.PCA):
- new_penguins = session.read_pandas(_PD_NEW_PENGUINS)
-
- predictions = penguins_pca_model.predict(new_penguins).to_pandas()
+def test_pca_predict(penguins_pca_model, new_penguins_df):
+ predictions = penguins_pca_model.predict(new_penguins_df).to_pandas()
expected = pd.DataFrame(
{
- "principal_component_1": [-1.459, 2.258, -1.685],
- "principal_component_2": [-1.120, -1.351, -0.874],
- "principal_component_3": [-0.646, 0.443, -0.704],
+ "principal_component_1": [-1.314041, -0.855813, -1.848786],
+ "principal_component_2": [-0.889106, -1.259753, -0.983304],
+ "principal_component_3": [-0.704345, 0.322555, -0.095759],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 85c3cce1d7..a85777c59d 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -13,6 +13,7 @@
# limitations under the License.
import operator
+import tempfile
import typing
from typing import Tuple
@@ -137,6 +138,46 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde
pandas.testing.assert_frame_equal(bf_result, pd_result)
+@pytest.mark.parametrize(
+ ("keep",),
+ [
+ ("first",),
+ ("last",),
+ ("all",),
+ ],
+)
+def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep):
+ bf_result = scalars_df_index.nlargest(
+ 3, ["bool_col", "int64_too"], keep=keep
+ ).to_pandas()
+ pd_result = scalars_pandas_df_index.nlargest(
+ 3, ["bool_col", "int64_too"], keep=keep
+ )
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+@pytest.mark.parametrize(
+ ("keep",),
+ [
+ ("first",),
+ ("last",),
+ ("all",),
+ ],
+)
+def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
+ bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep).to_pandas()
+ pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep)
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
def test_get_column_by_attr(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
series = scalars_df.int64_col
@@ -582,6 +623,22 @@ def test_df_fillna(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_df_ffill(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()
+ pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_df_bfill(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas()
+ pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill()
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
def test_df_isin_list(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
values = ["Hello, World!", 55555, 2.51, pd.NA, True]
@@ -1027,6 +1084,88 @@ def test_df_notnull(scalars_dfs):
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+@pytest.mark.parametrize(
+ ("left_labels", "right_labels", "overwrite", "fill_value"),
+ [
+ (["a", "b", "c"], ["c", "a", "b"], True, None),
+ (["a", "b", "c"], ["c", "a", "b"], False, None),
+ (["a", "b", "c"], ["a", "b", "c"], False, 2),
+ ],
+ ids=[
+ "one_one_match_overwrite",
+ "one_one_match_no_overwrite",
+ "exact_match",
+ ],
+)
+def test_combine(
+ scalars_df_index,
+ scalars_df_2_index,
+ scalars_pandas_df_index,
+ left_labels,
+ right_labels,
+ overwrite,
+ fill_value,
+):
+ if pd.__version__.startswith("1."):
+ pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.")
+ columns = ["int64_too", "int64_col", "float64_col"]
+
+ bf_df_a = scalars_df_index[columns]
+ bf_df_a.columns = left_labels
+ bf_df_b = scalars_df_2_index[columns]
+ bf_df_b.columns = right_labels
+ bf_result = bf_df_a.combine(
+ bf_df_b,
+ lambda x, y: x**2 + 2 * x * y + y**2,
+ overwrite=overwrite,
+ fill_value=fill_value,
+ ).to_pandas()
+
+ pd_df_a = scalars_pandas_df_index[columns]
+ pd_df_a.columns = left_labels
+ pd_df_b = scalars_pandas_df_index[columns]
+ pd_df_b.columns = right_labels
+ pd_result = pd_df_a.combine(
+ pd_df_b,
+ lambda x, y: x**2 + 2 * x * y + y**2,
+ overwrite=overwrite,
+ fill_value=fill_value,
+ )
+
+ # Some dtype inconsistency for all-NULL columns
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
+def test_combine_first(
+ scalars_df_index,
+ scalars_df_2_index,
+ scalars_pandas_df_index,
+):
+ if pd.__version__.startswith("1."):
+ pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.")
+ columns = ["int64_too", "int64_col", "float64_col"]
+
+ bf_df_a = scalars_df_index[columns].iloc[0:6]
+ bf_df_a.columns = ["a", "b", "c"]
+ bf_df_b = scalars_df_2_index[columns].iloc[2:8]
+ bf_df_b.columns = ["b", "a", "d"]
+ bf_result = bf_df_a.combine_first(bf_df_b).to_pandas()
+
+ pd_df_a = scalars_pandas_df_index[columns].iloc[0:6]
+ pd_df_a.columns = ["a", "b", "c"]
+ pd_df_b = scalars_pandas_df_index[columns].iloc[2:8]
+ pd_df_b.columns = ["b", "a", "d"]
+ pd_result = pd_df_a.combine_first(pd_df_b)
+
+ print("pandas")
+ print(pd_result.to_string())
+ print("bigframes")
+ print(bf_result.to_string())
+
+ # Some dtype inconsistency for all-NULL columns
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
@pytest.mark.parametrize(
("op"),
[
@@ -1145,11 +1284,13 @@ def test_series_binop_axis_index(
(["a", "a", "b"], ["c", "c", "d"]),
(["a", "b", "c"], ["c", "a", "b"]),
(["a", "c", "c"], ["c", "a", "c"]),
+ (["a", "b", "c"], ["a", "b", "c"]),
],
ids=[
"no_overlap",
"one_one_match",
"multi_match",
+ "exact_match",
],
)
def test_binop_df_df_binary_op(
@@ -1361,6 +1502,42 @@ def test_dataframe_general_analytic_op(
)
+@pytest.mark.parametrize(
+ ("periods",),
+ [
+ (1,),
+ (2,),
+ (-1,),
+ ],
+)
+def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods):
+ col_names = ["int64_too", "float64_col", "int64_col"]
+ bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas()
+ pd_result = scalars_pandas_df_index[col_names].diff(periods=periods)
+ pd.testing.assert_frame_equal(
+ pd_result,
+ bf_result,
+ )
+
+
+@pytest.mark.parametrize(
+ ("periods",),
+ [
+ (1,),
+ (2,),
+ (-1,),
+ ],
+)
+def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
+ col_names = ["int64_too", "float64_col", "int64_col"]
+ bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas()
+ pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods)
+ pd.testing.assert_frame_equal(
+ pd_result,
+ bf_result,
+ )
+
+
def test_dataframe_agg_single_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "float64_col"]
scalars_df, scalars_pandas_df = scalars_dfs
@@ -1675,6 +1852,52 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index
)
+def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+ bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99
+ pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99
+
+ # pandas type difference
+ pd_df["new_col"] = pd_df["new_col"].astype("Float64")
+
+ pd.testing.assert_frame_equal(
+ bf_df.to_pandas(),
+ pd_df,
+ )
+
+
+def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs):
+ if pd.__version__.startswith("1."):
+ pytest.skip("this loc overload not supported in pandas 1.x.")
+
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+ bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = "hello"
+ pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = "hello"
+
+ pd.testing.assert_frame_equal(
+ bf_df.to_pandas(),
+ pd_df,
+ )
+
+
+def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs):
+ if pd.__version__.startswith("1."):
+ pytest.skip("this loc overload not supported in pandas 1.x.")
+
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+
+ with pytest.raises(TypeError):
+ bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99
+ with pytest.raises(TypeError):
+ pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99
+
+
@pytest.mark.parametrize(
("op"),
[
@@ -1749,6 +1972,30 @@ def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index):
pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
+def test_df_skew_too_few_values(scalars_dfs):
+ columns = ["float64_col", "int64_col"]
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df[columns].head(2).skew().to_pandas()
+ pd_result = scalars_pandas_df[columns].head(2).skew()
+
+ # Pandas may produce narrower numeric types, but bigframes always produces Float64
+ pd_result = pd_result.astype("Float64")
+
+ pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+
+
+def test_df_skew(scalars_dfs):
+ columns = ["float64_col", "int64_col"]
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df[columns].skew().to_pandas()
+ pd_result = scalars_pandas_df[columns].skew()
+
+ # Pandas may produce narrower numeric types, but bigframes always produces Float64
+ pd_result = pd_result.astype("Float64")
+
+ pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+
+
@pytest.mark.parametrize(
("frac", "n", "random_state"),
[
@@ -1828,6 +2075,161 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis):
)
+def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index):
+ if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."):
+ pytest.skip("pandas filter items behavior different pre-2.1")
+ bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"])
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.filter(like="64_col").to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(like="64_col")
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$")
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index):
+ if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."):
+ pytest.skip("pandas filter items behavior different pre-2.1")
+ bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0)
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index):
+ scalars_df_index = scalars_df_index.copy().set_index("string_col")
+ scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col")
+
+ bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(like="ello", axis=0)
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index):
+ scalars_df_index = scalars_df_index.copy().set_index("string_col")
+ scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col")
+
+ bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas()
+
+ pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0)
+
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_reindex_rows_list(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]).to_pandas()
+
+ pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1])
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.reindex(
+ index=pd.Index([5, 1, 3, 99, 1], name="newname")
+ ).to_pandas()
+
+ pd_result = scalars_pandas_df_index.reindex(
+ index=pd.Index([5, 1, 3, 99, 1], name="newname")
+ )
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_reindex_nonunique(scalars_df_index):
+ with pytest.raises(ValueError):
+ # int64_too is non-unique
+ scalars_df_index.set_index("int64_too").reindex(
+ index=[5, 1, 3, 99, 1], validate=True
+ )
+
+
+def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.reindex(
+ columns=["not_a_col", "int64_col", "int64_too"]
+ ).to_pandas()
+
+ pd_result = scalars_pandas_df_index.reindex(
+ columns=["not_a_col", "int64_col", "int64_too"]
+ )
+
+ # Pandas uses float64 as default for newly created empty column, bf uses Float64
+ pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype())
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index):
+ reindex_target_bf = scalars_df_index.reindex(
+ columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1]
+ )
+ bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas()
+
+ reindex_target_pd = scalars_pandas_df_index.reindex(
+ columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1]
+ )
+ pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd)
+
+ # Pandas uses float64 as default for newly created empty column, bf uses Float64
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ # Pandas uses float64 as default for newly created empty column, bf uses Float64
+ pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype())
+ pd.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
def test_df_values(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.values
@@ -2035,6 +2437,93 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset):
pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False)
+def test_df_to_dict(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["numeric_col"] # formatted differently
+ bf_result = scalars_df_index.drop(columns=unsupported).to_dict()
+ pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_excel(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["timestamp_col"]
+ with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
+ scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file)
+ scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file)
+ bf_result = bf_result_file.read()
+ pd_result = bf_result_file.read()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_latex(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["numeric_col"] # formatted differently
+ bf_result = scalars_df_index.drop(columns=unsupported).to_latex()
+ pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_records(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["numeric_col"]
+ bf_result = scalars_df_index.drop(columns=unsupported).to_records()
+ pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records()
+
+ for bfi, pdi in zip(bf_result, pd_result):
+ for bfj, pdj in zip(bfi, pdi):
+ assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj
+
+
+def test_df_to_string(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["numeric_col"] # formatted differently
+
+ bf_result = scalars_df_index.drop(columns=unsupported).to_string()
+ pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index):
+ # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231
+ bf_result = scalars_df_index.dropna().to_markdown()
+ pd_result = scalars_pandas_df_index.dropna().to_markdown()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index):
+ with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
+ scalars_df_index.to_pickle(bf_result_file)
+ scalars_pandas_df_index.to_pickle(pd_result_file)
+ bf_result = bf_result_file.read()
+ pd_result = bf_result_file.read()
+
+ assert bf_result == pd_result
+
+
+def test_df_to_orc(scalars_df_index, scalars_pandas_df_index):
+ unsupported = [
+ "numeric_col",
+ "bytes_col",
+ "date_col",
+ "datetime_col",
+ "time_col",
+ "timestamp_col",
+ "geography_col",
+ ]
+
+ bf_result_file = tempfile.TemporaryFile()
+ pd_result_file = tempfile.TemporaryFile()
+ scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file)
+ scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc(
+ pd_result_file
+ )
+ bf_result = bf_result_file.read()
+ pd_result = bf_result_file.read()
+
+ assert bf_result == pd_result
+
+
@pytest.mark.parametrize(
("subset", "normalize", "ascending", "dropna"),
[
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index 987368ce77..18741468c5 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -210,12 +210,14 @@ def test_dataframe_groupby_multi_sum(
(lambda x: x.cummax(numeric_only=True)),
(lambda x: x.cummin(numeric_only=True)),
(lambda x: x.cumprod()),
+ (lambda x: x.shift(periods=2)),
],
ids=[
"cumsum",
"cummax",
"cummin",
"cumprod",
+ "shift",
],
)
def test_dataframe_groupby_analytic(
@@ -229,6 +231,30 @@ def test_dataframe_groupby_analytic(
pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
+def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas()
+ pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew()
+
+ pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
+def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index):
+ col_names = ["float64_col", "int64_col", "bool_col"]
+ bf_result = scalars_df_index[col_names].groupby("bool_col").skew().to_pandas()
+ pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").skew()
+
+ pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
+def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index):
+ col_names = ["float64_col", "int64_col", "string_col"]
+ bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1)
+ pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1)
+ bf_result_computed = bf_result.to_pandas()
+
+ pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
+
+
def test_dataframe_groupby_getitem(
scalars_df_index,
scalars_pandas_df_index,
diff --git a/tests/system/small/test_ipython.py b/tests/system/small/test_ipython.py
index 6725805d9a..be98ce0067 100644
--- a/tests/system/small/test_ipython.py
+++ b/tests/system/small/test_ipython.py
@@ -22,7 +22,8 @@ def test_repr_cache(scalars_df_index):
# Make sure the df has a new block that the method return value
# is not already cached.
test_df = scalars_df_index.head()
+ test_df._block.retrieve_repr_request_results.cache_clear()
results = display_formatter.format(test_df)
assert results[0].keys() == {"text/plain", "text/html"}
- assert test_df._block.retrieve_repr_request_results.cache_info().misses == 1
- assert test_df._block.retrieve_repr_request_results.cache_info().hits == 1
+ assert test_df._block.retrieve_repr_request_results.cache_info().misses >= 1
+ assert test_df._block.retrieve_repr_request_results.cache_info().hits >= 1
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 25d1e2ad49..1e38b47b4c 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -157,7 +157,7 @@ def test_multi_index_getitem_bool(scalars_df_index, scalars_pandas_df_index):
],
ids=["level_num", "level_name", "list", "mixed_list"],
)
-def test_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level):
+def test_df_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level):
bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
@@ -167,6 +167,26 @@ def test_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level)
pandas.testing.assert_frame_equal(bf_result, pd_result)
+@pytest.mark.parametrize(
+ ("level"),
+ [
+ (1),
+ ("int64_too"),
+ ([0, 2]),
+ ([2, "bool_col"]),
+ ],
+ ids=["level_num", "level_name", "list", "mixed_list"],
+)
+def test_series_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level):
+ bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+ pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+
+ bf_result = bf_frame["string_col"].droplevel(level).to_pandas()
+ pd_result = pd_frame["string_col"].droplevel(level)
+
+ pandas.testing.assert_series_equal(bf_result, pd_result)
+
+
@pytest.mark.parametrize(
("labels", "level"),
[
@@ -198,7 +218,9 @@ def test_multi_index_drop(scalars_df_index, scalars_pandas_df_index, labels, lev
"num_names_mixed",
],
)
-def test_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index, order):
+def test_df_multi_index_reorder_levels(
+ scalars_df_index, scalars_pandas_df_index, order
+):
bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
@@ -208,6 +230,51 @@ def test_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index, o
pandas.testing.assert_frame_equal(bf_result, pd_result)
+@pytest.mark.parametrize(
+ ("order"),
+ [
+ (1, 0, 2),
+ (["int64_col", "bool_col", "int64_too"]),
+ (["int64_col", "bool_col", 0]),
+ ],
+ ids=[
+ "level_nums",
+ "level_names",
+ "num_names_mixed",
+ ],
+)
+def test_series_multi_index_reorder_levels(
+ scalars_df_index, scalars_pandas_df_index, order
+):
+ bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+ pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+
+ bf_result = bf_frame["string_col"].reorder_levels(order).to_pandas()
+ pd_result = pd_frame["string_col"].reorder_levels(order)
+
+ pandas.testing.assert_series_equal(bf_result, pd_result)
+
+
+def test_df_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index):
+ bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+ pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+
+ bf_result = bf_frame.swaplevel().to_pandas()
+ pd_result = pd_frame.swaplevel()
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_series_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index):
+ bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+ pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"])
+
+ bf_result = bf_frame["string_col"].swaplevel(0, 2).to_pandas()
+ pd_result = pd_frame["string_col"].swaplevel(0, 2)
+
+ pandas.testing.assert_series_equal(bf_result, pd_result)
+
+
def test_multi_index_series_groupby(scalars_df_index, scalars_pandas_df_index):
bf_frame = scalars_df_index.set_index(["int64_too", "bool_col"])
bf_result = (
@@ -446,6 +513,24 @@ def test_multi_index_series_rename_dict_same_type(
)
+def test_multi_index_df_reindex(scalars_df_index, scalars_pandas_df_index):
+ new_index = pandas.MultiIndex.from_tuples(
+ [(4, "Hello, World!"), (99, "some_new_string")],
+ names=["new_index1", "new_index2"],
+ )
+ bf_result = (
+ scalars_df_index.set_index(["rowindex_2", "string_col"])
+ .reindex(index=new_index)
+ .to_pandas()
+ )
+ pd_result = scalars_pandas_df_index.set_index(["rowindex_2", "string_col"]).reindex(
+ index=new_index
+ )
+ pandas.testing.assert_frame_equal(
+ bf_result, pd_result, check_dtype=False, check_index_type=False
+ )
+
+
# Column Multi-index tests
@@ -722,3 +807,76 @@ def test_is_monotonic_decreasing_extra(indexes):
bf_result.index.is_monotonic_decreasing
== pd_result.index.is_monotonic_decreasing
)
+
+
+def test_column_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(
+ zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"])
+ )
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.droplevel(1, axis=1).to_pandas()
+ pd_result = pd_df.droplevel(1, axis=1)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_df_column_multi_index_reindex(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "rowindex_2"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ new_index = pandas.MultiIndex.from_tuples(
+ [("z", "a"), ("a", "a")], names=["newname1", "newname2"]
+ )
+
+ bf_result = bf_df.reindex(columns=new_index).to_pandas()
+
+ pd_result = pd_df.reindex(columns=new_index)
+
+ # Pandas uses float64 as default for newly created empty column, bf uses Float64
+ pd_result[("z", "a")] = pd_result[("z", "a")].astype(pandas.Float64Dtype())
+
+ pandas.testing.assert_frame_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(
+ zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"])
+ )
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.reorder_levels([-2, -1, 0], axis=1).to_pandas()
+ pd_result = pd_df.reorder_levels([-2, -1, 0], axis=1)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(
+ zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"])
+ )
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.swaplevel(-3, -1, axis=1).to_pandas()
+ pd_result = pd_df.swaplevel(-3, -1, axis=1)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index e451d5c3a2..a429c6551d 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -209,3 +209,17 @@ def test_merge_series(scalars_dfs, merge_how):
)
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+
+def test_cut(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False)
+ bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False)
+
+ # make sure the result is a supported dtype
+ assert bf_result.dtype == bpd.Int64Dtype()
+
+ bf_result = bf_result.to_pandas()
+ pd_result = pd_result.astype("Int64")
+ pd.testing.assert_series_equal(bf_result, pd_result)
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 07dc892ddc..d3560540cc 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -186,6 +186,54 @@ def test_fillna(scalars_dfs):
)
+def test_series_replace_scalar_scalar(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ col_name = "string_col"
+ bf_result = (
+ scalars_df[col_name].replace("Hello, World!", "Howdy, Planet!").to_pandas()
+ )
+ pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!")
+
+ pd.testing.assert_series_equal(
+ pd_result,
+ bf_result,
+ )
+
+
+def test_series_replace_regex_scalar(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ col_name = "string_col"
+ bf_result = (
+ scalars_df[col_name].replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
+ )
+ pd_result = scalars_pandas_df[col_name].replace(
+ "^H.l", "Howdy, Planet!", regex=True
+ )
+
+ pd.testing.assert_series_equal(
+ pd_result,
+ bf_result,
+ )
+
+
+def test_series_replace_list_scalar(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ col_name = "string_col"
+ bf_result = (
+ scalars_df[col_name]
+ .replace(["Hello, World!", "T"], "Howdy, Planet!")
+ .to_pandas()
+ )
+ pd_result = scalars_pandas_df[col_name].replace(
+ ["Hello, World!", "T"], "Howdy, Planet!"
+ )
+
+ pd.testing.assert_series_equal(
+ pd_result,
+ bf_result,
+ )
+
+
@pytest.mark.parametrize(
("ignore_index",),
(
@@ -759,7 +807,6 @@ def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index):
)
def test_isin(scalars_dfs, col_name, test_set):
scalars_df, scalars_pandas_df = scalars_dfs
- print(type(scalars_pandas_df["datetime_col"].iloc[0]))
bf_result = scalars_df[col_name].isin(test_set).to_pandas()
pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean")
pd.testing.assert_series_equal(
@@ -1506,6 +1553,28 @@ def test_shift(scalars_df_index, scalars_pandas_df_index):
)
+def test_series_ffill(scalars_df_index, scalars_pandas_df_index):
+ col_name = "numeric_col"
+ bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas()
+ pd_result = scalars_pandas_df_index[col_name].ffill(limit=1)
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_series_bfill(scalars_df_index, scalars_pandas_df_index):
+ col_name = "numeric_col"
+ bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas()
+ pd_result = scalars_pandas_df_index[col_name].bfill(limit=2)
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
def test_cumsum_int(scalars_df_index, scalars_pandas_df_index):
if pd.__version__.startswith("1."):
pytest.skip("Series.cumsum NA mask are different in pandas 1.x.")
@@ -1588,7 +1657,7 @@ def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, m
("all",),
],
)
-def test_nlargest(scalars_df_index, scalars_pandas_df_index, keep):
+def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep):
col_name = "bool_col"
bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas()
pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep)
@@ -1622,6 +1691,25 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods):
)
+@pytest.mark.parametrize(
+ ("periods",),
+ [
+ (1,),
+ (2,),
+ (-1,),
+ ],
+)
+def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
+ bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas()
+ # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA
+ pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods)
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
@pytest.mark.parametrize(
("keep",),
[
@@ -1630,7 +1718,7 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods):
("all",),
],
)
-def test_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
+def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
col_name = "bool_col"
bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas()
pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep)
@@ -1853,6 +1941,91 @@ def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index):
)
+def test_series_filter_items(scalars_df_index, scalars_pandas_df_index):
+ if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."):
+ pytest.skip("pandas filter items behavior different pre-2.1")
+ bf_result = scalars_df_index["float64_col"].filter(items=[5, 1, 3]).to_pandas()
+
+ pd_result = scalars_pandas_df_index["float64_col"].filter(items=[5, 1, 3])
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_series_filter_like(scalars_df_index, scalars_pandas_df_index):
+ scalars_df_index = scalars_df_index.copy().set_index("string_col")
+ scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col")
+
+ bf_result = scalars_df_index["float64_col"].filter(like="ello").to_pandas()
+
+ pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello")
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index):
+ scalars_df_index = scalars_df_index.copy().set_index("string_col")
+ scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col")
+
+ bf_result = scalars_df_index["float64_col"].filter(regex="^[GH].*").to_pandas()
+
+ pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*")
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_series_reindex(scalars_df_index, scalars_pandas_df_index):
+ bf_result = (
+ scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]).to_pandas()
+ )
+
+ pd_result = scalars_pandas_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1])
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+def test_series_reindex_nonunique(scalars_df_index):
+ with pytest.raises(ValueError):
+ # int64_too is non-unique
+ scalars_df_index.set_index("int64_too")["float64_col"].reindex(
+ index=[5, 1, 3, 99, 1], validate=True
+ )
+
+
+def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index):
+ bf_reindex_target = scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1])
+ bf_result = (
+ scalars_df_index["int64_too"].reindex_like(bf_reindex_target).to_pandas()
+ )
+
+ pd_reindex_target = scalars_pandas_df_index["float64_col"].reindex(
+ index=[5, 1, 3, 99, 1]
+ )
+ pd_result = scalars_pandas_df_index["int64_too"].reindex_like(pd_reindex_target)
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
def test_where_with_series(scalars_df_index, scalars_pandas_df_index):
bf_result = (
scalars_df_index["int64_col"]
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
new file mode 100644
index 0000000000..8d4932a3c3
--- /dev/null
+++ b/tests/unit/ml/test_golden_sql.py
@@ -0,0 +1,47 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import mock
+
+import pytest_mock
+
+import bigframes
+from bigframes.ml import linear_model
+import bigframes.pandas as bpd
+
+
+def test_linear_regression_default_fit(mocker: pytest_mock.MockerFixture):
+ mock_session = mock.create_autospec(spec=bigframes.Session)
+
+ mock_X = mock.create_autospec(spec=bpd.DataFrame)
+ mock_X._get_block().expr._session = mock_session
+
+ mock_y = mock.create_autospec(spec=bpd.DataFrame)
+ mock_y.columns.tolist.return_value = ["input_label_column"]
+
+ mock_X.join(mock_y).sql = "input_dataframe_sql"
+
+ # return values we don't care about, but need to provide to continue the program
+ mock_session._start_query.return_value = (None, mock.MagicMock())
+
+ mocker.patch(
+ "bigframes.ml.core._create_temp_model_name", return_value="temp_model_name"
+ )
+
+ model = linear_model.LinearRegression()
+ model.fit(mock_X, mock_y)
+
+ mock_session._start_query.assert_called_once_with(
+ 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n INPUT_LABEL_COLS=["input_label_column"])\nAS input_dataframe_sql'
+ )
diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py
index 27706a1a07..ed5c621b1d 100644
--- a/tests/unit/ml/test_pipeline.py
+++ b/tests/unit/ml/test_pipeline.py
@@ -18,38 +18,35 @@
import sklearn.pipeline as sklearn_pipeline # type: ignore
import sklearn.preprocessing as sklearn_preprocessing # type: ignore
-import bigframes.ml.compose
-import bigframes.ml.linear_model
-import bigframes.ml.pipeline
-import bigframes.ml.preprocessing
+from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing
def test_pipeline_repr():
- pipeline = bigframes.ml.pipeline.Pipeline(
+ pl = pipeline.Pipeline(
[
(
"preproc",
- bigframes.ml.compose.ColumnTransformer(
+ compose.ColumnTransformer(
[
(
"onehot",
- bigframes.ml.preprocessing.OneHotEncoder(),
+ preprocessing.OneHotEncoder(),
"species",
),
(
"scale",
- bigframes.ml.preprocessing.StandardScaler(),
+ preprocessing.StandardScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
]
),
),
- ("linreg", bigframes.ml.linear_model.LinearRegression()),
+ ("linreg", linear_model.LinearRegression()),
]
)
assert (
- pipeline.__repr__()
+ pl.__repr__()
== """Pipeline(steps=[('preproc',
ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
'species'),
@@ -62,29 +59,29 @@ def test_pipeline_repr():
@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
def test_pipeline_repr_matches_sklearn():
- bf_pipeline = bigframes.ml.pipeline.Pipeline(
+ bf_pl = pipeline.Pipeline(
[
(
"preproc",
- bigframes.ml.compose.ColumnTransformer(
+ compose.ColumnTransformer(
[
(
"onehot",
- bigframes.ml.preprocessing.OneHotEncoder(),
+ preprocessing.OneHotEncoder(),
"species",
),
(
"scale",
- bigframes.ml.preprocessing.StandardScaler(),
+ preprocessing.StandardScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
]
),
),
- ("linreg", bigframes.ml.linear_model.LinearRegression()),
+ ("linreg", linear_model.LinearRegression()),
]
)
- sk_pipeline = sklearn_pipeline.Pipeline(
+ sk_pl = sklearn_pipeline.Pipeline(
[
(
"preproc",
@@ -107,4 +104,17 @@ def test_pipeline_repr_matches_sklearn():
]
)
- assert bf_pipeline.__repr__() == sk_pipeline.__repr__()
+ assert bf_pl.__repr__() == sk_pl.__repr__()
+
+
+def test_pipeline_arima_plus_not_implemented():
+ with pytest.raises(NotImplementedError):
+ pipeline.Pipeline(
+ [
+ (
+ "transform",
+ preprocessing.StandardScaler(),
+ ),
+ ("estimator", forecasting.ARIMAPlus()),
+ ]
+ )
diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py
index e01638e22e..8f3e0beb0e 100644
--- a/tests/unit/test_core.py
+++ b/tests/unit/test_core.py
@@ -13,9 +13,12 @@
# limitations under the License.
import ibis
+import ibis.expr.types as ibis_types
import pandas
import bigframes.core as core
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
from . import resources
@@ -46,6 +49,42 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns():
assert len(actual.columns) == 3
+def test_arrayvalue_with_get_column_type():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ col1_type = value.get_column_type("col1")
+ col2_type = value.get_column_type("col2")
+ col3_type = value.get_column_type("col3")
+ assert isinstance(col1_type, pandas.Int64Dtype)
+ assert isinstance(col2_type, pandas.StringDtype)
+ assert isinstance(col3_type, pandas.Float64Dtype)
+
+
+def test_arrayvalue_with_get_column():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ col1 = value.get_column("col1")
+ assert isinstance(col1, ibis_types.Value)
+ assert col1.get_name() == "col1"
+ assert col1.type().is_int64()
+
+
def test_arrayvalue_to_ibis_expr_with_projection():
value = resources.create_arrayvalue(
pandas.DataFrame(
@@ -69,3 +108,133 @@ def test_arrayvalue_to_ibis_expr_with_projection():
assert actual.columns[0] == "int64_col"
assert actual.columns[1] == "literals"
assert actual.columns[2] == "string_col"
+
+
+def test_arrayvalues_to_ibis_expr_with_get_column():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.get_column("col1")
+ assert expr.get_name() == "col1"
+ assert expr.type().is_int64()
+
+
+def test_arrayvalues_to_ibis_expr_with_concat():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.concat([value])
+ actual = expr.to_ibis_expr()
+ assert len(actual.columns) == 3
+ # TODO(ashleyxu, b/299631930): test out the union expression
+ assert actual.columns[0] == "column_0"
+ assert actual.columns[1] == "column_1"
+ assert actual.columns[2] == "column_2"
+
+
+def test_arrayvalues_to_ibis_expr_with_project_unary_op():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.project_unary_op("col1", ops.AsTypeOp("string"))
+ assert value.columns[0].type().is_int64()
+ assert expr.columns[0].type().is_string()
+
+
+def test_arrayvalues_to_ibis_expr_with_project_binary_op():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": [0.2, 0.3, 0.4],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.project_binary_op("col2", "col3", ops.add_op, "col4")
+ assert expr.columns[3].type().is_float64()
+ actual = expr.to_ibis_expr()
+ assert len(expr.columns) == 4
+ assert actual.columns[3] == "col4"
+
+
+def test_arrayvalues_to_ibis_expr_with_project_ternary_op():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": [0.2, 0.3, 0.4],
+ "col3": [True, False, False],
+ "col4": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5")
+ assert expr.columns[4].type().is_float64()
+ actual = expr.to_ibis_expr()
+ assert len(expr.columns) == 5
+ assert actual.columns[4] == "col5"
+
+
+def test_arrayvalue_to_ibis_expr_with_aggregate():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.aggregate(
+ aggregations=(("col1", agg_ops.sum_op, "col4"),),
+ by_column_ids=["col1"],
+ dropna=False,
+ )
+ actual = expr.to_ibis_expr()
+ assert len(expr.columns) == 2
+ assert actual.columns[0] == "col1"
+ assert actual.columns[1] == "col4"
+ assert expr.columns[1].type().is_int64()
+
+
+def test_arrayvalue_to_ibis_expr_with_corr_aggregate():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")])
+ actual = expr.to_ibis_expr()
+ assert len(expr.columns) == 1
+ assert actual.columns[0] == "col4"
+ assert expr.columns[0].type().is_float64()
diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
index d209284ab7..a4e61ca0f9 100644
--- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
+++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
@@ -12,8 +12,20 @@ def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile
return f"APPROX_QUANTILES({arg}, {num_bins})"
+def _first_non_null_value(translator, op: vendored_ibis_ops.FirstNonNullValue):
+ arg = translator.translate(op.arg)
+ return f"FIRST_VALUE({arg} IGNORE NULLS)"
+
+
+def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue):
+ arg = translator.translate(op.arg)
+ return f"LAST_VALUE({arg} IGNORE NULLS)"
+
+
patched_ops = {
vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles,
+ vendored_ibis_ops.FirstNonNullValue: _first_non_null_value,
+ vendored_ibis_ops.LastNonNullValue: _last_non_null_value,
}
OPERATION_REGISTRY.update(patched_ops)
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
index f3ab753a3b..1612d9c12e 100644
--- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
+++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
@@ -1,4 +1,5 @@
# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py
from __future__ import annotations
+from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F403
from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
new file mode 100644
index 0000000000..038987cac9
--- /dev/null
+++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
@@ -0,0 +1,26 @@
+# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/analytic.py
+
+from __future__ import annotations
+
+from ibis.expr.operations.analytic import Analytic
+import ibis.expr.rules as rlz
+
+
+class FirstNonNullValue(Analytic):
+ """Retrieve the first element."""
+
+ arg = rlz.column(rlz.any)
+ output_dtype = rlz.dtype_like("arg")
+
+
+class LastNonNullValue(Analytic):
+ """Retrieve the last element."""
+
+ arg = rlz.column(rlz.any)
+ output_dtype = rlz.dtype_like("arg")
+
+
+__all__ = [
+ "FirstNonNullValue",
+ "LastNonNullValue",
+]
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 8c81b23b6c..113c6547a0 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -13,7 +13,7 @@
from typing import Iterable, Literal, Mapping, Optional, Sequence, Union
-import numpy
+import numpy as np
from bigframes import constants
from third_party.bigframes_vendored.pandas.core.generic import NDFrame
@@ -56,7 +56,7 @@ def axes(self) -> list:
return [self.index, self.columns]
@property
- def values(self) -> numpy.ndarray:
+ def values(self) -> np.ndarray:
"""Return the values of DataFrame in the form of a NumPy array.
Args:
@@ -72,9 +72,7 @@ def values(self) -> numpy.ndarray:
# ----------------------------------------------------------------------
# IO methods (to / from other formats)
- def to_numpy(
- self, dtype=None, copy=False, na_value=None, **kwargs
- ) -> numpy.ndarray:
+ def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:
"""
Convert the DataFrame to a NumPy array.
@@ -154,6 +152,250 @@ def to_parquet(
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def to_dict(
+ self,
+ orient: Literal[
+ "dict", "list", "series", "split", "tight", "records", "index"
+ ] = "dict",
+ into: type[dict] = dict,
+ **kwargs,
+ ) -> dict | list[dict]:
+ """
+ Convert the DataFrame to a dictionary.
+
+ The type of the key-value pairs can be customized with the parameters
+ (see below).
+
+ Args:
+ orient (str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}):
+ Determines the type of the values of the dictionary.
+ 'dict' (default) : dict like {column -> {index -> value}}.
+ 'list' : dict like {column -> [values]}.
+ 'series' : dict like {column -> Series(values)}.
+ split' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}.
+ 'tight' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
+ 'index_names' -> [index.names], 'column_names' -> [column.names]}.
+ 'records' : list like [{column -> value}, ... , {column -> value}].
+ 'index' : dict like {index -> {column -> value}}.
+ into (class, default dict):
+ The collections.abc.Mapping subclass used for all Mappings
+ in the return value. Can be the actual class or an empty
+ instance of the mapping type you want. If you want a
+ collections.defaultdict, you must pass it initialized.
+
+ index (bool, default True):
+ Whether to include the index item (and index_names item if `orient`
+ is 'tight') in the returned dictionary. Can only be ``False``
+ when `orient` is 'split' or 'tight'.
+
+ Returns:
+ dict or list of dict: Return a collections.abc.Mapping object representing the DataFrame.
+ The resulting transformation depends on the `orient` parameter.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None:
+ """
+ Write DataFrame to an Excel sheet.
+
+ To write a single DataFrame to an Excel .xlsx file it is only necessary to
+ specify a target file name. To write to multiple sheets it is necessary to
+ create an `ExcelWriter` object with a target file name, and specify a sheet
+ in the file to write to.
+
+ Multiple sheets may be written to by specifying unique `sheet_name`.
+ With all data written to the file it is necessary to save the changes.
+ Note that creating an `ExcelWriter` object with a file name that already
+ exists will result in the contents of the existing file being erased.
+
+ Args:
+ excel_writer (path-like, file-like, or ExcelWriter object):
+ File path or existing ExcelWriter.
+ sheet_name (str, default 'Sheet1'):
+ Name of sheet which will contain DataFrame.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def to_latex(
+ self, buf=None, columns=None, header=True, index=True, **kwargs
+ ) -> str | None:
+ r"""
+ Render object to a LaTeX tabular, longtable, or nested table.
+
+ Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
+ into a main LaTeX document or read from an external file
+ with ``\input{{table.tex}}``.
+
+ Args:
+ buf (str, Path or StringIO-like, optional, default None):
+ Buffer to write to. If None, the output is returned as a string.
+ columns (list of label, optional):
+ The subset of columns to write. Writes all columns by default.
+ header (bool or list of str, default True):
+ Write out the column names. If a list of strings is given,
+ it is assumed to be aliases for the column names.
+ index (bool, default True):
+ Write row names (index).
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def to_records(
+ self, index: bool = True, column_dtypes=None, index_dtypes=None
+ ) -> np.recarray:
+ """
+ Convert DataFrame to a NumPy record array.
+
+ Index will be included as the first field of the record array if
+ requested.
+
+ Args:
+ index (bool, default True):
+ Include index in resulting record array, stored in 'index'
+ field or using the index label, if set.
+ column_dtypes (str, type, dict, default None):
+ If a string or type, the data type to store all columns. If
+ a dictionary, a mapping of column names and indices (zero-indexed)
+ to specific data types.
+ index_dtypes (str, type, dict, default None):
+ If a string or type, the data type to store all index levels. If
+ a dictionary, a mapping of index level names and indices
+ (zero-indexed) to specific data types.
+
+ This mapping is applied only if `index=True`.
+
+ Returns:
+ np.recarray: NumPy ndarray with the DataFrame labels as fields and each row
+ of the DataFrame as entries.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def to_string(
+ self,
+ buf=None,
+ columns: Sequence[str] | None = None,
+ col_space=None,
+ header: bool | Sequence[str] = True,
+ index: bool = True,
+ na_rep: str = "NaN",
+ formatters=None,
+ float_format=None,
+ sparsify: bool | None = None,
+ index_names: bool = True,
+ justify: str | None = None,
+ max_rows: int | None = None,
+ max_cols: int | None = None,
+ show_dimensions: bool = False,
+ decimal: str = ".",
+ line_width: int | None = None,
+ min_rows: int | None = None,
+ max_colwidth: int | None = None,
+ encoding: str | None = None,
+ ):
+ """Render a DataFrame to a console-friendly tabular output.
+
+ Args:
+ buf (str, Path or StringIO-like, optional, default None):
+ Buffer to write to. If None, the output is returned as a string.
+ columns (sequence, optional, default None):
+ The subset of columns to write. Writes all columns by default.
+ col_space (int, list or dict of int, optional):
+ The minimum width of each column.
+ header (bool or sequence, optional):
+ Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names.
+ index (bool, optional, default True):
+ Whether to print index (row) labels.
+ na_rep (str, optional, default 'NaN'):
+ String representation of NAN to use.
+ formatters (list, tuple or dict of one-param. functions, optional):
+ Formatter functions to apply to columns' elements by position or
+ name.
+ The result of each function must be a unicode string.
+ List/tuple must be of length equal to the number of columns.
+ float_format (one-parameter function, optional, default None):
+ Formatter function to apply to columns' elements if they are
+ floats. The result of this function must be a unicode string.
+ sparsify (bool, optional, default True):
+ Set to False for a DataFrame with a hierarchical index to print
+ every multiindex key at each row.
+ index_names (bool, optional, default True):
+ Prints the names of the indexes.
+ justify (str, default None):
+ How to justify the column labels. If None uses the option from
+ the print configuration (controlled by set_option), 'right' out
+ of the box. Valid values are, 'left', 'right', 'center', 'justify',
+ 'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial',
+ 'unset'.
+ max_rows (int, optional):
+ Maximum number of rows to display in the console.
+ min_rows (int, optional):
+ The number of rows to display in the console in a truncated repr
+ (when number of rows is above `max_rows`).
+ max_cols (int, optional):
+ Maximum number of columns to display in the console.
+ show_dimensions (bool, default False):
+ Display DataFrame dimensions (number of rows by number of columns).
+ decimal (str, default '.'):
+ Character recognized as decimal separator, e.g. ',' in Europe.
+ line_width (int, optional):
+ Width to wrap a line in characters.
+ max_colwidth (int, optional):
+ Max width to truncate each column in characters. By default, no limit.
+ encoding (str, default "utf-8"):
+ Set character encoding.
+
+ Returns:
+ str or None: If buf is None, returns the result as a string. Otherwise returns
+ None.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def to_markdown(
+ self,
+ buf=None,
+ mode: str = "wt",
+ index: bool = True,
+ **kwargs,
+ ):
+ """Print DataFrame in Markdown-friendly format.
+
+ Args:
+ buf (str, Path or StringIO-like, optional, default None):
+ Buffer to write to. If None, the output is returned as a string.
+ mode (str, optional):
+ Mode in which file is opened.
+ index (bool, optional, default True):
+ Add index (row) labels.
+ **kwargs
+ These parameters will be passed to `tabulate