From 59bc7be9891ac12926ac44531d9dbd4a88bcfc44 Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Thu, 19 Oct 2023 17:02:14 +0000 Subject: [PATCH 01/15] chore: rename rst files to avoid conflict with service names (#120) Source-Link: https://togithub.com/googleapis/synthtool/commit/d52e638b37b091054c869bfa6f5a9fedaba9e0dd Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:4f9b3b106ad0beafc2c8a415e3f62c1a0cc23cabea115dbe841b848f581cfe99 --- .github/.OwlBot.lock.yaml | 4 ++-- .kokoro/requirements.txt | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index dd98abbdee..7f291dbd5f 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:08e34975760f002746b1d8c86fdc90660be45945ee6d9db914d1508acdf9a547 -# created: 2023-10-09T14:06:13.397766266Z + digest: sha256:4f9b3b106ad0beafc2c8a415e3f62c1a0cc23cabea115dbe841b848f581cfe99 +# created: 2023-10-18T20:26:37.410353675Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 0332d3267e..16170d0ca7 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -467,9 +467,9 @@ typing-extensions==4.4.0 \ --hash=sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa \ --hash=sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e # via -r requirements.in -urllib3==1.26.17 \ - --hash=sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21 \ - --hash=sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b +urllib3==1.26.18 \ + --hash=sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07 \ + --hash=sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0 # via # requests # twine From c9c46d45eca536d1fc2dfed0cce25bfb41a41554 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 19 Oct 2023 13:08:14 -0500 Subject: [PATCH 02/15] chore: remove unneeded scripts, no longer need THIRD_PARTY_NOTICES (#121) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 306233684 🦕 --- .kokoro/release-nightly.sh | 44 +--- scripts/generate_third_party_notices.py | 332 ------------------------ scripts/update_firebase_docs_site.sh | 105 -------- scripts/update_x20_docs_site.sh | 106 -------- 4 files changed, 1 insertion(+), 586 deletions(-) delete mode 100644 scripts/generate_third_party_notices.py delete mode 100644 scripts/update_firebase_docs_site.sh delete mode 100644 scripts/update_x20_docs_site.sh diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 488dbb9e13..5cc1275308 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -63,26 +63,6 @@ export PYTHONUNBUFFERED=1 # Install dependencies, as the following steps depend on it python3.10 -m pip install -e .[all] -# Generate third party notices and include it in the licenses in setup.cfg -# TODO(shobs): Don't include it in the package once vertex colab can pick it -# from elsewhere -THIRD_PARTY_NOTICES_FILE=THIRD_PARTY_NOTICES -python3.10 -m pip install pip-licenses -python3.10 scripts/generate_third_party_notices.py --output-file ${THIRD_PARTY_NOTICES_FILE} -if ! [ -s ${THIRD_PARTY_NOTICES_FILE} ]; then - echo "${THIRD_PARTY_NOTICES_FILE} was generated with zero size" - exit -1 -fi -SETUP_CFG_BKP=`mktemp` -cp -f setup.cfg ${SETUP_CFG_BKP} -cat >> setup.cfg << EOF - -[metadata] -license_files = - LICENSE - ${THIRD_PARTY_NOTICES_FILE} -EOF - # Update version string to include git hash and date CURRENT_DATE=$(date '+%Y%m%d') GIT_HASH=$(git rev-parse --short HEAD) @@ -101,33 +81,13 @@ if [ $num_wheel_files -ne 1 ] ; then exit -1 fi -# Make sure the wheel file has the third party notices included -# TODO(shobs): An utimate validation would be to create a virtual environment -# and install the wheel file, then verify that -# site-packages/bigframes-*.dist-info/ includes third party notices -python3.10 -c " -from zipfile import ZipFile -with ZipFile('$VERSION_WHEEL') as myzip: - third_party_licenses_info = [ - info - for info in myzip.infolist() - if info.filename.endswith('.dist-info/${THIRD_PARTY_NOTICES_FILE}') - ] - assert ( - len(third_party_licenses_info) == 1 - ), f'Found {len(third_party_licenses_info)} third party licenses' - assert ( - third_party_licenses_info[0].file_size > 0 - ), 'Package contains third party license of size 0' -" - # Create a copy of the wheel with a well known, version agnostic name LATEST_WHEEL=dist/bigframes-latest-py2.py3-none-any.whl cp $VERSION_WHEEL $LATEST_WHEEL cp dist/bigframes-*.tar.gz dist/bigframes-latest.tar.gz if ! [ ${DRY_RUN} ]; then - for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ +for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ gs://dl-platform-colab/bigframes/ \ gs://bigframes-wheels/; do @@ -155,8 +115,6 @@ fi # the changes were made but before this cleanup, because the script would # terminate with the failure itself. See if we can ensure the cleanup. sed -i -e "s/$RELEASE_VERSION/$BIGFRAMES_VERSION/g" bigframes/version.py -mv -f ${SETUP_CFG_BKP} setup.cfg -rm -f ${THIRD_PARTY_NOTICES_FILE} if ! [ ${DRY_RUN} ]; then # Copy docs and wheels to Google Drive diff --git a/scripts/generate_third_party_notices.py b/scripts/generate_third_party_notices.py deleted file mode 100644 index 7040bb2e5f..0000000000 --- a/scripts/generate_third_party_notices.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import glob -import importlib.metadata -import json -import os.path -import re -import sys - -import piplicenses -import requests - -DEPENDENCY_INFO_SEPARATOR = "*" * 80 + "\n" -PACKAGE_NAME_EXTRACTOR = re.compile("^[a-zA-Z0-9._-]+") - -# These packages don't have LICENSE files distributed in their packages, -# but we have manually confirmed they have a compatible license and -# included it manually in our `third_party` directory. -# -# TODO(swast): We can remove this workaround once these packages bundle the -# license file. -# -# ipython-genutils and recommonmark are both in an archived state with no likely updates in the future -# -# Tracking issues: -# * https://github.com/grpc/grpc/issues/33557 -# * https://github.com/gsnedders/python-webencodings/issues/33 -# * https://github.com/pickleshare/pickleshare/issues/34 -DIRECT_LICENSE_MAPPINGS = { - "grpcio-status": "https://raw.githubusercontent.com/grpc/grpc/master/LICENSE", - "webencodings": "https://raw.githubusercontent.com/gsnedders/python-webencodings/master/LICENSE", - "ipython-genutils": "https://raw.githubusercontent.com/ipython/ipython_genutils/master/COPYING.md", - "pickleshare": "https://raw.githubusercontent.com/pickleshare/pickleshare/master/LICENSE", - "recommonmark": "https://raw.githubusercontent.com/readthedocs/recommonmark/master/license.md", -} - - -def get_package_dependencies(pkg_name): - """Get all package dependencies for a given package, both required and optional.""" - packages = set() - requirements = importlib.metadata.requires(pkg_name) - if requirements: - for req in requirements: - match = PACKAGE_NAME_EXTRACTOR.match(req) - assert match, f"Could not parse {req} for package name" - packages.add(match.group(0)) - return packages - - -# Inspired by third_party/colab/cleanup_filesets.py -def find_dependencies( - roots: set[str], ignore_missing_metadata=False -) -> dict[str, dict[str, set[str]]]: - """Return the transitive dependencies of a set of packages. - Args: - roots: List of package names, e.g. ["pkg1", "pkg2"] - Returns: - A dictionary of dependencies, e.g. - { - "pkg3" : { - "Requires" : set(["pkg4", "pkg5", "pkg6"]), - "RequiredBy": set(["pkg1"]) - }, - "pkg4" : { - "Requires" : set([]), - "RequiredBy": set(["pkg3"]) - }, - ... - } - """ - hops = set() - visited = set() - deps: dict[str, dict[str, set[str]]] = dict() - - # Initialize the start of the graph walk - for root in roots: - # Get the normalized package name - try: - pkg = importlib.metadata.metadata(root) - except importlib.metadata.PackageNotFoundError: - if not ignore_missing_metadata: - raise - continue - hops.add(pkg["Name"]) - - # Start the graph walk - while True: - if not hops: - break - hop = hops.pop() - if hop in visited: - continue - visited.add(hop) - - for dep in get_package_dependencies(hop): - # Get the normalized package name - try: - req_pkg = importlib.metadata.metadata(dep) - except importlib.metadata.PackageNotFoundError: - if not ignore_missing_metadata: - raise - continue - dep = req_pkg["Name"] - - # Create outgoing edge only for non root packages, for which an - # entry must have been created in the deps dictionary when we - # saw the package for the first time during the graph walk - if hop in deps: - deps[hop]["Requires"].add(dep) - - if dep in deps: - # We have already seen this requirement in the graph walk. - # Just update the incoming dependency and carry on. - deps[dep]["RequiredBy"].add(hop) - else: - # This is the first time we came across this requirement. - # Create a new entry with the incoming dependency. - deps[dep] = {"RequiredBy": {hop}, "Requires": set()} - - # Put it in the next hops for further graph traversal - hops.add(dep) - - return deps - - -def get_metadata_and_filename( - package_name: str, - metadata_name: str, - metadata_file: str, - metadata_text: str, - ignore_missing=True, -) -> tuple[str, str] | None: - """Get package metadata and corresponsing file name.""" - - # Check metadata file - metadata_filepath_known = metadata_file != piplicenses.LICENSE_UNKNOWN - if not metadata_filepath_known and not ignore_missing: - raise ValueError(f"No {metadata_name} file found for {package_name}") - - # Check metadata text - if metadata_text != piplicenses.LICENSE_UNKNOWN: - output_filename = metadata_name - if metadata_filepath_known: - output_filename = os.path.basename(metadata_file) - if not output_filename: - raise ValueError( - f"Need a file name to write {metadata_name} text for {package_name}." - ) - return metadata_text, output_filename - elif not ignore_missing: - raise ValueError(f"No {metadata_name} text found for {package_name}") - - return None - - -def fetch_license_and_notice_metadata(packages: list[str]): - """Fetch metadata including license and notice for given packages. - Returns a json object. - """ - parser = piplicenses.create_parser() - args = parser.parse_args( - [ - "--format", - "json", - "--with-license-file", - "--with-notice-file", - "--with-urls", - "--with-description", - "--packages", - *packages, - ] - ) - output_str = piplicenses.create_output_string(args) - metadatas = json.loads(output_str) - return metadatas - - -def write_lines_without_trailing_spaces(file, text: str, key: str): - """Write text lines to a file without the trailing spaces. - This will stop complaints by the trailing-whitespace pre-commit hook.""" - text = "\n".join([line.rstrip() for line in text.split("\n")]) - file.write(f"{key}:\n{text}\n") - - -def write_metadata_to_file( - file, metadata, with_version=False, requires_packages=[], packages_required_by=[] -): - """Write package metadata to a file object.""" - file.write(DEPENDENCY_INFO_SEPARATOR) - - info_keys = ["Name"] - if with_version: - info_keys.append("Version") - info_keys.extend(["License", "URL"]) - file.writelines([f"{key}: {metadata[key]}\n" for key in info_keys]) - - if requires_packages: - file.write(f"Requires: {', '.join(sorted(requires_packages))}\n") - - if packages_required_by: - file.write(f"Required By: {', '.join(sorted(packages_required_by))}\n") - - # Try to generate third party license - - license_info = get_metadata_and_filename( - metadata["Name"], - "LICENSE", - metadata["LicenseFile"], - metadata["LicenseText"], - ignore_missing=metadata["Name"] in DIRECT_LICENSE_MAPPINGS, - ) - - license_text = "" - if license_info: - license_text = license_info[0] - else: - license_text_response = requests.get(DIRECT_LICENSE_MAPPINGS[metadata["Name"]]) - license_text = license_text_response.text - - write_lines_without_trailing_spaces(file, license_text, "License") - - # Try to generate third party notice - notice_info = get_metadata_and_filename( - metadata["Name"], - "NOTICE", - metadata["NoticeFile"], - metadata["NoticeText"], - ignore_missing=True, - ) - - if notice_info: - write_lines_without_trailing_spaces(file, notice_info[0], "Notice") - - file.write(DEPENDENCY_INFO_SEPARATOR) - - -def write_third_party_vendored_license(file, path): - """Write license of a vendored third party library to notices file.""" - file.write(DEPENDENCY_INFO_SEPARATOR) - file.write(f"Vendored Code: {os.path.dirname(path)}\n") - notice_key = f"Notice ({os.path.basename(path)})" - write_lines_without_trailing_spaces(file, open(path).read(), notice_key) - file.write(DEPENDENCY_INFO_SEPARATOR) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Generate third party notices for bigframes dependencies." - ) - parser.add_argument( - "--with-version", - action="store_true", - default=False, - help="Include the version information for each package.", - ) - parser.add_argument( - "--with-requires", - action="store_true", - default=False, - help="Include for each package the packages it requires.", - ) - parser.add_argument( - "--with-required-by", - action="store_true", - default=False, - help="Include for each package the packages that require it.", - ) - parser.add_argument( - "--output-file", - action="store", - default="THIRD_PARTY_NOTICES", - help="The output file to write third party notices in.", - ) - args = parser.parse_args(sys.argv[1:]) - - # Initialize the root package - roots = {"bigframes"} - - # Find dependencies - # Let's ignore the packages that are not installed assuming they are - # just the optional dependencies that bigframes does not require. - # One example is the dependency path bigframes -> SQLAlchemy -> pg8000, - # where pg8000 is only an optional dependency for SQLAlchemy which bigframes - # is not depending on - # https://github.com/sqlalchemy/sqlalchemy/blob/7bc81947e22dc32368b0c49a41c398cd251d94af/setup.cfg#LL62C21-L62C27 - deps = find_dependencies(roots, ignore_missing_metadata=True) - - # Use third party solution to fetch dependency metadata - deps_metadata = fetch_license_and_notice_metadata(list(deps)) - deps_metadata = sorted(deps_metadata, key=lambda m: m["Name"]) - - # Write the file - with open(args.output_file, "w") as f: - # Generate third party metadata for each dependency - for metadata in deps_metadata: - dep = deps[metadata["Name"]] - write_metadata_to_file( - f, - metadata, - args.with_version, - dep["Requires"] if args.with_requires else [], - dep["RequiredBy"] if args.with_required_by else [], - ) - - # Generate third party vendored notices - notices = set() - for filename in [ - "LICENCE", - "LICENCE.txt", - "LICENSE", - "LICENSE.txt", - "NOTICE", - "NOTICE.txt", - "COPYING", - "COPYING.txt", - ]: - notices.update(glob.glob(f"third_party/bigframes_vendored/*/{filename}")) - for path in sorted(notices): - write_third_party_vendored_license(f, path) diff --git a/scripts/update_firebase_docs_site.sh b/scripts/update_firebase_docs_site.sh deleted file mode 100644 index f0ef866c90..0000000000 --- a/scripts/update_firebase_docs_site.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -o pipefail -set -x - -# Replace the docs version by the desired commit -BIGFRAMES_DOCS_VERSION=8ce2a3e - -BIGFRAMES_FIREBASE_PROJECT=bigframes-dev-d4d9a -BIGFRAMES_DOCS_GCS_BUCKET=bigframes-docs - -BIGFRAMES_DOCS_DIR=`mktemp -d` - - -# Install firebase if not already installed -if ! which firebase; then - npm install -g firebase-tools -fi - -# Prepare a working directory for firebase -mkdir -p ${BIGFRAMES_DOCS_DIR} -pushd ${BIGFRAMES_DOCS_DIR} - -# Copy the bigframes version -if [ ! -d ${BIGFRAMES_DOCS_VERSION} ]; then - gsutil -m cp -r gs://${BIGFRAMES_DOCS_GCS_BUCKET}/${BIGFRAMES_DOCS_VERSION} . -fi - -rm -f latest -ln -s ${BIGFRAMES_DOCS_VERSION} latest - -# Set up firebase -firebase login --no-localhost -firebase init hosting - -versions="${BIGFRAMES_DOCS_VERSION} latest" -for version in ${versions}; do - site_name=bigframes-docs-${version} - if ! firebase hosting:sites:list | grep ${site_name}; then - firebase hosting:sites:create ${site_name} - fi - - firebase target:apply hosting ${version} ${site_name} -done - -# Make sure the firebase json config is consistent with ${versions} -# TODO(shobs): Come up with a better way of updating the config than -# a hard overwrite -cat > firebase.json << EOF -{ - "hosting": [ - { - "target": "latest", - "public": "latest", - "ignore": [ - "firebase.json", - "**/.*", - "**/node_modules/**" - ] - } - , - { - "target": "${BIGFRAMES_DOCS_VERSION}", - "public": "${BIGFRAMES_DOCS_VERSION}", - "ignore": [ - "firebase.json", - "**/.*", - "**/node_modules/**" - ] - } - ] -} -EOF - -# Verify that the intended sites look good -for version in ${versions}; do - echo "Preview the local hosting of the docs site \"${version}\" before actually deploying (Press Ctrl+C to stop)" - firebase serve --only hosting:${version} -done - -echo -n "Go ahead and deploy? [y/N]: " -read deploy_consent - -# Deploy the sites -if [ "$deploy_consent" = y ]; then - echo "Deploying ..." - firebase deploy --only hosting -else - echo "Not Deploying anything." -fi - -popd diff --git a/scripts/update_x20_docs_site.sh b/scripts/update_x20_docs_site.sh deleted file mode 100644 index 31da116bdd..0000000000 --- a/scripts/update_x20_docs_site.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -################################################################################ -# This script uses corp credentials to sync the files to x20. -# Make sure you: -# 1. Have write permission to /x20/teams/bigframes-swe -# 2. Have run `gcert` before running the script -################################################################################ - -set -e -o pipefail -set -x - -GIT_DOCS_DIR=docs/_build/html/ -X20_BIGFRAMES_DIR=/x20/teams/bigframes-swe/bigframes/docs -MAX_BACKFILL=10 -REQ_PYTHON_VERSION=3.9 - -# Create a temporary directory -tempdir=`mktemp --directory --tmpdir bigframes.XXXXXXXXXX` - -# Clone the repository -git clone sso://team/bigquery-query-swe/bigframes ${tempdir} - -# Enter the temporary bigframes directory -pushd ${tempdir} - -# Python version 3.9 is required to build bigframes docs, install if not present -if ! python3 --version | grep ${REQ_PYTHON_VERSION}; then - # Install pyenv to install the required python version - ## https://github.com/pyenv/pyenv#basic-github-checkout - git clone https://github.com/pyenv/pyenv.git .pyenv - pushd .pyenv && src/configure && make -C src && popd - - ## https://github.com/pyenv/pyenv#set-up-your-shell-environment-for-pyenv - export PYENV_ROOT=${PWD}/.pyenv - PATH=${PYENV_ROOT}/bin:${PATH} - eval "$(pyenv init -)" - - ## Install the required python version - pyenv install ${REQ_PYTHON_VERSION} - - ## Make the required python version available - pyenv global ${REQ_PYTHON_VERSION} -fi - -# Create a virtual environment with nox installed -python3 -m venv venv -source venv/bin/activate -pip install nox - -# i = 0 means docs for the latest version, and i = 1 onwards means backfill -for i in `seq 0 ${MAX_BACKFILL}`; do - # If it is backfill turn, back off the version by 1 - if [ ${i} -ne 0 ]; then - git reset --hard HEAD~1 - - # Clean up any old docs - rm -rf ${GIT_DOCS_DIR} - fi - - # Construct a docs path in x20 - commit_hash=`git rev-parse --short HEAD` - x20_docs_dir_commit=${X20_BIGFRAMES_DIR}/${commit_hash} - - # If the x20 docs path already exists, let's assume that it was created - # properly in the previous attempt - if fileutil test -d ${x20_docs_dir_commit}; then - echo ${x20_docs_dir_commit} exists, skipping rebuilding it.. - continue - fi - - # Build the docs - echo Building docs for commit ${commit_hash}.. - nox -s docs - - # TODO(shobs): Check if a symlink can be created instead of another copy of - # the latest commit's docs, using fileutil CLI or otherwise - x20_docs_dirs=${x20_docs_dir_commit} - if [ ${i} -eq 0 ]; then - x20_docs_dirs="${x20_docs_dirs} ${X20_BIGFRAMES_DIR}/latest" - fi - - for x20_docs_dir in ${x20_docs_dirs}; do - fileutil mirror -parallelism=4 -force ${GIT_DOCS_DIR} ${x20_docs_dir} - x20_own request_change --recursive --path=${x20_docs_dir} --new_mode=a+r - done -done - -# Exit the temporary bigframes directory -popd - -# Clean up the temporary bigframes directory -rm -rf ${tempdir} From 694a85a0ef90d838700014a204d72b23362db1d8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 20 Oct 2023 10:52:49 -0500 Subject: [PATCH 03/15] feat: add back `reset_session` as an alias for `close_session` (#124) --- bigframes/pandas/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1b9144fb62..24b19fa70a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -452,6 +452,7 @@ def read_gbq_function(function_name: str): # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session +reset_session = global_session.close_session # Use __all__ to let type checkers know what is part of the public API. From f9bb3c4bc88c5ba2be6f17e12a0ec4f482ce161f Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 20 Oct 2023 13:48:13 -0700 Subject: [PATCH 04/15] feat: change `query` parameter to `query_or_table` in `read_gbq` (#127) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/pandas/__init__.py | 6 +++--- bigframes/session/__init__.py | 8 ++++---- third_party/bigframes_vendored/pandas/io/gbq.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 24b19fa70a..971d40f801 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -290,16 +290,16 @@ def read_json( def read_gbq( - query: str, + query_or_table: str, *, index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, ) -> bigframes.dataframe.DataFrame: - _set_default_session_location_if_possible(query) + _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( bigframes.session.Session.read_gbq, - query, + query_or_table, index_col=index_col, col_order=col_order, max_results=max_results, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 1031fde9b5..2f001d7d49 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -262,7 +262,7 @@ def close(self): def read_gbq( self, - query: str, + query_or_table: str, *, index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), @@ -270,9 +270,9 @@ def read_gbq( # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - if _is_query(query): + if _is_query(query_or_table): return self._read_gbq_query( - query, + query_or_table, index_col=index_col, col_order=col_order, max_results=max_results, @@ -283,7 +283,7 @@ def read_gbq( # deterministic query so we can avoid serializing if we have a # unique index. return self._read_gbq_table( - query, + query_or_table, index_col=index_col, col_order=col_order, max_results=max_results, diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 95531ff5e8..8919f4ed16 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -11,7 +11,7 @@ class GBQIOMixin: def read_gbq( self, - query: str, + query_or_table: str, *, index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), @@ -69,7 +69,7 @@ def read_gbq( [5 rows x 3 columns] Args: - query (str): + query_or_table (str): A SQL string to be executed or a BigQuery table to be read. The table must be specified in the format of `project.dataset.tablename` or `dataset.tablename`. From ba2e824f3abbea30010d845e53d645db52504361 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Sat, 21 Oct 2023 01:18:20 +0000 Subject: [PATCH 05/15] refactor: Tweak notebooks to test minimal IAM permissions (#123) * refactor: Tweak notebooks to test minimal IAM permissions * Make IAM permission comment more helpful --- notebooks/dataframes/dataframe.ipynb | 313 ++- .../getting_started/ml_fundamentals.ipynb | 2005 +++++++++-------- .../regression/easy_linear_regression.ipynb | 190 +- 3 files changed, 1425 insertions(+), 1083 deletions(-) diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 85ea61d281..c6b276af87 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -35,12 +35,26 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "944f0e4417154e81b6496302fe756465", + "model_id": "11c27813da5c4d2e8108bf4bd9e7e55d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job ac4d1f2b-e9f3-4d95-b78d-57e40eee93fa is RUNNING. \n", " \n", " \n", - " 241\n", + " 156\n", " Biscoe\n", " 46.2\n", - " 14.9\n", - " 221.0\n", - " MALE\n", - " Gentoo penguin (Pygoscelis papua)\n", - " \n", - " \n", - " 121\n", - " Dream\n", - " 48.1\n", - " 16.4\n", - " 199.0\n", + " 14.5\n", + " 209.0\n", " FEMALE\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 209\n", + " 189\n", " Biscoe\n", - " 42.7\n", - " 18.3\n", - " 196.0\n", - " MALE\n", + " 35.3\n", + " 18.9\n", + " 187.0\n", + " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 270\n", + " 279\n", " Biscoe\n", - " 37.7\n", - " 16.0\n", - " 183.0\n", + " 45.1\n", + " 14.5\n", + " 215.0\n", " FEMALE\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 187\n", + " 245\n", " Biscoe\n", - " 43.4\n", - " 14.4\n", - " 218.0\n", - " FEMALE\n", + " 49.5\n", + " 16.2\n", + " 229.0\n", + " MALE\n", " Gentoo penguin (Pygoscelis papua)\n", " \n", + " \n", + " 343\n", + " Torgersen\n", + " 37.3\n", + " 20.5\n", + " 199.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " \n", " \n", "\n", "

5 rows Ă— 6 columns

\n", "[5 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "penguin_id \n", - "241 Biscoe 46.2 14.9 221.0 \n", - "121 Dream 48.1 16.4 199.0 \n", - "209 Biscoe 42.7 18.3 196.0 \n", - "270 Biscoe 37.7 16.0 183.0 \n", - "187 Biscoe 43.4 14.4 218.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "penguin_id \n", + "156 Biscoe 46.2 14.5 209.0 \n", + "189 Biscoe 35.3 18.9 187.0 \n", + "279 Biscoe 45.1 14.5 215.0 \n", + "245 Biscoe 49.5 16.2 229.0 \n", + "343 Torgersen 37.3 20.5 199.0 \n", "\n", - " sex species \n", - "penguin_id \n", - "241 MALE Gentoo penguin (Pygoscelis papua) \n", - "121 FEMALE Chinstrap penguin (Pygoscelis antarctica) \n", - "209 MALE Adelie Penguin (Pygoscelis adeliae) \n", - "270 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "187 FEMALE Gentoo penguin (Pygoscelis papua) \n", + " sex species \n", + "penguin_id \n", + "156 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "189 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "279 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "245 MALE Gentoo penguin (Pygoscelis papua) \n", + "343 MALE Adelie Penguin (Pygoscelis adeliae) \n", "\n", "[5 rows x 6 columns]" ] }, - "execution_count": 3, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -763,18 +749,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f09b707c5c9540cdae59af3765339b6e", + "model_id": "d6dd794f89724099950dcc927d63d0f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job 9f0cf977-a895-41fa-9f33-8b5346c62786 is DONE. 31.7 kB processed.
\n", " \n", " \n", - " 241\n", - " 5300.0\n", + " 156\n", + " 4800.0\n", " \n", " \n", - " 121\n", - " 3325.0\n", + " 189\n", + " 3800.0\n", " \n", " \n", - " 209\n", - " 4075.0\n", + " 279\n", + " 5000.0\n", " \n", " \n", - " 270\n", - " 3075.0\n", + " 245\n", + " 5800.0\n", " \n", " \n", - " 187\n", - " 4600.0\n", + " 343\n", + " 3775.0\n", " \n", " \n", "\n", @@ -865,16 +837,16 @@ "text/plain": [ " body_mass_g\n", "penguin_id \n", - "241 5300.0\n", - "121 3325.0\n", - "209 4075.0\n", - "270 3075.0\n", - "187 4600.0\n", + "156 4800.0\n", + "189 3800.0\n", + "279 5000.0\n", + "245 5800.0\n", + "343 3775.0\n", "\n", "[5 rows x 1 columns]" ] }, - "execution_count": 4, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -908,18 +880,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "299c0c895e3d4a83a8495924a1966ce0", + "model_id": "380c57dc3fe54fbd8ad2fb23f1e66e37", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job f54865db-fdb4-4022-af30-7f282a6b81c0 is DONE. 31.7 kB processed. \n", " \n", " \n", - " scaled_culmen_length_mm\n", - " scaled_culmen_depth_mm\n", - " scaled_flipper_length_mm\n", + " standard_scaled_culmen_length_mm\n", + " standard_scaled_culmen_depth_mm\n", + " standard_scaled_flipper_length_mm\n", " \n", " \n", " penguin_id\n", @@ -1016,153 +1002,153 @@ " \n", " \n", " 0\n", - " -1.364965\n", - " 0.629892\n", - " -1.226537\n", + " -1.344188\n", + " 0.642519\n", + " -1.193942\n", " \n", " \n", " 1\n", - " -0.771824\n", - " 0.984275\n", - " -1.226537\n", + " -0.750047\n", + " 1.005876\n", + " -1.193942\n", " \n", " \n", " 2\n", - " -0.567932\n", - " 0.883023\n", - " -1.226537\n", - " \n", - " \n", - " 3\n", - " 0.470064\n", - " 0.376761\n", - " -0.652517\n", + " -0.545811\n", + " 0.90206\n", + " -1.193942\n", " \n", " \n", " 4\n", - " -1.235216\n", - " -0.180128\n", - " -0.652517\n", + " -1.214219\n", + " -0.188011\n", + " -0.619171\n", " \n", " \n", " 5\n", - " -0.141612\n", - " 0.680518\n", - " -0.652517\n", + " -0.118772\n", + " 0.694427\n", + " -0.619171\n", " \n", " \n", " 6\n", - " 0.544207\n", - " -0.281381\n", - " -0.652517\n", + " 0.568203\n", + " -0.291828\n", + " -0.619171\n", " \n", " \n", " 7\n", - " 1.21149\n", - " 0.629892\n", - " -0.078497\n", + " 1.236611\n", + " 0.642519\n", + " -0.044401\n", " \n", " \n", - " 8\n", - " 1.026133\n", - " 0.933649\n", - " -0.078497\n", + " 9\n", + " -0.675779\n", + " 1.524957\n", + " -0.044401\n", " \n", " \n", " 10\n", - " -0.586468\n", - " 0.883023\n", - " 0.495523\n", + " -0.564378\n", + " 0.90206\n", + " 0.530369\n", " \n", " \n", " 11\n", - " -0.92011\n", - " 0.781771\n", - " -1.154784\n", + " -0.898582\n", + " 0.798243\n", + " -1.122096\n", + " \n", + " \n", + " 12\n", + " -1.26992\n", + " -0.136103\n", + " -1.122096\n", + " \n", + " \n", + " 13\n", + " 0.58677\n", + " 0.071529\n", + " -1.122096\n", " \n", " \n", " 14\n", - " -1.846892\n", - " -0.028249\n", - " -1.154784\n", + " -1.826927\n", + " -0.032287\n", + " -1.122096\n", " \n", " \n", " 15\n", - " -1.290822\n", - " -0.332007\n", - " -1.154784\n", + " -1.26992\n", + " -0.343736\n", + " -1.122096\n", " \n", " \n", " 16\n", - " 0.321779\n", - " 0.073003\n", - " -0.580765\n", + " 0.3454\n", + " 0.071529\n", + " -0.547325\n", " \n", " \n", - " 17\n", - " 1.230026\n", - " 1.136154\n", - " -0.580765\n", + " 18\n", + " -0.768614\n", + " 0.382978\n", + " -0.547325\n", " \n", " \n", - " 18\n", - " -0.79036\n", - " 0.376761\n", - " -0.580765\n", + " 19\n", + " -1.121385\n", + " 0.486795\n", + " -0.547325\n", " \n", " \n", " 20\n", - " 0.4886\n", - " 0.326134\n", - " -0.580765\n", + " 0.512502\n", + " 0.33107\n", + " -0.547325\n", " \n", " \n", " 21\n", - " 1.359775\n", - " 1.034902\n", - " -0.580765\n", + " 1.385146\n", + " 1.057784\n", + " -0.547325\n", + " \n", + " \n", + " 22\n", + " -0.675779\n", + " -0.032287\n", + " -0.547325\n", " \n", " \n", " 24\n", - " 1.044669\n", - " 0.528639\n", - " -0.580765\n", + " 1.069509\n", + " 0.538703\n", + " -0.547325\n", " \n", " \n", " 26\n", - " -0.456718\n", - " 0.680518\n", - " -0.006745\n", + " -0.43441\n", + " 0.694427\n", + " 0.027445\n", " \n", " \n", - " 27\n", - " 1.21149\n", - " 1.237407\n", - " -0.006745\n", + " 28\n", + " 1.923586\n", + " 1.888314\n", + " 0.027445\n", " \n", " \n", - " 29\n", - " 1.378311\n", - " 0.933649\n", - " -0.006745\n", + " 30\n", + " 1.292312\n", + " 0.694427\n", + " 0.027445\n", " \n", " \n", " 31\n", - " -2.013713\n", - " -0.534512\n", - " -1.657052\n", - " \n", - " \n", - " 32\n", - " -1.253751\n", - " 0.478013\n", - " -1.657052\n", - " \n", - " \n", - " 33\n", - " -0.827431\n", - " -0.230754\n", - " -1.657052\n", + " -1.994029\n", + " -0.551368\n", + " -1.62502\n", " \n", " \n", "\n", @@ -1170,67 +1156,67 @@ "[267 rows x 3 columns in total]" ], "text/plain": [ - " scaled_culmen_length_mm scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "0 -1.364965 0.629892 \n", - "1 -0.771824 0.984275 \n", - "2 -0.567932 0.883023 \n", - "3 0.470064 0.376761 \n", - "4 -1.235216 -0.180128 \n", - "5 -0.141612 0.680518 \n", - "6 0.544207 -0.281381 \n", - "7 1.21149 0.629892 \n", - "8 1.026133 0.933649 \n", - "10 -0.586468 0.883023 \n", - "11 -0.92011 0.781771 \n", - "14 -1.846892 -0.028249 \n", - "15 -1.290822 -0.332007 \n", - "16 0.321779 0.073003 \n", - "17 1.230026 1.136154 \n", - "18 -0.79036 0.376761 \n", - "20 0.4886 0.326134 \n", - "21 1.359775 1.034902 \n", - "24 1.044669 0.528639 \n", - "26 -0.456718 0.680518 \n", - "27 1.21149 1.237407 \n", - "29 1.378311 0.933649 \n", - "31 -2.013713 -0.534512 \n", - "32 -1.253751 0.478013 \n", - "33 -0.827431 -0.230754 \n", + " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "0 -1.344188 0.642519 \n", + "1 -0.750047 1.005876 \n", + "2 -0.545811 0.90206 \n", + "4 -1.214219 -0.188011 \n", + "5 -0.118772 0.694427 \n", + "6 0.568203 -0.291828 \n", + "7 1.236611 0.642519 \n", + "9 -0.675779 1.524957 \n", + "10 -0.564378 0.90206 \n", + "11 -0.898582 0.798243 \n", + "12 -1.26992 -0.136103 \n", + "13 0.58677 0.071529 \n", + "14 -1.826927 -0.032287 \n", + "15 -1.26992 -0.343736 \n", + "16 0.3454 0.071529 \n", + "18 -0.768614 0.382978 \n", + "19 -1.121385 0.486795 \n", + "20 0.512502 0.33107 \n", + "21 1.385146 1.057784 \n", + "22 -0.675779 -0.032287 \n", + "24 1.069509 0.538703 \n", + "26 -0.43441 0.694427 \n", + "28 1.923586 1.888314 \n", + "30 1.292312 0.694427 \n", + "31 -1.994029 -0.551368 \n", "\n", - " scaled_flipper_length_mm \n", - "penguin_id \n", - "0 -1.226537 \n", - "1 -1.226537 \n", - "2 -1.226537 \n", - "3 -0.652517 \n", - "4 -0.652517 \n", - "5 -0.652517 \n", - "6 -0.652517 \n", - "7 -0.078497 \n", - "8 -0.078497 \n", - "10 0.495523 \n", - "11 -1.154784 \n", - "14 -1.154784 \n", - "15 -1.154784 \n", - "16 -0.580765 \n", - "17 -0.580765 \n", - "18 -0.580765 \n", - "20 -0.580765 \n", - "21 -0.580765 \n", - "24 -0.580765 \n", - "26 -0.006745 \n", - "27 -0.006745 \n", - "29 -0.006745 \n", - "31 -1.657052 \n", - "32 -1.657052 \n", - "33 -1.657052 \n", + " standard_scaled_flipper_length_mm \n", + "penguin_id \n", + "0 -1.193942 \n", + "1 -1.193942 \n", + "2 -1.193942 \n", + "4 -0.619171 \n", + "5 -0.619171 \n", + "6 -0.619171 \n", + "7 -0.044401 \n", + "9 -0.044401 \n", + "10 0.530369 \n", + "11 -1.122096 \n", + "12 -1.122096 \n", + "13 -1.122096 \n", + "14 -1.122096 \n", + "15 -1.122096 \n", + "16 -0.547325 \n", + "18 -0.547325 \n", + "19 -0.547325 \n", + "20 -0.547325 \n", + "21 -0.547325 \n", + "22 -0.547325 \n", + "24 -0.547325 \n", + "26 0.027445 \n", + "28 0.027445 \n", + "30 0.027445 \n", + "31 -1.62502 \n", "...\n", "\n", "[267 rows x 3 columns]" ] }, - "execution_count": 5, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1251,18 +1237,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6ec4ab3e60844e989dbebad89e7665ca", + "model_id": "74f3c24c0a434e12bf6a56dc4809b501", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job a8029b51-2ef1-4acd-9759-d808db954298 is DONE. 31.7 kB processed. \n", - " -1.364965\n", - " 0.629892\n", - " -1.226537\n", + " -1.344188\n", + " 0.642519\n", + " -1.193942\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 1\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.771824\n", - " 0.984275\n", - " -1.226537\n", + " -0.750047\n", + " 1.005876\n", + " -1.193942\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 2\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.567932\n", - " 0.883023\n", - " -1.226537\n", + " -0.545811\n", + " 0.90206\n", + " -1.193942\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 3\n", - " [{'index': 2, 'value': 1.0}]\n", - " 0.470064\n", - " 0.376761\n", - " -0.652517\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", - " \n", - " \n", " 4\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.235216\n", - " -0.180128\n", - " -0.652517\n", + " -1.214219\n", + " -0.188011\n", + " -0.619171\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 5\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.141612\n", - " 0.680518\n", - " -0.652517\n", + " -0.118772\n", + " 0.694427\n", + " -0.619171\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 6\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.544207\n", - " -0.281381\n", - " -0.652517\n", + " 0.568203\n", + " -0.291828\n", + " -0.619171\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 7\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.21149\n", - " 0.629892\n", - " -0.078497\n", + " 1.236611\n", + " 0.642519\n", + " -0.044401\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 8\n", + " 9\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.026133\n", - " 0.933649\n", - " -0.078497\n", + " -0.675779\n", + " 1.524957\n", + " -0.044401\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 10\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.586468\n", - " 0.883023\n", - " 0.495523\n", + " -0.564378\n", + " 0.90206\n", + " 0.530369\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 11\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.92011\n", - " 0.781771\n", - " -1.154784\n", + " -0.898582\n", + " 0.798243\n", + " -1.122096\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", + " 12\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.26992\n", + " -0.136103\n", + " -1.122096\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " \n", + " \n", + " 13\n", + " [{'index': 2, 'value': 1.0}]\n", + " 0.58677\n", + " 0.071529\n", + " -1.122096\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", + " \n", + " \n", " 14\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.846892\n", - " -0.028249\n", - " -1.154784\n", + " -1.826927\n", + " -0.032287\n", + " -1.122096\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 15\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.290822\n", - " -0.332007\n", - " -1.154784\n", + " -1.26992\n", + " -0.343736\n", + " -1.122096\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 16\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.321779\n", - " 0.073003\n", - " -0.580765\n", + " 0.3454\n", + " 0.071529\n", + " -0.547325\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 17\n", + " 18\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.230026\n", - " 1.136154\n", - " -0.580765\n", + " -0.768614\n", + " 0.382978\n", + " -0.547325\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 18\n", + " 19\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.79036\n", - " 0.376761\n", - " -0.580765\n", + " -1.121385\n", + " 0.486795\n", + " -0.547325\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 20\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.4886\n", - " 0.326134\n", - " -0.580765\n", + " 0.512502\n", + " 0.33107\n", + " -0.547325\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 21\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.359775\n", - " 1.034902\n", - " -0.580765\n", + " 1.385146\n", + " 1.057784\n", + " -0.547325\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", + " 22\n", + " [{'index': 2, 'value': 1.0}]\n", + " -0.675779\n", + " -0.032287\n", + " -0.547325\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " \n", + " \n", " 24\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.044669\n", - " 0.528639\n", - " -0.580765\n", + " 1.069509\n", + " 0.538703\n", + " -0.547325\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 26\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.456718\n", - " 0.680518\n", - " -0.006745\n", + " -0.43441\n", + " 0.694427\n", + " 0.027445\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 27\n", + " 28\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.21149\n", - " 1.237407\n", - " -0.006745\n", + " 1.923586\n", + " 1.888314\n", + " 0.027445\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 29\n", + " 30\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.378311\n", - " 0.933649\n", - " -0.006745\n", + " 1.292312\n", + " 0.694427\n", + " 0.027445\n", " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 31\n", " [{'index': 2, 'value': 1.0}]\n", - " -2.013713\n", - " -0.534512\n", - " -1.657052\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", - " \n", - " \n", - " 32\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.253751\n", - " 0.478013\n", - " -1.657052\n", - " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", - " \n", - " \n", - " 33\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.827431\n", - " -0.230754\n", - " -1.657052\n", + " -1.994029\n", + " -0.551368\n", + " -1.62502\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", @@ -1966,95 +1980,123 @@ "[267 rows x 6 columns in total]" ], "text/plain": [ - " onehotencoded_island scaled_culmen_length_mm \\\n", - "penguin_id \n", - "0 [{'index': 2, 'value': 1.0}] -1.364965 \n", - "1 [{'index': 2, 'value': 1.0}] -0.771824 \n", - "2 [{'index': 2, 'value': 1.0}] -0.567932 \n", - "3 [{'index': 2, 'value': 1.0}] 0.470064 \n", - "4 [{'index': 2, 'value': 1.0}] -1.235216 \n", - "5 [{'index': 2, 'value': 1.0}] -0.141612 \n", - "6 [{'index': 2, 'value': 1.0}] 0.544207 \n", - "7 [{'index': 2, 'value': 1.0}] 1.21149 \n", - "8 [{'index': 2, 'value': 1.0}] 1.026133 \n", - "10 [{'index': 2, 'value': 1.0}] -0.586468 \n", - "11 [{'index': 2, 'value': 1.0}] -0.92011 \n", - "14 [{'index': 2, 'value': 1.0}] -1.846892 \n", - "15 [{'index': 2, 'value': 1.0}] -1.290822 \n", - "16 [{'index': 2, 'value': 1.0}] 0.321779 \n", - "17 [{'index': 2, 'value': 1.0}] 1.230026 \n", - "18 [{'index': 2, 'value': 1.0}] -0.79036 \n", - "20 [{'index': 2, 'value': 1.0}] 0.4886 \n", - "21 [{'index': 2, 'value': 1.0}] 1.359775 \n", - "24 [{'index': 2, 'value': 1.0}] 1.044669 \n", - "26 [{'index': 2, 'value': 1.0}] -0.456718 \n", - "27 [{'index': 2, 'value': 1.0}] 1.21149 \n", - "29 [{'index': 2, 'value': 1.0}] 1.378311 \n", - "31 [{'index': 2, 'value': 1.0}] -2.013713 \n", - "32 [{'index': 2, 'value': 1.0}] -1.253751 \n", - "33 [{'index': 2, 'value': 1.0}] -0.827431 \n", + " onehotencoded_island standard_scaled_culmen_length_mm \\\n", + "penguin_id \n", + "0 [{'index': 2, 'value': 1.0}] -1.344188 \n", + "1 [{'index': 2, 'value': 1.0}] -0.750047 \n", + "2 [{'index': 2, 'value': 1.0}] -0.545811 \n", + "4 [{'index': 2, 'value': 1.0}] -1.214219 \n", + "5 [{'index': 2, 'value': 1.0}] -0.118772 \n", + "6 [{'index': 2, 'value': 1.0}] 0.568203 \n", + "7 [{'index': 2, 'value': 1.0}] 1.236611 \n", + "9 [{'index': 2, 'value': 1.0}] -0.675779 \n", + "10 [{'index': 2, 'value': 1.0}] -0.564378 \n", + "11 [{'index': 2, 'value': 1.0}] -0.898582 \n", + "12 [{'index': 2, 'value': 1.0}] -1.26992 \n", + "13 [{'index': 2, 'value': 1.0}] 0.58677 \n", + "14 [{'index': 2, 'value': 1.0}] -1.826927 \n", + "15 [{'index': 2, 'value': 1.0}] -1.26992 \n", + "16 [{'index': 2, 'value': 1.0}] 0.3454 \n", + "18 [{'index': 2, 'value': 1.0}] -0.768614 \n", + "19 [{'index': 2, 'value': 1.0}] -1.121385 \n", + "20 [{'index': 2, 'value': 1.0}] 0.512502 \n", + "21 [{'index': 2, 'value': 1.0}] 1.385146 \n", + "22 [{'index': 2, 'value': 1.0}] -0.675779 \n", + "24 [{'index': 2, 'value': 1.0}] 1.069509 \n", + "26 [{'index': 2, 'value': 1.0}] -0.43441 \n", + "28 [{'index': 2, 'value': 1.0}] 1.923586 \n", + "30 [{'index': 2, 'value': 1.0}] 1.292312 \n", + "31 [{'index': 2, 'value': 1.0}] -1.994029 \n", "\n", - " scaled_culmen_depth_mm scaled_flipper_length_mm \\\n", - "penguin_id \n", - "0 0.629892 -1.226537 \n", - "1 0.984275 -1.226537 \n", - "2 0.883023 -1.226537 \n", - "3 0.376761 -0.652517 \n", - "4 -0.180128 -0.652517 \n", - "5 0.680518 -0.652517 \n", - "6 -0.281381 -0.652517 \n", - "7 0.629892 -0.078497 \n", - "8 0.933649 -0.078497 \n", - "10 0.883023 0.495523 \n", - "11 0.781771 -1.154784 \n", - "14 -0.028249 -1.154784 \n", - "15 -0.332007 -1.154784 \n", - "16 0.073003 -0.580765 \n", - "17 1.136154 -0.580765 \n", - "18 0.376761 -0.580765 \n", - "20 0.326134 -0.580765 \n", - "21 1.034902 -0.580765 \n", - "24 0.528639 -0.580765 \n", - "26 0.680518 -0.006745 \n", - "27 1.237407 -0.006745 \n", - "29 0.933649 -0.006745 \n", - "31 -0.534512 -1.657052 \n", - "32 0.478013 -1.657052 \n", - "33 -0.230754 -1.657052 \n", + " standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "0 0.642519 \n", + "1 1.005876 \n", + "2 0.90206 \n", + "4 -0.188011 \n", + "5 0.694427 \n", + "6 -0.291828 \n", + "7 0.642519 \n", + "9 1.524957 \n", + "10 0.90206 \n", + "11 0.798243 \n", + "12 -0.136103 \n", + "13 0.071529 \n", + "14 -0.032287 \n", + "15 -0.343736 \n", + "16 0.071529 \n", + "18 0.382978 \n", + "19 0.486795 \n", + "20 0.33107 \n", + "21 1.057784 \n", + "22 -0.032287 \n", + "24 0.538703 \n", + "26 0.694427 \n", + "28 1.888314 \n", + "30 0.694427 \n", + "31 -0.551368 \n", "\n", - " onehotencoded_sex onehotencoded_species \n", - "penguin_id \n", - "0 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "1 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "2 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "3 [{'index': 2, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "4 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "5 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "6 [{'index': 2, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "7 [{'index': 2, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "8 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "10 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "11 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "14 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "15 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "16 [{'index': 2, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "17 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "18 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "20 [{'index': 2, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "21 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "24 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "26 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "27 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "29 [{'index': 3, 'value': 1.0}] [{'index': 2, 'value': 1.0}] \n", - "31 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "32 [{'index': 3, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", - "33 [{'index': 2, 'value': 1.0}] [{'index': 1, 'value': 1.0}] \n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "0 -1.193942 [{'index': 2, 'value': 1.0}] \n", + "1 -1.193942 [{'index': 3, 'value': 1.0}] \n", + "2 -1.193942 [{'index': 3, 'value': 1.0}] \n", + "4 -0.619171 [{'index': 2, 'value': 1.0}] \n", + "5 -0.619171 [{'index': 3, 'value': 1.0}] \n", + "6 -0.619171 [{'index': 2, 'value': 1.0}] \n", + "7 -0.044401 [{'index': 2, 'value': 1.0}] \n", + "9 -0.044401 [{'index': 3, 'value': 1.0}] \n", + "10 0.530369 [{'index': 3, 'value': 1.0}] \n", + "11 -1.122096 [{'index': 3, 'value': 1.0}] \n", + "12 -1.122096 [{'index': 2, 'value': 1.0}] \n", + "13 -1.122096 [{'index': 2, 'value': 1.0}] \n", + "14 -1.122096 [{'index': 2, 'value': 1.0}] \n", + "15 -1.122096 [{'index': 2, 'value': 1.0}] \n", + "16 -0.547325 [{'index': 2, 'value': 1.0}] \n", + "18 -0.547325 [{'index': 3, 'value': 1.0}] \n", + "19 -0.547325 [{'index': 3, 'value': 1.0}] \n", + "20 -0.547325 [{'index': 2, 'value': 1.0}] \n", + "21 -0.547325 [{'index': 3, 'value': 1.0}] \n", + "22 -0.547325 [{'index': 2, 'value': 1.0}] \n", + "24 -0.547325 [{'index': 3, 'value': 1.0}] \n", + "26 0.027445 [{'index': 3, 'value': 1.0}] \n", + "28 0.027445 [{'index': 3, 'value': 1.0}] \n", + "30 0.027445 [{'index': 3, 'value': 1.0}] \n", + "31 -1.62502 [{'index': 2, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "0 [{'index': 1, 'value': 1.0}] \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "2 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "5 [{'index': 1, 'value': 1.0}] \n", + "6 [{'index': 2, 'value': 1.0}] \n", + "7 [{'index': 2, 'value': 1.0}] \n", + "9 [{'index': 1, 'value': 1.0}] \n", + "10 [{'index': 1, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "12 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 2, 'value': 1.0}] \n", + "14 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 1, 'value': 1.0}] \n", + "16 [{'index': 2, 'value': 1.0}] \n", + "18 [{'index': 1, 'value': 1.0}] \n", + "19 [{'index': 1, 'value': 1.0}] \n", + "20 [{'index': 2, 'value': 1.0}] \n", + "21 [{'index': 2, 'value': 1.0}] \n", + "22 [{'index': 1, 'value': 1.0}] \n", + "24 [{'index': 2, 'value': 1.0}] \n", + "26 [{'index': 1, 'value': 1.0}] \n", + "28 [{'index': 2, 'value': 1.0}] \n", + "30 [{'index': 2, 'value': 1.0}] \n", + "31 [{'index': 1, 'value': 1.0}] \n", "...\n", "\n", "[267 rows x 6 columns]" ] }, - "execution_count": 7, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2096,18 +2138,32 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b7c22c0858914b46951adde174b43e25", + "model_id": "5db4c5c80ba4417db151aa561dab5ee7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job ceced0cc-13a7-4b14-b42c-4d5f69e7e49a is RUNNING. \n", " \n", " \n", - " 9\n", - " 4295.335461\n", + " 3\n", + " 3394.118128\n", " \n", " \n", - " 12\n", - " 3338.44131\n", + " 8\n", + " 4048.685642\n", " \n", " \n", - " 13\n", - " 3201.820204\n", + " 17\n", + " 3976.454093\n", " \n", " \n", - " 19\n", - " 3982.814079\n", + " 23\n", + " 3541.582194\n", " \n", " \n", - " 22\n", - " 3538.610664\n", + " 25\n", + " 4032.844186\n", " \n", " \n", - " 23\n", - " 3613.50305\n", + " 27\n", + " 4118.351772\n", " \n", " \n", - " 25\n", - " 4009.759444\n", + " 29\n", + " 4087.767826\n", " \n", " \n", - " 28\n", - " 4240.515635\n", + " 34\n", + " 3183.755249\n", " \n", " \n", - " 30\n", - " 4028.904195\n", + " 35\n", + " 3418.802274\n", " \n", " \n", - " 38\n", - " 4206.810346\n", + " 39\n", + " 3519.186468\n", " \n", " \n", - " 41\n", - " 3736.225488\n", + " 51\n", + " 3398.135365\n", " \n", " \n", - " 45\n", - " 4383.368544\n", + " 52\n", + " 3223.615957\n", " \n", " \n", - " 49\n", - " 3125.772789\n", + " 60\n", + " 3445.014718\n", " \n", " \n", - " 50\n", - " 3149.28765\n", + " 61\n", + " 3505.638864\n", " \n", " \n", - " 62\n", - " 3531.69488\n", + " 64\n", + " 3515.905786\n", " \n", " \n", " 65\n", - " 4073.900616\n", + " 4028.363185\n", " \n", " \n", - " 66\n", - " 4160.810162\n", + " 67\n", + " 4159.993943\n", " \n", " \n", " 83\n", - " 3272.396279\n", + " 3348.16883\n", " \n", " \n", - " 87\n", - " 3989.364493\n", + " 85\n", + " 3485.050273\n", " \n", " \n", - " 92\n", - " 4240.495294\n", + " 93\n", + " 4172.874548\n", " \n", " \n", - " 98\n", - " 3911.455384\n", + " 104\n", + " 3299.302424\n", " \n", " \n", - " 104\n", - " 3271.202866\n", + " 105\n", + " 3515.687917\n", " \n", " \n", - " 114\n", - " 3244.728549\n", + " 108\n", + " 3405.224618\n", " \n", " \n", - " 115\n", - " 3737.374636\n", + " 113\n", + " 4209.140425\n", " \n", " \n", - " 118\n", - " 3485.95604\n", + " 130\n", + " 4197.905737\n", " \n", " \n", "\n", @@ -2306,37 +2362,37 @@ "text/plain": [ " predicted_body_mass_g\n", "penguin_id \n", - "9 4295.335461\n", - "12 3338.44131\n", - "13 3201.820204\n", - "19 3982.814079\n", - "22 3538.610664\n", - "23 3613.50305\n", - "25 4009.759444\n", - "28 4240.515635\n", - "30 4028.904195\n", - "38 4206.810346\n", - "41 3736.225488\n", - "45 4383.368544\n", - "49 3125.772789\n", - "50 3149.28765\n", - "62 3531.69488\n", - "65 4073.900616\n", - "66 4160.810162\n", - "83 3272.396279\n", - "87 3989.364493\n", - "92 4240.495294\n", - "98 3911.455384\n", - "104 3271.202866\n", - "114 3244.728549\n", - "115 3737.374636\n", - "118 3485.95604\n", + "3 3394.118128\n", + "8 4048.685642\n", + "17 3976.454093\n", + "23 3541.582194\n", + "25 4032.844186\n", + "27 4118.351772\n", + "29 4087.767826\n", + "34 3183.755249\n", + "35 3418.802274\n", + "39 3519.186468\n", + "51 3398.135365\n", + "52 3223.615957\n", + "60 3445.014718\n", + "61 3505.638864\n", + "64 3515.905786\n", + "65 4028.363185\n", + "67 4159.993943\n", + "83 3348.16883\n", + "85 3485.050273\n", + "93 4172.874548\n", + "104 3299.302424\n", + "105 3515.687917\n", + "108 3405.224618\n", + "113 4209.140425\n", + "130 4197.905737\n", "...\n", "\n", "[67 rows x 1 columns]" ] }, - "execution_count": 8, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2367,18 +2423,32 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fa6bd24b44cb42ec946e262ac2f25d09", + "model_id": "d7a16e04253a42b7a5ce247d8f63b656", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 6f19614c-82c0-4f8b-b74b-9d91a894efdd is RUNNING. \n", " \n", " \n", - " 9\n", - " 4\n", + " 3\n", + " 3\n", " \n", " \n", - " 12\n", - " 4\n", + " 8\n", + " 3\n", " \n", " \n", - " 13\n", - " 2\n", + " 17\n", + " 3\n", " \n", " \n", - " 19\n", - " 4\n", + " 23\n", + " 1\n", " \n", " \n", - " 22\n", - " 4\n", + " 25\n", + " 3\n", " \n", " \n", - " 23\n", - " 4\n", + " 27\n", + " 3\n", " \n", " \n", - " 25\n", - " 2\n", + " 29\n", + " 3\n", " \n", " \n", - " 28\n", - " 2\n", + " 34\n", + " 3\n", " \n", " \n", - " 30\n", - " 2\n", + " 35\n", + " 1\n", " \n", " \n", - " 38\n", - " 4\n", + " 39\n", + " 3\n", " \n", " \n", - " 41\n", - " 4\n", + " 51\n", + " 1\n", " \n", " \n", - " 45\n", - " 2\n", + " 52\n", + " 3\n", " \n", " \n", - " 49\n", - " 4\n", + " 60\n", + " 3\n", " \n", " \n", - " 50\n", - " 2\n", + " 61\n", + " 3\n", " \n", " \n", - " 62\n", - " 4\n", + " 64\n", + " 1\n", " \n", " \n", " 65\n", - " 4\n", + " 1\n", " \n", " \n", - " 66\n", - " 2\n", + " 67\n", + " 3\n", " \n", " \n", " 83\n", - " 2\n", + " 3\n", " \n", " \n", - " 87\n", - " 2\n", + " 85\n", + " 1\n", " \n", " \n", - " 92\n", - " 2\n", + " 93\n", + " 1\n", " \n", " \n", - " 98\n", - " 4\n", + " 104\n", + " 3\n", " \n", " \n", - " 104\n", - " 2\n", + " 105\n", + " 1\n", " \n", " \n", - " 114\n", - " 2\n", + " 108\n", + " 3\n", " \n", " \n", - " 115\n", - " 2\n", + " 113\n", + " 3\n", " \n", " \n", - " 118\n", - " 4\n", + " 130\n", + " 1\n", " \n", " \n", "\n", @@ -2577,37 +2647,37 @@ "text/plain": [ " CENTROID_ID\n", "penguin_id \n", - "9 4\n", - "12 4\n", - "13 2\n", - "19 4\n", - "22 4\n", - "23 4\n", - "25 2\n", - "28 2\n", - "30 2\n", - "38 4\n", - "41 4\n", - "45 2\n", - "49 4\n", - "50 2\n", - "62 4\n", - "65 4\n", - "66 2\n", - "83 2\n", - "87 2\n", - "92 2\n", - "98 4\n", - "104 2\n", - "114 2\n", - "115 2\n", - "118 4\n", + "3 3\n", + "8 3\n", + "17 3\n", + "23 1\n", + "25 3\n", + "27 3\n", + "29 3\n", + "34 3\n", + "35 1\n", + "39 3\n", + "51 1\n", + "52 3\n", + "60 3\n", + "61 3\n", + "64 1\n", + "65 1\n", + "67 3\n", + "83 3\n", + "85 1\n", + "93 1\n", + "104 3\n", + "105 1\n", + "108 3\n", + "113 3\n", + "130 1\n", "...\n", "\n", "[67 rows x 1 columns]" ] }, - "execution_count": 9, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2634,7 +2704,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -2651,7 +2721,7 @@ " ('linreg', LinearRegression())])" ] }, - "execution_count": 10, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -2678,18 +2748,18 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f0465986682742af92759c3f5fce96e0", + "model_id": "887bf58cebf14bdba95db828390fd33d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job c7d094cb-cc51-4f11-8887-b169c23aceb2 is DONE. 32.3 kB processed. \n", " \n", " \n", - " 9\n", - " 4295.328991\n", + " 3\n", + " 3394.116212\n", " \n", " \n", - " 12\n", - " 3338.434943\n", + " 8\n", + " 4048.683645\n", " \n", " \n", - " 13\n", - " 3201.813783\n", + " 17\n", + " 3976.452358\n", " \n", " \n", - " 19\n", - " 3982.807707\n", + " 23\n", + " 3541.580346\n", " \n", " \n", - " 22\n", - " 3538.604385\n", + " 25\n", + " 4032.842027\n", " \n", " \n", - " 23\n", - " 3613.496641\n", + " 27\n", + " 4118.34983\n", " \n", " \n", - " 25\n", - " 4009.753161\n", + " 29\n", + " 4087.765797\n", " \n", " \n", - " 28\n", - " 4240.509087\n", + " 34\n", + " 3183.75379\n", " \n", " \n", - " 30\n", - " 4028.897875\n", + " 35\n", + " 3418.800633\n", " \n", " \n", - " 38\n", - " 4206.80377\n", + " 39\n", + " 3519.18471\n", " \n", " \n", - " 41\n", - " 3736.219256\n", + " 51\n", + " 3398.133564\n", " \n", " \n", - " 45\n", - " 4383.362136\n", + " 52\n", + " 3223.614107\n", " \n", " \n", - " 49\n", - " 3125.766474\n", + " 60\n", + " 3445.012713\n", " \n", " \n", - " 50\n", - " 3149.281322\n", + " 61\n", + " 3505.637004\n", " \n", " \n", - " 62\n", - " 3531.688645\n", + " 64\n", + " 3515.903779\n", " \n", " \n", " 65\n", - " 4073.894238\n", + " 4028.361259\n", " \n", " \n", - " 66\n", - " 4160.803738\n", + " 67\n", + " 4159.991956\n", " \n", " \n", " 83\n", - " 3272.389735\n", + " 3348.167212\n", " \n", " \n", - " 87\n", - " 3989.358086\n", + " 85\n", + " 3485.048557\n", " \n", " \n", - " 92\n", - " 4240.488891\n", + " 93\n", + " 4172.872284\n", " \n", " \n", - " 98\n", - " 3911.449023\n", + " 104\n", + " 3299.300454\n", " \n", " \n", - " 104\n", - " 3271.196535\n", + " 105\n", + " 3515.68617\n", " \n", " \n", - " 114\n", - " 3244.722283\n", + " 108\n", + " 3405.222757\n", " \n", " \n", - " 115\n", - " 3737.368277\n", + " 113\n", + " 4209.13832\n", " \n", " \n", - " 118\n", - " 3485.949702\n", + " 130\n", + " 4197.90382\n", " \n", " \n", "\n", @@ -2888,37 +2972,37 @@ "text/plain": [ " predicted_body_mass_g\n", "penguin_id \n", - "9 4295.328991\n", - "12 3338.434943\n", - "13 3201.813783\n", - "19 3982.807707\n", - "22 3538.604385\n", - "23 3613.496641\n", - "25 4009.753161\n", - "28 4240.509087\n", - "30 4028.897875\n", - "38 4206.80377\n", - "41 3736.219256\n", - "45 4383.362136\n", - "49 3125.766474\n", - "50 3149.281322\n", - "62 3531.688645\n", - "65 4073.894238\n", - "66 4160.803738\n", - "83 3272.389735\n", - "87 3989.358086\n", - "92 4240.488891\n", - "98 3911.449023\n", - "104 3271.196535\n", - "114 3244.722283\n", - "115 3737.368277\n", - "118 3485.949702\n", + "3 3394.116212\n", + "8 4048.683645\n", + "17 3976.452358\n", + "23 3541.580346\n", + "25 4032.842027\n", + "27 4118.34983\n", + "29 4087.765797\n", + "34 3183.75379\n", + "35 3418.800633\n", + "39 3519.18471\n", + "51 3398.133564\n", + "52 3223.614107\n", + "60 3445.012713\n", + "61 3505.637004\n", + "64 3515.903779\n", + "65 4028.361259\n", + "67 4159.991956\n", + "83 3348.167212\n", + "85 3485.048557\n", + "93 4172.872284\n", + "104 3299.300454\n", + "105 3515.68617\n", + "108 3405.222757\n", + "113 4209.13832\n", + "130 4197.90382\n", "...\n", "\n", "[67 rows x 1 columns]" ] }, - "execution_count": 11, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2950,18 +3034,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e6409bce719940f4ae326a8b18871d9b", + "model_id": "2d32081be31f44abb8de67e2209d76cd", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job a427bad2-9875-453f-ad2a-1eefaf085657 is DONE. 32.3 kB processed. \n", " \n", " 0\n", - " 241.640738\n", - " 90117.84266\n", - " 0.005652\n", - " 200.718678\n", - " 0.8727\n", - " 0.878359\n", + " 229.48269\n", + " 82962.794947\n", + " 0.004248\n", + " 206.728384\n", + " 0.88633\n", + " 0.892953\n", " \n", " \n", "\n", @@ -3069,15 +3167,15 @@ ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 241.640738 90117.84266 0.005652 \n", + "0 229.48269 82962.794947 0.004248 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 200.718678 0.8727 0.878359 \n", + "0 206.728384 0.88633 0.892953 \n", "\n", "[1 rows x 6 columns]" ] }, - "execution_count": 12, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -3097,18 +3195,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e56e2cc197894ee9acc448a8c12e8a30", + "model_id": "f32692d89f00406499f4ea5aa55268fb", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HTML(value='Query job 929c826c-1051-47fc-9256-546f4ef11c32 is DONE. 31.7 kB processed. Date: Fri, 20 Oct 2023 22:27:22 -0700 Subject: [PATCH 06/15] docs: add runnable code samples for reading methods (#125) * docs: add runnable and testable I/O code samples * docs: add runnable and testable reading methods code snippets * fix: assign a df and show the first 2 rows * address comments --- bigframes/session/__init__.py | 100 ++++++++++++++++++ .../bigframes_vendored/pandas/io/gbq.py | 24 +++-- .../bigframes_vendored/pandas/io/parquet.py | 14 +++ .../pandas/io/parsers/readers.py | 30 +++++- .../bigframes_vendored/pandas/io/pickle.py | 18 ++++ 5 files changed, 179 insertions(+), 7 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2f001d7d49..5ec3da1a5a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -342,6 +342,51 @@ def read_gbq_query( ``row_number() over ()`` if there is no natural unique index or you want to preserve ordering. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Simple query input: + + >>> df = bpd.read_gbq_query(''' + ... SELECT + ... pitcherFirstName, + ... pitcherLastName, + ... pitchSpeed, + ... FROM `bigquery-public-data.baseball.games_wide` + ... ''') + >>> df.head(2) + pitcherFirstName pitcherLastName pitchSpeed + 0 0 + 1 0 + + [2 rows x 3 columns] + + Preserve ordering in a query input. + + >>> df = bpd.read_gbq_query(''' + ... SELECT + ... -- Instead of an ORDER BY clause on the query, use + ... -- ROW_NUMBER() to create an ordered DataFrame. + ... ROW_NUMBER() OVER (ORDER BY AVG(pitchSpeed) DESC) + ... AS rowindex, + ... + ... pitcherFirstName, + ... pitcherLastName, + ... AVG(pitchSpeed) AS averagePitchSpeed + ... FROM `bigquery-public-data.baseball.games_wide` + ... WHERE year = 2016 + ... GROUP BY pitcherFirstName, pitcherLastName + ... ''', index_col="rowindex") + >>> df.head(2) + pitcherFirstName pitcherLastName averagePitchSpeed + rowindex + 1 Albertin Chapman 96.514113 + 2 Zachary Britton 94.591039 + + [2 rows x 3 columns] + See also: :meth:`Session.read_gbq`. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so @@ -405,6 +450,25 @@ def read_gbq_table( ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). + + >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + See also: :meth:`Session.read_gbq`. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so @@ -792,6 +856,16 @@ def _read_ibis( def read_gbq_model(self, model_name: str): """Loads a BigQuery ML model from BigQuery. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Read an existing BigQuery ML model. + + >>> model_name = "bigframes-dev.bqml_tutorial.penguins_model" + >>> model = bpd.read_gbq_model(model_name) + Args: model_name (str): the model's name in BigQuery in the format @@ -815,6 +889,22 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame The pandas DataFrame will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> pandas_df = pd.DataFrame(data=d) + >>> df = bpd.read_pandas(pandas_df) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: pandas_dataframe (pandas.DataFrame): a pandas DataFrame object to be loaded. @@ -1365,6 +1455,16 @@ def read_gbq_function( The return type of the function must be explicitly specified in the function's original definition even if not otherwise required. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" + >>> func = bpd.read_gbq_function(function_name=function_name) + >>> func.bigframes_remote_function + 'bqutil.fn.cw_lower_case_ascii_only' + Args: function_name (str): the function's name in BigQuery in the format diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 8919f4ed16..575c501618 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -42,9 +42,23 @@ def read_gbq( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + If the input is a table ID: + + >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + Preserve ordering in a query input. - >>> bpd.read_gbq(''' + >>> df = bpd.read_gbq(''' ... SELECT ... -- Instead of an ORDER BY clause on the query, use ... -- ROW_NUMBER() to create an ordered DataFrame. @@ -57,16 +71,14 @@ def read_gbq( ... FROM `bigquery-public-data.baseball.games_wide` ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName - ... ''', index_col="rowindex").head(n=5) + ... ''', index_col="rowindex") + >>> df.head(2) pitcherFirstName pitcherLastName averagePitchSpeed rowindex 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 - 3 Trevor Rosenthal 94.213953 - 4 Jose Torres 94.103448 - 5 Tayron Guerrero 93.863636 - [5 rows x 3 columns] + [2 rows x 3 columns] Args: query_or_table (str): diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 9aed9af5a8..f97bd386a4 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -17,6 +17,20 @@ def read_parquet( Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" + >>> df = bpd.read_parquet(path=gcs_path) + >>> df.head(2) + name post_abbr + 0 Alabama AL + 1 Alaska AK + + [2 rows x 2 columns] + Args: path (str): Local or Cloud Storage path to Parquet file. diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index d19a92ecdf..e8ed6182a6 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -55,6 +55,20 @@ def read_csv( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) + >>> df.head(2) + name post_abbr + 0 Alabama AL + 1 Alaska AK + + [2 rows x 2 columns] + Args: filepath_or_buffer (str): A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` @@ -64,7 +78,7 @@ def read_csv( can be any ISO-8859-1 single-byte character. To use a character in the range 128-255, you must encode the character as UTF-8. Both engines support `sep="\t"` to specify tab character as separator. Default engine supports - having any number of spaces as separator by specifying `sep="\s+"`. Separators + having any number of spaces as separator by specifying `sep="\\s+"`. Separators longer than 1 character are interpreted as regular expressions by the default engine. BigQuery engine only supports single character separators. header (Optional[int], default 0): @@ -146,6 +160,20 @@ def read_json( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" + >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") + >>> df.head(2) + id name + 0 1 Alice + 1 2 Bob + + [2 rows x 2 columns] + Args: path_or_buf (a valid JSON str, path object or file-like object): A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 71b31956a0..053ba4871c 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -25,6 +25,24 @@ def read_pickle( If the content of the pickle file is a Series and its name attribute is None, the name will be set to '0' by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" + >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + Args: filepath_or_buffer (str, path object, or file-like object): String, path object (implementing os.PathLike[str]), or file-like object From 02984a4530d63e00dd628705149a1fe788cc263e Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Sat, 21 Oct 2023 10:33:03 -0700 Subject: [PATCH 07/15] test: add sample code for use BigFrames developer guide (#118) * test: add sample code for use BigFrames developer guide --- .../snippets/load_data_from_bigquery_test.py | 24 +++++++++++++ samples/snippets/load_data_from_csv_test.py | 25 ++++++++++++++ samples/snippets/pandas_methods_test.py | 34 +++++++++++++++++++ samples/snippets/set_options_test.py | 34 +++++++++++++++++++ 4 files changed, 117 insertions(+) create mode 100644 samples/snippets/load_data_from_bigquery_test.py create mode 100644 samples/snippets/load_data_from_csv_test.py create mode 100644 samples/snippets/pandas_methods_test.py create mode 100644 samples/snippets/set_options_test.py diff --git a/samples/snippets/load_data_from_bigquery_test.py b/samples/snippets/load_data_from_bigquery_test.py new file mode 100644 index 0000000000..e4c65688bd --- /dev/null +++ b/samples/snippets/load_data_from_bigquery_test.py @@ -0,0 +1,24 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery(): + # [START bigquery_dataframes_load_data_from_bigquery] + # Create a DataFrame from a BigQuery table: + import bigframes.pandas as bpd + + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + # [END bigquery_dataframes_load_data_from_bigquery] + assert bq_df is not None diff --git a/samples/snippets/load_data_from_csv_test.py b/samples/snippets/load_data_from_csv_test.py new file mode 100644 index 0000000000..31ab9255bf --- /dev/null +++ b/samples/snippets/load_data_from_csv_test.py @@ -0,0 +1,25 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_csv(): + # [START bigquery_dataframes_load_data_from_csv] + import bigframes.pandas as bpd + + filepath_or_buffer = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + df_from_gcs = bpd.read_csv(filepath_or_buffer) + # Display the first few rows of the DataFrame: + df_from_gcs.head() + # [END bigquery_dataframes_load_data_from_csv] + assert df_from_gcs is not None diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py new file mode 100644 index 0000000000..1f472d6346 --- /dev/null +++ b/samples/snippets/pandas_methods_test.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_pandas_methods(): + # [START bigquery_dataframes_pandas_methods] + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Inspect one of the columns (or series) of the DataFrame: + bq_df["body_mass_g"].head(10) + + # Compute the mean of this series: + average_body_mass = bq_df["body_mass_g"].mean() + print(f"average_body_mass: {average_body_mass}") + + # Calculate the mean body_mass_g by species using the groupby operation: + bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() + # [END bigquery_dataframes_pandas_methods] + assert average_body_mass is not None diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py new file mode 100644 index 0000000000..ef6f41ce54 --- /dev/null +++ b/samples/snippets/set_options_test.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_set_options(): + # Close the session before resetting the options + import bigframes.pandas as bpd + + bpd.close_session() + + # [START bigquery_dataframes_set_options] + import bigframes.pandas as bpd + + PROJECT_ID = "bigframes-dec" # @param {type:"string"} + REGION = "US" # @param {type:"string"} + + # Set BigQuery DataFrames options + bpd.options.bigquery.project = PROJECT_ID + bpd.options.bigquery.location = REGION + + # [END bigquery_dataframes_set_options] + assert bpd.options.bigquery.project == PROJECT_ID + assert bpd.options.bigquery.location == REGION From b17e1f43cd0f7567bc5b59b0e916cd20528312b3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 23 Oct 2023 12:05:58 -0500 Subject: [PATCH 08/15] fix: expose `bigframes.pandas.reset_session` as a public API (#128) --- bigframes/pandas/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 971d40f801..5c1928e6f0 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -484,4 +484,5 @@ def read_gbq_function(function_name: str): # Session management APIs "get_global_session", "close_session", + "reset_session", ] From f9ba28c6a6ab1ceaeecd70f7b5a87ec7c404ed13 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 23 Oct 2023 15:30:43 -0500 Subject: [PATCH 09/15] chore: remove unused reference to `THIRD_PARTY_NOTICES` from "nightly" (#130) --- .kokoro/release-nightly.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 5cc1275308..0751cf2502 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -93,7 +93,6 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ do gsutil cp -v dist/* ${gcs_path} gsutil cp -v LICENSE ${gcs_path} - gsutil cp -v ${THIRD_PARTY_NOTICES_FILE} ${gcs_path} gsutil -m cp -r -v "notebooks/" ${gcs_path}notebooks/ done From 386f35d2840ab677bfb83f9a0b6f3c8de06e78e8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 24 Oct 2023 10:44:13 -0500 Subject: [PATCH 10/15] refactor: move DDL gen to `bigframes.session._io`, add missing test (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Towards internal issue 280662868 🦕 --- bigframes/dataframe.py | 14 +++++++------- bigframes/session/__init__.py | 2 +- bigframes/session/_io/__init__.py | 13 +++++++++++++ bigframes/{core/io.py => session/_io/bigquery.py} | 0 .../test_io.py => session/test_io_bigquery.py} | 9 +++++---- 5 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 bigframes/session/_io/__init__.py rename bigframes/{core/io.py => session/_io/bigquery.py} (100%) rename tests/unit/{core/test_io.py => session/test_io_bigquery.py} (93%) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 32a2908a42..5c0d9b78e1 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -46,7 +46,6 @@ import bigframes.core.guid import bigframes.core.indexers as indexers import bigframes.core.indexes as indexes -import bigframes.core.io import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window @@ -56,6 +55,7 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series import bigframes.series as bf_series +import bigframes.session._io.bigquery import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -2201,9 +2201,9 @@ def to_csv( raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) result_table = self._run_io_query( - index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID + index=index, ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID ) - export_data_statement = bigframes.core.io.create_export_csv_statement( + export_data_statement = bigframes.session._io.bigquery.create_export_csv_statement( f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", uri=path_or_buf, field_delimiter=sep, @@ -2243,9 +2243,9 @@ def to_json( ) result_table = self._run_io_query( - index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID + index=index, ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID ) - export_data_statement = bigframes.core.io.create_export_data_statement( + export_data_statement = bigframes.session._io.bigquery.create_export_data_statement( f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", uri=path_or_buf, format="JSON", @@ -2319,9 +2319,9 @@ def to_parquet( export_options["compression"] = compression.upper() result_table = self._run_io_query( - index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID + index=index, ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID ) - export_data_statement = bigframes.core.io.create_export_data_statement( + export_data_statement = bigframes.session._io.bigquery.create_export_data_statement( f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", uri=path, format="PARQUET", diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5ec3da1a5a..db9c5a353c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,13 +67,13 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.guid as guid -import bigframes.core.io as bigframes_io from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers from bigframes.remote_function import read_gbq_function as bigframes_rgf from bigframes.remote_function import remote_function as bigframes_rf +import bigframes.session._io.bigquery as bigframes_io import bigframes.session.clients import bigframes.version diff --git a/bigframes/session/_io/__init__.py b/bigframes/session/_io/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/bigframes/session/_io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/io.py b/bigframes/session/_io/bigquery.py similarity index 100% rename from bigframes/core/io.py rename to bigframes/session/_io/bigquery.py diff --git a/tests/unit/core/test_io.py b/tests/unit/session/test_io_bigquery.py similarity index 93% rename from tests/unit/core/test_io.py rename to tests/unit/session/test_io_bigquery.py index afb38a5f75..d2255d5edf 100644 --- a/tests/unit/core/test_io.py +++ b/tests/unit/session/test_io_bigquery.py @@ -18,7 +18,7 @@ import google.cloud.bigquery as bigquery import pytest -import bigframes.core.io +import bigframes.session._io.bigquery def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): @@ -26,7 +26,7 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): "my-test-project._e8166e0cdb.anonbb92cd" ) - sql = bigframes.core.io.create_snapshot_sql( + sql = bigframes.session._io.bigquery.create_snapshot_sql( table_ref, datetime.datetime.now(datetime.timezone.utc) ) @@ -40,7 +40,7 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): table_ref = bigquery.TableReference.from_string("my-test-project._session.abcdefg") - sql = bigframes.core.io.create_snapshot_sql( + sql = bigframes.session._io.bigquery.create_snapshot_sql( table_ref, datetime.datetime.now(datetime.timezone.utc) ) @@ -101,4 +101,5 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): ), ) def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str): - pass + sql = bigframes.session._io.bigquery.bq_schema_to_sql(schema) + assert sql == expected From 95bff3f1902bc09dc3310798a42df8ffd31ed8ee Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Tue, 24 Oct 2023 15:08:14 -0700 Subject: [PATCH 11/15] fix: use series's own session in series.reindex listlike case (#135) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 84ca2a578f..49df8ab61e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1242,7 +1242,9 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): raise NotImplementedError( "Cannot reindex with index with different nlevels" ) - new_indexer = bigframes.dataframe.DataFrame(index=index)[[]] + new_indexer = bigframes.dataframe.DataFrame( + index=index, session=self._get_block().expr._session + )[[]] # multiindex join is senstive to index names, so we will set all these result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( self.to_frame().rename_axis(range(self.index.nlevels)), From 6fea8efac35871985677ebeb948a576e64a1ffa4 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 24 Oct 2023 21:07:15 -0700 Subject: [PATCH 12/15] docs: add runnable code samples for DataFrames I/O methods and property (#129) * docs: add runnable code samples for DataFrames I/O methods and property * fix: expose `bigframes.pandas.reset_session` as a public API (#128) * fix: address the comment * Empty commit * fix: address comments for better visualization of the output * Empty commit --------- Co-authored-by: Tim Swast --- .../bigframes_vendored/pandas/core/frame.py | 172 +++++++++++++++++- 1 file changed, 164 insertions(+), 8 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index a5c12d7b32..13a81b4645 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -33,7 +33,19 @@ class DataFrame(NDFrame): @property def shape(self) -> tuple[int, int]: - """Return a tuple representing the dimensionality of the DataFrame.""" + """ + Return a tuple representing the dimensionality of the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2, 3], + ... 'col2': [4, 5, 6]}) + >>> df.shape + (3, 2) + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property @@ -44,14 +56,14 @@ def axes(self) -> list: It has the row axis labels and column axis labels as the only members. They are returned in that order. - Examples + **Examples:** - .. code-block:: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None - df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - df.axes - [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], - dtype='object')] + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.axes[1:] + [Index(['col1', 'col2'], dtype='object')] """ return [self.index, self.columns] @@ -59,6 +71,16 @@ def axes(self) -> list: def values(self) -> np.ndarray: """Return the values of DataFrame in the form of a NumPy array. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.values + array([[1, 3], + [2, 4]], dtype=object) + Args: dytype (default None): The dtype to pass to `numpy.asarray()`. @@ -76,6 +98,16 @@ def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarra """ Convert the DataFrame to a NumPy array. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_numpy() + array([[1, 3], + [2, 4]], dtype=object) + Args: dtype (None): The dtype to pass to `numpy.asarray()`. @@ -101,6 +133,15 @@ def to_gbq( ) -> None: """Write a DataFrame to a BigQuery table. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME + >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") + Args: destination_table (str): Name of table to be written, in the form ``dataset.tablename`` @@ -137,6 +178,15 @@ def to_parquet( This function writes the dataframe as a `parquet file `_ to Cloud Storage. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" + >>> df.to_parquet(path=gcs_bucket) + Args: path (str): Destination URI(s) of Cloud Storage files(s) to store the extracted dataframe @@ -171,6 +221,35 @@ def to_dict( The type of the key-value pairs can be customized with the parameters (see below). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_dict() + {'col1': {0: 1, 1: 2}, 'col2': {0: 3, 1: 4}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': 0 1 + 1 2 + Name: col1, dtype: Int64, + 'col2': 0 3 + 1 4 + Name: col2, dtype: Int64} + + >>> df.to_dict('split') + {'index': [0, 1], 'columns': ['col1', 'col2'], 'data': [[1, 3], [2, 4]]} + + >>> df.to_dict("tight") + {'index': [0, 1], + 'columns': ['col1', 'col2'], + 'data': [[1, 3], [2, 4]], + 'index_names': [None], + 'column_names': [None]} + Args: orient (str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}): Determines the type of the values of the dictionary. @@ -213,6 +292,15 @@ def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: Note that creating an `ExcelWriter` object with a file name that already exists will result in the contents of the existing file being erased. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import tempfile + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_excel(tempfile.TemporaryFile()) + Args: excel_writer (path-like, file-like, or ExcelWriter object): File path or existing ExcelWriter. @@ -231,6 +319,23 @@ def to_latex( into a main LaTeX document or read from an external file with ``\input{{table.tex}}``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> print(df.to_latex()) + \begin{tabular}{lrr} + \toprule + & col1 & col2 \\ + \midrule + 0 & 1 & 3 \\ + 1 & 2 & 4 \\ + \bottomrule + \end{tabular} + + Args: buf (str, Path or StringIO-like, optional, default None): Buffer to write to. If None, the output is returned as a string. @@ -253,6 +358,16 @@ def to_records( Index will be included as the first field of the record array if requested. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_records() + rec.array([(0, 1, 3), (1, 2, 4)], + dtype=[('index', 'O'), ('col1', 'O'), ('col2', 'O')]) + Args: index (bool, default True): Include index in resulting record array, stored in 'index' @@ -298,6 +413,17 @@ def to_string( ): """Render a DataFrame to a console-friendly tabular output. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> print(df.to_string()) + col1 col2 + 0 1 3 + 1 2 4 + Args: buf (str, Path or StringIO-like, optional, default None): Buffer to write to. If None, the output is returned as a string. @@ -363,6 +489,18 @@ def to_markdown( ): """Print DataFrame in Markdown-friendly format. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> print(df.to_markdown()) + | | col1 | col2 | + |---:|-------:|-------:| + | 0 | 1 | 3 | + | 1 | 2 | 4 | + Args: buf (str, Path or StringIO-like, optional, default None): Buffer to write to. If None, the output is returned as a string. @@ -371,7 +509,7 @@ def to_markdown( index (bool, optional, default True): Add index (row) labels. **kwargs - These parameters will be passed to `tabulate `_. + These parameters will be passed to `tabulate `_. Returns: DataFrame in Markdown-friendly format. @@ -381,6 +519,15 @@ def to_markdown( def to_pickle(self, path, **kwargs) -> None: """Pickle (serialize) object to file. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" + >>> df.to_pickle(path=gcs_bucket) + Args: path (str): File path where the pickled object will be stored. @@ -391,6 +538,15 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: """ Write a DataFrame to the ORC format. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> import tempfile + >>> df.to_orc(tempfile.TemporaryFile()) + Args: path (str, file-like object or None, default None): If a string, it will be used as Root Directory path From 05d7618c50acf7b7f9d73e02a8870c8eac910aab Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Wed, 25 Oct 2023 15:52:16 -0700 Subject: [PATCH 13/15] test: allow for alternative PCA solutions in tests (#143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/large/ml/test_decomposition.py | 4 ++-- tests/system/large/ml/test_pipeline.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 460f07b816..a7049d4c18 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -67,8 +67,8 @@ def test_decomposition_configure_fit_score_predict( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pd.testing.assert_frame_equal( - result.sort_index(), - expected, + abs(result.sort_index()), # results may differ by a minus sign + abs(expected), check_exact=False, rtol=0.1, ) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 9294740dd6..6874a9f301 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -431,10 +431,16 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pd.testing.assert_frame_equal( - predictions[ - ["principal_component_1", "principal_component_2", "principal_component_3"] - ], - expected, + abs( # results may differ by a minus sign + predictions[ + [ + "principal_component_1", + "principal_component_2", + "principal_component_3", + ] + ] + ), + abs(expected), check_exact=False, rtol=0.1, ) From 1641aff37d601b47e0bc4f25ff148be4f718bd1a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 00:16:15 +0000 Subject: [PATCH 14/15] ci: Disable presubmit LLM tests temporarily (#144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/small/ml/test_llm.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b7257dde1b..a801c36c83 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,6 +26,9 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -48,6 +51,9 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -74,6 +80,9 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df @@ -85,6 +94,9 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df @@ -96,6 +108,9 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df @@ -108,6 +123,9 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -139,6 +157,9 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -152,6 +173,9 @@ def test_embedding_generator_predict_success( assert value.size == 768 +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df From c3b24b59cd02eeef4fab46761faf1699daa90252 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:39:22 -0700 Subject: [PATCH 15/15] chore(main): release 0.11.0 (#126) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 20 ++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d9f63d4c6..93ebadb56f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.11.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.10.0...v0.11.0) (2023-10-26) + + +### Features + +* Add back `reset_session` as an alias for `close_session` ([#124](https://github.com/googleapis/python-bigquery-dataframes/issues/124)) ([694a85a](https://github.com/googleapis/python-bigquery-dataframes/commit/694a85a0ef90d838700014a204d72b23362db1d8)) +* Change `query` parameter to `query_or_table` in `read_gbq` ([#127](https://github.com/googleapis/python-bigquery-dataframes/issues/127)) ([f9bb3c4](https://github.com/googleapis/python-bigquery-dataframes/commit/f9bb3c4bc88c5ba2be6f17e12a0ec4f482ce161f)) + + +### Bug Fixes + +* Expose `bigframes.pandas.reset_session` as a public API ([#128](https://github.com/googleapis/python-bigquery-dataframes/issues/128)) ([b17e1f4](https://github.com/googleapis/python-bigquery-dataframes/commit/b17e1f43cd0f7567bc5b59b0e916cd20528312b3)) +* Use series's own session in series.reindex listlike case ([#135](https://github.com/googleapis/python-bigquery-dataframes/issues/135)) ([95bff3f](https://github.com/googleapis/python-bigquery-dataframes/commit/95bff3f1902bc09dc3310798a42df8ffd31ed8ee)) + + +### Documentation + +* Add runnable code samples for DataFrames I/O methods and property ([#129](https://github.com/googleapis/python-bigquery-dataframes/issues/129)) ([6fea8ef](https://github.com/googleapis/python-bigquery-dataframes/commit/6fea8efac35871985677ebeb948a576e64a1ffa4)) +* Add runnable code samples for reading methods ([#125](https://github.com/googleapis/python-bigquery-dataframes/issues/125)) ([a669919](https://github.com/googleapis/python-bigquery-dataframes/commit/a669919ff25b56156bd70ccd816a0bf19adb48aa)) + ## [0.10.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.9.0...v0.10.0) (2023-10-19) diff --git a/bigframes/version.py b/bigframes/version.py index 7a37ebd220..18edfa5615 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.10.0" +__version__ = "0.11.0"