chore: add experimental blob.image_normalize function (#1388)

GarrettWu · web-flow · commit d6fbb5b6ccda · 2025-02-12T10:48:25.000-08:00
diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
@@ -174,6 +174,54 @@ def image_resize_func(
 )
 
 
+def image_normalize_func(
+    src_obj_ref_rt: str, dst_obj_ref_rt: str, alpha: float, beta: float, norm_type: str
+) -> str:
+    import json
+
+    import cv2 as cv  # type: ignore
+    import numpy as np
+    import requests
+
+    norm_type_mapping = {
+        "inf": cv.NORM_INF,
+        "l1": cv.NORM_L1,
+        "l2": cv.NORM_L2,
+        "minmax": cv.NORM_MINMAX,
+    }
+
+    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
+    dst_obj_ref_rt_json = json.loads(dst_obj_ref_rt)
+
+    src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
+    dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"]
+
+    response = requests.get(src_url)
+    bts = response.content
+
+    nparr = np.frombuffer(bts, np.uint8)
+    img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED)
+    img_normalized = cv.normalize(
+        img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type]
+    )
+    bts = cv.imencode(".jpeg", img_normalized)[1].tobytes()
+
+    requests.put(
+        url=dst_url,
+        data=bts,
+        headers={
+            "Content-Type": "image/jpeg",
+        },
+    )
+
+    return dst_obj_ref_rt
+
+
+image_normalize_def = FunctionDef(
+    image_normalize_func, ["opencv-python", "numpy", "requests"]
+)
+
+
 # Extracts all text from a PDF url
 def pdf_extract_func(src_obj_ref_rt: str) -> str:
     import io
diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py
@@ -384,6 +384,62 @@ def image_resize(
 
         return dst
 
+    def image_normalize(
+        self,
+        *,
+        alpha: float = 1.0,
+        beta: float = 0.0,
+        norm_type: str = "l2",
+        dst: Union[str, bigframes.series.Series],
+        connection: Optional[str] = None,
+    ) -> bigframes.series.Series:
+        """Normalize images.
+
+        .. note::
+            BigFrames Blob is still under experiments. It may not work and subject to change in the future.
+
+        Args:
+            alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
+            beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
+            norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
+            dst (str or bigframes.series.Series): Destination GCS folder str or blob series.
+            connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
+
+        Returns:
+            BigFrames Blob Series
+        """
+        import bigframes.blob._functions as blob_func
+
+        connection = self._resolve_connection(connection)
+
+        if isinstance(dst, str):
+            dst = os.path.join(dst, "")
+            src_uri = bigframes.series.Series(self._block).struct.explode()["uri"]
+            # Replace src folder with dst folder, keep the file names.
+            dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True)
+            dst = cast(
+                bigframes.series.Series, dst_uri.str.to_blob(connection=connection)
+            )
+
+        image_normalize_udf = blob_func.TransformFunction(
+            blob_func.image_normalize_def,
+            session=self._block.session,
+            connection=connection,
+        ).udf()
+
+        src_rt = self._get_runtime_json_str(mode="R")
+        dst_rt = dst.blob._get_runtime_json_str(mode="RW")
+
+        df = src_rt.to_frame().join(dst_rt.to_frame(), how="outer")
+        df["alpha"] = alpha
+        df["beta"] = beta
+        df["norm_type"] = norm_type
+
+        res = df.apply(image_normalize_udf, axis=1)
+        res.cache()  # to execute the udf
+
+        return dst
+
     def pdf_extract(
         self, *, connection: Optional[str] = None
     ) -> bigframes.series.Series: