chore: add a new metric (#1345)

Genesis929 · web-flow · commit e86a4da6b802 · 2025-01-30T17:40:19.000-08:00
* chore: add a new metric

* update print
diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py
@@ -30,28 +30,31 @@ class ExecutionMetrics:
     slot_millis: int = 0
     bytes_processed: int = 0
     execution_secs: float = 0
+    query_char_count: int = 0
 
     def count_job_stats(self, query_job: bq_job.QueryJob):
         stats = get_performance_stats(query_job)
         if stats is not None:
-            bytes_processed, slot_millis, execution_secs = stats
+            bytes_processed, slot_millis, execution_secs, query_char_count = stats
             self.execution_count += 1
             self.bytes_processed += bytes_processed
             self.slot_millis += slot_millis
             self.execution_secs += execution_secs
+            self.query_char_count += query_char_count
             if LOGGING_NAME_ENV_VAR in os.environ:
                 # when running notebooks via pytest nbmake
-                write_stats_to_disk(bytes_processed, slot_millis, execution_secs)
+                write_stats_to_disk(
+                    bytes_processed, slot_millis, execution_secs, query_char_count
+                )
 
 
 def get_performance_stats(
     query_job: bigquery.QueryJob,
-) -> Optional[Tuple[int, int, float]]:
+) -> Optional[Tuple[int, int, float, int]]:
     """Parse the query job for performance stats.
 
     Return None if the stats do not reflect real work done in bigquery.
     """
-
     if (
         query_job.configuration.dry_run
         or query_job.created is None
@@ -68,12 +71,13 @@ def get_performance_stats(
         return None  # filter out mocks
 
     execution_secs = (query_job.ended - query_job.created).total_seconds()
+    query_char_count = len(query_job.query)
 
-    return bytes_processed, slot_millis, execution_secs
+    return bytes_processed, slot_millis, execution_secs, query_char_count
 
 
 def write_stats_to_disk(
-    bytes_processed: int, slot_millis: int, exec_seconds: Optional[float]
+    bytes_processed: int, slot_millis: int, exec_seconds: float, query_char_count: int
 ):
     """For pytest runs only, log information about the query job
     to a file in order to create a performance report.
@@ -103,3 +107,10 @@ def write_stats_to_disk(
     )
     with open(exec_time_file, "a") as f:
         f.write(str(exec_seconds) + "\n")
+
+    # store length of query
+    query_char_count_file = os.path.join(
+        current_directory, test_name + ".query_char_count"
+    )
+    with open(query_char_count_file, "a") as f:
+        f.write(str(query_char_count) + "\n")
diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py
@@ -88,22 +88,27 @@ def collect_benchmark_result(
         millis_files = sorted(path.rglob("*.slotmillis"))
         bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds"))
         local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds"))
+        query_char_count_files = sorted(path.rglob("*.query_char_count"))
+
         error_files = sorted(path.rglob("*.error"))
 
         if not (
             len(bytes_files)
             == len(millis_files)
             == len(local_seconds_files)
             == len(bq_seconds_files)
+            == len(query_char_count_files)
         ):
             raise ValueError(
-                "Mismatch in the number of report files for bytes, millis, and seconds."
+                "Mismatch in the number of report files for bytes, millis, seconds and query char count."
             )
 
         for idx in range(len(bytes_files)):
             bytes_file = bytes_files[idx]
             millis_file = millis_files[idx]
             bq_seconds_file = bq_seconds_files[idx]
+            query_char_count_file = query_char_count_files[idx]
+
             filename = bytes_file.relative_to(path).with_suffix("")
 
             if filename != millis_file.relative_to(path).with_suffix(
@@ -136,19 +141,25 @@ def collect_benchmark_result(
                 lines = file.read().splitlines()
                 bq_seconds = sum(float(line) for line in lines) / iterations
 
+            with open(query_char_count_file, "r") as file:
+                lines = file.read().splitlines()
+                query_char_count = sum(int(line) for line in lines) / iterations
+
             results_dict[str(filename)] = [
                 query_count,
                 total_bytes,
                 total_slot_millis,
                 local_seconds,
                 bq_seconds,
+                query_char_count,
             ]
     finally:
         for files_to_remove in (
             path.rglob("*.bytesprocessed"),
             path.rglob("*.slotmillis"),
             path.rglob("*.local_exec_time_seconds"),
             path.rglob("*.bq_exec_time_seconds"),
+            path.rglob("*.query_char_count"),
             path.rglob("*.error"),
         ):
             for log_file in files_to_remove:
@@ -160,6 +171,7 @@ def collect_benchmark_result(
         "Slot_Millis",
         "Local_Execution_Time_Sec",
         "BigQuery_Execution_Time_Sec",
+        "Query_Char_Count",
     ]
 
     benchmark_metrics = pd.DataFrame.from_dict(
@@ -182,15 +194,19 @@ def collect_benchmark_result(
         )
         print(
             f"{index} - query count: {row['Query_Count']},"
+            f" query char count: {row['Query_Char_Count']},",
             f" bytes processed sum: {row['Bytes_Processed']},"
             f" slot millis sum: {row['Slot_Millis']},"
             f" local execution time: {formatted_local_exec_time} seconds,"
-            f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds"
+            f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds",
         )
 
     geometric_mean_queries = geometric_mean_excluding_zeros(
         benchmark_metrics["Query_Count"]
     )
+    geometric_mean_query_char_count = geometric_mean_excluding_zeros(
+        benchmark_metrics["Query_Char_Count"]
+    )
     geometric_mean_bytes = geometric_mean_excluding_zeros(
         benchmark_metrics["Bytes_Processed"]
     )
@@ -206,6 +222,7 @@ def collect_benchmark_result(
 
     print(
         f"---Geometric mean of queries: {geometric_mean_queries}, "
+        f"Geometric mean of queries char counts: {geometric_mean_query_char_count}, "
         f"Geometric mean of bytes processed: {geometric_mean_bytes}, "
         f"Geometric mean of slot millis: {geometric_mean_slot_millis}, "
         f"Geometric mean of local execution time: {geometric_mean_local_seconds} seconds, "