Pass lower resolution image to libaom av1 encoder.

The implementation of VideoFrameBuffer might have a more efficient way to scale images.
Therefore it is beneficial to first scale the image down to the top active layer resolution.

Bug: chromium:397485312
Change-Id: I782bc12d78e5bcab75345dbb145bce68100ba570
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/379680
Reviewed-by: Erik SprÃ¥ng <[email protected]>
Commit-Queue: Ilya Nikolaevskiy <[email protected]>
Cr-Commit-Position: refs/heads/main@{#44531}
diff --git a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
index d0c513e..339a508 100644
--- a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
+++ b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
@@ -13,6 +13,7 @@
 #include <stdint.h>
 
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -124,10 +125,22 @@
   void SetSvcRefFrameConfig(
       const ScalableVideoController::LayerFrameConfig& layer_frame);
   // If pixel format doesn't match, then reallocate.
-  void MaybeRewrapImgWithFormat(const aom_img_fmt_t fmt);
+  void MaybeRewrapImgWithFormat(const aom_img_fmt_t fmt,
+                                unsigned int width,
+                                unsigned int height);
+
+  // Adjust sclaing factors assuming that the top active SVC layer
+  // will be the input resolution.
+  void AdjustScalingFactorsForTopActiveLayer();
 
   std::unique_ptr<ScalableVideoController> svc_controller_;
   std::optional<ScalabilityMode> scalability_mode_;
+  // Original scaling factors for all configured layers active and inactive.
+  // `svc_params_` stores factors ignoring top inactive layers.
+  std::vector<int> scaling_factors_num_;
+  std::vector<int> scaling_factors_den_;
+  int last_active_layer_ = 0;
+
   bool inited_;
   bool rates_configured_;
   std::optional<aom_svc_params_t> svc_params_;
@@ -469,9 +482,19 @@
         1 << (svc_config.num_temporal_layers - tid - 1);
   }
 
+  scaling_factors_den_.resize(svc_config.num_spatial_layers);
+  scaling_factors_num_.resize(svc_config.num_spatial_layers);
   for (int sid = 0; sid < svc_config.num_spatial_layers; ++sid) {
+    scaling_factors_num_[sid] = svc_config.scaling_factor_num[sid];
     svc_params.scaling_factor_num[sid] = svc_config.scaling_factor_num[sid];
+    scaling_factors_den_[sid] = svc_config.scaling_factor_den[sid];
     svc_params.scaling_factor_den[sid] = svc_config.scaling_factor_den[sid];
+    encoder_settings_.spatialLayers[sid].width = encoder_settings_.width *
+                                                 scaling_factors_num_[sid] /
+                                                 scaling_factors_den_[sid];
+    encoder_settings_.spatialLayers[sid].height = encoder_settings_.height *
+                                                  scaling_factors_num_[sid] /
+                                                  scaling_factors_den_[sid];
   }
 
   // svc_params.layer_target_bitrate is set in SetRates() before svc_params is
@@ -540,21 +563,61 @@
   return WEBRTC_VIDEO_CODEC_OK;
 }
 
-void LibaomAv1Encoder::MaybeRewrapImgWithFormat(const aom_img_fmt_t fmt) {
+void LibaomAv1Encoder::MaybeRewrapImgWithFormat(const aom_img_fmt_t fmt,
+                                                unsigned int width,
+                                                unsigned int height) {
   if (!frame_for_encode_) {
-    frame_for_encode_ =
-        aom_img_wrap(nullptr, fmt, cfg_.g_w, cfg_.g_h, 1, nullptr);
-
-  } else if (frame_for_encode_->fmt != fmt) {
+    RTC_LOG(LS_INFO) << "Configuring AV1 encoder pixel format to "
+                     << (fmt == AOM_IMG_FMT_NV12 ? "NV12" : "I420") << " "
+                     << width << "x" << height;
+    frame_for_encode_ = aom_img_wrap(nullptr, fmt, width, height, 1, nullptr);
+  } else if (frame_for_encode_->fmt != fmt || frame_for_encode_->d_w != width ||
+             frame_for_encode_->d_h != height) {
     RTC_LOG(LS_INFO) << "Switching AV1 encoder pixel format to "
-                     << (fmt == AOM_IMG_FMT_NV12 ? "NV12" : "I420");
+                     << (fmt == AOM_IMG_FMT_NV12 ? "NV12" : "I420") << " "
+                     << width << "x" << height;
     aom_img_free(frame_for_encode_);
-    frame_for_encode_ =
-        aom_img_wrap(nullptr, fmt, cfg_.g_w, cfg_.g_h, 1, nullptr);
+    frame_for_encode_ = aom_img_wrap(nullptr, fmt, width, height, 1, nullptr);
   }
   // else no-op since the image is already in the right format.
 }
 
+void LibaomAv1Encoder::AdjustScalingFactorsForTopActiveLayer() {
+  if (!SvcEnabled())
+    return;
+  last_active_layer_ = svc_params_->number_spatial_layers - 1;
+  for (int sid = 0; sid < svc_params_->number_spatial_layers; ++sid) {
+    for (int tid = 0; tid < svc_params_->number_temporal_layers; ++tid) {
+      int layer_index = sid * svc_params_->number_temporal_layers + tid;
+      if (svc_params_->layer_target_bitrate[layer_index] > 0) {
+        last_active_layer_ = sid;
+      }
+    }
+  }
+  if (static_cast<int>(cfg_.g_w) ==
+      encoder_settings_.spatialLayers[last_active_layer_].width) {
+    return;
+  }
+
+  cfg_.g_w = encoder_settings_.spatialLayers[last_active_layer_].width;
+  cfg_.g_h = encoder_settings_.spatialLayers[last_active_layer_].height;
+
+  // Recalculate scaling factors ignoring top inactive layers.
+  // Divide all by scaling factor of the last active layer.
+  for (int i = 0; i <= last_active_layer_; ++i) {
+    int n = scaling_factors_num_[i] * scaling_factors_den_[last_active_layer_];
+    int d = scaling_factors_den_[i] * scaling_factors_num_[last_active_layer_];
+    int gcd = std::gcd(n, d);
+    svc_params_->scaling_factor_num[i] = n / gcd;
+    svc_params_->scaling_factor_den[i] = d / gcd;
+  }
+  for (int i = last_active_layer_ + 1; i < svc_params_->number_spatial_layers;
+       ++i) {
+    svc_params_->scaling_factor_num[i] = 1;
+    svc_params_->scaling_factor_den[i] = 1;
+  }
+}
+
 int32_t LibaomAv1Encoder::Encode(
     const VideoFrame& frame,
     const std::vector<VideoFrameType>* frame_types) {
@@ -578,13 +641,24 @@
   absl::InlinedVector<VideoFrameBuffer::Type, kMaxPreferredPixelFormats>
       supported_formats = {VideoFrameBuffer::Type::kI420,
                            VideoFrameBuffer::Type::kNV12};
+
+  scoped_refptr<VideoFrameBuffer> scaled_image;
+  if (!SvcEnabled() ||
+      last_active_layer_ + 1 == svc_params_->number_spatial_layers) {
+    scaled_image = buffer;
+  } else {
+    scaled_image = buffer->Scale(
+        encoder_settings_.spatialLayers[last_active_layer_].width,
+        encoder_settings_.spatialLayers[last_active_layer_].height);
+  }
+
   scoped_refptr<VideoFrameBuffer> mapped_buffer;
-  if (buffer->type() != VideoFrameBuffer::Type::kNative) {
+  if (scaled_image->type() != VideoFrameBuffer::Type::kNative) {
     // `buffer` is already mapped.
-    mapped_buffer = buffer;
+    mapped_buffer = scaled_image;
   } else {
     // Attempt to map to one of the supported formats.
-    mapped_buffer = buffer->GetMappedFrameBuffer(supported_formats);
+    mapped_buffer = scaled_image->GetMappedFrameBuffer(supported_formats);
   }
 
   // Convert input frame to I420, if needed.
@@ -610,7 +684,8 @@
     case VideoFrameBuffer::Type::kI420:
     case VideoFrameBuffer::Type::kI420A: {
       // Set frame_for_encode_ data pointers and strides.
-      MaybeRewrapImgWithFormat(AOM_IMG_FMT_I420);
+      MaybeRewrapImgWithFormat(AOM_IMG_FMT_I420, mapped_buffer->width(),
+                               mapped_buffer->height());
       auto i420_buffer = mapped_buffer->GetI420();
       RTC_DCHECK(i420_buffer);
       RTC_CHECK_EQ(i420_buffer->width(), frame_for_encode_->d_w);
@@ -627,7 +702,8 @@
       break;
     }
     case VideoFrameBuffer::Type::kNV12: {
-      MaybeRewrapImgWithFormat(AOM_IMG_FMT_NV12);
+      MaybeRewrapImgWithFormat(AOM_IMG_FMT_NV12, mapped_buffer->width(),
+                               mapped_buffer->height());
       const NV12BufferInterface* nv12_buffer = mapped_buffer->GetNV12();
       RTC_DCHECK(nv12_buffer);
       RTC_CHECK_EQ(nv12_buffer->width(), frame_for_encode_->d_w);
@@ -725,10 +801,10 @@
         // If encoded image width/height info are added to aom_codec_cx_pkt_t,
         // use those values in lieu of the values in frame.
         if (svc_params_) {
-          int n = svc_params_->scaling_factor_num[layer_frame->SpatialId()];
-          int d = svc_params_->scaling_factor_den[layer_frame->SpatialId()];
-          encoded_image._encodedWidth = cfg_.g_w * n / d;
-          encoded_image._encodedHeight = cfg_.g_h * n / d;
+          int n = scaling_factors_num_[layer_frame->SpatialId()];
+          int d = scaling_factors_den_[layer_frame->SpatialId()];
+          encoded_image._encodedWidth = encoder_settings_.width * n / d;
+          encoded_image._encodedHeight = encoder_settings_.height * n / d;
           encoded_image.SetSpatialIndex(layer_frame->SpatialId());
           encoded_image.SetTemporalIndex(layer_frame->TemporalId());
         } else {
@@ -762,10 +838,11 @@
         if (SvcEnabled()) {
           resolutions.resize(svc_params_->number_spatial_layers);
           for (int sid = 0; sid < svc_params_->number_spatial_layers; ++sid) {
-            int n = svc_params_->scaling_factor_num[sid];
-            int d = svc_params_->scaling_factor_den[sid];
+            int n = scaling_factors_num_[sid];
+            int d = scaling_factors_den_[sid];
             resolutions[sid] =
-                RenderResolution(cfg_.g_w * n / d, cfg_.g_h * n / d);
+                RenderResolution(encoder_settings_.width * n / d,
+                                 encoder_settings_.height * n / d);
           }
         } else {
           resolutions = {RenderResolution(cfg_.g_w, cfg_.g_h)};
@@ -807,11 +884,6 @@
   // total target bitrate is not updated first a division by zero could happen.
   svc_controller_->OnRatesUpdated(parameters.bitrate);
   cfg_.rc_target_bitrate = parameters.bitrate.get_sum_kbps();
-  aom_codec_err_t error_code = aom_codec_enc_config_set(&ctx_, &cfg_);
-  if (error_code != AOM_CODEC_OK) {
-    RTC_LOG(LS_WARNING) << "Error configuring encoder, error code: "
-                        << error_code;
-  }
 
   if (SvcEnabled()) {
     for (int sid = 0; sid < svc_params_->number_spatial_layers; ++sid) {
@@ -824,9 +896,17 @@
             parameters.bitrate.GetTemporalLayerSum(sid, tid) / 1000;
       }
     }
+    AdjustScalingFactorsForTopActiveLayer();
     SetEncoderControlParameters(AV1E_SET_SVC_PARAMS, &*svc_params_);
   }
 
+  // AdjustScalingFactorsForTopActiveLayer() may update `cfg_`.
+  aom_codec_err_t error_code = aom_codec_enc_config_set(&ctx_, &cfg_);
+  if (error_code != AOM_CODEC_OK) {
+    RTC_LOG(LS_WARNING) << "Error configuring encoder, error code: "
+                        << error_code;
+  }
+
   framerate_fps_ = parameters.framerate_fps;
 
   rates_configured_ = true;
diff --git a/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc b/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
index 33b750c..0bbf6a2 100644
--- a/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
+++ b/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
@@ -37,6 +37,7 @@
 #include "api/video_codecs/video_codec.h"
 #include "api/video_codecs/video_encoder.h"
 #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h"
+#include "modules/video_coding/codecs/av1/av1_svc_config.h"
 #include "modules/video_coding/codecs/test/encoded_video_frame_producer.h"
 #include "modules/video_coding/include/video_error_codes.h"
 #include "rtc_base/checks.h"
@@ -68,6 +69,18 @@
   return codec_settings;
 }
 
+VideoCodec HDCodecSettings() {
+  VideoCodec codec_settings;
+  codec_settings.codecType = kVideoCodecAV1;
+  codec_settings.width = 1280;
+  codec_settings.height = 720;
+  codec_settings.maxFramerate = 30;
+  codec_settings.startBitrate = 2048;
+  codec_settings.maxBitrate = 2048;
+  codec_settings.qpMax = 63;
+  return codec_settings;
+}
+
 VideoEncoder::Settings DefaultEncoderSettings() {
   return VideoEncoder::Settings(
       VideoEncoder::Capabilities(/*loss_notification=*/false),
@@ -592,5 +605,87 @@
   RTC_CHECK_EQ(callback.frames_encoded(), 1);
 }
 
+TEST(LibaomAv1EncoderTest, EnableDisableSpatialLayersWithSvcController) {
+  constexpr int kNumSpatialLayers = 3;
+  constexpr int kNumTemporalLayers = 1;
+  constexpr size_t kWidth = 1280;
+  constexpr size_t kHeight = 720;
+
+  // Configure encoder to produce 3 spatial layers. Encode frames of layer 0
+  // then enable layer 1 and encode more frames and so on.
+  // Then disable layers one by one in the same way.
+  // Note: bit rate allocation is high to avoid frame dropping due to rate
+  // control, the encoder should always produce a frame. A dropped
+  // frame indicates a problem and the test will fail.
+  std::unique_ptr<VideoEncoder> encoder =
+      CreateLibaomAv1Encoder(CreateEnvironment());
+  VideoCodec codec_settings = HDCodecSettings();
+  SetAv1SvcConfig(codec_settings, kNumTemporalLayers, kNumSpatialLayers);
+  codec_settings.SetFrameDropEnabled(true);
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, DefaultEncoderSettings()),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  EncodedVideoFrameProducer producer(*encoder);
+  producer.SetResolution({kWidth, kHeight});
+
+  VideoBitrateAllocation bitrate_allocation;
+
+  // Set all layers active for initial allocation.
+  for (size_t sl_idx = 0; sl_idx < kNumSpatialLayers; ++sl_idx) {
+    // Allocate high bit rate to avoid frame dropping due to rate control.
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0,
+        codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2);
+  }
+
+  encoder->SetRates(VideoEncoder::RateControlParameters(
+      bitrate_allocation, codec_settings.maxFramerate));
+
+  // Encode a key frame to validate all other frames are delta frames.
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      producer.SetNumInputFrames(1).Encode();
+  ASSERT_THAT(frames, Not(IsEmpty()));
+  EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+
+  constexpr size_t kNumFramesToEncode = 5;
+
+  // Disable layers one by one.
+  for (int sl_idx = kNumSpatialLayers - 1; sl_idx > 0; --sl_idx) {
+    bitrate_allocation.SetBitrate(sl_idx, 0, 0);
+    encoder->SetRates(VideoEncoder::RateControlParameters(
+        bitrate_allocation, codec_settings.maxFramerate));
+
+    frames = producer.SetNumInputFrames(kNumFramesToEncode).Encode();
+    // With `sl_idx` spatial layer disabled, there are `sl_idx` spatial layers
+    // left.
+    ASSERT_THAT(frames, SizeIs(kNumFramesToEncode * sl_idx));
+    for (size_t i = 0; i < frames.size(); ++i) {
+      EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+      EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    }
+  }
+
+  // Enable layers back one by one.
+  for (size_t sl_idx = 1; sl_idx < kNumSpatialLayers; ++sl_idx) {
+    // Allocate high bit rate to avoid frame dropping due to rate control.
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0,
+        codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2);
+    encoder->SetRates(VideoEncoder::RateControlParameters(
+        bitrate_allocation, codec_settings.maxFramerate));
+
+    frames = producer.SetNumInputFrames(kNumFramesToEncode).Encode();
+    // With (sl_idx+1) spatial layers expect (sl_idx+1) frames per input frame.
+    ASSERT_THAT(frames, SizeIs(kNumFramesToEncode * (sl_idx + 1)));
+    // Only the first frame after enabling the layer must be a keyframe.
+    EXPECT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+    EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+    for (size_t i = 1; i < frames.size(); ++i) {
+      EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+      EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace webrtc