引言
文本检测在openvino部署端的前后处理与在paddleocr中的不太一样。
1,前处理
在将文本检测的模型转换成onnx格式(输入输出大小都已固定),并部署到openvino后,其预处理后的输入图像大小必须为640*640
的,而在paddlocr部署端其输入图像大小是动态可变的,若是完全按照paddlocr的前处理运行在openvino上则会导致推理失败崩溃,故在进行相关前处理时要将图像等比列缩放、扩展背景到960*960
,其归一化的方式是完全一致的。
原图如下(219*178):
前处理后的结果如下(960*960):
2,后处理
https://blog.csdn.net/weixin_42148389/article/details/133685422
OCR文本检测算法:DBNet
DBNet是基于分割的文本检测算法。基于分割算法的流程一般为:先通过网络输出文本分割的概率图,然后使用设定阈值将概率图转化为二值图,然后通过后处理得到检测结果(文本框坐标)。缺点在于阈值的选取非常关键。
DBNet 针对这个问题,提出可微分二值化的概念:即对每一个像素点进行自适应二值化,二值化阈值由网络学习得到,将二值化这一步骤加入到网络里一起训练,生成鲁棒性更好的二值图,简化后处理。
图像经过openvino的推理后,其输出也是一张960*960的单通道掩膜图像,其内部像素值为分值,通过分数阈值来对其分割可得到二值化图像,如下,对二值化图像再进行轮廓查找然后求得外接矩形作为文本检测框,最后再按照比列缩放回去即可。
这里的后处理按照自己的简单来进行截取的,并没有真正的使用DB的后处理得到检测框,这里毕竟只是取外接矩形检测框即可,故先用此简单处理了,后续遇到弯曲文本检测等时再进行相关的处理。
而paddleocr的DB算法后处理也是对掩膜图像进行处理,最终输出文本框,原理基本一致,只是其内部实现更复杂一些以及结果更精确一些,更多后处理相关内容可以查看如下:
https://gitee.com/paddlepaddle/PaddleOCR/blob/release/2.6/deploy/cpp_infer/src/postprocess_op.cpp
3,C++部署代码
#include <iostream>
#include <string>
#include <vector>
#include <iterator>
#include <fstream>
#include <chrono> // system_clock 的头文件
#include <cstdint>
using namespace std;
using namespace chrono; //chrono库一定写在std库后面,否则报错
#include <openvino/openvino.hpp> //include openvino runtime header files
#include <opencv2/opencv.hpp> //opencv header file
/* --------- Please modify the path of yolov5 model and image -----------*/
std::string model_file = "model.bin";
//std::string image_file = "E:/tupian/ocr2/fangxiangfenlei/zheng/Image_20231219110654464_1242_1072_389_164_0.jpg";
std::string image_file = "1.jpg";
void Normalize(cv::Mat* im, const std::vector<float>& mean, const std::vector<float>& scale, const bool is_scale) {
double e = 1.0;
if (is_scale) {
e /= 255.0;
}
(*im).convertTo(*im, CV_32FC3, e);
std::vector<cv::Mat> bgr_channels(3);
cv::split(*im, bgr_channels);
for (auto i = 0; i < bgr_channels.size(); i++) {
bgr_channels[i].convertTo(bgr_channels[i], CV_32FC1, 1.0 * scale[i],
(0.0 - mean[i]) * scale[i]);
}
cv::merge(bgr_channels, *im);
}
void PermuteBatch(const std::vector<cv::Mat> imgs, float* data) {
for (int j = 0; j < imgs.size(); j++) {
int rh = imgs[j].rows;
int rw = imgs[j].cols;
int rc = imgs[j].channels();
for (int i = 0; i < rc; ++i) {
cv::extractChannel(
imgs[j], cv::Mat(rh, rw, CV_32FC1, data + (j * rc + i) * rh * rw), i);
}
}
}
void paddleOCRPreprocess(const cv::Mat& image, std::vector<float>& input, float& ratio, const int batch, const int channel, const int targetHeight, const int targetWidth,
const std::vector<float>& mean, const std::vector<float>& scale)
{
cv::Mat out;
if (image.empty())
throw "paddleOCRPreprocess : input image is empty()\n";
if (targetHeight <= 0 || targetWidth <= 0)
throw "paddleOCRPreprocess target size error targetHeight<=0 || targetWidth<=0";
//图像大小变换及扩展
//等比例缩放
//先变为正方型
int col = image.cols;
int row = image.rows;
int _max = MAX(col, row);
cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
image.copyTo(result(cv::Rect(0, 0, col, row)));
ratio = float(targetHeight) / float(_max);//缩放的比列
//再变成960*960
cv::resize(result, out, cv::Size(targetHeight, targetWidth), 0.f, 0.f, cv::INTER_LINEAR);
//cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, int(imgW - resize_img.cols), cv::BORDER_CONSTANT, { 127, 127, 127 });
//归一化
Normalize(&out, mean, scale, true);
//维度变换(一致)
std::vector<cv::Mat> norm_img_batch;
norm_img_batch.push_back(out);
std::vector<float> input_i(batch * channel * out.rows * out.cols, 0.0f);
input = input_i;
PermuteBatch(norm_img_batch, input.data());
}
void paddleOCRPostProcess(cv::Mat& detect_buffer, float& ratio, std::vector<cv::RotatedRect>& boxes, std::vector<float>& scores)
{
//得到的是掩膜(里面是分值(每个像素点的分值,根据得分来筛选,0.6))
cv::Mat thresh_img;
cv::threshold(detect_buffer, thresh_img, 0.3, 255, cv::THRESH_BINARY);
//类型转换为uint8
normalize(thresh_img, thresh_img, 0, 255, cv::NORM_MINMAX, CV_8U);
//直接二值化及寻找轮廓最小外接矩形,得到文本检测框(后处理直接简化后处理,不用DB的后处理())
//直接寻找其所有轮廓
std::vector<std::vector<cv::Point>> contours;
std::vector<cv::Vec4i> hierarchy;
findContours(thresh_img, contours, hierarchy, cv::RETR_TREE, cv::CHAIN_APPROX_NONE);
for (int i = 0; i < contours.size(); i++) {
//制作掩膜,用于截取对应的区域分值
cv::Mat scoremask = cv::Mat::zeros(thresh_img.size(), thresh_img.type());
drawContours(scoremask, contours, static_cast<int>(i), 255, -1);
//寻找轮廓后要求其分数
cv::Mat scoreImg = cv::Mat::zeros(detect_buffer.size(), detect_buffer.type());
//std::cout << "depthImg type:" << depthImg.type() << std::endl;
detect_buffer.copyTo(scoreImg, scoremask); //根据掩膜截取图像
//求scoreImg的平均分数值
//对此区域的非0值求平均值
//求非0的像素值个数
int no_zero_count = cv::countNonZero(scoreImg);
if (no_zero_count > 0) {
//求所有值的和
double depth_sum = cv::sum(scoreImg)[0];
//求有效值的平均深度值
double depth_z = depth_sum / no_zero_count;
//得到对应的平均深度值输出
scores.push_back(depth_z);
//dst_image = contourImg.clone();
}
else {
scores.push_back(0);
}
//先过滤,默认面积小于100的过滤掉,防止干扰
float area = contourArea(contours[i]);
std::cout << "area:" << area << std::endl;
if (area < 100) {
continue;
}
//求每个轮廓外接矩形
cv::RotatedRect box = minAreaRect(contours[i]); //计算每个轮廓最小外接矩形
box.size.width = box.size.width + 60; //向外扩一下
box.size.height = box.size.height + 60; //向外扩一下
box.center.x = box.center.x / ratio;
box.center.y = box.center.y / ratio;
box.size.width = box.size.width / ratio;
box.size.height = box.size.height / ratio;
boxes.push_back(box);
}
}
int main(int argc, char* argv[]) {
// -------- Get OpenVINO runtime version --------
std::cout << ov::get_openvino_version().description << ':' << ov::get_openvino_version().buildNumber << std::endl;
// -------- Step 1. Initialize OpenVINO Runtime Core --------
//初始化OpenVINO运行时核心
ov::Core core;
// -------- Step 2. Compile the Model --------
//加载模型到内存并指定Plugin
auto compiled_model = core.compile_model(model_file, "AUTO"); //AUTO GPU CPU
// -------- Step 3. Create an Inference Request --------
//创造推理引擎
ov::InferRequest infer_request = compiled_model.create_infer_request();
for (auto input : compiled_model.inputs()) {
std::cout << input.get_any_name() << std::endl;
std::cout << input.get_element_type() << std::endl;
std::cout << input.get_partial_shape() << std::endl;
}
//时间测量
auto start = std::chrono::system_clock::now();
// -------- Step 4. Read a picture file and do the preprocess --------
cv::RNG rng;
cv::Mat img = cv::imread(image_file); //Load a picture into memory
std::vector<float> input;
int batch = 1;
int channel = 3;
int h = 960;
int w = 960;
std::vector<float> mean = { 0.485f, 0.456f, 0.406f };
std::vector<float> scale = { 1 / 0.229f, 1 / 0.224f, 1 / 0.225f };
bool is_scale_ = true;
float ratio;
paddleOCRPreprocess(img, input, ratio, batch, channel, h, w, mean, scale);
//cv::Mat input = cv::dnn::blobFromImage(img, 1 / 255.0, cv::Size(960, 960), cv::Scalar(0, 0, 0), true);
// -------- Step 5. Feed the blob into the input node of paddleocr_rec -------
// Get input port for model with one input //获取模型的输入端口
auto input_port = compiled_model.input();
// Create tensor from external memory
ov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), input.data());
// Set input tensor for model with one input
infer_request.set_input_tensor(input_tensor);
// -------- Step 6. Start inference --------
infer_request.infer();
// -------- Step 7. Get the inference result --------
auto detect = infer_request.get_output_tensor(0);
auto detect_shape = detect.get_shape();
std::cout << "The shape of Detection tensor:" << detect_shape << std::endl;
cv::Mat detect_buffer(detect_shape[2], detect_shape[3], CV_32F, detect.data());
int n2 = detect_shape[2];
int n3 = detect_shape[3];
int n = n2 * n3;
std::vector<float> pred(n, 0.0);
std::vector<unsigned char> cbuf(n, ' ');
//维度变换
std::vector<cv::Mat> norm_img_batch;
norm_img_batch.push_back(detect_buffer);
std::vector<float> out_data(batch * channel * detect_buffer.rows * detect_buffer.cols, 0.0f);
PermuteBatch(norm_img_batch, out_data.data());
for (int i = 0; i < n; i++) {
pred[i] = float(out_data[i]);
cbuf[i] = (unsigned char)((out_data[i]) * 255);
}
cv::Mat cbuf_map(n2, n3, CV_8UC1, (unsigned char*)cbuf.data());
cv::Mat pred_map(n2, n3, CV_32F, (float*)pred.data());
const double threshold = 0.3 * 255;
const double maxvalue = 255;
cv::Mat bit_map;
cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);
if (false) {
cv::Mat dila_ele =
cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(bit_map, bit_map, dila_ele);
}
std::vector<cv::RotatedRect> boxes;
std::vector<float> scores;
paddleOCRPostProcess(detect_buffer, ratio, boxes, scores);
for (int i = 0; i < boxes.size(); i++) {
// 获取矩形的四个顶点
cv::Point2f vertices[4];
boxes[i].points(vertices);
//绘制最小外接矩形
for (int j = 0; j < 4; j++)
{
cv::line(img, vertices[j], vertices[(j + 1) % 4], cv::Scalar(0, 255, 0), 2, 8); //绘制最小外接矩形每条边
}
cv::putText(img, "score:" + std::to_string(scores[i]), cv::Point(boxes[i].center.x, boxes[i].center.y), cv::FONT_HERSHEY_COMPLEX, 0.5, cv::Scalar(12, 23, 200), 1, 8);
std::cout << "score: " << scores[i] << " \n";
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Elapsed time: " << elapsed.count() * 1000 << " ms\n";
cv::imshow("demo", img);
cv::waitKey(0);
return 0;
}