diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14cf00fb0..d298a5c40 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -98,10 +98,12 @@ set(OPENSHOT_CV_SOURCES CVStabilization.cpp ClipProcessingJobs.cpp CVObjectDetection.cpp + CVObjectMask.cpp TrackedObjectBBox.cpp effects/Stabilizer.cpp effects/Tracker.cpp effects/ObjectDetection.cpp + effects/ObjectMask.cpp effects/Outline.cpp ./sort_filter/sort.cpp ./sort_filter/Hungarian.cpp diff --git a/src/CVObjectDetection.cpp b/src/CVObjectDetection.cpp index f1802bc91..7fe03387b 100644 --- a/src/CVObjectDetection.cpp +++ b/src/CVObjectDetection.cpp @@ -18,7 +18,13 @@ #include "CVObjectDetection.h" #include "Exceptions.h" +#include "ZmqLogger.h" +#define int64 int64_t +#define uint64 uint64_t +#include +#undef uint64 +#undef int64 #include "objdetectdata.pb.h" #include @@ -256,23 +262,45 @@ std::string CVObjectDetection::ValidateONNXModel(std::string modelPath) } void CVObjectDetection::setProcessingDevice(){ - if(processingDevice == "GPU"){ + const std::string requestedDevice = processingDevice; + if (processingDevice == "CPU") { + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + ZmqLogger::Instance()->Log("Object Detection DNN device: requested CPU, selected CPU"); + return; + } + + if(processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA"){ try { const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA); if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) { net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA); net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA); + ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected CUDA"); return; } } catch (const cv::Exception&) { } - processingDevice = "CPU"; } - if(processingDevice == "CPU"){ - net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); - net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + if(processingDevice == "GPU_OPENCL"){ + try { + const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV); + if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) { + cv::ocl::setUseOpenCL(true); + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL); + ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected OpenCL"); + return; + } + } catch (const cv::Exception&) { + } } + + processingDevice = "CPU"; + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + ZmqLogger::Instance()->Log("Object Detection DNN device: requested " + requestedDevice + ", selected CPU"); } void CVObjectDetection::detectObjectsClip(openshot::Clip &video, size_t _start, size_t _end, bool process_interval) diff --git a/src/CVObjectMask.cpp b/src/CVObjectMask.cpp new file mode 100644 index 000000000..d977ec4fd --- /dev/null +++ b/src/CVObjectMask.cpp @@ -0,0 +1,1242 @@ +/** + * @file + * @brief Source file for CVObjectMask class + * @author Jonathan Thomas + * + * @ref License + */ + +// Copyright (c) 2026 OpenShot Studios, LLC +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include "CVObjectMask.h" + +#include "Exceptions.h" +#include "ZmqLogger.h" +#include "objdetectdata.pb.h" + +#define int64 int64_t +#define uint64 uint64_t +#include +#undef uint64 +#undef int64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace openshot; +using google::protobuf::util::TimeUtil; + +namespace { + +std::string LoadONNXModel(const std::string& modelPath, cv::dnn::Net* net) +{ + try { + cv::dnn::Net loadedNet = cv::dnn::readNetFromONNX(modelPath); + if (net) + *net = loadedNet; + return ""; + } catch (const cv::Exception& e) { + return std::string("Failed to load ONNX model: ") + e.what(); + } catch (const std::exception& e) { + return std::string("Failed to load ONNX model: ") + e.what(); + } +} + +std::vector EncodeBinaryMaskRLE(const cv::Mat& mask) +{ + std::vector rle; + if (mask.empty()) + return rle; + + uint8_t current = 0; + uint32_t count = 0; + for (int y = 0; y < mask.rows; ++y) { + const uint8_t* row = mask.ptr(y); + for (int x = 0; x < mask.cols; ++x) { + const uint8_t value = row[x] ? 1 : 0; + if (value == current) { + ++count; + } else { + rle.push_back(count); + current = value; + count = 1; + } + } + } + rle.push_back(count); + return rle; +} + +struct EfficientSamPreprocessResult { + cv::Mat blob; + float scaleX = 1.0f; + float scaleY = 1.0f; +}; + +EfficientSamPreprocessResult MakeEfficientSamBlob(const cv::Mat& bgr, int modelSize) +{ + EfficientSamPreprocessResult result; + result.scaleX = static_cast(modelSize) / static_cast(bgr.cols); + result.scaleY = static_cast(modelSize) / static_cast(bgr.rows); + + cv::Mat resized; + cv::resize(bgr, resized, cv::Size(modelSize, modelSize), 0, 0, cv::INTER_LINEAR); + + const int shape[] = {1, 3, modelSize, modelSize}; + result.blob = cv::Mat(4, shape, CV_32F); + float* dst = result.blob.ptr(); + + for (int y = 0; y < resized.rows; ++y) { + const cv::Vec3b* row = resized.ptr(y); + for (int x = 0; x < resized.cols; ++x) { + const float rgb[] = { + static_cast(row[x][2]) / 255.0f, + static_cast(row[x][1]) / 255.0f, + static_cast(row[x][0]) / 255.0f, + }; + for (int c = 0; c < 3; ++c) + dst[(c * modelSize + y) * modelSize + x] = rgb[c]; + } + } + + return result; +} + +cv::Rect_ NormalizedBoundingBox(const cv::Mat& mask) +{ + std::vector points; + cv::findNonZero(mask, points); + if (points.empty()) + return {}; + + cv::Rect rect = cv::boundingRect(points); + return cv::Rect_( + rect.x / static_cast(mask.cols), + rect.y / static_cast(mask.rows), + rect.width / static_cast(mask.cols), + rect.height / static_cast(mask.rows)); +} + +cv::Mat EfficientSamMaskToFrameMask(const cv::Mat& modelMask, const cv::Size& frameSize, float maskThreshold) +{ + cv::Mat fullSize; + cv::resize(modelMask, fullSize, frameSize, 0, 0, cv::INTER_LINEAR); + + cv::Mat binary; + cv::threshold(fullSize, binary, maskThreshold, 255.0, cv::THRESH_BINARY); + if (cv::countNonZero(binary) == 0) { + double maxValue = 0.0; + cv::minMaxLoc(fullSize, nullptr, &maxValue); + if (maxValue > 0.0) { + cv::threshold(fullSize, binary, maxValue * 0.5, 255.0, cv::THRESH_BINARY); + } + } + binary.convertTo(binary, CV_8U); + return binary; +} + +cv::Mat MakeEfficientSamPromptBlob( + const CVObjectMaskPromptSet& prompts, + const EfficientSamPreprocessResult& prep, + int promptSlots, + std::vector& backgroundPoints, + std::vector& backgroundRects) +{ + const int coordsShape[] = {1, 1, promptSlots, 2}; + cv::Mat pointCoords(4, coordsShape, CV_32F, cv::Scalar(0.0f)); + + float* coords = pointCoords.ptr(); + int promptIndex = 0; + for (const auto& rect : prompts.positiveRects) { + if (promptIndex + 1 >= promptSlots) + break; + coords[promptIndex * 2] = rect.x * prep.scaleX; + coords[promptIndex * 2 + 1] = rect.y * prep.scaleY; + ++promptIndex; + coords[promptIndex * 2] = (rect.x + rect.width) * prep.scaleX; + coords[promptIndex * 2 + 1] = (rect.y + rect.height) * prep.scaleY; + ++promptIndex; + } + for (const auto& point : prompts.positivePoints) { + if (promptIndex >= promptSlots) + break; + coords[promptIndex * 2] = point.x * prep.scaleX; + coords[promptIndex * 2 + 1] = point.y * prep.scaleY; + ++promptIndex; + } + for (const auto& point : prompts.negativePoints) { + backgroundPoints.emplace_back( + static_cast(std::lround(point.x * prep.scaleX)), + static_cast(std::lround(point.y * prep.scaleY))); + } + for (const auto& rect : prompts.negativeRects) { + const int x1 = static_cast(std::floor(rect.x * prep.scaleX)); + const int y1 = static_cast(std::floor(rect.y * prep.scaleY)); + const int x2 = static_cast(std::ceil((rect.x + rect.width) * prep.scaleX)); + const int y2 = static_cast(std::ceil((rect.y + rect.height) * prep.scaleY)); + const int modelWidth = prep.blob.size[3]; + const int modelHeight = prep.blob.size[2]; + const int left = std::max(0, std::min(modelWidth - 1, x1)); + const int top = std::max(0, std::min(modelHeight - 1, y1)); + const int right = std::max(left + 1, std::min(modelWidth, x2)); + const int bottom = std::max(top + 1, std::min(modelHeight, y2)); + backgroundRects.emplace_back(left, top, right - left, bottom - top); + } + + return pointCoords; +} + +cv::Mat MakeEfficientSamLabelBlob(const CVObjectMaskPromptSet& prompts, int promptSlots) +{ + const int labelsShape[] = {1, 1, promptSlots, 1}; + cv::Mat pointLabels(4, labelsShape, CV_32F, cv::Scalar(-1.0f)); + + float* labels = pointLabels.ptr(); + int promptIndex = 0; + for (size_t i = 0; i < prompts.positiveRects.size() && promptIndex + 1 < promptSlots; ++i) { + labels[promptIndex++] = 2.0f; + labels[promptIndex++] = 3.0f; + } + for (size_t i = 0; i < prompts.positivePoints.size() && promptIndex < promptSlots; ++i, ++promptIndex) + labels[promptIndex] = 1.0f; + + return pointLabels; +} + +cv::Mat SelectEfficientSamMask(const cv::Mat& outputMasks, const cv::Mat& iouPredictions, + const std::vector& backgroundPoints, + const std::vector& backgroundRects, + float maskThreshold) +{ + if (outputMasks.dims != 5 || iouPredictions.empty()) + return cv::Mat(); + + const int candidateCount = outputMasks.size[2]; + const int maskHeight = outputMasks.size[3]; + const int maskWidth = outputMasks.size[4]; + const float* ious = iouPredictions.ptr(); + + const float* masks = outputMasks.ptr(); + const size_t candidatePixels = static_cast(maskHeight) * static_cast(maskWidth); + cv::Mat bestMask; + float bestScore = -std::numeric_limits::infinity(); + for (int candidate = 0; candidate < candidateCount; ++candidate) { + cv::Mat mask(maskHeight, maskWidth, CV_32F, + const_cast(masks + static_cast(candidate) * candidatePixels)); + + int backgroundHits = 0; + for (const cv::Point& point : backgroundPoints) { + const int x = std::max(0, std::min(maskWidth - 1, point.x)); + const int y = std::max(0, std::min(maskHeight - 1, point.y)); + if (mask.at(y, x) >= maskThreshold) + ++backgroundHits; + } + + float rectOverlapPenalty = 0.0f; + for (const cv::Rect& rect : backgroundRects) { + const cv::Rect clipped = rect & cv::Rect(0, 0, maskWidth, maskHeight); + const int area = clipped.area(); + if (area <= 0) + continue; + int overlap = 0; + for (int y = clipped.y; y < clipped.y + clipped.height; ++y) { + const float* row = mask.ptr(y); + for (int x = clipped.x; x < clipped.x + clipped.width; ++x) { + if (row[x] >= maskThreshold) + ++overlap; + } + } + rectOverlapPenalty += static_cast(overlap) / static_cast(area); + } + + const float pointPenalty = backgroundPoints.empty() + ? 0.0f + : static_cast(backgroundHits) / static_cast(backgroundPoints.size()); + if (!backgroundRects.empty()) + rectOverlapPenalty /= static_cast(backgroundRects.size()); + + const float score = ious[candidate] - (0.35f * pointPenalty) - (0.75f * rectOverlapPenalty); + if (bestMask.empty() || score > bestScore) { + bestScore = score; + bestMask = mask.clone(); + } + } + return bestMask; +} + +CVObjectMaskFrameData FrameDataFromMask(const cv::Mat& mask, size_t frameId, float score) +{ + CVObjectMaskFrameData frameData; + frameData.frameId = frameId; + frameData.objectId = 1; + if (mask.empty()) + return frameData; + + frameData.score = score; + frameData.width = mask.cols; + frameData.height = mask.rows; + frameData.rle = EncodeBinaryMaskRLE(mask); + frameData.box = NormalizedBoundingBox(mask); + return frameData; +} + +cv::Point2f JsonPoint(const Json::Value& value) +{ + if (!value.isObject() || value["x"].isNull() || value["y"].isNull()) + return cv::Point2f(-1.0f, -1.0f); + return cv::Point2f(value["x"].asFloat(), value["y"].asFloat()); +} + +bool IsValidPoint(const cv::Point2f& point) +{ + return point.x >= 0.0f && point.y >= 0.0f; +} + +void AppendJsonPoints(const Json::Value& values, std::vector& points) +{ + if (!values.isArray()) + return; + for (const auto& value : values) { + cv::Point2f point = JsonPoint(value); + if (IsValidPoint(point)) + points.push_back(point); + } +} + +size_t JsonFrameNumber(const std::string& frameName) +{ + try { + return static_cast(std::max(0, std::stoi(frameName))); + } catch (...) { + return 0; + } +} + +bool RectFromJson(const Json::Value& rect, cv::Rect_& output) +{ + if (!rect.isObject() || rect["x1"].isNull() || rect["y1"].isNull() || + rect["x2"].isNull() || rect["y2"].isNull()) { + return false; + } + + const float x1 = std::min(rect["x1"].asFloat(), rect["x2"].asFloat()); + const float y1 = std::min(rect["y1"].asFloat(), rect["y2"].asFloat()); + const float x2 = std::max(rect["x1"].asFloat(), rect["x2"].asFloat()); + const float y2 = std::max(rect["y1"].asFloat(), rect["y2"].asFloat()); + cv::Point2f topLeft(x1, y1); + cv::Point2f bottomRight(x2, y2); + if (!IsValidPoint(topLeft) || !IsValidPoint(bottomRight) || x2 <= x1 || y2 <= y1) + return false; + + output = cv::Rect_(x1, y1, x2 - x1, y2 - y1); + return true; +} + +void AppendJsonRects(const Json::Value& values, std::vector>& rects) +{ + if (!values.isArray()) + return; + for (const auto& rect : values) { + cv::Rect_ parsed; + if (RectFromJson(rect, parsed)) + rects.push_back(parsed); + } +} + +CVObjectMaskPromptSet PromptSetFromJson(const Json::Value& framePayload) +{ + CVObjectMaskPromptSet prompts; + AppendJsonPoints(framePayload["positive_points"], prompts.positivePoints); + AppendJsonPoints(framePayload["negative_points"], prompts.negativePoints); + AppendJsonRects(framePayload["positive_rects"], prompts.positiveRects); + AppendJsonRects(framePayload["negative_rects"], prompts.negativeRects); + return prompts; +} + +cv::Mat MakeBlob(const std::vector& shape, float value = 0.0f) +{ + cv::Mat output(static_cast(shape.size()), shape.data(), CV_32F); + output.setTo(value); + return output; +} + +std::string SetNetDevice(cv::dnn::Net& net, const std::string& processingDevice) +{ + if (processingDevice == "CPU") { + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + return "CPU"; + } + + if (processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA") { + try { + const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA); + if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) { + net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA); + return "CUDA"; + } + } catch (const cv::Exception&) { + } + } + + if (processingDevice == "GPU_OPENCL") { + try { + const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV); + if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) { + cv::ocl::setUseOpenCL(true); + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL); + return "OpenCL"; + } + } catch (const cv::Exception&) { + } + } + + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + return "CPU"; +} + +class CutiePropagator { +private: + static constexpr int memorySlots = 6; + int modelWidth = 640; + int modelHeight = 368; + int stride16Width = modelWidth / 16; + int stride16Height = modelHeight / 16; + + struct MemoryFrame { + cv::Mat key; + cv::Mat shrinkage; + cv::Mat value; + cv::Mat valid; + }; + + struct LetterboxTransform { + cv::Size originalSize; + cv::Rect contentRect; + }; + + cv::dnn::Net encodeKey; + cv::dnn::Net encodeValue; + cv::dnn::Net memoryReadout; + cv::dnn::Net decode; + cv::Mat sensory; + cv::Mat lastMask; + cv::Mat objectMemory; + MemoryFrame permanentMemory; + bool hasPermanentMemory = false; + std::deque workingMemoryFrames; + int frameIndex = 0; + int lastMemoryFrame = -1000000; + int memEvery = 5; + int maxMemoryFrames = memorySlots; + + static bool ParseModelSize(const std::string& modelPath, int& width, int& height) + { + size_t xPos = modelPath.find('x'); + while (xPos != std::string::npos) { + size_t widthStart = xPos; + while (widthStart > 0 && std::isdigit(static_cast(modelPath[widthStart - 1]))) + --widthStart; + + size_t heightEnd = xPos + 1; + while (heightEnd < modelPath.size() && std::isdigit(static_cast(modelPath[heightEnd]))) + ++heightEnd; + + if (widthStart != xPos && heightEnd != xPos + 1) { + width = std::stoi(modelPath.substr(widthStart, xPos - widthStart)); + height = std::stoi(modelPath.substr(xPos + 1, heightEnd - xPos - 1)); + if (width > 0 && height > 0 && width % 16 == 0 && height % 16 == 0) + return true; + } + xPos = modelPath.find('x', xPos + 1); + } + return false; + } + + void ConfigureModelSize(const std::string& modelPath) + { + int width = modelWidth; + int height = modelHeight; + if (!ParseModelSize(modelPath, width, height)) + return; + modelWidth = width; + modelHeight = height; + stride16Width = modelWidth / 16; + stride16Height = modelHeight / 16; + } + + LetterboxTransform ComputeLetterbox(const cv::Size& sourceSize) const + { + LetterboxTransform transform; + transform.originalSize = sourceSize; + if (sourceSize.width <= 0 || sourceSize.height <= 0) { + transform.contentRect = cv::Rect(0, 0, modelWidth, modelHeight); + return transform; + } + + const float scaleX = static_cast(modelWidth) / static_cast(sourceSize.width); + const float scaleY = static_cast(modelHeight) / static_cast(sourceSize.height); + const float scale = std::min(scaleX, scaleY); + + const int resizedWidth = std::max(1, std::min( + modelWidth, static_cast(std::lround(sourceSize.width * scale)))); + const int resizedHeight = std::max(1, std::min( + modelHeight, static_cast(std::lround(sourceSize.height * scale)))); + const int offsetX = (modelWidth - resizedWidth) / 2; + const int offsetY = (modelHeight - resizedHeight) / 2; + transform.contentRect = cv::Rect(offsetX, offsetY, resizedWidth, resizedHeight); + return transform; + } + + cv::Mat MakeImageBlob(const cv::Mat& bgr, const LetterboxTransform& transform) const + { + cv::Mat resized; + cv::resize(bgr, resized, transform.contentRect.size(), 0, 0, cv::INTER_LINEAR); + cv::Mat canvas(modelHeight, modelWidth, bgr.type(), cv::Scalar::all(0)); + resized.copyTo(canvas(transform.contentRect)); + + const int shape[] = {1, 3, modelHeight, modelWidth}; + cv::Mat blob(4, shape, CV_32F); + float* dst = blob.ptr(); + for (int y = 0; y < canvas.rows; ++y) { + const cv::Vec3b* row = canvas.ptr(y); + for (int x = 0; x < canvas.cols; ++x) { + dst[(0 * modelHeight + y) * modelWidth + x] = static_cast(row[x][2]) / 255.0f; + dst[(1 * modelHeight + y) * modelWidth + x] = static_cast(row[x][1]) / 255.0f; + dst[(2 * modelHeight + y) * modelWidth + x] = static_cast(row[x][0]) / 255.0f; + } + } + return blob; + } + + cv::Mat MakeMaskBlob(const cv::Mat& mask, const LetterboxTransform& transform) const + { + cv::Mat resized; + cv::resize(mask, resized, transform.contentRect.size(), 0, 0, cv::INTER_NEAREST); + cv::Mat canvas(modelHeight, modelWidth, CV_8U, cv::Scalar(0)); + resized.copyTo(canvas(transform.contentRect)); + + const int shape[] = {1, 1, modelHeight, modelWidth}; + cv::Mat blob(4, shape, CV_32F, cv::Scalar(0.0f)); + float* dst = blob.ptr(); + for (int y = 0; y < canvas.rows; ++y) { + const uint8_t* row = canvas.ptr(y); + for (int x = 0; x < canvas.cols; ++x) + dst[y * modelWidth + x] = row[x] ? 1.0f : 0.0f; + } + return blob; + } + + cv::Mat ForegroundFromProb(const cv::Mat& prob) const + { + const int shape[] = {1, 1, modelHeight, modelWidth}; + cv::Mat foreground(4, shape, CV_32F); + const float* src = prob.ptr(); + float* dst = foreground.ptr(); + const int plane = modelWidth * modelHeight; + std::memcpy(dst, src + plane, sizeof(float) * plane); + return foreground; + } + + cv::Mat BinaryMaskFromForeground(const cv::Mat& foreground, const LetterboxTransform& transform) const + { + cv::Mat modelMask(modelHeight, modelWidth, CV_8U, cv::Scalar(0)); + const float* src = foreground.ptr(); + for (int y = 0; y < modelMask.rows; ++y) { + uint8_t* row = modelMask.ptr(y); + for (int x = 0; x < modelMask.cols; ++x) + row[x] = src[y * modelWidth + x] >= 0.5f ? 255 : 0; + } + + cv::Mat cropped = modelMask(transform.contentRect); + cv::Mat restored; + cv::resize(cropped, restored, transform.originalSize, 0, 0, cv::INTER_NEAREST); + return restored; + } + + cv::Mat ValidMaskFromLetterbox(const LetterboxTransform& transform) const + { + cv::Mat valid(stride16Height, stride16Width, CV_32F, cv::Scalar(0.0f)); + for (int y = 0; y < stride16Height; ++y) { + float* row = valid.ptr(y); + const int centerY = y * 16 + 8; + for (int x = 0; x < stride16Width; ++x) { + const int centerX = x * 16 + 8; + if (transform.contentRect.contains(cv::Point(centerX, centerY))) + row[x] = 1.0f; + } + } + + const int shape[] = {1, 1, stride16Height, stride16Width}; + cv::Mat blob(4, shape, CV_32F); + std::memcpy(blob.ptr(), valid.ptr(), sizeof(float) * valid.total()); + return blob; + } + + void CopyKeySlot(const cv::Mat& src, cv::Mat& dst, int slot, int channels) const + { + const float* in = src.ptr(); + float* out = dst.ptr(); + const int plane = stride16Width * stride16Height; + for (int c = 0; c < channels; ++c) { + std::memcpy(out + (c * memorySlots + slot) * plane, + in + c * plane, + sizeof(float) * plane); + } + } + + void CopyValueSlot(const cv::Mat& src, cv::Mat& dst, int slot) const + { + const float* in = src.ptr(); + float* out = dst.ptr(); + const int plane = stride16Width * stride16Height; + for (int c = 0; c < 256; ++c) { + std::memcpy(out + (c * memorySlots + slot) * plane, + in + c * plane, + sizeof(float) * plane); + } + } + + cv::Mat MemoryKeyBlob() const + { + cv::Mat output = MakeBlob({1, 64, memorySlots, stride16Height, stride16Width}); + int slot = 0; + if (hasPermanentMemory) + CopyKeySlot(permanentMemory.key, output, slot++, 64); + for (int index = 0; + index < static_cast(workingMemoryFrames.size()) && slot < memorySlots; + ++index, ++slot) + CopyKeySlot(workingMemoryFrames[index].key, output, slot, 64); + return output; + } + + cv::Mat MemoryShrinkageBlob() const + { + cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width}); + int slot = 0; + if (hasPermanentMemory) + CopyKeySlot(permanentMemory.shrinkage, output, slot++, 1); + for (int index = 0; + index < static_cast(workingMemoryFrames.size()) && slot < memorySlots; + ++index, ++slot) + CopyKeySlot(workingMemoryFrames[index].shrinkage, output, slot, 1); + return output; + } + + cv::Mat MemoryValueBlob() const + { + cv::Mat output = MakeBlob({1, 1, 256, memorySlots, stride16Height, stride16Width}); + int slot = 0; + if (hasPermanentMemory) + CopyValueSlot(permanentMemory.value, output, slot++); + for (int index = 0; + index < static_cast(workingMemoryFrames.size()) && slot < memorySlots; + ++index, ++slot) + CopyValueSlot(workingMemoryFrames[index].value, output, slot); + return output; + } + + cv::Mat MemoryValidBlob() const + { + cv::Mat output = MakeBlob({1, 1, memorySlots, stride16Height, stride16Width}); + float* data = output.ptr(); + const int plane = stride16Width * stride16Height; + auto copyValidSlot = [&](const cv::Mat& valid, int slot) { + std::memcpy(data + slot * plane, valid.ptr(), sizeof(float) * plane); + }; + + int slot = 0; + if (hasPermanentMemory) + copyValidSlot(permanentMemory.valid, slot++); + for (int index = 0; + index < static_cast(workingMemoryFrames.size()) && slot < memorySlots; + ++index, ++slot) + copyValidSlot(workingMemoryFrames[index].valid, slot); + return output; + } + + void AddMemory(const cv::Mat& key, const cv::Mat& shrinkage, const cv::Mat& value, + const cv::Mat& valid, bool asPermanent) + { + MemoryFrame frame; + frame.key = key.clone(); + frame.shrinkage = shrinkage.clone(); + frame.value = value.clone(); + frame.valid = valid.clone(); + + if (asPermanent || !hasPermanentMemory) { + permanentMemory = frame; + hasPermanentMemory = true; + return; + } + + workingMemoryFrames.push_back(frame); + const int workingCapacity = std::max(0, maxMemoryFrames - 1); + while (static_cast(workingMemoryFrames.size()) > workingCapacity) + workingMemoryFrames.pop_front(); + } + + void AddObjectMemory(const cv::Mat& value) + { + if (objectMemory.empty()) { + objectMemory = MakeBlob({1, 1, 1, 16, 257}); + std::memcpy(objectMemory.ptr(), value.ptr(), sizeof(float) * value.total()); + return; + } + + float* dst = objectMemory.ptr(); + const float* src = value.ptr(); + for (size_t i = 0; i < value.total(); ++i) + dst[i] += src[i]; + } + +public: + void Load(const std::string& encodeKeyPath, const std::string& encodeValuePath, + const std::string& memoryReadoutPath, const std::string& decodePath) + { + ConfigureModelSize(encodeKeyPath); + encodeKey = cv::dnn::readNetFromONNX(encodeKeyPath); + encodeValue = cv::dnn::readNetFromONNX(encodeValuePath); + memoryReadout = cv::dnn::readNetFromONNX(memoryReadoutPath); + decode = cv::dnn::readNetFromONNX(decodePath); + sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width}); + } + + std::string SetDevice(const std::string& processingDevice) + { + std::string selected = SetNetDevice(encodeKey, processingDevice); + const std::string valueDevice = SetNetDevice(encodeValue, processingDevice); + const std::string readoutDevice = SetNetDevice(memoryReadout, processingDevice); + const std::string decodeDevice = SetNetDevice(decode, processingDevice); + if (selected != valueDevice || selected != readoutDevice || selected != decodeDevice) + return "Mixed"; + return selected; + } + + void Reset() + { + sensory = MakeBlob({1, 1, 256, stride16Height, stride16Width}); + lastMask.release(); + objectMemory.release(); + permanentMemory = MemoryFrame(); + hasPermanentMemory = false; + workingMemoryFrames.clear(); + frameIndex = 0; + lastMemoryFrame = -1000000; + } + + bool HasMemory() const + { + return hasPermanentMemory || !workingMemoryFrames.empty(); + } + + cv::Mat Step(const cv::Mat& frame, const cv::Mat& seedMask = cv::Mat()) + { + const LetterboxTransform transform = ComputeLetterbox(frame.size()); + const cv::Mat validMask = ValidMaskFromLetterbox(transform); + cv::Mat image = MakeImageBlob(frame, transform); + + encodeKey.setInput(image, "image"); + std::vector keyOutputs; + encodeKey.forward(keyOutputs, std::vector{"f16", "f8", "f4", "pix_feat", "key", "shrinkage", "selection"}); + cv::Mat f8 = keyOutputs[1]; + cv::Mat f4 = keyOutputs[2]; + cv::Mat pixFeat = keyOutputs[3]; + cv::Mat key = keyOutputs[4]; + cv::Mat shrinkage = keyOutputs[5]; + cv::Mat selection = keyOutputs[6]; + + cv::Mat foreground; + if (!seedMask.empty()) { + foreground = MakeMaskBlob(seedMask, transform); + } else if (HasMemory()) { + memoryReadout.setInput(key, "query_key"); + memoryReadout.setInput(selection, "query_selection"); + memoryReadout.setInput(MemoryKeyBlob(), "memory_key"); + memoryReadout.setInput(MemoryShrinkageBlob(), "memory_shrinkage"); + memoryReadout.setInput(MemoryValueBlob(), "memory_value"); + memoryReadout.setInput(MemoryValidBlob(), "memory_valid"); + memoryReadout.setInput(objectMemory, "object_memory"); + memoryReadout.setInput(pixFeat, "pix_feat"); + memoryReadout.setInput(sensory, "sensory"); + memoryReadout.setInput(lastMask, "last_mask"); + std::vector readoutOutputs; + memoryReadout.forward(readoutOutputs, std::vector{"memory_readout"}); + + decode.setInput(f8, "f8"); + decode.setInput(f4, "f4"); + decode.setInput(readoutOutputs[0], "memory_readout"); + decode.setInput(sensory, "sensory"); + std::vector decodeOutputs; + decode.forward(decodeOutputs, std::vector{"new_sensory", "logits", "prob"}); + sensory = decodeOutputs[0].clone(); + foreground = ForegroundFromProb(decodeOutputs[2]); + } else { + ++frameIndex; + return cv::Mat(); + } + + const bool isMemoryFrame = !seedMask.empty() || frameIndex - lastMemoryFrame >= memEvery; + if (isMemoryFrame) { + encodeValue.setInput(image, "image"); + encodeValue.setInput(pixFeat, "pix_feat"); + encodeValue.setInput(sensory, "sensory"); + encodeValue.setInput(foreground, "mask"); + std::vector valueOutputs; + encodeValue.forward(valueOutputs, std::vector{"mask_value", "new_sensory", "object_memory"}); + sensory = valueOutputs[1].clone(); + AddObjectMemory(valueOutputs[2]); + AddMemory(key, shrinkage, valueOutputs[0], validMask, !seedMask.empty()); + lastMemoryFrame = frameIndex; + } + + lastMask = foreground.clone(); + cv::Mat outputMask = BinaryMaskFromForeground(foreground, transform); + ++frameIndex; + return outputMask; + } +}; + +} + +CVObjectMask::CVObjectMask(std::string processInfoJson, ProcessingController& controller) + : processingController(&controller) +{ + SetJson(processInfoJson); +} + +std::string CVObjectMask::ValidateONNXModel(std::string modelPath) +{ + return LoadONNXModel(modelPath, nullptr); +} + +std::shared_ptr CVObjectMask::PreviewSeedMask(std::shared_ptr frame) +{ + if (!frame || efficientSamModelPath.empty() || promptKeyframes.empty()) + return std::shared_ptr(); + + std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam); + if (!loadError.empty()) + return std::shared_ptr(); + SetProcessingDevice(); + + CVObjectMaskPromptSet prompts = promptKeyframes.begin()->second; + cv::Mat frameImage = frame->GetImageCV(); + cv::Mat seedMask = CreateEfficientSAMSeedMask(frameImage, prompts); + if (seedMask.empty()) + return std::shared_ptr(); + + auto maskImage = std::make_shared( + seedMask.cols, seedMask.rows, QImage::Format_RGBA8888_Premultiplied); + maskImage->fill(Qt::transparent); + for (int y = 0; y < seedMask.rows; ++y) { + const uint8_t* src = seedMask.ptr(y); + QRgb* dst = reinterpret_cast(maskImage->scanLine(y)); + for (int x = 0; x < seedMask.cols; ++x) + dst[x] = src[x] ? qRgba(255, 255, 255, 255) : qRgba(0, 0, 0, 0); + } + + auto result = std::make_shared(frame->number, seedMask.cols, seedMask.rows, "#000000"); + result->AddImage(maskImage); + return result; +} + +void CVObjectMask::SetProcessingDevice() +{ + const std::string requestedDevice = processingDevice; + if (processingDevice == "CPU") { + efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested CPU, selected CPU"); + return; + } + + if (processingDevice == "GPU" || processingDevice == "GPU_AUTO" || processingDevice == "GPU_CUDA") { + try { + const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA); + if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) { + efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA); + efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA); + ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected CUDA"); + return; + } + } catch (const cv::Exception&) { + } + } + + if (processingDevice == "GPU_OPENCL") { + try { + const std::vector targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_OPENCV); + if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_OPENCL) != targets.end()) { + cv::ocl::setUseOpenCL(true); + efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL); + ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected OpenCL"); + return; + } + } catch (const cv::Exception&) { + } + } + + processingDevice = "CPU"; + efficientSam.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + efficientSam.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + ZmqLogger::Instance()->Log("Object Mask EfficientSAM DNN device: requested " + requestedDevice + ", selected CPU"); +} + +void CVObjectMask::maskClip(openshot::Clip& video, size_t _start, size_t _end, bool process_interval) +{ + start = _start; + end = _end; + + video.Open(); + processingController->SetError(false, ""); + + if (efficientSamModelPath.empty()) { + processingController->SetError(true, "Missing path to EfficientSAM ONNX model file"); + error = true; + return; + } + if (protobufDataPath.empty()) { + processingController->SetError(true, "Missing path to object mask protobuf data file"); + error = true; + return; + } + if (promptKeyframes.empty()) { + processingController->SetError(true, "Missing positive prompt point for Object Mask preprocessing"); + error = true; + return; + } + + std::string loadError = LoadONNXModel(efficientSamModelPath, &efficientSam); + if (!loadError.empty()) { + processingController->SetError(true, loadError); + error = true; + return; + } + SetProcessingDevice(); + + CutiePropagator cutie; + if (cutieEncodeKeyModelPath.empty() && !cutieModelDir.empty()) + cutieEncodeKeyModelPath = cutieModelDir + "/cutie-encode-key-640x368.onnx"; + if (cutieEncodeValueModelPath.empty() && !cutieModelDir.empty()) + cutieEncodeValueModelPath = cutieModelDir + "/cutie-encode-value-640x368.onnx"; + if (cutieMemoryReadoutModelPath.empty() && !cutieModelDir.empty()) + cutieMemoryReadoutModelPath = cutieModelDir + "/cutie-memory-readout-floatmask-valid-640x368-m6-topk30-opencv.onnx"; + if (cutieDecodeModelPath.empty() && !cutieModelDir.empty()) + cutieDecodeModelPath = cutieModelDir + "/cutie-decode-640x368.onnx"; + if (cutieEncodeKeyModelPath.empty() || cutieEncodeValueModelPath.empty() || + cutieMemoryReadoutModelPath.empty() || cutieDecodeModelPath.empty()) { + processingController->SetError(true, "Missing path to Cutie ONNX model files"); + error = true; + return; + } + try { + cutie.Load(cutieEncodeKeyModelPath, cutieEncodeValueModelPath, cutieMemoryReadoutModelPath, cutieDecodeModelPath); + const std::string cutieDevice = cutie.SetDevice(processingDevice); + ZmqLogger::Instance()->Log("Object Mask Cutie DNN device: requested " + processingDevice + ", selected " + cutieDevice); + } catch (const cv::Exception& e) { + processingController->SetError(true, std::string("Failed to load Cutie ONNX models: ") + e.what()); + error = true; + return; + } catch (const std::exception& e) { + processingController->SetError(true, std::string("Failed to load Cutie ONNX models: ") + e.what()); + error = true; + return; + } + + if (!process_interval || end <= 1 || end - start == 0) { + start = static_cast(video.Start() * video.Reader()->info.fps.ToFloat()); + end = static_cast(video.End() * video.Reader()->info.fps.ToFloat()); + } + if (end < start) + end = start; + + CVObjectMaskPromptSet activePrompts; + auto promptBeforeStart = promptKeyframes.upper_bound(start); + if (promptBeforeStart != promptKeyframes.begin()) { + --promptBeforeStart; + activePrompts = promptBeforeStart->second; + } + auto firstPromptAtOrAfterStart = promptKeyframes.lower_bound(start); + + for (size_t frameNumber = start; frameNumber <= end; ++frameNumber) { + if (processingController->ShouldStop()) + return; + + std::shared_ptr frame = video.GetFrame(frameNumber); + if (!frame) + continue; + + auto promptIt = promptKeyframes.find(frameNumber); + bool isPromptKeyframe = promptIt != promptKeyframes.end(); + if (promptIt != promptKeyframes.end()) { + activePrompts = promptIt->second; + cutie.Reset(); + } else if (!activePrompts.HasPositivePrompt()) { + if (firstPromptAtOrAfterStart != promptKeyframes.end() && frameNumber >= firstPromptAtOrAfterStart->first) { + activePrompts = firstPromptAtOrAfterStart->second; + isPromptKeyframe = true; + cutie.Reset(); + } else { + CVObjectMaskFrameData emptyFrame; + emptyFrame.frameId = frameNumber; + masksData[frameNumber] = emptyFrame; + continue; + } + } + + const cv::Mat frameImage = frame->GetImageCV(); + cv::Mat seedMask; + if (isPromptKeyframe || !cutie.HasMemory()) { + seedMask = CreateEfficientSAMSeedMask(frameImage, activePrompts); + if (seedMask.empty()) { + CVObjectMaskFrameData emptyFrame; + emptyFrame.frameId = frameNumber; + masksData[frameNumber] = emptyFrame; + continue; + } + if (!isPromptKeyframe) + cutie.Reset(); + } + + cv::Mat propagatedMask; + try { + propagatedMask = cutie.Step(frameImage, seedMask); + } catch (const cv::Exception& e) { + processingController->SetError(true, std::string("Failed to propagate Object Mask with Cutie: ") + e.what()); + error = true; + return; + } + + cv::Mat outputMask; + if (!seedMask.empty()) { + outputMask = seedMask; + } else if (!propagatedMask.empty()) { + cv::resize(propagatedMask, outputMask, frameImage.size(), 0, 0, cv::INTER_NEAREST); + } + masksData[frameNumber] = FrameDataFromMask(outputMask, frameNumber, 1.0f); + + const size_t range = std::max(1, end - start); + processingController->SetProgress(uint(100 * (frameNumber - start) / range)); + } +} + +cv::Mat CVObjectMask::CreateEfficientSAMSeedMask(const cv::Mat& frame, const CVObjectMaskPromptSet& prompts) +{ + EfficientSamPreprocessResult prep = MakeEfficientSamBlob(frame, modelSize); + + auto runPromptSet = [&](const CVObjectMaskPromptSet& promptSet) -> cv::Mat { + std::vector backgroundPoints; + std::vector backgroundRects; + cv::Mat pointCoords = MakeEfficientSamPromptBlob(promptSet, prep, promptSlots, backgroundPoints, backgroundRects); + cv::Mat pointLabels = MakeEfficientSamLabelBlob(promptSet, promptSlots); + + efficientSam.setInput(prep.blob, "batched_images"); + efficientSam.setInput(pointCoords, "batched_point_coords"); + efficientSam.setInput(pointLabels, "batched_point_labels"); + + std::vector outputs; + efficientSam.forward(outputs, std::vector{"output_masks", "iou_predictions"}); + if (outputs.size() != 2) + return cv::Mat(); + + cv::Mat modelMask = SelectEfficientSamMask(outputs[0], outputs[1], backgroundPoints, backgroundRects, maskThreshold); + if (modelMask.empty()) + return cv::Mat(); + return EfficientSamMaskToFrameMask(modelMask, frame.size(), maskThreshold); + }; + + if (prompts.positiveRects.size() <= 1) + return runPromptSet(prompts); + + cv::Mat combinedMask(frame.rows, frame.cols, CV_8U, cv::Scalar(0)); + bool hasMask = false; + for (const auto& rect : prompts.positiveRects) { + CVObjectMaskPromptSet rectPrompt; + rectPrompt.positiveRects.push_back(rect); + rectPrompt.negativePoints = prompts.negativePoints; + rectPrompt.negativeRects = prompts.negativeRects; + cv::Mat rectMask = runPromptSet(rectPrompt); + if (rectMask.empty()) + continue; + cv::bitwise_or(combinedMask, rectMask, combinedMask); + hasMask = true; + } + + if (!prompts.positivePoints.empty()) { + CVObjectMaskPromptSet pointPrompt; + pointPrompt.positivePoints = prompts.positivePoints; + pointPrompt.negativePoints = prompts.negativePoints; + pointPrompt.negativeRects = prompts.negativeRects; + cv::Mat pointMask = runPromptSet(pointPrompt); + if (!pointMask.empty()) { + cv::bitwise_or(combinedMask, pointMask, combinedMask); + hasMask = true; + } + } + + return hasMask ? combinedMask : cv::Mat(); +} + +bool CVObjectMask::SaveObjMaskData() +{ + if (protobufDataPath.empty()) { + std::cerr << "Missing path to object mask protobuf data file." << std::endl; + return false; + } + if (error) + return false; + + pb_objdetect::ObjDetect objMessage; + objMessage.add_classnames()->assign("object mask"); + + for (const auto& frameData : masksData) + AddFrameDataToProto(objMessage.add_frame(), frameData.second); + + *objMessage.mutable_last_updated() = TimeUtil::SecondsToTimestamp(time(NULL)); + + std::fstream output(protobufDataPath, std::ios::out | std::ios::trunc | std::ios::binary); + if (!objMessage.SerializeToOstream(&output)) { + std::cerr << "Failed to write object mask protobuf message." << std::endl; + return false; + } + + return true; +} + +void CVObjectMask::AddFrameDataToProto(pb_objdetect::Frame* pbFrameData, const CVObjectMaskFrameData& frameData) +{ + pbFrameData->set_id(frameData.frameId); + if (!frameData.HasMask()) + return; + + pb_objdetect::Frame_Box* box = pbFrameData->add_bounding_box(); + box->set_x(frameData.box.x); + box->set_y(frameData.box.y); + box->set_w(frameData.box.width); + box->set_h(frameData.box.height); + box->set_classid(0); + box->set_confidence(frameData.score); + box->set_objectid(frameData.objectId); + + pb_objdetect::Frame_Box_Mask* mask = box->mutable_mask(); + mask->set_width(frameData.width); + mask->set_height(frameData.height); + for (uint32_t count : frameData.rle) + mask->add_rle(count); +} + +void CVObjectMask::SetJson(const std::string value) +{ + try { + SetJsonValue(openshot::stringToJson(value)); + } catch (const std::exception&) { + std::cout << "JSON is invalid (missing keys or invalid data types)" << std::endl; + } +} + +void CVObjectMask::SetJsonValue(const Json::Value root) +{ + if (!root["protobuf_data_path"].isNull()) + protobufDataPath = root["protobuf_data_path"].asString(); + if (!root["efficient_sam_model"].isNull()) + efficientSamModelPath = root["efficient_sam_model"].asString(); + if (!root["efficient_sam_model_path"].isNull()) + efficientSamModelPath = root["efficient_sam_model_path"].asString(); + if (!root["sam_model"].isNull()) + efficientSamModelPath = root["sam_model"].asString(); + if (!root["sam_model_path"].isNull()) + efficientSamModelPath = root["sam_model_path"].asString(); + if (!root["encoder_model"].isNull()) + efficientSamModelPath = root["encoder_model"].asString(); + if (!root["encoder_model_path"].isNull()) + efficientSamModelPath = root["encoder_model_path"].asString(); + if (!root["cutie_model_dir"].isNull()) + cutieModelDir = root["cutie_model_dir"].asString(); + if (!root["cutie_encode_key_model"].isNull()) + cutieEncodeKeyModelPath = root["cutie_encode_key_model"].asString(); + if (!root["cutie_encode_key_model_path"].isNull()) + cutieEncodeKeyModelPath = root["cutie_encode_key_model_path"].asString(); + if (!root["cutie_encode_value_model"].isNull()) + cutieEncodeValueModelPath = root["cutie_encode_value_model"].asString(); + if (!root["cutie_encode_value_model_path"].isNull()) + cutieEncodeValueModelPath = root["cutie_encode_value_model_path"].asString(); + if (!root["cutie_memory_readout_model"].isNull()) + cutieMemoryReadoutModelPath = root["cutie_memory_readout_model"].asString(); + if (!root["cutie_memory_readout_model_path"].isNull()) + cutieMemoryReadoutModelPath = root["cutie_memory_readout_model_path"].asString(); + if (!root["cutie_decode_model"].isNull()) + cutieDecodeModelPath = root["cutie_decode_model"].asString(); + if (!root["cutie_decode_model_path"].isNull()) + cutieDecodeModelPath = root["cutie_decode_model_path"].asString(); + if (!root["processing-device"].isNull()) + processingDevice = root["processing-device"].asString(); + if (!root["processing_device"].isNull()) + processingDevice = root["processing_device"].asString(); + if (!root["prompt_slots"].isNull()) + promptSlots = std::max(1, std::min(6, root["prompt_slots"].asInt())); + if (!root["mask_threshold"].isNull()) + maskThreshold = root["mask_threshold"].asFloat(); + if (!root["model_size"].isNull()) + modelSize = root["model_size"].asInt(); + promptKeyframes.clear(); + if (!root["object_mask_selection"].isNull()) { + const Json::Value& selection = root["object_mask_selection"]; + const Json::Value& frames = selection["frames"]; + if (frames.isObject()) { + for (const auto& frameName : frames.getMemberNames()) { + const size_t frameNumber = JsonFrameNumber(frameName); + if (frameNumber == 0) + continue; + CVObjectMaskPromptSet prompts = PromptSetFromJson(frames[frameName]); + if (prompts.HasPositivePrompt()) + promptKeyframes[frameNumber] = prompts; + } + } + } + + CVObjectMaskPromptSet legacyPrompts; + if (!root["positive_points"].isNull()) + AppendJsonPoints(root["positive_points"], legacyPrompts.positivePoints); + if (!root["negative_points"].isNull()) + AppendJsonPoints(root["negative_points"], legacyPrompts.negativePoints); + + if (!root["positive_x"].isNull() && !root["positive_y"].isNull()) { + cv::Point2f point(root["positive_x"].asFloat(), root["positive_y"].asFloat()); + if (IsValidPoint(point) && legacyPrompts.positivePoints.empty()) + legacyPrompts.positivePoints.push_back(point); + } + if (!root["negative_x"].isNull() && !root["negative_y"].isNull()) { + cv::Point2f point(root["negative_x"].asFloat(), root["negative_y"].asFloat()); + if (IsValidPoint(point) && legacyPrompts.negativePoints.empty()) + legacyPrompts.negativePoints.push_back(point); + } + if (!root["rect_x1"].isNull() && !root["rect_y1"].isNull() && + !root["rect_x2"].isNull() && !root["rect_y2"].isNull()) { + Json::Value rect; + rect["x1"] = root["rect_x1"]; + rect["y1"] = root["rect_y1"]; + rect["x2"] = root["rect_x2"]; + rect["y2"] = root["rect_y2"]; + cv::Rect_ parsed; + if (RectFromJson(rect, parsed)) + legacyPrompts.positiveRects.push_back(parsed); + } + if (legacyPrompts.HasPositivePrompt() && promptKeyframes.empty()) + promptKeyframes[1] = legacyPrompts; +} diff --git a/src/CVObjectMask.h b/src/CVObjectMask.h new file mode 100644 index 000000000..cf1309fdf --- /dev/null +++ b/src/CVObjectMask.h @@ -0,0 +1,100 @@ +/** + * @file + * @brief Header file for CVObjectMask class + * @author Jonathan Thomas + * + * @ref License + */ + +// Copyright (c) 2026 OpenShot Studios, LLC +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#pragma once + +#define int64 opencv_broken_int +#define uint64 opencv_broken_uint +#include +#include +#include +#undef uint64 +#undef int64 + +#include "Clip.h" +#include "Json.h" +#include "ProcessingController.h" + +namespace pb_objdetect { + class Frame; +} + +namespace openshot +{ + struct CVObjectMaskFrameData { + size_t frameId = 0; + cv::Rect_ box; + float score = 0.0f; + int objectId = 1; + int width = 0; + int height = 0; + std::vector rle; + + bool HasMask() const { return width > 0 && height > 0 && !rle.empty(); } + }; + + struct CVObjectMaskPromptSet { + std::vector positivePoints; + std::vector negativePoints; + std::vector> positiveRects; + std::vector> negativeRects; + + bool HasPositivePrompt() const { return !positiveRects.empty() || !positivePoints.empty(); } + }; + + /** + * @brief Preprocess a clip into EfficientSAM/Cutie object masks stored in the object-detection protobuf format. + */ + class CVObjectMask + { + private: + cv::dnn::Net efficientSam; + + std::string efficientSamModelPath; + std::string cutieModelDir; + std::string cutieEncodeKeyModelPath; + std::string cutieEncodeValueModelPath; + std::string cutieMemoryReadoutModelPath; + std::string cutieDecodeModelPath; + std::string protobufDataPath; + std::string processingDevice = "CPU"; + + std::map promptKeyframes; + int promptSlots = 6; + float maskThreshold = 0.0f; + int modelSize = 1024; + + size_t start = 0; + size_t end = 0; + bool error = false; + + ProcessingController* processingController; + + void SetProcessingDevice(); + cv::Mat CreateEfficientSAMSeedMask(const cv::Mat& frame, const CVObjectMaskPromptSet& prompts); + void AddFrameDataToProto(pb_objdetect::Frame* pbFrameData, const CVObjectMaskFrameData& frameData); + + public: + std::map masksData; + + CVObjectMask(std::string processInfoJson, ProcessingController& processingController); + + static std::string ValidateONNXModel(std::string modelPath); + std::shared_ptr PreviewSeedMask(std::shared_ptr frame); + + void maskClip(openshot::Clip& video, size_t start = 0, size_t end = 0, bool process_interval = false); + bool SaveObjMaskData(); + + void SetJson(const std::string value); + void SetJsonValue(const Json::Value root); + }; +} diff --git a/src/ClipProcessingJobs.cpp b/src/ClipProcessingJobs.cpp index 19a237ab1..894245f28 100644 --- a/src/ClipProcessingJobs.cpp +++ b/src/ClipProcessingJobs.cpp @@ -28,6 +28,18 @@ std::string ClipProcessingJobs::ValidateONNXModel(std::string modelPath){ #endif } +std::shared_ptr ClipProcessingJobs::PreviewObjectMask(std::string processInfoJson, std::shared_ptr frame){ +#ifdef USE_OPENCV + ProcessingController controller; + CVObjectMask objectMask(processInfoJson, controller); + return objectMask.PreviewSeedMask(frame); +#else + (void)processInfoJson; + (void)frame; + return std::shared_ptr(); +#endif +} + void ClipProcessingJobs::processClip(Clip& clip, std::string json){ processInfoJson = json; @@ -41,6 +53,9 @@ void ClipProcessingJobs::processClip(Clip& clip, std::string json){ if(processingType == "ObjectDetection"){ t = std::thread(&ClipProcessingJobs::detectObjectsClip, this, std::ref(clip), std::ref(this->processingController)); } + if(processingType == "ObjectMask"){ + t = std::thread(&ClipProcessingJobs::maskObjectClip, this, std::ref(clip), std::ref(this->processingController)); + } } // Apply object tracking to clip @@ -87,6 +102,21 @@ void ClipProcessingJobs::detectObjectsClip(Clip& clip, ProcessingController& con } } +// Apply object segmentation mask to clip +void ClipProcessingJobs::maskObjectClip(Clip& clip, ProcessingController& controller){ + CVObjectMask objectMask(processInfoJson, controller); + objectMask.maskClip(clip); + + if(controller.ShouldStop()){ + controller.SetFinished(true); + return; + } + else{ + objectMask.SaveObjMaskData(); + controller.SetFinished(true); + } +} + void ClipProcessingJobs::stabilizeClip(Clip& clip, ProcessingController& controller){ // create CVStabilization object CVStabilization stabilizer(processInfoJson, controller); diff --git a/src/ClipProcessingJobs.h b/src/ClipProcessingJobs.h index dff27b265..16b644f17 100644 --- a/src/ClipProcessingJobs.h +++ b/src/ClipProcessingJobs.h @@ -22,6 +22,7 @@ #include "CVStabilization.h" #include "CVTracker.h" #include "CVObjectDetection.h" + #include "CVObjectMask.h" #endif #include @@ -51,12 +52,15 @@ class ClipProcessingJobs{ void stabilizeClip(Clip& clip, ProcessingController& controller); // Apply object detection to clip void detectObjectsClip(Clip& clip, ProcessingController& controller); + // Apply object segmentation mask to clip + void maskObjectClip(Clip& clip, ProcessingController& controller); public: // Constructor ClipProcessingJobs(std::string processingType, std::string processInfoJson); static std::string ValidateONNXModel(std::string modelPath); + static std::shared_ptr PreviewObjectMask(std::string processInfoJson, std::shared_ptr frame); // Process clip accordingly to processingType void processClip(Clip& clip, std::string json); diff --git a/src/EffectInfo.cpp b/src/EffectInfo.cpp index 6281812b3..87e417dca 100644 --- a/src/EffectInfo.cpp +++ b/src/EffectInfo.cpp @@ -149,6 +149,9 @@ EffectBase* EffectInfo::CreateEffect(std::string effect_type) { else if(effect_type == "ObjectDetection") return new ObjectDetection(); + + else if(effect_type == "ObjectMask") + return new ObjectMask(); #endif return NULL; @@ -205,6 +208,7 @@ Json::Value EffectInfo::JsonValue() { root.append(Stabilizer().JsonInfo()); root.append(Tracker().JsonInfo()); root.append(ObjectDetection().JsonInfo()); + root.append(ObjectMask().JsonInfo()); #endif // return JsonValue diff --git a/src/Effects.h b/src/Effects.h index e3f8e7eb8..af19ed7ea 100644 --- a/src/Effects.h +++ b/src/Effects.h @@ -58,6 +58,7 @@ #ifdef USE_OPENCV #include "effects/Outline.h" #include "effects/ObjectDetection.h" +#include "effects/ObjectMask.h" #include "effects/Tracker.h" #include "effects/Stabilizer.h" #endif diff --git a/src/effects/ObjectMask.cpp b/src/effects/ObjectMask.cpp new file mode 100644 index 000000000..477ce2e8b --- /dev/null +++ b/src/effects/ObjectMask.cpp @@ -0,0 +1,327 @@ +/** + * @file + * @brief Source file for Object Mask effect class + * @author Jonathan Thomas + * + * @ref License + */ + +// Copyright (c) 2026 OpenShot Studios, LLC +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include "effects/ObjectMask.h" + +#include "Exceptions.h" +#include "Frame.h" +#include "objdetectdata.pb.h" + +#define int64 opencv_broken_int +#define uint64 opencv_broken_uint +#include +#include +#undef uint64 +#undef int64 + +#include +#include +#include + +#include +#include + +using namespace openshot; + +namespace { + +QImage AlphaMaskImageFromRLE(const ObjectMaskFrameData& mask) +{ + QImage image(mask.width, mask.height, QImage::Format_ARGB32_Premultiplied); + image.fill(Qt::transparent); + if (!mask.HasData()) + return image; + + QRgb* data = reinterpret_cast(image.bits()); + const int total = mask.width * mask.height; + int offset = 0; + bool value = false; + for (uint32_t count : mask.rle) { + const int end = std::min(total, offset + static_cast(count)); + if (value) + std::fill(data + offset, data + end, qRgba(255, 255, 255, 255)); + offset = end; + value = !value; + if (offset >= total) + break; + } + return image; +} + +cv::Mat BinaryMaskFromImage(const QImage& image) +{ + QImage rgba = image.convertToFormat(QImage::Format_RGBA8888); + cv::Mat binary(rgba.height(), rgba.width(), CV_8UC1, cv::Scalar(0)); + for (int y = 0; y < rgba.height(); ++y) { + const uchar* source = rgba.constScanLine(y); + uchar* target = binary.ptr(y); + for (int x = 0; x < rgba.width(); ++x) + target[x] = source[x * 4 + 3] > 0 ? 255 : 0; + } + return binary; +} + +QImage StrokeImageFromMask(const QImage& alphaMask, int width) +{ + QImage result(alphaMask.size(), QImage::Format_ARGB32_Premultiplied); + result.fill(Qt::transparent); + if (width <= 0) + return result; + + cv::Mat binary = BinaryMaskFromImage(alphaMask); + cv::Mat dilated; + const int kernelSize = std::max(1, width * 2 + 1); + cv::Mat kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(kernelSize, kernelSize)); + cv::dilate(binary, dilated, kernel); + cv::Mat edge = dilated - binary; + + for (int y = 0; y < edge.rows; ++y) { + const uchar* edgeRow = edge.ptr(y); + QRgb* target = reinterpret_cast(result.scanLine(y)); + for (int x = 0; x < edge.cols; ++x) { + if (edgeRow[x]) + target[x] = qRgba(255, 255, 255, 255); + } + } + return result; +} + +} + +ObjectMask::ObjectMask() + : draw_mask(1.0) + , mask_color(83, 160, 237, 255) + , mask_alpha(120.0 / 255.0) + , stroke_color(255, 255, 255, 255) + , stroke_alpha(1.0) + , stroke_width(3.0) +{ + init_effect_details(); +} + +void ObjectMask::init_effect_details() +{ + InitEffectInfo(); + info.class_name = "ObjectMask"; + info.name = "Object Mask"; + info.description = "Create and draw a segmentation mask for a prompted object."; + info.has_audio = false; + info.has_video = true; + info.has_tracked_object = true; +} + +std::shared_ptr ObjectMask::GetFrame(std::shared_ptr frame, int64_t frame_number) +{ + std::shared_ptr frame_image = frame->GetImage(); + if (!frame_image || frame_image->isNull() || draw_mask.GetValue(frame_number) != 1) + return frame; + + auto mask_it = masksData.find(frame_number); + if (mask_it == masksData.end() || !mask_it->second.HasData()) + return frame; + + QImage alpha_mask = AlphaMaskImageFromRLE(mask_it->second) + .scaled(frame_image->size(), Qt::IgnoreAspectRatio, Qt::SmoothTransformation); + std::vector mask_rgba = mask_color.GetColorRGBA(frame_number); + QColor overlay_color(mask_rgba[0], mask_rgba[1], mask_rgba[2], 255 * mask_alpha.GetValue(frame_number)); + + QImage overlay(frame_image->size(), QImage::Format_ARGB32_Premultiplied); + overlay.fill(Qt::transparent); + QPainter overlay_painter(&overlay); + overlay_painter.setCompositionMode(QPainter::CompositionMode_Source); + overlay_painter.fillRect(overlay.rect(), overlay_color); + overlay_painter.setCompositionMode(QPainter::CompositionMode_DestinationIn); + overlay_painter.drawImage(0, 0, alpha_mask); + overlay_painter.end(); + + QPainter painter(frame_image.get()); + painter.drawImage(0, 0, overlay); + + const int strokeWidth = static_cast(std::round(stroke_width.GetValue(frame_number))); + if (strokeWidth > 0 && stroke_alpha.GetValue(frame_number) > 0.0) { + QImage stroke_mask = StrokeImageFromMask(alpha_mask, strokeWidth); + std::vector stroke_rgba = stroke_color.GetColorRGBA(frame_number); + QColor stroke_qcolor(stroke_rgba[0], stroke_rgba[1], stroke_rgba[2], 255 * stroke_alpha.GetValue(frame_number)); + + QImage stroke_overlay(frame_image->size(), QImage::Format_ARGB32_Premultiplied); + stroke_overlay.fill(Qt::transparent); + QPainter stroke_painter(&stroke_overlay); + stroke_painter.setCompositionMode(QPainter::CompositionMode_Source); + stroke_painter.fillRect(stroke_overlay.rect(), stroke_qcolor); + stroke_painter.setCompositionMode(QPainter::CompositionMode_DestinationIn); + stroke_painter.drawImage(0, 0, stroke_mask); + stroke_painter.end(); + painter.drawImage(0, 0, stroke_overlay); + } + painter.end(); + + return frame; +} + +bool ObjectMask::LoadObjMaskData(std::string inputFilePath) +{ + pb_objdetect::ObjDetect objMessage; + std::fstream input(inputFilePath, std::ios::in | std::ios::binary); + if (!objMessage.ParseFromIstream(&input)) + return false; + + masksData.clear(); + trackedObjects.clear(); + + auto trackedObject = std::make_shared(83, 160, 237, 255); + trackedObject->Id(Id().empty() ? "Object Mask" : Id() + "-1"); + trackedObject->ParentClip(this->ParentClip()); + trackedObject->draw_box = Keyframe(0.0); + trackedObject->draw_text = Keyframe(0.0); + trackedObject->draw_mask = draw_mask; + trackedObject->mask_alpha = mask_alpha; + trackedObject->mask_color = mask_color; + trackedObject->stroke = stroke_color; + trackedObject->stroke_alpha = stroke_alpha; + trackedObject->stroke_width = stroke_width; + + for (int frameIndex = 0; frameIndex < objMessage.frame_size(); ++frameIndex) { + const auto& pbFrame = objMessage.frame(frameIndex); + if (pbFrame.bounding_box_size() <= 0) + continue; + + const auto& box = pbFrame.bounding_box(0); + ObjectMaskFrameData mask; + mask.box = BBox(box.x() + box.w() / 2.0f, box.y() + box.h() / 2.0f, box.w(), box.h(), 0.0f); + mask.score = box.confidence(); + if (box.has_mask()) { + mask.width = box.mask().width(); + mask.height = box.mask().height(); + for (int rleIndex = 0; rleIndex < box.mask().rle_size(); ++rleIndex) + mask.rle.push_back(box.mask().rle(rleIndex)); + } + masksData[pbFrame.id()] = mask; + + trackedObject->AddBox(pbFrame.id(), mask.box.cx, mask.box.cy, mask.box.width, mask.box.height, 0.0f); + if (mask.HasData()) { + ObjectMaskData trackedMask; + trackedMask.width = mask.width; + trackedMask.height = mask.height; + trackedMask.rle = mask.rle; + trackedObject->AddMask(pbFrame.id(), trackedMask); + } + } + + if (!masksData.empty()) + trackedObjects[1] = trackedObject; + + google::protobuf::ShutdownProtobufLibrary(); + return true; +} + +std::shared_ptr ObjectMask::TrackedObjectMask(std::shared_ptr target_image, int64_t frame_number) const +{ + if (!target_image || target_image->isNull()) + return {}; + + auto mask_it = masksData.find(frame_number); + if (mask_it == masksData.end() || !mask_it->second.HasData()) + return {}; + + QImage alpha_mask = AlphaMaskImageFromRLE(mask_it->second) + .scaled(target_image->size(), Qt::IgnoreAspectRatio, Qt::SmoothTransformation); + + auto mask_image = std::make_shared( + target_image->width(), target_image->height(), QImage::Format_RGBA8888_Premultiplied); + mask_image->fill(QColor(0, 0, 0, 255)); + QPainter painter(mask_image.get()); + painter.drawImage(0, 0, alpha_mask); + painter.end(); + return mask_image; +} + +std::string ObjectMask::Json() const +{ + return JsonValue().toStyledString(); +} + +Json::Value ObjectMask::JsonValue() const +{ + Json::Value root = EffectBase::JsonValue(); + root["type"] = info.class_name; + root["protobuf_data_path"] = protobuf_data_path; + root["draw_mask"] = draw_mask.JsonValue(); + root["mask_color"] = mask_color.JsonValue(); + root["mask_alpha"] = mask_alpha.JsonValue(); + root["stroke_color"] = stroke_color.JsonValue(); + root["stroke_alpha"] = stroke_alpha.JsonValue(); + root["stroke_width"] = stroke_width.JsonValue(); + return root; +} + +void ObjectMask::SetJson(const std::string value) +{ + try { + SetJsonValue(openshot::stringToJson(value)); + } catch (const std::exception&) { + throw InvalidJSON("JSON is invalid (missing keys or invalid data types)"); + } +} + +void ObjectMask::SetJsonValue(const Json::Value root) +{ + EffectBase::SetJsonValue(root); + + if (!root["draw_mask"].isNull()) + draw_mask.SetJsonValue(root["draw_mask"]); + if (!root["mask_color"].isNull()) + mask_color.SetJsonValue(root["mask_color"]); + if (!root["mask_alpha"].isNull()) + mask_alpha.SetJsonValue(root["mask_alpha"]); + if (!root["stroke_color"].isNull()) + stroke_color.SetJsonValue(root["stroke_color"]); + if (!root["stroke"].isNull()) + stroke_color.SetJsonValue(root["stroke"]); + if (!root["stroke_alpha"].isNull()) + stroke_alpha.SetJsonValue(root["stroke_alpha"]); + if (!root["stroke_width"].isNull()) + stroke_width.SetJsonValue(root["stroke_width"]); + + if (!root["protobuf_data_path"].isNull()) { + std::string new_path = root["protobuf_data_path"].asString(); + if (protobuf_data_path != new_path || masksData.empty()) { + protobuf_data_path = new_path; + if (!LoadObjMaskData(protobuf_data_path)) + throw InvalidFile("Invalid object mask protobuf data path", ""); + } + } +} + +std::string ObjectMask::PropertiesJSON(int64_t requested_frame) const +{ + Json::Value root = BasePropertiesJSON(requested_frame); + root["protobuf_data_path"] = add_property_json("Object Mask Data", 0.0, "string", protobuf_data_path, NULL, -1, -1, false, requested_frame); + + root["draw_mask"] = add_property_json("Draw Mask", draw_mask.GetValue(requested_frame), "int", "", &draw_mask, 0, 1, false, requested_frame); + root["draw_mask"]["choices"].append(add_property_choice_json("Yes", true, draw_mask.GetValue(requested_frame))); + root["draw_mask"]["choices"].append(add_property_choice_json("No", false, draw_mask.GetValue(requested_frame))); + + root["mask_color"] = add_property_json("Mask Color", 0.0, "color", "", NULL, 0, 255, false, requested_frame); + root["mask_color"]["red"] = add_property_json("Red", mask_color.red.GetValue(requested_frame), "float", "", &mask_color.red, 0, 255, false, requested_frame); + root["mask_color"]["blue"] = add_property_json("Blue", mask_color.blue.GetValue(requested_frame), "float", "", &mask_color.blue, 0, 255, false, requested_frame); + root["mask_color"]["green"] = add_property_json("Green", mask_color.green.GetValue(requested_frame), "float", "", &mask_color.green, 0, 255, false, requested_frame); + root["mask_alpha"] = add_property_json("Mask Alpha", mask_alpha.GetValue(requested_frame), "float", "", &mask_alpha, 0.0, 1.0, false, requested_frame); + + root["stroke_color"] = add_property_json("Stroke Color", 0.0, "color", "", NULL, 0, 255, false, requested_frame); + root["stroke_color"]["red"] = add_property_json("Red", stroke_color.red.GetValue(requested_frame), "float", "", &stroke_color.red, 0, 255, false, requested_frame); + root["stroke_color"]["blue"] = add_property_json("Blue", stroke_color.blue.GetValue(requested_frame), "float", "", &stroke_color.blue, 0, 255, false, requested_frame); + root["stroke_color"]["green"] = add_property_json("Green", stroke_color.green.GetValue(requested_frame), "float", "", &stroke_color.green, 0, 255, false, requested_frame); + root["stroke_alpha"] = add_property_json("Stroke Alpha", stroke_alpha.GetValue(requested_frame), "float", "", &stroke_alpha, 0.0, 1.0, false, requested_frame); + root["stroke_width"] = add_property_json("Stroke Width", stroke_width.GetValue(requested_frame), "int", "", &stroke_width, 0, 50, false, requested_frame); + + return root.toStyledString(); +} diff --git a/src/effects/ObjectMask.h b/src/effects/ObjectMask.h new file mode 100644 index 000000000..3114af407 --- /dev/null +++ b/src/effects/ObjectMask.h @@ -0,0 +1,73 @@ +/** + * @file + * @brief Header file for Object Mask effect class + * @author Jonathan Thomas + * + * @ref License + */ + +// Copyright (c) 2026 OpenShot Studios, LLC +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#ifndef OPENSHOT_OBJECT_MASK_EFFECT_H +#define OPENSHOT_OBJECT_MASK_EFFECT_H + +#include "Color.h" +#include "EffectBase.h" +#include "KeyFrame.h" +#include "TrackedObjectBBox.h" + +#include + +namespace openshot +{ + class Frame; + + struct ObjectMaskFrameData { + int width = 0; + int height = 0; + std::vector rle; + BBox box; + float score = 0.0f; + + bool HasData() const { return width > 0 && height > 0 && !rle.empty(); } + }; + + /** + * @brief Display and expose a preprocessed segmentation mask for an object. + */ + class ObjectMask : public EffectBase + { + private: + std::string protobuf_data_path; + std::map masksData; + + void init_effect_details(); + + public: + Keyframe draw_mask; + Color mask_color; + Keyframe mask_alpha; + Color stroke_color; + Keyframe stroke_alpha; + Keyframe stroke_width; + + ObjectMask(); + + std::shared_ptr GetFrame(std::shared_ptr frame, int64_t frame_number) override; + std::shared_ptr GetFrame(int64_t frame_number) override { return GetFrame(std::make_shared(), frame_number); } + + bool LoadObjMaskData(std::string inputFilePath); + std::shared_ptr TrackedObjectMask(std::shared_ptr target_image, int64_t frame_number) const override; + + std::string Json() const override; + void SetJson(const std::string value) override; + Json::Value JsonValue() const override; + void SetJsonValue(const Json::Value root) override; + + std::string PropertiesJSON(int64_t requested_frame) const override; + }; +} + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0167c9fab..4f915b307 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -91,6 +91,7 @@ if($CACHE{HAVE_OPENCV}) CVTracker CVStabilizer CVOutline + ObjectMask # CVObjectDetection ) endif() diff --git a/tests/ObjectMask.cpp b/tests/ObjectMask.cpp new file mode 100644 index 000000000..8b6432e5a --- /dev/null +++ b/tests/ObjectMask.cpp @@ -0,0 +1,163 @@ +/** + * @file + * @brief Unit tests for Object Mask effect + * @author Jonathan Thomas + * + * @ref License + */ + +// Copyright (c) 2026 OpenShot Studios, LLC +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include "openshot_catch.h" + +#include "EffectInfo.h" +#include "Frame.h" +#include "Json.h" +#include "effects/ObjectMask.h" +#ifdef USE_OPENCV +#include "CVObjectMask.h" +#endif + +#include +#include + +#include +#include +#include +#include +#include + +using namespace openshot; + +static std::shared_ptr make_object_mask_frame(int64_t number, int width, int height) { + auto frame = std::make_shared(number, width, height, "#000000"); + frame->GetImage()->fill(QColor(64, 64, 64, 255)); + return frame; +} + +static std::string temp_object_mask_path() { + char path[] = "/tmp/libopenshot_object_mask_XXXXXX"; + int fd = mkstemp(path); + REQUIRE(fd != -1); + close(fd); + std::remove(path); + return std::string(path) + ".data"; +} + +static void append_varint(std::string& output, uint64_t value) { + while (value >= 0x80) { + output.push_back(static_cast((value & 0x7f) | 0x80)); + value >>= 7; + } + output.push_back(static_cast(value)); +} + +static void append_fixed32_float(std::string& output, float value) { + uint32_t bits; + std::memcpy(&bits, &value, sizeof(float)); + for (int i = 0; i < 4; ++i) + output.push_back(static_cast((bits >> (8 * i)) & 0xff)); +} + +static void append_length_delimited(std::string& output, uint32_t field_number, const std::string& value) { + append_varint(output, (field_number << 3) | 2); + append_varint(output, static_cast(value.size())); + output.append(value); +} + +static std::string create_object_mask_data() { + const std::string path = temp_object_mask_path(); + + std::string mask; + append_varint(mask, 8); + append_varint(mask, 4); + append_varint(mask, 16); + append_varint(mask, 4); + for (uint32_t count : {0u, 6u, 10u}) { + append_varint(mask, 24); + append_varint(mask, count); + } + + std::string box; + append_varint(box, 13); + append_fixed32_float(box, 0.0f); + append_varint(box, 21); + append_fixed32_float(box, 0.0f); + append_varint(box, 29); + append_fixed32_float(box, 1.0f); + append_varint(box, 37); + append_fixed32_float(box, 1.0f); + append_varint(box, 40); + append_varint(box, 0); + append_varint(box, 53); + append_fixed32_float(box, 0.95f); + append_varint(box, 56); + append_varint(box, 1); + append_length_delimited(box, 8, mask); + + std::string frame; + append_varint(frame, 8); + append_varint(frame, 1); + append_length_delimited(frame, 2, box); + + std::string data; + append_length_delimited(data, 1, frame); + append_length_delimited(data, 3, "object mask"); + + std::ofstream output(path, std::ios::out | std::ios::binary); + output.write(data.data(), static_cast(data.size())); + REQUIRE(output.good()); + return path; +} + +TEST_CASE("ObjectMask effect is registered", "[effect][object_mask]") { + std::unique_ptr effect(EffectInfo().CreateEffect("ObjectMask")); + REQUIRE(effect != nullptr); + CHECK(effect->info.name == "Object Mask"); + CHECK(effect->info.has_video); + CHECK(effect->info.has_tracked_object); +} + +TEST_CASE("ObjectMask loads protobuf masks and exposes style controls", "[effect][object_mask]") { + const std::string protobuf_path = create_object_mask_data(); + + ObjectMask effect; + Json::Value config; + config["protobuf_data_path"] = protobuf_path; + config["mask_alpha"] = Keyframe(0.5).JsonValue(); + config["stroke_width"] = Keyframe(2.0).JsonValue(); + effect.SetJsonValue(config); + + Json::Value properties = stringToJson(effect.PropertiesJSON(1)); + CHECK(properties["draw_mask"]["name"].asString() == "Draw Mask"); + CHECK(properties["mask_color"]["name"].asString() == "Mask Color"); + CHECK(properties["mask_alpha"]["value"].asDouble() == Approx(0.5)); + CHECK(properties["stroke_color"]["name"].asString() == "Stroke Color"); + CHECK(properties["stroke_alpha"]["name"].asString() == "Stroke Alpha"); + CHECK(properties["stroke_width"]["value"].asDouble() == Approx(2.0)); + + auto generated_mask = effect.TrackedObjectMask(std::make_shared(4, 4, QImage::Format_RGBA8888_Premultiplied), 1); + REQUIRE(generated_mask != nullptr); + CHECK(generated_mask->pixelColor(0, 0).red() == 255); + CHECK(generated_mask->pixelColor(3, 3).red() == 0); + + Json::Value no_stroke; + no_stroke["stroke_width"] = Keyframe(0.0).JsonValue(); + effect.SetJsonValue(no_stroke); + + auto frame = make_object_mask_frame(1, 4, 4); + auto output = effect.GetFrame(frame, 1)->GetImage(); + CHECK(output->pixelColor(0, 0) != QColor(64, 64, 64, 255)); + CHECK(output->pixelColor(3, 3) == QColor(64, 64, 64, 255)); + + std::remove(protobuf_path.c_str()); +} + +#ifdef USE_OPENCV +TEST_CASE("CVObjectMask validates a single EfficientSAM ONNX model path", "[effect][object_mask][opencv]") { + const std::string error = CVObjectMask::ValidateONNXModel("/tmp/libopenshot_missing_efficientsam.onnx"); + CHECK(error.find("Failed to load ONNX model") != std::string::npos); +} +#endif