From faafef0df5f12cd6d47ed9ed97e919c209b068ae Mon Sep 17 00:00:00 2001 From: Nikolaj Olsson Date: Mon, 15 Jun 2026 04:53:29 +0200 Subject: [PATCH] seconv: VobSub OCR + --time-codes-only for image-based subtitles Add a --time-codes-only flag to seconv that extracts time codes from image-based subtitles into a text format without OCR: each entry keeps its timing with empty text and no OCR engine is created, so it works without Tesseract/Paddle/etc. installed. Verified that SE re-opens the resulting empty-text SRT/ASSA files (timing preserved). Wire VobSub into the text/OCR pipeline (previously "use the UI"): - .sub + .idx pairs (text target) - VobSub-in-MKV (S_VOBSUB) - VobSub-in-MP4 (handler subp) Both full OCR and --time-codes-only are supported for all of these, reusing the existing VobSub bitmap decoder. Fix .sub routing: a binary VobSub .sub with no .idx companion is now detected (MPEG pack header) and read directly (stream PTS timing + default palette, with a note) instead of falling through to the MicroDVD text loader; a genuine text MicroDVD .sub still routes to the text loader. Co-Authored-By: Claude Opus 4.8 (1M context) --- change-log.txt | 7 + docs/features/seconv.md | 3 +- docs/reference/command-line.md | 10 +- src/seconv/Commands/ConvertCommand.cs | 5 + src/seconv/Core/BitmapSubtitleLoader.cs | 127 ++++++++++++-- src/seconv/Core/ContainerSubtitleLoader.cs | 62 ++++++- src/seconv/Core/ImageOcrLoader.cs | 184 +++++++++++++++++---- src/seconv/Core/SubtitleConverter.cs | 27 ++- src/seconv/Helpers/HelpDisplay.cs | 4 + src/seconv/README.md | 10 +- tests/seconv/Core/ContainerLoaderTest.cs | 37 +++-- tests/seconv/Core/TimeCodesOnlyTest.cs | 62 +++++++ tests/seconv/Core/VobSubRoutingTest.cs | 73 ++++++++ 13 files changed, 539 insertions(+), 72 deletions(-) create mode 100644 tests/seconv/Core/TimeCodesOnlyTest.cs create mode 100644 tests/seconv/Core/VobSubRoutingTest.cs diff --git a/change-log.txt b/change-log.txt index 0609e6c08ec..75493f6a88f 100644 --- a/change-log.txt +++ b/change-log.txt @@ -23,6 +23,13 @@ Highlights since the 4.x line: ----------------------------------------------------------------------------------------------------- +v5.0.0-rc5 (TBD) + +* seconv: OCR VobSub subtitles (.sub/.idx, VobSub-in-MKV, VobSub-in-MP4) to text formats +* seconv: add --time-codes-only to extract time codes from image-based subtitles (.sup/VobSub/PGS/DVB-sub) without OCR + +----------------------------------------------------------------------------------------------------- + v5.0.0-rc4 (11th of June 2026) * Add Zonos TTS text-to-speech engine (CrispASR) diff --git a/docs/features/seconv.md b/docs/features/seconv.md index 47091cbdf2e..64d524b8467 100644 --- a/docs/features/seconv.md +++ b/docs/features/seconv.md @@ -7,7 +7,8 @@ seconv *.srt webvtt seconv movie.srt subrip --encoding:source --FixCommonErrors seconv movie.mkv subrip --track-number:3 seconv movie.sup subrip --ocr-engine:tesseract --ocr-language:eng -seconv movie.sup subrip --ocr-engine:binaryocr --ocr-db:Latin.db +seconv movie.sub subrip --ocr-engine:binaryocr --ocr-db:Latin.db # VobSub (.idx auto-detected) +seconv movie.sup subrip --time-codes-only ``` For full usage, options, OCR setup, operations pipeline, examples, and exit codes, see the canonical reference: diff --git a/docs/reference/command-line.md b/docs/reference/command-line.md index 0a20b0e2dce..8ecdc0829da 100644 --- a/docs/reference/command-line.md +++ b/docs/reference/command-line.md @@ -164,6 +164,7 @@ If two tracks share a language, the track number is added: `movie.#3.eng.srt`. | `--ocr-db:` | OCR database file: `.nocr` for `nocr`, `.db` for `binaryocr` (required for both) | | `--ollama-url:` | Default `http://localhost:11434/api/chat` | | `--ollama-model:` | Default `llama3.2-vision` | +| `--time-codes-only` | Image sources (`.sup`, VobSub `.sub`/`.idx`, MKV PGS/VobSub, MP4 VobSub, TS DVB-sub) → text format with time codes only and empty text. **Skips OCR entirely** — no OCR engine required. Ignored for text inputs and image output targets. | > **OCR database files are not bundled with `seconv`.** The `nocr` and `binaryocr` engines need a `.nocr` or `.db` file passed via `--ocr-db`. Sources: > @@ -183,11 +184,18 @@ seconv movie.sup subrip --ocr-engine:nocr --ocr-db:"C:\Users\me\AppData\Roaming\ # BinaryOCR seconv movie.sup subrip --ocr-engine:binaryocr --ocr-db:"C:\Users\me\AppData\Roaming\Subtitle Edit\Ocr\Latin.db" -# MKV with image (PGS) tracks — OCR runs automatically +# MKV with image (PGS or VobSub) tracks — OCR runs automatically seconv movie.mkv subrip --ocr-engine:tesseract --ocr-language:eng +# VobSub .sub + .idx pair — the .idx companion is auto-detected +seconv movie.sub subrip --ocr-engine:tesseract --ocr-language:eng + # Transport-stream teletext (no OCR needed) seconv broadcast.ts subrip + +# Time codes only — extract timing with no OCR (empty text); works for any image source +seconv movie.sup subrip --time-codes-only +seconv movie.sub subrip --time-codes-only ``` ### Templates / replacements diff --git a/src/seconv/Commands/ConvertCommand.cs b/src/seconv/Commands/ConvertCommand.cs index 307fb03ad19..b5f12770f43 100644 --- a/src/seconv/Commands/ConvertCommand.cs +++ b/src/seconv/Commands/ConvertCommand.cs @@ -86,6 +86,10 @@ public sealed class Settings : CommandSettings [Description("Path to a .nocr file (--ocr-engine=nocr) or .db file (--ocr-engine=binaryocr)")] public string? OcrDb { get; init; } + [CommandOption("--time-codes-only|--timecodesonly")] + [Description("For image-based sources (.sup, VobSub .sub/.idx, MKV PGS/VobSub, MP4 VobSub, TS DVB-sub): output time codes only with empty text; skips OCR (no OCR engine required)")] + public bool TimeCodesOnly { get; init; } + [CommandOption("--ollama-url")] [Description("Ollama API endpoint (default: http://localhost:11434/api/chat)")] public string? OllamaUrl { get; init; } @@ -449,6 +453,7 @@ protected override async Task ExecuteAsync(CommandContext context, Settings OcrEngine = string.IsNullOrWhiteSpace(settings.OcrEngine) ? "tesseract" : settings.OcrEngine, OcrLanguage = settings.OcrLanguage ?? "eng", OcrDb = settings.OcrDb, + TimeCodesOnly = settings.TimeCodesOnly, OllamaUrl = settings.OllamaUrl, OllamaModel = settings.OllamaModel, TeletextOnly = settings.TeletextOnly, diff --git a/src/seconv/Core/BitmapSubtitleLoader.cs b/src/seconv/Core/BitmapSubtitleLoader.cs index dae97106e2e..0ed68977590 100644 --- a/src/seconv/Core/BitmapSubtitleLoader.cs +++ b/src/seconv/Core/BitmapSubtitleLoader.cs @@ -1,6 +1,7 @@ using Nikse.SubtitleEdit.Core.BluRaySup; using Nikse.SubtitleEdit.Core.Common; using Nikse.SubtitleEdit.Core.ContainerFormats.Matroska; +using Nikse.SubtitleEdit.Core.ContainerFormats.Mp4.Boxes; using Nikse.SubtitleEdit.Core.ContainerFormats.TransportStream; using Nikse.SubtitleEdit.Core.VobSub; using SkiaSharp; @@ -63,22 +64,19 @@ public static IReadOnlyList LoadMatroskaPgs(MatroskaFile mat } /// - /// VobSub .sub + .idx pair → bitmap events. Uses - /// so the .idx provides timing + palette - /// and the .sub provides the subpicture stream payload. The VobSub spec doesn't - /// store a screen size in the index file, so we bake in the DVD-standard frame - /// sizes (720x576 PAL, 720x480 NTSC) — otherwise the output writer would fall - /// back to --resolution / 1920x1080, which is wrong metadata for DVD - /// sources. + /// VobSub .sub (+ optional .idx) → bitmap events. Uses + /// , which uses the .idx for timing + palette when + /// present and otherwise parses the .sub's MPEG-PS stream directly (stream PTS timing + + /// a default palette). The VobSub spec doesn't store a screen size in the index file, so + /// we bake in the DVD-standard frame sizes (720x576 PAL, 720x480 NTSC) — otherwise the + /// output writer would fall back to --resolution / 1920x1080, which is wrong + /// metadata for DVD sources. /// public static IReadOnlyList LoadVobSub(string subPath, string idxPath, bool isPal) { - if (!File.Exists(idxPath)) - { - throw new InvalidOperationException($"VobSub .idx companion not found at: {idxPath}"); - } - var parser = new VobSubParser(isPal); + // OpenSubIdx falls back to parsing the .sub stream directly when the .idx is missing, + // so an absent companion is not fatal — see IsBinaryVobSub for the caller's gate. parser.OpenSubIdx(subPath, idxPath); var packs = parser.MergeVobSubPacks(); if (packs.Count == 0) @@ -102,6 +100,111 @@ public static IReadOnlyList LoadVobSub(string subPath, strin return items; } + /// + /// True if the file begins with an MPEG-2 pack header (00 00 01 BA), i.e. it's a + /// binary VobSub subpicture stream rather than a text MicroDVD .sub. Used to decide + /// whether a .sub without an .idx companion is a VobSub (read it directly, + /// with a warning) or a plain text subtitle (fall through to the text loader). + /// + public static bool IsBinaryVobSub(string filePath) + { + try + { + var header = new byte[4]; + using var fs = File.OpenRead(filePath); + return fs.Read(header, 0, 4) == 4 && VobSubParser.IsMpeg2PackHeader(header); + } + catch + { + // I/O race / permissions — let the text loader try rather than hard-failing here. + return false; + } + } + + /// + /// VobSub track inside an MKV (S_VOBSUB) → bitmap events. The subpicture packets + /// live in the Matroska blocks; the per-pack timing comes from the block Start/End (not + /// the SubPicture's own delay, which only applies to standalone .sub+.idx). + /// Mirrors the desktop batch converter's LoadVobSubFromMatroska; the palette is + /// left to 's default, matching the GUI's OCR path. + /// + public static IReadOnlyList LoadMatroskaVobSub(MatroskaFile matroska, MatroskaTrackInfo track) + { + if (track.ContentEncodingType == 1) + { + throw new InvalidOperationException( + $"VobSub MKV track #{track.TrackNumber} is compressed (content encoding 1), which isn't supported."); + } + + var sub = matroska.GetSubtitle(track.TrackNumber, null); + var packs = new List(sub.Count); + foreach (var p in sub) + { + packs.Add(new VobSubMergedPack(p.GetData(track), TimeSpan.FromMilliseconds(p.Start), 32, null) + { + EndTime = TimeSpan.FromMilliseconds(p.End), + }); + + // Fix overlapping time codes (some Handbrake versions emit them) by clamping the + // previous pack's end to just before this one's start. + if (packs.Count > 1 && packs[^2].EndTime > packs[^1].StartTime) + { + packs[^2].EndTime = TimeSpan.FromMilliseconds(packs[^1].StartTime.TotalMilliseconds - 1); + } + } + + var items = new List(packs.Count); + foreach (var pack in packs) + { + var bmp = pack.GetBitmap(); + if (bmp is null) + { + continue; + } + // Use the block-derived Start/End (TimeSpan), not StartTimeCode/EndTimeCode which + // are based on the SubPicture delay and only correct for .sub+.idx sources. + items.Add(new BitmapSubtitleItem( + new TimeCode(pack.StartTime.TotalMilliseconds), + new TimeCode(pack.EndTime.TotalMilliseconds), + bmp)); + } + if (items.Count == 0) + { + throw new InvalidOperationException($"No VobSub subtitles in MKV track #{track.TrackNumber}."); + } + return items; + } + + /// + /// VobSub track inside an MP4 (handler type subp, e.g. produced by MP4Box) → + /// bitmap events. The decoded subpictures and their timing are parsed by libse's + /// ; index i of SubPictures lines up with paragraph + /// i. Mirrors the desktop OcrSubtitleMp4VobSub (palette left to default). + /// + public static IReadOnlyList LoadMp4VobSub(Trak track) + { + var paragraphs = track.Mdia.Minf.Stbl.GetParagraphs(); + var subPictures = track.Mdia.Minf.Stbl.SubPictures; + var count = Math.Min(paragraphs.Count, subPictures.Count); + + var items = new List(count); + for (var i = 0; i < count; i++) + { + var bmp = subPictures[i].GetBitmap( + null, SKColors.Transparent, SKColors.Black, SKColors.White, SKColors.Black, false); + if (bmp is null) + { + continue; + } + items.Add(new BitmapSubtitleItem(paragraphs[i].StartTime, paragraphs[i].EndTime, bmp)); + } + if (items.Count == 0) + { + throw new InvalidOperationException("No VobSub subpictures found in MP4 track."); + } + return items; + } + /// /// Transport stream DVB-sub → one bitmap list per packet ID. Caller is responsible /// for routing each PID to its own output file (multiple subtitle streams = diff --git a/src/seconv/Core/ContainerSubtitleLoader.cs b/src/seconv/Core/ContainerSubtitleLoader.cs index c4c96f04459..0b846421c15 100644 --- a/src/seconv/Core/ContainerSubtitleLoader.cs +++ b/src/seconv/Core/ContainerSubtitleLoader.cs @@ -61,6 +61,31 @@ public sealed record LoadedTrack( return LoadBluRaySup(filePath, options); } + if (ext == ".sub") + { + var idxPath = Path.ChangeExtension(filePath, ".idx"); + if (File.Exists(idxPath)) + { + return LoadVobSub(filePath, idxPath, options); + } + + // No .idx companion. A binary VobSub .sub can still be read — the MPEG-PS packets + // carry their own PTS timing and a default palette is used — so read it (with a + // note) rather than letting it fall through to the MicroDVD text loader, which + // would misparse the binary and surface a confusing "no subtitles found" error. A + // genuine text MicroDVD .sub starts with text, not the MPEG pack header, so it + // returns null here and is handled by the text loader. + if (BitmapSubtitleLoader.IsBinaryVobSub(filePath)) + { + AnsiConsole.MarkupLine( + $"[yellow]Note: VobSub '.sub' has no '.idx' companion ({Path.GetFileName(idxPath).EscapeMarkup()}); " + + "reading timing from the stream and using a default color palette.[/]"); + return LoadVobSub(filePath, idxPath, options); + } + + return null; + } + if (ext is ".ts" or ".m2ts" or ".mts") { return LoadTransportStream(filePath, options); @@ -223,7 +248,18 @@ private static List LoadMatroska(string filePath, ConversionOptions if (track.CodecId.Equals("S_VOBSUB", StringComparison.OrdinalIgnoreCase)) { - AnsiConsole.MarkupLine($"[yellow]Warning: skipping VobSub MKV track #{track.TrackNumber} — VobSub OCR not yet supported in seconv. Use Subtitle Edit (UI) for now.[/]"); + try + { + var vobSub = ImageOcrLoader.LoadMatroskaVobSub(matroska, track, options); + if (vobSub.Paragraphs.Count > 0) + { + tracks.Add(new LoadedTrack(vobSub, new SubRip(), SanitizeLang(track.Language), track.TrackNumber)); + } + } + catch (Exception ex) + { + AnsiConsole.MarkupLine($"[yellow]Warning: VobSub OCR failed on MKV track #{track.TrackNumber}: {ex.Message.EscapeMarkup()}[/]"); + } continue; } @@ -269,7 +305,19 @@ private static List LoadMp4(string filePath, ConversionOptions opti } if (trak.Mdia.IsVobSubSubtitle) { - AnsiConsole.MarkupLine($"[yellow]Warning: skipping VobSub MP4 track #{trackId} — OCR is not yet supported.[/]"); + try + { + var vobSub = ImageOcrLoader.LoadMp4VobSub(trak, options); + if (vobSub.Paragraphs.Count > 0) + { + var vobLang = LanguageAutoDetect.AutoDetectGoogleLanguageOrNull(vobSub) ?? string.Empty; + tracks.Add(new LoadedTrack(vobSub, new SubRip(), vobLang, trackId)); + } + } + catch (Exception ex) + { + AnsiConsole.MarkupLine($"[yellow]Warning: VobSub OCR failed on MP4 track #{trackId}: {ex.Message.EscapeMarkup()}[/]"); + } continue; } @@ -315,6 +363,16 @@ private static List LoadBluRaySup(string filePath, ConversionOption return [new LoadedTrack(subtitle, new SubRip(), string.Empty, null)]; } + private static List LoadVobSub(string subPath, string idxPath, ConversionOptions options) + { + var subtitle = ImageOcrLoader.LoadVobSub(subPath, idxPath, options); + if (subtitle.Paragraphs.Count == 0) + { + throw new InvalidOperationException($"No subtitles recognised in VobSub file: {subPath}"); + } + return [new LoadedTrack(subtitle, new SubRip(), string.Empty, null)]; + } + private static List LoadTransportStream(string filePath, ConversionOptions options) { var tracks = new List(); diff --git a/src/seconv/Core/ImageOcrLoader.cs b/src/seconv/Core/ImageOcrLoader.cs index 636e1971c87..7878bc94235 100644 --- a/src/seconv/Core/ImageOcrLoader.cs +++ b/src/seconv/Core/ImageOcrLoader.cs @@ -1,6 +1,7 @@ using Nikse.SubtitleEdit.Core.BluRaySup; using Nikse.SubtitleEdit.Core.Common; using Nikse.SubtitleEdit.Core.ContainerFormats.Matroska; +using Nikse.SubtitleEdit.Core.ContainerFormats.Mp4.Boxes; using Nikse.SubtitleEdit.Core.ContainerFormats.TransportStream; using SkiaSharp; using Spectre.Console; @@ -16,7 +17,9 @@ namespace SeConv.Core; internal static class ImageOcrLoader { /// - /// Blu-Ray .sup → text via Tesseract. + /// Blu-Ray .sup → text via the configured OCR engine. When + /// is set, OCR is skipped entirely and + /// each entry keeps its timing with empty text — no OCR engine is even created. /// public static Subtitle LoadBluRaySup(string filePath, ConversionOptions options) { @@ -27,13 +30,20 @@ public static Subtitle LoadBluRaySup(string filePath, ConversionOptions options) throw new InvalidOperationException($"No Blu-Ray sup subtitles found in: {filePath}"); } + if (options.TimeCodesOnly) + { + AnsiConsole.MarkupLine($"[dim]Extracting time codes from {pcsList.Count} Blu-Ray sup image(s) (no OCR)...[/]"); + return PcsListToSubtitle(pcsList, null); + } + using var ocr = OcrEngineFactory.Create(options); AnsiConsole.MarkupLine($"[dim]Running {ocr.Name} OCR on {pcsList.Count} Blu-Ray sup image(s)...[/]"); - return OcrPcsList(pcsList, ocr); + return PcsListToSubtitle(pcsList, ocr); } /// - /// MKV PGS track (S_HDMV/PGS) → text via the configured OCR engine. + /// MKV PGS track (S_HDMV/PGS) → text via the configured OCR engine, or time codes only + /// when is set. /// public static Subtitle LoadMatroskaPgs(MatroskaFile matroska, MatroskaTrackInfo track, ConversionOptions options) { @@ -42,15 +52,23 @@ public static Subtitle LoadMatroskaPgs(MatroskaFile matroska, MatroskaTrackInfo { throw new InvalidOperationException($"No PGS subtitles in MKV track #{track.TrackNumber}."); } + + if (options.TimeCodesOnly) + { + AnsiConsole.MarkupLine($"[dim]Extracting time codes from {pcsList.Count} MKV PGS image(s) (track #{track.TrackNumber}, no OCR)...[/]"); + return PcsListToSubtitle(pcsList, null); + } + using var ocr = OcrEngineFactory.Create(options); AnsiConsole.MarkupLine($"[dim]Running {ocr.Name} OCR on {pcsList.Count} MKV PGS image(s) (track #{track.TrackNumber})...[/]"); - return OcrPcsList(pcsList, ocr); + return PcsListToSubtitle(pcsList, ocr); } /// - /// Transport stream DVB-sub → text via Tesseract. Returns one Subtitle per packet ID - /// that has subtitles. Teletext PIDs are not handled here (they're already text; - /// see ). + /// Transport stream DVB-sub → text via the configured OCR engine, or time codes only + /// when is set. Returns one Subtitle per + /// packet ID that has subtitles. Teletext PIDs are not handled here (they're already + /// text; see ). /// public static List<(Subtitle Subtitle, int PacketId)> LoadTransportStreamDvbSub(string filePath, ConversionOptions options) { @@ -63,48 +81,146 @@ public static Subtitle LoadMatroskaPgs(MatroskaFile matroska, MatroskaTrackInfo return results; } - using var ocr = OcrEngineFactory.Create(options); - - foreach (var pid in parser.SubtitlePacketIds) + // Time-codes-only needs no recognition, so don't create (or require) an OCR engine. + IOcrEngine? ocr = options.TimeCodesOnly ? null : OcrEngineFactory.Create(options); + try { - var dvbSubtitles = parser.GetDvbSubtitles(pid); - if (dvbSubtitles.Count == 0) - { - continue; - } - - AnsiConsole.MarkupLine($"[dim]Running {ocr.Name} OCR on {dvbSubtitles.Count} DVB-sub image(s) (PID {pid})...[/]"); - var subtitle = new Subtitle(); - foreach (var dvb in dvbSubtitles) + foreach (var pid in parser.SubtitlePacketIds) { - var bitmap = dvb.GetBitmap(); - if (bitmap is null) + var dvbSubtitles = parser.GetDvbSubtitles(pid); + if (dvbSubtitles.Count == 0) { continue; } - try + + AnsiConsole.MarkupLine(ocr is null + ? $"[dim]Extracting time codes from {dvbSubtitles.Count} DVB-sub image(s) (PID {pid}, no OCR)...[/]" + : $"[dim]Running {ocr.Name} OCR on {dvbSubtitles.Count} DVB-sub image(s) (PID {pid})...[/]"); + var subtitle = new Subtitle(); + foreach (var dvb in dvbSubtitles) { - var text = ocr.Recognize(bitmap); - if (!string.IsNullOrWhiteSpace(text)) + var bitmap = dvb.GetBitmap(); + if (bitmap is null) + { + continue; + } + try { - subtitle.Paragraphs.Add(new LibSeParagraph(text, (double)dvb.StartMilliseconds, (double)dvb.EndMilliseconds)); + // ocr == null → time-codes-only: keep the entry with empty text. + var text = ocr is null ? string.Empty : ocr.Recognize(bitmap); + if (ocr is null || !string.IsNullOrWhiteSpace(text)) + { + subtitle.Paragraphs.Add(new LibSeParagraph(text, (double)dvb.StartMilliseconds, (double)dvb.EndMilliseconds)); + } + } + finally + { + bitmap.Dispose(); } } - finally + subtitle.Renumber(); + if (subtitle.Paragraphs.Count > 0) { - bitmap.Dispose(); + results.Add((subtitle, pid)); } } - subtitle.Renumber(); - if (subtitle.Paragraphs.Count > 0) + } + finally + { + ocr?.Dispose(); + } + return results; + } + + /// + /// VobSub .sub + .idx pair → text via the configured OCR engine, or time + /// codes only when is set. + /// + public static Subtitle LoadVobSub(string subPath, string idxPath, ConversionOptions options) + { + // IsPal default mirrors BitmapSubtitleLoader / VobSubExtractor — a wrong guess only + // affects timing scale, which doesn't matter for OCR/time-code extraction. + var items = BitmapSubtitleLoader.LoadVobSub(subPath, idxPath, isPal: true); + return OcrBitmapItems(items, options, $"{items.Count} VobSub image(s)"); + } + + /// + /// VobSub MKV track (S_VOBSUB) → text via the configured OCR engine, or time + /// codes only when is set. + /// + public static Subtitle LoadMatroskaVobSub(MatroskaFile matroska, MatroskaTrackInfo track, ConversionOptions options) + { + var items = BitmapSubtitleLoader.LoadMatroskaVobSub(matroska, track); + return OcrBitmapItems(items, options, $"{items.Count} MKV VobSub image(s) (track #{track.TrackNumber})"); + } + + /// + /// VobSub MP4 track (handler subp) → text via the configured OCR engine, or time + /// codes only when is set. + /// + public static Subtitle LoadMp4VobSub(Trak track, ConversionOptions options) + { + var items = BitmapSubtitleLoader.LoadMp4VobSub(track); + return OcrBitmapItems(items, options, $"{items.Count} MP4 VobSub image(s)"); + } + + /// + /// Shared driver for the VobSub sources: recognises each pre-decoded bitmap to text + /// (or keeps timing with empty text in time-codes-only mode), disposing the bitmaps + /// afterwards. The OCR engine is only created when recognition is actually needed. + /// + private static Subtitle OcrBitmapItems( + IReadOnlyList items, ConversionOptions options, string what) + { + try + { + if (options.TimeCodesOnly) { - results.Add((subtitle, pid)); + AnsiConsole.MarkupLine($"[dim]Extracting time codes from {what} (no OCR)...[/]"); + return BitmapItemsToSubtitle(items, null); + } + + using var ocr = OcrEngineFactory.Create(options); + AnsiConsole.MarkupLine($"[dim]Running {ocr.Name} OCR on {what}...[/]"); + return BitmapItemsToSubtitle(items, ocr); + } + finally + { + foreach (var item in items) + { + item.Dispose(); } } - return results; } - private static Subtitle OcrPcsList(List pcsList, IOcrEngine ocr) + /// + /// Turns pre-decoded bitmap events into a Subtitle. null = + /// time-codes-only (empty text kept); non-null = recognise each bitmap and drop blanks. + /// + private static Subtitle BitmapItemsToSubtitle( + IReadOnlyList items, IOcrEngine? ocr) + { + var subtitle = new Subtitle(); + foreach (var item in items) + { + var text = ocr is null ? string.Empty : ocr.Recognize(item.Bitmap); + if (ocr is null || !string.IsNullOrWhiteSpace(text)) + { + subtitle.Paragraphs.Add(new LibSeParagraph( + text, item.StartTime.TotalMilliseconds, item.EndTime.TotalMilliseconds)); + } + } + subtitle.Renumber(); + return subtitle; + } + + /// + /// Turns a PCS list into a Subtitle. When is non-null each + /// bitmap is recognised to text; when it's null (time-codes-only mode) every entry is + /// kept with empty text so the output carries timing but no recognised characters. + /// Entries whose bitmap is null (e.g. clear-screen commands) are skipped in both modes. + /// + private static Subtitle PcsListToSubtitle(List pcsList, IOcrEngine? ocr) { var subtitle = new Subtitle(); @@ -117,8 +233,8 @@ private static Subtitle OcrPcsList(List pcsList, IOcrEn } try { - var text = ocr.Recognize(bitmap); - if (!string.IsNullOrWhiteSpace(text)) + var text = ocr is null ? string.Empty : ocr.Recognize(bitmap); + if (ocr is null || !string.IsNullOrWhiteSpace(text)) { subtitle.Paragraphs.Add(new LibSeParagraph(text, pcs.StartTime / 90.0, pcs.EndTime / 90.0)); } diff --git a/src/seconv/Core/SubtitleConverter.cs b/src/seconv/Core/SubtitleConverter.cs index 56c0e6e6814..5da0e2fa03c 100644 --- a/src/seconv/Core/SubtitleConverter.cs +++ b/src/seconv/Core/SubtitleConverter.cs @@ -143,7 +143,7 @@ private async Task ConvertVobBatchAsync(List vobFiles, { result.Errors.Add( "VOB input is currently only supported with target format 'VobSub'. " - + "Re-run with --format VobSub to extract subtitles to .sub + .idx; the GUI is still required for OCR to text."); + + "Re-run with --format VobSub to extract subtitles to .sub + .idx, then convert that .sub to a text format (e.g. seconv movie.sub subrip) to OCR it."); result.FailedFiles = vobFiles.Count; return result; } @@ -275,11 +275,21 @@ private async Task TryConvertImageToImageAsync(string inputFile, Conversio if (ext == ".sub") { - // Treat .sub as VobSub input only when an .idx companion exists — otherwise - // (e.g. a text MicroDVD .sub) leave it for the text loader to detect via IsMine. + // Treat .sub as VobSub input when an .idx companion exists, or when the .sub is a + // binary VobSub stream even without one (read directly, default palette). A text + // MicroDVD .sub starts with text, not the MPEG pack header, so it's left for the + // text loader to detect via IsMine. var idxPath = Path.ChangeExtension(inputFile, ".idx"); - if (File.Exists(idxPath)) + var hasIdx = File.Exists(idxPath); + if (hasIdx || BitmapSubtitleLoader.IsBinaryVobSub(inputFile)) { + if (!hasIdx && !options.Quiet) + { + AnsiConsole.MarkupLine( + $"[yellow]Note: VobSub '.sub' has no '.idx' companion ({Path.GetFileName(idxPath).EscapeMarkup()}); " + + "reading timing from the stream and using a default color palette.[/]"); + } + // IsPal: default to PAL to match VobSubExtractor. The .idx "size:" field // could disambiguate per-file, but a wrong guess only affects timing scale, // not bitmap content. @@ -838,6 +848,15 @@ internal record class ConversionOptions /// Path to a .nocr database file (required when OcrEngine == "nocr"). public string? OcrDb { get; init; } + /// + /// When true, image-based sources (Blu-Ray .sup, VobSub .sub+.idx, + /// MKV PGS/VobSub, MP4 VobSub, TS DVB-sub) are converted to a text format keeping only + /// their time codes — OCR is skipped entirely and each entry's text is left empty. No + /// OCR engine is created, so this works without Tesseract/Paddle/etc. installed. Ignored + /// for text inputs and image output targets. + /// + public bool TimeCodesOnly { get; init; } + /// Ollama API endpoint (default http://localhost:11434/api/chat). public string? OllamaUrl { get; init; } diff --git a/src/seconv/Helpers/HelpDisplay.cs b/src/seconv/Helpers/HelpDisplay.cs index 3bf6ad64559..b730ac28624 100644 --- a/src/seconv/Helpers/HelpDisplay.cs +++ b/src/seconv/Helpers/HelpDisplay.cs @@ -42,6 +42,7 @@ public static void ShowHelp() ShowParameter("--ocr-engine:", "OCR engine: tesseract | nocr | binaryocr | ollama | paddle"); ShowParameter("--ocr-language:", "Language for OCR (e.g. eng, deu, spa)"); ShowParameter("--ocr-db:", ".nocr (--ocr-engine=nocr) or .db (--ocr-engine=binaryocr)"); + ShowParameter("--time-codes-only", "Image sources (.sup/VobSub/PGS/DVB) -> text with time codes only; skips OCR"); ShowParameter("--ollama-url:", "Ollama API endpoint (default: http://localhost:11434/api/chat)"); ShowParameter("--ollama-model:", "Ollama vision model (default: llama3.2-vision)"); ShowParameter("--multiple-replace:", "SE MultipleSearchAndReplaceGroups XML applied per paragraph"); @@ -110,6 +111,9 @@ public static void ShowHelp() ShowExample( "seconv movie.sup subrip --ocr-engine:nocr --ocr-db:Latin.nocr", "OCR a Blu-Ray .sup using nOCR"); + ShowExample( + "seconv movie.sup subrip --time-codes-only", + "Extract only the time codes from a .sup (no OCR; empty text)"); ShowExample( "seconv subs.srt customtext --custom-format:my-template.xml", "Render via a custom text format template"); diff --git a/src/seconv/README.md b/src/seconv/README.md index 286a65fadf9..8a053951271 100644 --- a/src/seconv/README.md +++ b/src/seconv/README.md @@ -8,7 +8,8 @@ operations, and OCR engines as the desktop UI — without an Avalonia / GUI depe - 380+ subtitle formats (text, binary, image-based) - Container input: Matroska (.mkv/.mks), MP4, MCC, MXF, transport stream teletext -- OCR pipelines for image-based sources (Blu-Ray .sup, MKV PGS, DVB-sub) +- OCR pipelines for image-based sources (Blu-Ray .sup, VobSub .sub/.idx, MKV PGS/VobSub, MP4 VobSub, TS DVB-sub) +- Time-codes-only extraction (`--time-codes-only`): timing without OCR for any of the above - Five OCR engines: Tesseract subprocess, nOCR (built-in), BinaryOCR (built-in), Ollama (HTTP), PaddleOCR subprocess - Image-based output: Blu-Ray sup, BDN-XML, DOST, FCP, D-Cinema interop / SMPTE 2014, images-with-time-code, WebVTT thumbnails - Image-to-image conversion (preserve source bitmaps, no OCR): Blu-Ray .sup, VobSub .sub/.idx, MKV PGS, TS DVB-sub → any image output format @@ -45,6 +46,10 @@ seconv movie.sup subrip --ocr-engine:tesseract --ocr-language:eng # OCR a Blu-R seconv movie.sup subrip --ocr-engine:nocr --ocr-db:Latin.nocr # OCR via nOCR seconv movie.sup subrip --ocr-engine:binaryocr --ocr-db:Latin.db # OCR via BinaryOCR seconv movie.sup subrip --ocr-engine:ollama --ollama-model:llama3.2-vision +seconv movie.sub subrip --ocr-engine:tesseract --ocr-language:eng # OCR a VobSub .sub (.idx auto-detected) +seconv movie.mkv subrip --ocr-engine:nocr --ocr-db:Latin.nocr # OCR MKV PGS/VobSub tracks +seconv movie.sup subrip --time-codes-only # time codes only, no OCR +seconv movie.sub subrip --time-codes-only # VobSub time codes only, no OCR seconv subs.srt bluraysup --resolution:1920x1080 # render text → Blu-Ray sup seconv subs.srt bdnxml --resolution:1920x1080 # render text → BDN-XML @@ -151,6 +156,7 @@ already has and only repair true overlaps. | `--ocr-db:` | OCR database file: `.nocr` for `nocr`, `.db` for `binaryocr` (required for both) | | `--ollama-url:` | Default `http://localhost:11434/api/chat` | | `--ollama-model:` | Default `llama3.2-vision` | +| `--time-codes-only` | Image sources (`.sup`, VobSub `.sub`/`.idx`, MKV PGS/VobSub, MP4 VobSub, TS DVB-sub) → text with time codes only and empty text; **skips OCR** (no OCR engine required) | > **OCR database files are not bundled with `seconv`.** The `nocr` and `binaryocr` engines > need a `.nocr` or `.db` file passed via `--ocr-db`. Sources: @@ -169,7 +175,7 @@ and re-rasterising at the CLI's default font. Routing is automatic — no flag n | Input | Detection | Outputs | |---|---|---| | `.sup` | extension | any image format | -| `.sub` (VobSub) | `.idx` companion next to it | any image format | +| `.sub` (VobSub) | `.idx` companion next to it, or a binary VobSub stream (read with a default palette) | any image format | | `.mkv` / `.mks` | `S_HDMV/PGS` track present | any image format (one output per track) | | `.ts` / `.m2ts` / `.mts` | DVB-sub PID present | any image format (one output per PID) | diff --git a/tests/seconv/Core/ContainerLoaderTest.cs b/tests/seconv/Core/ContainerLoaderTest.cs index 4429e45e85d..35d5176fed3 100644 --- a/tests/seconv/Core/ContainerLoaderTest.cs +++ b/tests/seconv/Core/ContainerLoaderTest.cs @@ -48,38 +48,43 @@ public async Task ConvertAsync_MkvWithTextTrack_ProducesSrtWithLanguageSuffix() } [Fact] - public async Task ConvertAsync_MkvWithImageTracks_OcrsPgsAndSkipsVobSub() + public async Task ConvertAsync_MkvWithImageTracks_TimeCodesOnly_ProducesBothTracks() { // container_image.mkv has both a VobSub (S_VOBSUB) and a PGS (S_HDMV/PGS) - // image-subtitle track. Behaviour split: - // - PGS: OCR'd via Tesseract to text → one .srt produced - // - VobSub: warned-and-skipped (seconv has no VobSub OCR path yet) - // - // PGS OCR shells out to the Tesseract binary, which seconv does not bundle. - // Skip (don't fail) when it isn't installed so the suite stays green on - // machines/CI without Tesseract on PATH. - Assert.SkipWhen( - TesseractOcrEngine.Detect() is null, - "Tesseract is not installed on PATH; PGS OCR cannot run."); - + // image-subtitle track. With --time-codes-only both are decoded for timing and + // emitted as text (empty text), no OCR engine required — so this runs everywhere, + // and proves the VobSub-in-MKV path is wired (it used to be warned-and-skipped). var input = Fixtures.Path("container_image.mkv"); Assert.True(File.Exists(input), $"Fixture missing: {input}"); var outputFolder = Path.Combine(_tempRoot, "out"); Directory.CreateDirectory(outputFolder); var converter = new SubtitleConverter(); + // Overwrite=false so the two same-language ("und") tracks get distinct names via the + // track-number disambiguation (movie.und.srt + movie.#N.und.srt) instead of one + // clobbering the other. var result = await converter.ConvertAsync(new ConversionOptions { Patterns = [input], Format = "SubRip", OutputFolder = outputFolder, - Overwrite = true, + Overwrite = false, + TimeCodesOnly = true, }); - // The PGS track converts; the VobSub track is skipped (one success, no error). + // Both image tracks (PGS + VobSub) convert: two successes, two .srt outputs. Assert.True(result.Success, string.Join("; ", result.Errors)); - Assert.Equal(1, result.SuccessfulFiles); - Assert.Single(Directory.GetFiles(outputFolder, "*.srt")); + Assert.Equal(2, result.SuccessfulFiles); + var outputs = Directory.GetFiles(outputFolder, "*.srt"); + Assert.Equal(2, outputs.Length); + + // Every output carries time codes but no recognised text (no letters leaked in). + foreach (var f in outputs) + { + var content = await File.ReadAllTextAsync(f, TestContext.Current.CancellationToken); + Assert.Contains("-->", content); + Assert.DoesNotContain(content, c => char.IsLetter(c)); + } } [Fact] diff --git a/tests/seconv/Core/TimeCodesOnlyTest.cs b/tests/seconv/Core/TimeCodesOnlyTest.cs new file mode 100644 index 00000000000..1b5704b537e --- /dev/null +++ b/tests/seconv/Core/TimeCodesOnlyTest.cs @@ -0,0 +1,62 @@ +using SeConv.Core; +using Xunit; + +namespace SeConvTests.Core; + +/// +/// Verifies the --time-codes-only path: an image-based source (Blu-Ray .sup) converts to a +/// text format keeping its timing but with empty text, without running OCR. This must work +/// even when no OCR engine is installed, since recognition is skipped entirely. +/// +public class TimeCodesOnlyTest : IDisposable +{ + private readonly string _tempRoot; + + public TimeCodesOnlyTest() + { + _tempRoot = Path.Combine(Path.GetTempPath(), "TimeCodesOnly_" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(_tempRoot); + } + + public void Dispose() + { + if (Directory.Exists(_tempRoot)) + { + Directory.Delete(_tempRoot, recursive: true); + } + } + + [Fact] + public async Task ConvertAsync_BluRaySupToSrt_TimeCodesOnly_EmitsTimingWithoutOcr() + { + var input = Fixtures.Path("sample.sup"); + Assert.True(File.Exists(input), $"Fixture missing: {input}"); + var outputFolder = Path.Combine(_tempRoot, "out"); + Directory.CreateDirectory(outputFolder); + + var converter = new SubtitleConverter(); + var result = await converter.ConvertAsync(new ConversionOptions + { + Patterns = [input], + Format = "SubRip", + OutputFolder = outputFolder, + Overwrite = true, + TimeCodesOnly = true, + }); + + // Succeeds with no OCR engine present — recognition is skipped. + Assert.True(result.Success, string.Join("; ", result.Errors)); + + var srt = Directory.GetFiles(outputFolder, "*.srt"); + Assert.Single(srt); + var content = await File.ReadAllTextAsync(srt[0], TestContext.Current.CancellationToken); + + // Every cue keeps its time code (the "-->" separator) ... + var cueCount = content.Split("-->").Length - 1; + Assert.True(cueCount > 0, "expected at least one time-coded cue"); + + // ... and carries no recognised text: only digits, time codes, arrows and + // whitespace should appear. Any letter would mean OCR text leaked in. + Assert.DoesNotContain(content, c => char.IsLetter(c)); + } +} diff --git a/tests/seconv/Core/VobSubRoutingTest.cs b/tests/seconv/Core/VobSubRoutingTest.cs new file mode 100644 index 00000000000..c7e24046c68 --- /dev/null +++ b/tests/seconv/Core/VobSubRoutingTest.cs @@ -0,0 +1,73 @@ +using SeConv.Core; +using Xunit; + +namespace SeConvTests.Core; + +/// +/// Routing for .sub inputs without an .idx companion: a binary VobSub stream +/// must be recognised as VobSub (read directly, with a default palette) and a text MicroDVD +/// .sub must still fall through to the text loader. +/// +public class VobSubRoutingTest : IDisposable +{ + private readonly string _tempRoot; + + public VobSubRoutingTest() + { + _tempRoot = Path.Combine(Path.GetTempPath(), "VobRoute_" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(_tempRoot); + } + + public void Dispose() + { + if (Directory.Exists(_tempRoot)) + { + Directory.Delete(_tempRoot, recursive: true); + } + } + + [Fact] + public void IsBinaryVobSub_TrueForMpegPackHeader_FalseForText() + { + // Binary VobSub starts with the MPEG-2 pack header 00 00 01 BA. + var binary = Path.Combine(_tempRoot, "binary.sub"); + File.WriteAllBytes(binary, [0x00, 0x00, 0x01, 0xBA, 0x44, 0x00, 0x04, 0x00]); + Assert.True(BitmapSubtitleLoader.IsBinaryVobSub(binary)); + + // Text MicroDVD starts with '{'. + var text = Path.Combine(_tempRoot, "text.sub"); + File.WriteAllText(text, "{0}{25}Hello world"); + Assert.False(BitmapSubtitleLoader.IsBinaryVobSub(text)); + } + + [Fact] + public async Task TextMicroDvdSub_WithoutIdx_StillConvertsAsText() + { + // A MicroDVD .sub with no .idx must not be mistaken for VobSub — it should convert + // as text, preserving its content. + var input = Path.Combine(_tempRoot, "movie.sub"); + await File.WriteAllTextAsync( + input, + "{0}{25}Hello world" + Environment.NewLine + "{26}{50}Second cue", + TestContext.Current.CancellationToken); + + var outputFolder = Path.Combine(_tempRoot, "out"); + Directory.CreateDirectory(outputFolder); + + var result = await new SubtitleConverter().ConvertAsync(new ConversionOptions + { + Patterns = [input], + Format = "SubRip", + Fps = 25, + OutputFolder = outputFolder, + Overwrite = true, + }); + + Assert.True(result.Success, string.Join("; ", result.Errors)); + var srt = Directory.GetFiles(outputFolder, "*.srt"); + Assert.Single(srt); + var content = await File.ReadAllTextAsync(srt[0], TestContext.Current.CancellationToken); + Assert.Contains("Hello world", content); + Assert.Contains("Second cue", content); + } +}