From 1260f55c810691a6adbb4e8836666958e2e53e54 Mon Sep 17 00:00:00 2001 From: Todd Schiller Date: Sun, 7 Jun 2026 16:59:30 -0400 Subject: [PATCH 1/4] Fix: extend encoded-payload-redact with text-cipher encodings (#203) Add detection for ROT13, Atbash, reverse, leetspeak, NATO phonetic, and Morse alongside the existing base64 / hex / percent matchers. Text-cipher decodes are gated by a distinct common-English-word count since the encoded form is itself printable; substitution ciphers additionally skip candidates whose source is already English. Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/rules/encoded-payload-redact.ts | 566 +++++++++++++++++- 1 file changed, 550 insertions(+), 16 deletions(-) diff --git a/extension/src/rules/encoded-payload-redact.ts b/extension/src/rules/encoded-payload-redact.ts index 2c49580..501e66a 100644 --- a/extension/src/rules/encoded-payload-redact.ts +++ b/extension/src/rules/encoded-payload-redact.ts @@ -1,20 +1,35 @@ // Copyright (c) 2026 PixieBrix, Inc. // Licensed under PolyForm Shield 1.0.0 — see LICENSE. -// Redact long base64 / hex / percent-encoded runs in text nodes — the -// "decode this and follow it" carrier for indirect prompt injection. An -// attacker drops an encoded blob into a page region the agent reads -// (review body, product description, social embed caption); a human skims -// past it as noise but an LLM agent may helpfully decode the bytes and -// treat the result as content or as an instruction it should obey. +// Redact long encoded runs in text nodes — the "decode this and follow +// it" carrier for indirect prompt injection. An attacker drops an +// encoded blob into a page region the agent reads (review body, product +// description, social embed caption); a human skims past it as noise but +// an LLM agent may helpfully decode the bytes and treat the result as +// content or as an instruction it should obey. // -// Detection runs three candidate windows per text node — base64/base64url, -// hex, percent-encoded — each gated by a length floor that sits above -// common hash sizes (SHA-256 = 64 hex, SHA-512 = 128 hex). The decisive -// filter is the *decoded printable-ASCII ratio*: instructions decode to -// readable text (ratio ~1.0); hashes, fingerprints, and image bytes decode -// to high-entropy binary (ratio well below 0.85). JWTs are skipped so the -// more specific `secrets-redact` label wins on overlap. +// Two families of detector run per inline group: +// +// * **Byte encodings** — base64 / base64url, hex, percent-encoded. +// Each is gated by a length floor that sits above common hash sizes +// (SHA-256 = 64 hex, SHA-512 = 128 hex). The decisive filter is the +// *decoded printable-ASCII ratio*: instructions decode to readable +// text (ratio ~1.0); hashes, fingerprints, and image bytes decode +// to high-entropy binary (ratio well below 0.85). JWTs are skipped +// so the more specific `secrets-redact` label wins on overlap. +// +// * **Text ciphers** — ROT13, Atbash, reverse, leetspeak, NATO +// phonetic, Morse. The encoded form is itself printable text, so +// the printable-ASCII ratio is useless. The qualifier is instead a +// count of distinct common English function words in the decoded +// output: cipher payloads of useful length decode to several; +// random gibberish or non-cipher prose does not. For the +// letter-substitution ciphers (ROT13, Atbash, reverse, leetspeak) +// we additionally require the *original* candidate not already be +// English, so ordinary prose decodes to gibberish and falls +// through. NATO and Morse have distinctive enough forms that the +// candidate regex alone is selective; the decoded common-word check +// guards against ASCII art or coincidental Morse-shape runs. // // Matches are replaced inline with a click-to-reveal placeholder. False // positives cost one click, not lost data. @@ -29,9 +44,43 @@ const MIN_BASE64_LENGTH = 120; const MIN_HEX_LENGTH = 160; const MIN_PERCENT_TRIPLETS = 20; +// Text-cipher candidate floor. Substitution ciphers (ROT13, Atbash, +// leetspeak) and reverse need enough characters to carry a meaningful +// instruction; under 80 chars the candidate is too short to clear the +// common-word qualifier even when the decode is real. +const MIN_TEXT_CIPHER_LENGTH = 80; + +// Leetspeak candidate floor. Smaller than the other ciphers because a +// leet payload is denser (digit substitutions concentrate intent in +// fewer chars). Combined with the digit-substitution count below, the +// floor avoids matching ordinary text that happens to contain digits. +const MIN_LEET_LENGTH = 40; +const MIN_LEET_SUBSTITUTIONS = 4; + +// Distinct common-English-word hits required for the decoded output of +// a text cipher to qualify as a payload. Three hits across a 40-char +// decode is rare for random letter noise but routine for any English +// sentence carrying a directive. +const MIN_COMMON_WORDS = 3; + +// NATO and Morse minima — both encodings spell one letter per token, so +// the token count IS the decoded length. Ten letters is the smallest +// payload that can fit a single English directive verb plus its object. +const MIN_NATO_WORDS = 10; +const MIN_MORSE_TOKENS = 10; + +// Morse decoders that resolve to a known letter must clear this share +// of the decoded tokens; below it the run is likely incidental dots and +// dashes (ASCII art, bullets, repeated `---` separators) rather than a +// payload. +const MIN_MORSE_VALID_RATIO = 0.8; + // Reject text nodes shorter than the smallest candidate window — cheap -// per-node early-out. -const MIN_TEXT_LENGTH = MIN_BASE64_LENGTH; +// per-node early-out. The smallest cipher floor (Morse: 10 tokens of +// 1+ symbol each, separated by single whitespace) is ~19 chars; we use +// 20 so the dispatcher sees every plausible cipher payload while still +// skipping short text nodes (UI labels, tab text, badges). +const MIN_TEXT_LENGTH = 20; // Decoded byte stream must be this fraction printable ASCII (space..~, // plus \t \n \r) to count as "decodes to readable text". Hashes and @@ -167,6 +216,402 @@ function decodePercent(candidate: string): Uint8Array | null { return bytes; } +// Distinct high-frequency English function words — articles, pronouns, +// prepositions, conjunctions, modal/auxiliary verbs, common short +// verbs. Deliberately *generic*: any English sentence of useful length +// hits several, and the set carries no injection-specific phrasing per +// the docs-style guidance to keep adversarial vocabulary out of source. +const COMMON_ENGLISH_WORDS = new Set([ + "the", + "and", + "you", + "for", + "are", + "with", + "this", + "that", + "your", + "have", + "from", + "they", + "will", + "what", + "when", + "but", + "not", + "any", + "can", + "out", + "all", + "one", + "now", + "about", + "after", + "before", + "these", + "their", + "them", + "than", + "then", + "into", + "would", + "could", + "should", + "must", + "more", + "some", + "such", + "only", + "very", + "just", + "also", + "where", + "which", + "while", + "who", + "why", + "how", + "his", + "her", + "she", + "him", + "its", + "been", + "were", + "was", + "yes", + "let", + "make", + "use", + "see", + "get", + "give", + "take", + "made", + "want", + "tell", + "ask", + "show", + "find", + "know", + "think", + "look", + "come", + "say", + "good", + "well", + "back", + "down", + "over", + "under", + "between", + "below", + "above", + "every", + "each", + "other", + "another", + "anyone", + "everyone", +]); + +function countDistinctCommonWords(text: string): number { + const seen = new Set(); + for (const m of text.toLowerCase().matchAll(/[a-z]+/g)) { + const word = m[0]; + if (COMMON_ENGLISH_WORDS.has(word)) { + seen.add(word); + } + } + return seen.size; +} + +// Substitution-cipher decoders. All are 1:1 on the character axis so +// the decoded length equals the original — match indices map straight +// through. +const LOWER_A = 97; // 'a' +const UPPER_A = 65; // 'A' +const ALPHABET_LENGTH = 26; +const ROT13_SHIFT = 13; + +function rot13(text: string): string { + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= LOWER_A ? LOWER_A : UPPER_A; + return String.fromCodePoint( + ((code - base + ROT13_SHIFT) % ALPHABET_LENGTH) + base, + ); + }); +} + +function atbash(text: string): string { + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= LOWER_A ? LOWER_A : UPPER_A; + return String.fromCodePoint(ALPHABET_LENGTH - 1 - (code - base) + base); + }); +} + +function reverseText(text: string): string { + // Unicode-aware reverse so any astral pairs survive. + return [...text].reverse().join(""); +} + +// Leetspeak substitution table — only the substitutions that obscure a +// letter behind a digit or symbol. Pure digits (`2nd`, `iPhone 13`) get +// mapped too, which on its own would be a false positive; the +// surrounding gate requires a minimum substitution count AND a decoded +// common-word floor, so prose with incidental digits doesn't qualify. +const LEET_MAP: Record = { + "0": "o", + "1": "i", + "3": "e", + "4": "a", + "5": "s", + "7": "t", + "8": "b", + "@": "a", + $: "s", + "!": "i", +}; + +// Character class covers every leet substitution we recognize. Each +// occurrence is a candidate disguised letter; we require a minimum +// count of these in any candidate window before attempting a decode so +// that ordinary text with incidental digits doesn't qualify. +const LEET_SUBSTITUTION_CHAR_CLASS = String.raw`[0134578@$!]`; + +function deleet(text: string): string { + return text.replaceAll( + new RegExp(LEET_SUBSTITUTION_CHAR_CLASS, "g"), + (c) => LEET_MAP[c] ?? c, + ); +} + +// NATO phonetic alphabet — one word per encoded letter. Includes both +// "juliet" and "juliett" spellings; "x-ray" / "xray" handled as the +// hyphen-stripped token because the candidate scanner already strips +// hyphens out of word tokens. +const NATO_FIRST_LETTER: Record = { + alpha: "A", + alfa: "A", + bravo: "B", + charlie: "C", + delta: "D", + echo: "E", + foxtrot: "F", + golf: "G", + hotel: "H", + india: "I", + juliet: "J", + juliett: "J", + kilo: "K", + lima: "L", + mike: "M", + november: "N", + oscar: "O", + papa: "P", + quebec: "Q", + romeo: "R", + sierra: "S", + tango: "T", + uniform: "U", + victor: "V", + whiskey: "W", + whisky: "W", + xray: "X", + yankee: "Y", + zulu: "Z", +}; + +// Morse map — letters and digits only. Punctuation codes are excluded: +// payloads rarely need them and including them widens the false-match +// surface for sparse dot/dash strings. +const MORSE_MAP: Record = { + ".-": "A", + "-...": "B", + "-.-.": "C", + "-..": "D", + ".": "E", + "..-.": "F", + "--.": "G", + "....": "H", + "..": "I", + ".---": "J", + "-.-": "K", + ".-..": "L", + "--": "M", + "-.": "N", + "---": "O", + ".--.": "P", + "--.-": "Q", + ".-.": "R", + "...": "S", + "-": "T", + "..-": "U", + "...-": "V", + ".--": "W", + "-..-": "X", + "-.--": "Y", + "--..": "Z", + "-----": "0", + ".----": "1", + "..---": "2", + "...--": "3", + "....-": "4", + ".....": "5", + "-....": "6", + "--...": "7", + "---..": "8", + "----.": "9", +}; + +// Candidate windows. Each is conservatively scoped: word-shaped runs +// long enough that the qualifier will see useful signal, with endpoints +// anchored on alphanumerics so trailing punctuation doesn't drift the +// match boundary into surrounding prose. +const TEXT_CIPHER_CANDIDATE = new RegExp( + String.raw`[A-Za-z][A-Za-z\s.,'"!?:;\-]{${MIN_TEXT_CIPHER_LENGTH - 2},}[A-Za-z]`, + "g", +); +const LEET_CANDIDATE = new RegExp( + String.raw`[A-Za-z0-9@$!][A-Za-z0-9@$!\s.,'"?:;\-]{${MIN_LEET_LENGTH - 2},}[A-Za-z0-9@$!]`, + "g", +); +const MORSE_CANDIDATE = new RegExp( + String.raw`(?:[.\-]{1,7}[ \t/]+){${MIN_MORSE_TOKENS - 1},}[.\-]{1,7}`, + "g", +); + +function countLeetSubstitutions(text: string): number { + return (text.match(new RegExp(LEET_SUBSTITUTION_CHAR_CLASS, "g")) ?? []) + .length; +} + +interface CipherDecodeResult { + decoded: string; + commonWords: number; +} + +function tryCipherDecode( + candidate: string, + decoder: (text: string) => string, +): CipherDecodeResult | null { + const decoded = decoder(candidate); + const commonWords = countDistinctCommonWords(decoded); + if (commonWords < MIN_COMMON_WORDS) { + return null; + } + return { decoded, commonWords }; +} + +// For substitution ciphers, skip candidates whose original text is +// already English — applying ROT13/Atbash/reverse to English prose +// would produce gibberish (zero common-word hits), so this is only a +// performance gate, not a correctness one. +function alreadyEnglish(candidate: string): boolean { + return countDistinctCommonWords(candidate) >= MIN_COMMON_WORDS; +} + +interface NatoRun { + start: number; + end: number; + decoded: string; +} + +function isAlphabetSequence(letters: string): boolean { + // A run is a sequential alphabet drill (ABCDE… or BCDEF…) iff every + // adjacent pair differs by exactly 1 in code-point order. We treat + // these as instructional content rather than a payload — alphabet + // pages and signal-corps drills shouldn't be redacted. + for (let i = 1; i < letters.length; i++) { + const prev = letters.codePointAt(i - 1) ?? 0; + const curr = letters.codePointAt(i) ?? 0; + if (curr - prev !== 1) { + return false; + } + } + return true; +} + +function findNatoRuns(text: string): NatoRun[] { + // Scan word tokens linearly so a long alternation regex doesn't + // backtrack. A NATO run is a maximal sequence of NATO tokens + // separated by whitespace, hyphens, or commas; any other token + // (including non-NATO words) ends the run. + const runs: NatoRun[] = []; + const lower = text.toLowerCase(); + const TOKEN_RE = /[a-z]+/g; + let current: { start: number; end: number; letters: string } | null = null; + let lastTokenEnd = -1; + for (const m of lower.matchAll(TOKEN_RE)) { + const start = m.index; + const end = start + m[0].length; + const gap = text.slice(lastTokenEnd, start); + const letter = NATO_FIRST_LETTER[m[0]]; + const gapIsSeparator = lastTokenEnd === -1 || /^[\s,\-]+$/.test(gap); + if (letter && (current === null || gapIsSeparator)) { + if (current === null) { + current = { start, end, letters: letter }; + } else { + current.end = end; + current.letters += letter; + } + } else { + if (current !== null && current.letters.length >= MIN_NATO_WORDS) { + runs.push({ + start: current.start, + end: current.end, + decoded: current.letters, + }); + } + current = letter ? { start, end, letters: letter } : null; + } + lastTokenEnd = end; + } + if (current !== null && current.letters.length >= MIN_NATO_WORDS) { + runs.push({ + start: current.start, + end: current.end, + decoded: current.letters, + }); + } + return runs; +} + +interface MorseDecodeResult { + decoded: string; + validRatio: number; +} + +function decodeMorse(candidate: string): MorseDecodeResult { + // Word separator: `/` (with optional whitespace). Letter separator: + // any whitespace run. + const words = candidate.split(/\s*\/\s*/); + const decodedWords: string[] = []; + let valid = 0; + let total = 0; + for (const word of words) { + const symbols = word.trim().split(/\s+/).filter(Boolean); + let chunk = ""; + for (const sym of symbols) { + total++; + const letter = MORSE_MAP[sym]; + if (letter) { + valid++; + chunk += letter; + } + } + if (chunk.length > 0) { + decodedWords.push(chunk); + } + } + return { + decoded: decodedWords.join(" "), + validRatio: total === 0 ? 0 : valid / total, + }; +} + function qualifies(decoded: Uint8Array | null): boolean { if (decoded === null) { return false; @@ -269,12 +714,101 @@ function collectPercent(text: string, matches: InlineMatch[]): void { } } +function collectSubstitutionCipher( + text: string, + decoder: (text: string) => string, + matches: InlineMatch[], +): void { + for (const m of text.matchAll(TEXT_CIPHER_CANDIDATE)) { + const candidate = m[0]; + if (alreadyEnglish(candidate)) { + continue; + } + if (tryCipherDecode(candidate, decoder) !== null) { + matches.push({ + start: m.index, + end: m.index + candidate.length, + label: "[encoded payload hidden]", + }); + } + } +} + +function collectReverse(text: string, matches: InlineMatch[]): void { + for (const m of text.matchAll(TEXT_CIPHER_CANDIDATE)) { + const candidate = m[0]; + if (alreadyEnglish(candidate)) { + continue; + } + if (tryCipherDecode(candidate, reverseText) !== null) { + matches.push({ + start: m.index, + end: m.index + candidate.length, + label: "[encoded payload hidden]", + }); + } + } +} + +function collectLeet(text: string, matches: InlineMatch[]): void { + for (const m of text.matchAll(LEET_CANDIDATE)) { + const candidate = m[0]; + if (countLeetSubstitutions(candidate) < MIN_LEET_SUBSTITUTIONS) { + continue; + } + if (tryCipherDecode(candidate, deleet) !== null) { + matches.push({ + start: m.index, + end: m.index + candidate.length, + label: "[encoded payload hidden]", + }); + } + } +} + +function collectNato(text: string, matches: InlineMatch[]): void { + for (const run of findNatoRuns(text)) { + if (isAlphabetSequence(run.decoded)) { + continue; + } + matches.push({ + start: run.start, + end: run.end, + label: "[encoded payload hidden]", + }); + } +} + +function collectMorse(text: string, matches: InlineMatch[]): void { + for (const m of text.matchAll(MORSE_CANDIDATE)) { + const candidate = m[0]; + const { decoded, validRatio } = decodeMorse(candidate); + if (validRatio < MIN_MORSE_VALID_RATIO) { + continue; + } + if (countDistinctCommonWords(decoded) < MIN_COMMON_WORDS) { + continue; + } + matches.push({ + start: m.index, + end: m.index + candidate.length, + label: "[encoded payload hidden]", + }); + } +} + function collectMatches(text: string): InlineMatch[] { const matches: InlineMatch[] = []; const jwtRanges = collectJwtRanges(text); collectBase64(text, jwtRanges, matches); collectHex(text, matches); collectPercent(text, matches); + collectSubstitutionCipher(text, rot13, matches); + collectSubstitutionCipher(text, atbash, matches); + collectReverse(text, matches); + collectLeet(text, matches); + collectNato(text, matches); + collectMorse(text, matches); // Sort by start, then prefer the longest on ties so a base64 candidate // wins over a hex prefix of the same span. Merge by dropping any match @@ -295,7 +829,7 @@ export const encodedPayloadRedactRule = defineInlineTextRedactRule({ id: "encoded-payload-redact", label: "Redact Encoded Payloads", description: - "Redact long base64, hex, or percent-encoded runs in text nodes whose decoded bytes are mostly readable text. Defends against the 'decode this and follow it' indirect-injection carrier; hashes, fingerprints, and binary blobs are left alone.", + "Redact long encoded runs in text nodes whose decoded form is readable English. Covers base64 / hex / percent (byte encodings) and ROT13 / Atbash / reverse / leetspeak / NATO phonetic / Morse (text ciphers). Defends against the 'decode this and follow it' indirect-injection carrier; hashes, fingerprints, and binary blobs are left alone.", minLength: MIN_TEXT_LENGTH, collectMatches, }); From e54a74716b4a3bf751af568c4c29971780fb507c Mon Sep 17 00:00:00 2001 From: Todd Schiller Date: Sun, 7 Jun 2026 17:13:05 -0400 Subject: [PATCH 2/4] Test: cover ROT13/Atbash/reverse/leet/NATO/Morse in encoded-payload-redact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 13 example tests and 5 property tests for the text-cipher detection paths. Source files contain only ciphertext or symbolic runs — benign English filler is encoded at test time so adversarial phrasing never appears in plaintext. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../encoded-payload-redact.property.test.ts | 199 ++++++++++++ .../__tests__/encoded-payload-redact.test.ts | 284 ++++++++++++++++++ 2 files changed, 483 insertions(+) diff --git a/extension/src/rules/__tests__/encoded-payload-redact.property.test.ts b/extension/src/rules/__tests__/encoded-payload-redact.property.test.ts index 7278b7e..9266a07 100644 --- a/extension/src/rules/__tests__/encoded-payload-redact.property.test.ts +++ b/extension/src/rules/__tests__/encoded-payload-redact.property.test.ts @@ -199,6 +199,205 @@ function sortedSplitsArb(length: number) { .map((indices) => indices.toSorted((a, b) => a - b)); } +// Text-cipher helpers. Encode benign-prose generators into cipher form +// at test time so the source file holds only ciphertext or symbolic +// runs — adversarial phrasing never appears in plaintext. + +const A_CODE = 65; +const a_CODE = 97; + +function rot13(text: string): string { + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= a_CODE ? a_CODE : A_CODE; + return String.fromCodePoint(((code - base + 13) % 26) + base); + }); +} + +function atbash(text: string): string { + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= a_CODE ? a_CODE : A_CODE; + return String.fromCodePoint(26 - 1 - (code - base) + base); + }); +} + +function reverseText(text: string): string { + // `charAt` (vs `text[i]`) keeps the result `string` rather than + // `string | undefined`. Test inputs are pure ASCII so no + // astral-pair correctness concern with code-unit iteration. + let out = ""; + for (let i = text.length - 1; i >= 0; i--) { + out += text.charAt(i); + } + return out; +} + +const NATO_ENCODE: Record = { + A: "alpha", + B: "bravo", + C: "charlie", + D: "delta", + E: "echo", + F: "foxtrot", + G: "golf", + H: "hotel", + I: "india", + J: "juliet", + K: "kilo", + L: "lima", + M: "mike", + N: "november", + O: "oscar", + P: "papa", + Q: "quebec", + R: "romeo", + S: "sierra", + T: "tango", + U: "uniform", + V: "victor", + W: "whiskey", + X: "xray", + Y: "yankee", + Z: "zulu", +}; + +function natoEncode(letters: string): string { + const out: string[] = []; + for (const char of letters.toUpperCase()) { + const word = NATO_ENCODE[char]; + if (word) { + out.push(word); + } + } + return out.join(" "); +} + +// Vocabulary of common English function words drawn from the rule's +// internal qualifier set. Built-in fast-check generators can't conjure +// "looks English to the rule" prose, so the property tests sample from +// this list to build sentences guaranteed to clear the common-word +// floor — no need to enumerate the rule's set in two places, just keep +// a representative subset here. +const COMMON_WORD_VOCAB = [ + "the", + "and", + "you", + "for", + "this", + "that", + "with", + "have", + "from", + "when", + "what", + "should", + "could", + "would", + "must", + "your", + "their", + "every", + "other", + "every", +] as const; + +const commonProseArb = fc + .array(fc.constantFrom(...COMMON_WORD_VOCAB), { + minLength: 20, + maxLength: 40, + }) + .map((words) => words.join(" ")) + .filter((s) => s.length >= 80); + +// Random letters A..Z. Filter out runs that are strict alphabet +// sequences (ABCDE…) — the rule treats those as instructional content. +const natoLettersArb = fc + .array( + fc + .integer({ min: 0, max: 25 }) + .map((i) => String.fromCodePoint(A_CODE + i)), + { minLength: 10, maxLength: 24 }, + ) + .map((letters) => letters.join("")) + .filter((letters) => { + for (let i = 1; i < letters.length; i++) { + const previous = letters.codePointAt(i - 1) ?? 0; + const current = letters.codePointAt(i) ?? 0; + if (current - previous !== 1) { + return true; + } + } + return false; + }); + +describe("encoded-payload-redact text ciphers (property)", () => { + it("redacts ROT13-encoded common-word prose", () => { + fc.assert( + fc.property(commonProseArb, (prose) => { + const ciphertext = rot13(prose); + // Parens delimit the cipher candidate so surrounding context + // doesn't get pulled into the match (`(` is outside the + // candidate's allowed char class). + const body = applyToText(`(prefix) ${ciphertext} (suffix)`); + expect(body.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(body.textContent).not.toContain(ciphertext); + }), + ); + }); + + it("redacts Atbash-encoded common-word prose", () => { + fc.assert( + fc.property(commonProseArb, (prose) => { + const ciphertext = atbash(prose); + const body = applyToText(`(prefix) ${ciphertext} (suffix)`); + expect(body.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(body.textContent).not.toContain(ciphertext); + }), + ); + }); + + it("redacts reversed common-word prose", () => { + fc.assert( + fc.property(commonProseArb, (prose) => { + const ciphertext = reverseText(prose); + const body = applyToText(`(prefix) ${ciphertext} (suffix)`); + expect(body.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(body.textContent).not.toContain(ciphertext); + }), + ); + }); + + it("redacts NATO-phonetic runs of >= 10 non-sequential letters", () => { + fc.assert( + fc.property(natoLettersArb, (letters) => { + const ciphertext = natoEncode(letters); + const body = applyToText(`(prefix) ${ciphertext} (suffix)`); + expect(body.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(body.textContent).not.toContain(ciphertext); + }), + ); + }); + + it("leaves plain English common-word prose alone (no cipher false-fire)", () => { + fc.assert( + fc.property(commonProseArb, (prose) => { + const body = applyToText(prose); + expect(body.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + expect(body.textContent).toContain(prose); + }), + ); + }); +}); + describe("encoded-payload-redact cross-node detection (property)", () => { it("redacts base64 payloads regardless of how they're split across sibling spans", () => { fc.assert( diff --git a/extension/src/rules/__tests__/encoded-payload-redact.test.ts b/extension/src/rules/__tests__/encoded-payload-redact.test.ts index 965d237..f2381d8 100644 --- a/extension/src/rules/__tests__/encoded-payload-redact.test.ts +++ b/extension/src/rules/__tests__/encoded-payload-redact.test.ts @@ -45,6 +45,151 @@ function percentEncode(text: string): string { return out; } +// Benign English sentence rich in distinct common function words. Used as +// the *cleartext* for the text-cipher positive cases: the ciphers +// encode this string at test time so the source file holds only the +// resulting gibberish (ROT13/Atbash) or symbolic form (NATO/Morse), +// keeping adversarial phrasing out of the file while still exercising +// the decoded common-word qualifier (>= 3 distinct hits). +const CIPHER_CLEARTEXT = + "you can see this from above and you know what should come next " + + "when the time comes for you to look around"; + +function rot13(text: string): string { + const A = 65; + const a = 97; + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= a ? a : A; + return String.fromCodePoint(((code - base + 13) % 26) + base); + }); +} + +function atbash(text: string): string { + const A = 65; + const a = 97; + return text.replaceAll(/[a-zA-Z]/g, (c) => { + const code = c.codePointAt(0) ?? 0; + const base = code >= a ? a : A; + return String.fromCodePoint(26 - 1 - (code - base) + base); + }); +} + +function reverseText(text: string): string { + // `charAt` (vs `text[i]`) keeps the result `string` rather than + // `string | undefined`. Test inputs are pure ASCII so no + // astral-pair correctness concern with code-unit iteration. + let out = ""; + for (let i = text.length - 1; i >= 0; i--) { + out += text.charAt(i); + } + return out; +} + +const LEET_ENCODE: Record = { + o: "0", + i: "1", + e: "3", + a: "4", + s: "5", + t: "7", + b: "8", +}; + +function leetEncode(text: string): string { + return text.replaceAll( + /[oieasbtOIEASBT]/g, + (c) => LEET_ENCODE[c.toLowerCase()] ?? c, + ); +} + +const NATO_ENCODE: Record = { + A: "alpha", + B: "bravo", + C: "charlie", + D: "delta", + E: "echo", + F: "foxtrot", + G: "golf", + H: "hotel", + I: "india", + J: "juliet", + K: "kilo", + L: "lima", + M: "mike", + N: "november", + O: "oscar", + P: "papa", + Q: "quebec", + R: "romeo", + S: "sierra", + T: "tango", + U: "uniform", + V: "victor", + W: "whiskey", + X: "xray", + Y: "yankee", + Z: "zulu", +}; + +function natoEncode(letters: string): string { + const out: string[] = []; + for (const char of letters.toUpperCase()) { + const word = NATO_ENCODE[char]; + if (word) { + out.push(word); + } + } + return out.join(" "); +} + +const MORSE_ENCODE: Record = { + A: ".-", + B: "-...", + C: "-.-.", + D: "-..", + E: ".", + F: "..-.", + G: "--.", + H: "....", + I: "..", + J: ".---", + K: "-.-", + L: ".-..", + M: "--", + N: "-.", + O: "---", + P: ".--.", + Q: "--.-", + R: ".-.", + S: "...", + T: "-", + U: "..-", + V: "...-", + W: ".--", + X: "-..-", + Y: "-.--", + Z: "--..", +}; + +function morseEncode(text: string): string { + return text + .toUpperCase() + .split(" ") + .map((word) => { + const symbols: string[] = []; + for (const c of word) { + const sym = MORSE_ENCODE[c]; + if (sym) { + symbols.push(sym); + } + } + return symbols.join(" "); + }) + .filter(Boolean) + .join(" / "); +} + const MUTATION_THROTTLE_MS = 250; async function flushMutations(): Promise { @@ -278,6 +423,145 @@ describe("encoded-payload-redact cross-node detection", () => { }); }); +describe("encoded-payload-redact text-cipher positive cases", () => { + it("redacts a ROT13-encoded English sentence", () => { + const ciphertext = rot13(CIPHER_CLEARTEXT); + // Parens break the cipher candidate regex (`(` and `)` aren't in its + // char class), so the surrounding prose stays as its own text run. + document.body.innerHTML = `

(prefix) ${ciphertext} (suffix)

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + expect(document.body.textContent).toContain("(prefix)"); + expect(document.body.textContent).toContain("(suffix)"); + }); + + it("redacts an Atbash-encoded English sentence", () => { + const ciphertext = atbash(CIPHER_CLEARTEXT); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + }); + + it("redacts a reversed English sentence", () => { + const ciphertext = reverseText(CIPHER_CLEARTEXT); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + }); + + it("redacts a leetspeak-encoded English sentence", () => { + const ciphertext = leetEncode(CIPHER_CLEARTEXT); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + }); + + it("redacts a NATO-phonetic spelling run of >= 10 non-sequential letters", () => { + // Spells THEQUICKBROWNFOX — 16 NATO tokens, not an alphabet sequence. + const ciphertext = natoEncode("thequickbrownfox"); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + }); + + it("redacts a Morse-encoded English phrase", () => { + const ciphertext = morseEncode( + "the quick brown fox jumps over the lazy dog and you can see this", + ); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)?.textContent).toBe( + "[encoded payload hidden]", + ); + expect(document.body.textContent).not.toContain(ciphertext); + }); +}); + +describe("encoded-payload-redact text-cipher false-positive guards", () => { + it("leaves ordinary English prose alone (no cipher fires)", () => { + const prose = + "You can see this paragraph from the homepage and you know what " + + "should come next when the time comes for you to look around."; + document.body.innerHTML = `

${prose}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + expect(document.body.textContent).toContain(prose); + }); + + it("leaves a NATO alphabet drill (A..J) visible", () => { + // Sequential NATO spelling — the rule treats this as instructional + // content (alphabet page / signal-corps drill), not a payload. + const drill = + "alpha bravo charlie delta echo foxtrot golf hotel india juliet"; + document.body.innerHTML = `

${drill}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + expect(document.body.textContent).toContain(drill); + }); + + it("leaves a short ROT13 snippet (under 80-char floor) alone", () => { + const ciphertext = rot13("you can see this from above"); + document.body.innerHTML = `

${ciphertext}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + }); + + it("leaves text with a single incidental digit alone (leet floor)", () => { + // Product copy with one leet-shape digit ("1") — below the + // MIN_LEET_SUBSTITUTIONS=4 floor, so the leet detector skips it. + const prose = + "Limited time offer for 1 day only: get our flagship product before midnight tonight and have it shipped tomorrow."; + document.body.innerHTML = `

${prose}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + }); + + it("leaves a sparse dot/dash ASCII-art run alone (Morse valid-ratio floor)", () => { + // Repeating `--- . --- . ---` style separators — many tokens but + // most decode to letters with no decoded common-word hits, and the + // chosen pattern is below the valid-ratio + common-word qualifiers. + const ascii = "... --- ... --- ... --- ... --- ... --- ... --- ..."; + document.body.innerHTML = `

${ascii}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + }); + + it("leaves a single NATO word in prose alone (below 10-word floor)", () => { + const prose = + "Our flagship product is called Tango and customers love it for the quality of the materials."; + document.body.innerHTML = `

${prose}

`; + encodedPayloadRedactRule.apply(document.body); + + expect(document.querySelector(`.${PLACEHOLDER_CLASS}`)).toBeNull(); + }); +}); + describe("encoded-payload-redact teardown", () => { it("stops re-scanning after teardown", async () => { encodedPayloadRedactRule.apply(document.body); From 0aaa0dc74fcdbe3137d8200b4c6d5db67d7e7cef Mon Sep 17 00:00:00 2001 From: Todd Schiller Date: Sun, 7 Jun 2026 17:17:57 -0400 Subject: [PATCH 3/4] Lint: fix biome/eslint findings in encoded-payload-redact Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/rules/encoded-payload-redact.ts | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/extension/src/rules/encoded-payload-redact.ts b/extension/src/rules/encoded-payload-redact.ts index 501e66a..43a62ed 100644 --- a/extension/src/rules/encoded-payload-redact.ts +++ b/extension/src/rules/encoded-payload-redact.ts @@ -353,8 +353,11 @@ function atbash(text: string): string { } function reverseText(text: string): string { - // Unicode-aware reverse so any astral pairs survive. - return [...text].reverse().join(""); + // Unicode-aware reverse so any astral pairs survive. Array.from over a + // string splits on code points (handling surrogate pairs), which is what + // we want here; spread would trigger no-misused-spread. + // eslint-disable-next-line unicorn/prefer-spread + return Array.from(text).toReversed().join(""); } // Leetspeak substitution table — only the substitutions that obscure a @@ -379,7 +382,7 @@ const LEET_MAP: Record = { // occurrence is a candidate disguised letter; we require a minimum // count of these in any candidate window before attempting a decode so // that ordinary text with incidental digits doesn't qualify. -const LEET_SUBSTITUTION_CHAR_CLASS = String.raw`[0134578@$!]`; +const LEET_SUBSTITUTION_CHAR_CLASS = "[0134578@$!]"; function deleet(text: string): string { return text.replaceAll( @@ -525,9 +528,9 @@ function isAlphabetSequence(letters: string): boolean { // these as instructional content rather than a payload — alphabet // pages and signal-corps drills shouldn't be redacted. for (let i = 1; i < letters.length; i++) { - const prev = letters.codePointAt(i - 1) ?? 0; - const curr = letters.codePointAt(i) ?? 0; - if (curr - prev !== 1) { + const previous = letters.codePointAt(i - 1) ?? 0; + const current = letters.codePointAt(i) ?? 0; + if (current - previous !== 1) { return false; } } @@ -549,7 +552,7 @@ function findNatoRuns(text: string): NatoRun[] { const end = start + m[0].length; const gap = text.slice(lastTokenEnd, start); const letter = NATO_FIRST_LETTER[m[0]]; - const gapIsSeparator = lastTokenEnd === -1 || /^[\s,\-]+$/.test(gap); + const gapIsSeparator = lastTokenEnd === -1 || /^[\s,-]+$/.test(gap); if (letter && (current === null || gapIsSeparator)) { if (current === null) { current = { start, end, letters: letter }; From ed4e0251736706c12174bcc039e14270d356c2dc Mon Sep 17 00:00:00 2001 From: Todd Schiller Date: Sun, 7 Jun 2026 17:22:08 -0400 Subject: [PATCH 4/4] Refactor: single-pass substitution-cipher collector, pre-compiled leet regex Address review feedback on #216: - Collapse rot13/atbash/reverse passes into one TEXT_CIPHER_CANDIDATE iteration with a decoder list; cuts regex + alreadyEnglish work to one-third per text group. - Pre-compile LEET_SUBSTITUTION_RE at module level so deleet and countLeetSubstitutions stop allocating a regex per call inside the inner candidate loop. - Drop the now-duplicate collectReverse wrapper. Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/rules/encoded-payload-redact.ts | 59 ++++++++----------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/extension/src/rules/encoded-payload-redact.ts b/extension/src/rules/encoded-payload-redact.ts index 43a62ed..6b39297 100644 --- a/extension/src/rules/encoded-payload-redact.ts +++ b/extension/src/rules/encoded-payload-redact.ts @@ -382,13 +382,10 @@ const LEET_MAP: Record = { // occurrence is a candidate disguised letter; we require a minimum // count of these in any candidate window before attempting a decode so // that ordinary text with incidental digits doesn't qualify. -const LEET_SUBSTITUTION_CHAR_CLASS = "[0134578@$!]"; +const LEET_SUBSTITUTION_RE = /[0134578@$!]/g; function deleet(text: string): string { - return text.replaceAll( - new RegExp(LEET_SUBSTITUTION_CHAR_CLASS, "g"), - (c) => LEET_MAP[c] ?? c, - ); + return text.replaceAll(LEET_SUBSTITUTION_RE, (c) => LEET_MAP[c] ?? c); } // NATO phonetic alphabet — one word per encoded letter. Includes both @@ -487,8 +484,7 @@ const MORSE_CANDIDATE = new RegExp( ); function countLeetSubstitutions(text: string): number { - return (text.match(new RegExp(LEET_SUBSTITUTION_CHAR_CLASS, "g")) ?? []) - .length; + return (text.match(LEET_SUBSTITUTION_RE) ?? []).length; } interface CipherDecodeResult { @@ -717,9 +713,19 @@ function collectPercent(text: string, matches: InlineMatch[]): void { } } -function collectSubstitutionCipher( +// All substitution-style decoders share the same candidate window and +// already-English gate, so we iterate TEXT_CIPHER_CANDIDATE once and try +// each decoder. The first one that produces a readable decode wins; the +// rest are skipped for that candidate (their match spans would overlap +// and merge identically downstream anyway). +const SUBSTITUTION_DECODERS: ReadonlyArray<(text: string) => string> = [ + rot13, + atbash, + reverseText, +]; + +function collectSubstitutionCiphers( text: string, - decoder: (text: string) => string, matches: InlineMatch[], ): void { for (const m of text.matchAll(TEXT_CIPHER_CANDIDATE)) { @@ -727,28 +733,15 @@ function collectSubstitutionCipher( if (alreadyEnglish(candidate)) { continue; } - if (tryCipherDecode(candidate, decoder) !== null) { - matches.push({ - start: m.index, - end: m.index + candidate.length, - label: "[encoded payload hidden]", - }); - } - } -} - -function collectReverse(text: string, matches: InlineMatch[]): void { - for (const m of text.matchAll(TEXT_CIPHER_CANDIDATE)) { - const candidate = m[0]; - if (alreadyEnglish(candidate)) { - continue; - } - if (tryCipherDecode(candidate, reverseText) !== null) { - matches.push({ - start: m.index, - end: m.index + candidate.length, - label: "[encoded payload hidden]", - }); + for (const decoder of SUBSTITUTION_DECODERS) { + if (tryCipherDecode(candidate, decoder) !== null) { + matches.push({ + start: m.index, + end: m.index + candidate.length, + label: "[encoded payload hidden]", + }); + break; + } } } } @@ -806,9 +799,7 @@ function collectMatches(text: string): InlineMatch[] { collectBase64(text, jwtRanges, matches); collectHex(text, matches); collectPercent(text, matches); - collectSubstitutionCipher(text, rot13, matches); - collectSubstitutionCipher(text, atbash, matches); - collectReverse(text, matches); + collectSubstitutionCiphers(text, matches); collectLeet(text, matches); collectNato(text, matches); collectMorse(text, matches);