Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions clearwing/analysis/source_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from dataclasses import dataclass, field
from pathlib import Path

import pathspec


@dataclass
class AnalyzerFinding:
Expand Down Expand Up @@ -490,9 +492,16 @@ class SourceAnalyzer:
SKIP_FILES = {".min.js", ".min.css", ".map", ".lock"}
MAX_FILE_SIZE = 1_000_000 # 1MB — class default; overridable per-instance

def __init__(self, repo_path: str | None = None, *, max_file_size: int | None = None):
def __init__(
self,
repo_path: str | None = None,
*,
max_file_size: int | None = None,
respect_gitignore: bool = False,
):
self.repo_path = repo_path
self._temp_dir: tempfile.TemporaryDirectory | None = None
self.respect_gitignore = respect_gitignore
if max_file_size is not None:
self.MAX_FILE_SIZE = max_file_size

Expand Down Expand Up @@ -582,14 +591,22 @@ def analyze(self, path: str | None = None) -> AnalysisResult:

def _iter_source_files(self, root: str):
"""Yield source file paths, skipping irrelevant directories."""
gitignore = _GitignoreMatcher.from_repo(root) if self.respect_gitignore else None
for dirpath, dirnames, filenames in os.walk(root):
# Prune skip directories
dirnames[:] = [d for d in dirnames if d not in self.SKIP_DIRS]
dirnames[:] = [
d
for d in dirnames
if d not in self.SKIP_DIRS
and not (gitignore and gitignore.matches_dir(os.path.join(dirpath, d)))
]

for fname in filenames:
if any(fname.endswith(skip) for skip in self.SKIP_FILES):
continue
full_path = os.path.join(dirpath, fname)
if gitignore and gitignore.matches_file(full_path):
continue
try:
if os.path.getsize(full_path) > self.MAX_FILE_SIZE:
continue
Expand Down Expand Up @@ -733,3 +750,31 @@ def __enter__(self):

def __exit__(self, *args):
self.cleanup()


class _GitignoreMatcher:
"""Repo-root .gitignore matcher for source enumeration."""

def __init__(self, root: str, spec: pathspec.PathSpec):
self.root = os.path.abspath(root)
self.spec = spec

@classmethod
def from_repo(cls, root: str) -> _GitignoreMatcher | None:
gitignore_path = os.path.join(root, ".gitignore")
try:
with open(gitignore_path, encoding="utf-8") as handle:
spec = pathspec.PathSpec.from_lines("gitignore", handle)
except FileNotFoundError:
return None
return cls(root, spec)

def _rel(self, path: str) -> str:
return Path(os.path.relpath(path, self.root)).as_posix()

def matches_file(self, path: str) -> bool:
return self.spec.match_file(self._rel(path))

def matches_dir(self, path: str) -> bool:
rel = self._rel(path)
return self.spec.match_file(f"{rel}/") or self.spec.match_file(rel)
29 changes: 24 additions & 5 deletions clearwing/sourcehunt/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from clearwing.analysis import SourceAnalyzer
from clearwing.analysis.source_analyzer import AnalyzerFinding as StaticFinding
from clearwing.analysis.source_analyzer import _GitignoreMatcher

from .callgraph import CallGraph, CallGraphBuilder
from .semgrep_sidecar import SemgrepSidecar
Expand Down Expand Up @@ -155,7 +156,12 @@ def _file_defines_constants(content_sample: str, language: str) -> bool:
return False


def _count_imports_by(repo_path: str, file_path: str, language: str) -> int:
def _count_imports_by(
repo_path: str,
file_path: str,
language: str,
gitignore: _GitignoreMatcher | None = None,
) -> int:
"""Cheap heuristic for `imports_by`: grep the repo for references to this
file's basename. Used as the v0.1 influence signal until v0.2's tree-sitter
callgraph lands.
Expand All @@ -180,11 +186,18 @@ def _count_imports_by(repo_path: str, file_path: str, language: str) -> int:

count = 0
for dirpath, dirnames, filenames in os.walk(repo_path):
dirnames[:] = [d for d in dirnames if d not in SourceAnalyzer.SKIP_DIRS]
dirnames[:] = [
d
for d in dirnames
if d not in SourceAnalyzer.SKIP_DIRS
and not (gitignore and gitignore.matches_dir(os.path.join(dirpath, d)))
]
for fname in filenames:
other = os.path.join(dirpath, fname)
if other == file_path:
continue
if gitignore and gitignore.matches_file(other):
continue
try:
if os.path.getsize(other) > SourceAnalyzer.MAX_FILE_SIZE:
continue
Expand Down Expand Up @@ -226,6 +239,7 @@ def __init__(
ingest_fuzz_corpora: bool = False, # v0.2 seam
run_taint: bool = False, # v0.4: tree-sitter taint analysis
max_imports_by_files: int = 1000, # cap the imports_by walk
respect_gitignore: bool = False,
):
self.repo_url = repo_url
self.branch = branch
Expand All @@ -237,6 +251,7 @@ def __init__(
self.ingest_fuzz_corpora = ingest_fuzz_corpora
self.run_taint = run_taint
self.max_imports_by_files = max_imports_by_files
self.respect_gitignore = respect_gitignore
self._analyzer: SourceAnalyzer | None = None

def run(self) -> PreprocessResult:
Expand All @@ -245,7 +260,11 @@ def run(self) -> PreprocessResult:

# Pre-scan for static findings — also gives us the file iterator
logger.info("Preprocessor: running static analyzer")
self._analyzer = SourceAnalyzer(repo_path=repo_path)
self._analyzer = SourceAnalyzer(
repo_path=repo_path,
respect_gitignore=self.respect_gitignore,
)
gitignore = _GitignoreMatcher.from_repo(repo_path) if self.respect_gitignore else None
analysis_result = self._analyzer.analyze()
static_findings = analysis_result.findings
logger.info(
Expand Down Expand Up @@ -313,7 +332,7 @@ def run(self) -> PreprocessResult:
# v0.1 imports_by — capped to keep large repos snappy
imports_by = 0
if len(file_targets) < imports_by_budget:
imports_by = _count_imports_by(repo_path, abs_path, language)
imports_by = _count_imports_by(repo_path, abs_path, language, gitignore)

target: FileTarget = {
"path": rel_path,
Expand Down Expand Up @@ -364,7 +383,7 @@ def run(self) -> PreprocessResult:

if self.run_semgrep:
try:
sidecar = SemgrepSidecar()
sidecar = SemgrepSidecar(respect_gitignore=self.respect_gitignore)
if sidecar.available:
semgrep_findings_objs = sidecar.run_scan(repo_path)
semgrep_findings = [_semgrep_finding_to_dict(f) for f in semgrep_findings_objs]
Expand Down
3 changes: 3 additions & 0 deletions clearwing/sourcehunt/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ def __init__(
gvisor_runtime: str | None = None,
preprocessing: bool = True,
seed_harness_crashes: bool = False,
respect_gitignore: bool = False,
*,
config: SourceHuntConfig | None = None,
):
Expand Down Expand Up @@ -439,6 +440,7 @@ def __init__(
self._gvisor_runtime = gvisor_runtime
self._preprocessing = preprocessing
self._seed_harness_crashes = seed_harness_crashes
self._respect_gitignore = respect_gitignore

def _inject_campaign_pool(
self,
Expand Down Expand Up @@ -1636,6 +1638,7 @@ def _preprocess(self) -> PreprocessResult:
propagate_reachability=(self.depth != "quick" and self._preprocessing),
run_semgrep=(self.depth != "quick" and self._preprocessing),
run_taint=(self.depth != "quick" and self._preprocessing),
respect_gitignore=self._respect_gitignore,
)
return pp.run()

Expand Down
28 changes: 14 additions & 14 deletions clearwing/sourcehunt/semgrep_sidecar.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ def __init__(
extra_args: list[str] | None = None,
binary: str = "semgrep",
timeout_seconds: int | None = None,
respect_gitignore: bool = False,
):
self.config = config
self.extra_args = extra_args or []
self.binary = binary
self.timeout_seconds = timeout_seconds or SEMGREP_TIMEOUT_SECONDS
self.respect_gitignore = respect_gitignore

@property
def available(self) -> bool:
Expand All @@ -72,20 +74,18 @@ def run_scan(self, repo_path: str) -> list[SemgrepFinding]:
logger.debug("Semgrep binary not found; skipping")
return []

cmd = (
[
self.binary,
"scan",
"--json",
"--config",
self.config,
"--quiet",
"--no-git-ignore", # also scan ignored files — v0.1 choice
"--skip-unknown-extensions",
]
+ self.extra_args
+ [repo_path]
)
cmd = [
self.binary,
"scan",
"--json",
"--config",
self.config,
"--quiet",
"--skip-unknown-extensions",
]
if not self.respect_gitignore:
cmd.append("--no-git-ignore") # also scan ignored files — v0.1 choice
cmd = cmd + self.extra_args + [repo_path]

try:
proc = subprocess.run(
Expand Down
7 changes: 7 additions & 0 deletions clearwing/ui/commands/sourcehunt.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ def add_parser(subparsers):
dest="seed_cves",
help="Auto-extract CVE history from git log as seed context",
)
parser.add_argument(
"--respect-gitignore",
action="store_true",
default=False,
help="Exclude files and directories matched by the target repo's root .gitignore",
)
parser.add_argument(
"--budget",
type=float,
Expand Down Expand Up @@ -901,6 +907,7 @@ def handle(cli, args):
enable_behavior_monitor=not getattr(args, "no_behavior_monitor", False),
enable_artifact_store=getattr(args, "encrypt_artifacts", False),
gvisor_runtime="runsc" if getattr(args, "gvisor", False) else None,
respect_gitignore=args.respect_gitignore,
)

cli.console.print(
Expand Down
1 change: 1 addition & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ runs, else 1.
[--min-shard-rank N] # minimum file rank for sharding (default: 4)
[--seed-corpus PATH] # local seed corpus directory
[--seed-cves] # extract CVE history from git log as seed context
[--respect-gitignore] # exclude files matched by the repo root .gitignore
```

For high-ranked files in large projects, shards agents by function-level
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies = [
"reportlab>=4.0.0",
"genai-pyo3>=0.1.16",
"pydantic>=2.0.0",
"pathspec>=0.10.0",
"docker>=7.0.0",
# TUI
"textual>=0.47.0",
Expand Down
17 changes: 17 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,23 @@ def test_skips_git_directory(self):
result = analyzer.analyze()
assert result.files_analyzed == 0

def test_respect_gitignore_skips_ignored_files(self):
with tempfile.TemporaryDirectory() as tmp:
Path(os.path.join(tmp, ".gitignore")).write_text(".next/\npublic/sw.js\n")
next_dir = os.path.join(tmp, ".next", "server")
os.makedirs(next_dir)
Path(os.path.join(next_dir, "webpack.js")).write_text("eval(userCode);")
public_dir = os.path.join(tmp, "public")
os.makedirs(public_dir)
Path(os.path.join(public_dir, "sw.js")).write_text("eval(userCode);")
Path(os.path.join(tmp, "src.js")).write_text("eval(userCode);")

analyzer = SourceAnalyzer(repo_path=tmp, respect_gitignore=True)
result = analyzer.analyze()

assert result.files_analyzed == 1
assert [Path(f.file_path).name for f in result.findings] == ["src.js"]

def test_deduplication(self):
# Same pattern matched twice at same location should be deduped
result = self._analyze_code(
Expand Down
16 changes: 16 additions & 0 deletions tests/test_sourcehunt_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,22 @@ def test_file_targets_have_required_fields(self):
assert ft["transitive_callers"] == 0
assert ft["fuzz_harness_path"] is None

def test_respect_gitignore_filters_file_targets_and_static_findings(self, tmp_path):
(tmp_path / ".gitignore").write_text(".next/\n")
(tmp_path / ".next" / "server").mkdir(parents=True)
(tmp_path / ".next" / "server" / "webpack.js").write_text("eval(userCode);\n")
(tmp_path / "src.js").write_text("eval(userCode);\n")

pp = Preprocessor(
repo_url=str(tmp_path),
local_path=str(tmp_path),
respect_gitignore=True,
)
result = pp.run()

assert [ft["path"] for ft in result.file_targets] == ["src.js"]
assert [Path(f.file_path).name for f in result.static_findings] == ["src.js"]

def test_codec_limits_h_tagged_memory_unsafe(self):
pp = Preprocessor(
repo_url=str(FIXTURE_C_PROPAGATION),
Expand Down
Loading