diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..949be565b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Fix: an edited Office source now re-enters `--update`, and unchanged PDFs/Office files are no longer re-parsed on every run (#1649 / #1656, thanks @Ns2384-star). A modified `.docx`/`.xlsx` reused a byte-identical, deterministically-named sidecar whose content was never refreshed, so `graphify update` never re-extracted it (#1649); meanwhile every incremental run re-unzipped each Office file and re-parsed each PDF just to recount words (#1656). The `.docx`/`.xlsx` sidecar header now records a raw-byte `source-md5` fingerprint of the source — reading the bytes doesn't unzip the OOXML container, so it's cheap. A matching fingerprint returns the existing sidecar without re-parsing or rewriting it (preserving the #1226 unchanged-mtime no-churn guarantee), while a differing or legacy-missing fingerprint re-parses and rewrites so the new sidecar mtime/hash flows the edit through `detect_incremental`. Per-file word counts are also cached in the manifest and reused for any file whose mtime is unchanged, so an unchanged PDF/Office file is parsed once rather than on every run. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/detect.py b/graphify/detect.py index 0e1c4ba30..ffc1d2512 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -601,6 +601,30 @@ def _edge(src: str, tgt: str, relation: str) -> None: return {"nodes": nodes, "edges": edges} +# Sidecar header records the source's raw-byte fingerprint so a later run can +# tell whether the Office file changed without re-parsing it (#1649, #1656). +# Anchor to the ` | source-md5: -->` delimiter/terminator at the END of the +# header line so a source *filename* that happens to contain a "source-md5: ..." +# substring can't be captured as the fingerprint — otherwise the real fingerprint +# would never match and the file would re-parse+rewrite (and re-queue) every run. +_SIDECAR_SOURCE_FP_RE = re.compile(r"\| source-md5:\s*([0-9a-f]+)\s*-->\s*$") + + +def _read_sidecar_source_fingerprint(out_path: Path) -> str | None: + """Read the source fingerprint stored in an existing sidecar's header. + + Returns ``None`` for legacy sidecars written before the fingerprint was + added, so they are treated as needing a refresh on the next run. + """ + try: + with out_path.open("r", encoding="utf-8") as fh: + first_line = fh.readline() + except OSError: + return None + m = _SIDECAR_SOURCE_FP_RE.search(first_line) + return m.group(1) if m else None + + def convert_office_file(path: Path, out_dir: Path) -> Path | None: """Convert a .docx or .xlsx to a markdown sidecar in out_dir. @@ -608,14 +632,7 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None: or the required library is not installed. """ ext = path.suffix.lower() - if ext == ".docx": - text = docx_to_markdown(path) - elif ext == ".xlsx": - text = xlsx_to_markdown(path) - else: - return None - - if not text.strip(): + if ext not in (".docx", ".xlsx"): return None out_dir.mkdir(parents=True, exist_ok=True) @@ -630,13 +647,31 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None: normalized_path = unicodedata.normalize("NFC", str(path.resolve())) name_hash = hashlib.sha256(normalized_path.encode()).hexdigest()[:8] out_path = out_dir / f"{path.stem}_{name_hash}.md" - # Once the hash is stable the sidecar name is deterministic; skip re-writing - # an existing sidecar so an unchanged source never churns its mtime (which - # would still flag it as changed in detect_incremental). + + # Fingerprint the SOURCE by its raw bytes (md5). Reading the bytes does NOT + # unzip/parse the OOXML container, so this is cheap — that is the whole + # point. If a sidecar already exists and records the same fingerprint the + # source is unchanged: return it WITHOUT parsing (avoids the per-run + # re-parse of #1656) and WITHOUT rewriting (so an unchanged source never + # churns its mtime, which detect_incremental would otherwise flag as + # changed — #1226). If the fingerprint differs (or is missing, e.g. a legacy + # sidecar) the source was edited: re-parse and rewrite so the change flows + # through detect_incremental via the sidecar's new mtime/md5 (#1649). + source_fp = _md5_file(path) if out_path.exists(): - return out_path + if not source_fp or _read_sidecar_source_fingerprint(out_path) == source_fp: + return out_path + + if ext == ".docx": + text = docx_to_markdown(path) + else: + text = xlsx_to_markdown(path) + + if not text.strip(): + return None + out_path.write_text( - f"\n\n{text}", + f"\n\n{text}", encoding="utf-8", ) return out_path @@ -1015,7 +1050,14 @@ def _resolves_under_root(path: Path, root: Path) -> bool: return True -def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: bool | None = None, extra_excludes: list[str] | None = None) -> dict: +def detect( + root: Path, + *, + follow_symlinks: bool | None = None, + google_workspace: bool | None = None, + extra_excludes: list[str] | None = None, + prior_word_counts: dict[str, tuple[float, int]] | None = None, +) -> dict: root = root.resolve() if follow_symlinks is None: follow_symlinks = False @@ -1029,6 +1071,21 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: } total_words = 0 + # Word counting parses PDFs (extract_pdf_text) and reads sidecars on every + # file; re-doing it for unchanged files on each incremental run is pure waste + # (#1656). When the caller supplies the previous run's per-file counts, reuse + # the cached count for any file whose mtime is unchanged instead of parsing. + def _count_words(fp: Path) -> int: + if prior_word_counts is not None: + cached = prior_word_counts.get(str(fp)) + if cached is not None: + try: + if fp.stat().st_mtime == cached[0]: + return cached[1] + except OSError: + pass + return count_words(fp) + skipped_sensitive: list[str] = [] ignore_patterns = _load_graphifyignore(root) ignore_cache: dict[Path, bool] = {} # shared across all _is_ignored calls in this scan @@ -1133,7 +1190,7 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache): continue files[ftype].append(str(md_path)) - total_words += count_words(md_path) + total_words += _count_words(md_path) else: skipped_sensitive.append(str(p) + " [Google Workspace export produced no readable text]") continue @@ -1144,14 +1201,14 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache): continue files[ftype].append(str(md_path)) - total_words += count_words(md_path) + total_words += _count_words(md_path) else: # Conversion failed (library not installed) - skip with note skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]") continue files[ftype].append(str(p)) if ftype != FileType.VIDEO: - total_words += count_words(p) + total_words += _count_words(p) for ftype in files: files[ftype].sort() @@ -1323,6 +1380,9 @@ def _normalise_entry(entry): continue all_files = [f for file_list in files.values() for f in file_list] + # Video files are never word-counted (mirrors detect(), and avoids reading + # large media in as text). + video_set = set(files.get("video", [])) with ThreadPoolExecutor() as pool: raw = pool.map(_stat_and_hash, all_files) hashed: dict[str, tuple[float, str]] = { @@ -1334,6 +1394,14 @@ def _normalise_entry(entry): continue # file deleted between detect() and manifest write mtime, h = hashed[f] prev = _normalise_entry(existing.get(f, {})) or {} + # Compare the file's content hash (h) against the prior hash of the SAME + # kind so an unchanged file is recognised regardless of which pipeline + # wrote the manifest. ast/both key off the prior ast_hash (unchanged + # behaviour); a semantic-only manifest never populates ast_hash, so key + # off the prior semantic_hash there — otherwise content_unchanged would + # always be False and the word_count cache below could never be reused. + prior_hash = prev.get("semantic_hash", "") if kind == "semantic" else prev.get("ast_hash", "") + content_unchanged = h == prior_hash entry: dict = {"mtime": mtime} if kind in ("ast", "both"): entry["ast_hash"] = h @@ -1343,7 +1411,17 @@ def _normalise_entry(entry): entry["semantic_hash"] = h else: # Preserve semantic_hash only when content is unchanged - entry["semantic_hash"] = prev.get("semantic_hash", "") if h == prev.get("ast_hash", "") else "" + entry["semantic_hash"] = prev.get("semantic_hash", "") if content_unchanged else "" + # Cache the word count so detect() can reuse it for unchanged files + # rather than re-parsing (esp. PDFs) on every incremental run (#1656). + # Reuse the previous count whenever the content hash is unchanged; only + # (re)parse a genuinely new or changed file. + if f in video_set: + entry["word_count"] = 0 + elif content_unchanged and isinstance(prev.get("word_count"), int): + entry["word_count"] = prev["word_count"] + else: + entry["word_count"] = count_words(Path(f)) manifest[f] = entry if root is not None: # Persist in portable form: forward-slash relative paths. Keys outside @@ -1386,11 +1464,27 @@ def detect_incremental( runs. ``None`` (default) does not follow symlinked directories; callers must opt in explicitly, and resolved targets outside the scan root are skipped. """ - full = detect(root, follow_symlinks=follow_symlinks, google_workspace=google_workspace, extra_excludes=extra_excludes) # Pass ``root`` so a manifest written with relative keys (post-#777) is # re-anchored to the absolute form the rest of this function compares - # against. Legacy absolute-keyed manifests pass through unchanged. + # against. Legacy absolute-keyed manifests pass through unchanged. Loaded + # before detect() so the cached per-file word counts can be handed down and + # reused for unchanged files (avoids re-parsing every PDF each run — #1656). manifest = load_manifest(manifest_path, root=root) + prior_word_counts: dict[str, tuple[float, int]] = {} + for key, entry in manifest.items(): + if isinstance(entry, dict): + wc = entry.get("word_count") + mt = entry.get("mtime") + if isinstance(wc, int) and isinstance(mt, (int, float)): + prior_word_counts[key] = (mt, wc) + + full = detect( + root, + follow_symlinks=follow_symlinks, + google_workspace=google_workspace, + extra_excludes=extra_excludes, + prior_word_counts=prior_word_counts, + ) if not manifest: # No previous run - treat everything as new diff --git a/tests/test_detect.py b/tests/test_detect.py index 458bb6a84..7e71e5566 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -1538,3 +1538,230 @@ def test_convert_office_file_does_not_rewrite_existing_sidecar(tmp_path, monkeyp second = detect_mod.convert_office_file(src, out_dir) assert second == first assert second.stat().st_mtime_ns == mtime_before + + +def _flat(files: dict) -> list: + return [p for bucket in files.values() for p in bucket] + + +def test_convert_office_file_reparses_when_source_changes(tmp_path, monkeypatch): + """A source fingerprint in the sidecar header lets convert_office_file skip + re-parsing an unchanged .docx (#1656) yet re-parse + rewrite an edited one so + the change flows through detect_incremental (#1649).""" + calls = {"n": 0} + + def fake_docx(path): + calls["n"] += 1 + return path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx) + + out_dir = tmp_path / "converted" + src = tmp_path / "report.docx" + src.write_bytes(b"version one") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None + assert calls["n"] == 1 + assert "version one" in first.read_text(encoding="utf-8") + mtime_before = first.stat().st_mtime_ns + + # Source unchanged: fingerprint matches -> no re-parse, no rewrite (#1656/#1226). + second = detect_mod.convert_office_file(src, out_dir) + assert second == first + assert calls["n"] == 1 + assert second.stat().st_mtime_ns == mtime_before + + # Source edited: fingerprint differs -> re-parse + rewrite the sidecar (#1649). + src.write_bytes(b"version two, now considerably longer") + third = detect_mod.convert_office_file(src, out_dir) + assert third == first # deterministic sidecar name is unchanged + assert calls["n"] == 2 + assert "version two" in third.read_text(encoding="utf-8") + + +def test_convert_office_file_reparses_xlsx_on_edit(tmp_path, monkeypatch): + """Same source-change detection for .xlsx sources (#1649/#1656).""" + calls = {"n": 0} + + def fake_xlsx(path): + calls["n"] += 1 + return "sheet: " + path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "xlsx_to_markdown", fake_xlsx) + + out_dir = tmp_path / "converted" + src = tmp_path / "budget.xlsx" + src.write_bytes(b"q1") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None and calls["n"] == 1 + + detect_mod.convert_office_file(src, out_dir) # unchanged + assert calls["n"] == 1 + + src.write_bytes(b"q1 q2 q3 q4") # edited + detect_mod.convert_office_file(src, out_dir) + assert calls["n"] == 2 + + +def test_convert_office_file_upgrades_legacy_sidecar(tmp_path, monkeypatch): + """A sidecar written before source fingerprints existed (no source-md5 in the + header) is treated as stale and rewritten once, then reused thereafter.""" + monkeypatch.setattr(detect_mod, "docx_to_markdown", lambda p: "hello world") + + out_dir = tmp_path / "converted" + src = tmp_path / "legacy.docx" + src.write_bytes(b"body") + + # First conversion writes a fingerprinted sidecar. + sidecar = detect_mod.convert_office_file(src, out_dir) + assert sidecar is not None + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None + + # Simulate an old (pre-fingerprint) sidecar header and confirm a run refreshes it. + sidecar.write_text("\n\nhello world", encoding="utf-8") + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is None + refreshed = detect_mod.convert_office_file(src, out_dir) + assert refreshed == sidecar + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None + + +def test_detect_incremental_reenters_edited_office_file(tmp_path, monkeypatch): + """End-to-end: after an Office source is edited, detect_incremental must + surface its sidecar as changed so `--update` re-extracts it (#1649). An + unchanged source stays out of the changed set (#1226/#1656).""" + monkeypatch.setattr( + detect_mod, + "docx_to_markdown", + lambda p: p.read_bytes().decode("latin-1", "ignore"), + ) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + src = tmp_path / "spec.docx" + src.write_bytes(b"first draft") + + r1 = detect_incremental(tmp_path, manifest_path) + sidecars = r1["files"]["document"] + assert sidecars, "office sidecar should be classified as a document" + save_manifest(r1["files"], manifest_path, root=tmp_path) + + # Unchanged run: sidecar is not re-queued. + r2 = detect_incremental(tmp_path, manifest_path) + assert not r2["new_files"]["document"] + assert r2["unchanged_files"]["document"] == sidecars + save_manifest(r2["files"], manifest_path, root=tmp_path) + + # Edit the source: the sidecar is rewritten and re-queued as changed. + src.write_bytes(b"second draft, substantially revised and longer") + r3 = detect_incremental(tmp_path, manifest_path) + assert r3["new_files"]["document"] == sidecars + + +def test_detect_incremental_pdf_not_reparsed_when_unchanged(tmp_path, monkeypatch): + """An unchanged PDF must not be re-parsed for its word count on a later + incremental run — the count is cached in the manifest and reused (#1656).""" + calls = {"n": 0} + + def fake_pdf(path): + calls["n"] += 1 + return "alpha beta gamma delta" + + monkeypatch.setattr(detect_mod, "extract_pdf_text", fake_pdf) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + pdf = tmp_path / "paper.pdf" + pdf.write_bytes(b"%PDF-1.4 stub content") + + r1 = detect_incremental(tmp_path, manifest_path) + assert any(p.endswith("paper.pdf") for p in _flat(r1["files"])) + save_manifest(r1["files"], manifest_path, root=tmp_path) + after_run1 = calls["n"] + assert after_run1 >= 1 # parsed at least once to seed the cached word count + + r2 = detect_incremental(tmp_path, manifest_path) + save_manifest(r2["files"], manifest_path, root=tmp_path) + assert calls["n"] == after_run1 # #1656: unchanged PDF is not re-parsed + assert not any(p.endswith("paper.pdf") for p in _flat(r2["new_files"])) + assert any(p.endswith("paper.pdf") for p in _flat(r2["unchanged_files"])) + + +def test_convert_office_file_pathological_source_filename_fingerprint(tmp_path, monkeypatch): + """A source *filename* that itself contains a ``source-md5: `` substring + must not fool the sidecar fingerprint reader. The header interpolates the + filename before the real trailing fingerprint, so a naive un-anchored regex + would capture the filename's fake hash — the real fingerprint would then + never match and the file would re-parse + rewrite (and re-queue) on EVERY + run. The reader anchors to the trailing delimiter so only the real + fingerprint matches and an unchanged source is parsed exactly once (#1656).""" + calls = {"n": 0} + + def fake_docx(path): + calls["n"] += 1 + return path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx) + + out_dir = tmp_path / "converted" + # Filename embeds a decoy "source-md5: deadbeef" substring. + src = tmp_path / "report source-md5: deadbeef.docx" + src.write_bytes(b"payload one") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None + assert calls["n"] == 1 + # The stored fingerprint is the real source md5, not the filename's decoy. + stored = detect_mod._read_sidecar_source_fingerprint(first) + assert stored is not None and stored != "deadbeef" + + # Unchanged source: must NOT re-parse despite the misleading filename. + second = detect_mod.convert_office_file(src, out_dir) + assert second == first + assert calls["n"] == 1 + + # An actual edit still re-parses. + src.write_bytes(b"payload two, now different") + third = detect_mod.convert_office_file(src, out_dir) + assert third == first + assert calls["n"] == 2 + + +def test_save_manifest_semantic_kind_reuses_cached_word_count(tmp_path, monkeypatch): + """For a semantic-only manifest ``ast_hash`` is never populated, so keying the + word_count cache off it would always miss. save_manifest keys the reuse check + off the prior hash of the matching kind, so an unchanged file's cached count + is reused rather than recomputed on the next kind='semantic' write (#1656).""" + from graphify.detect import load_manifest + + calls = {"n": 0} + real_count_words = detect_mod.count_words + + def counting(path): + calls["n"] += 1 + return real_count_words(path) + + monkeypatch.setattr(detect_mod, "count_words", counting) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + doc = tmp_path / "notes.md" + doc.write_text("alpha beta gamma delta epsilon", encoding="utf-8") + files = {"document": [str(doc)]} + + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 1 # parsed once to seed the cache + stored = load_manifest(manifest_path, root=tmp_path) + (entry,) = stored.values() + assert entry["word_count"] == 5 + assert entry.get("ast_hash", "") == "" # semantic-only: ast_hash stays empty + + # Second semantic write, content unchanged: cached count is reused. + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 1 + + # A real content change must still recompute. + doc.write_text("one two three", encoding="utf-8") + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 2 + stored = load_manifest(manifest_path, root=tmp_path) + (entry,) = stored.values() + assert entry["word_count"] == 3