diff --git a/CHANGELOG.md b/CHANGELOG.md
index 626fac385..949be565b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu
 
 ## Unreleased
 
+- Fix: an edited Office source now re-enters `--update`, and unchanged PDFs/Office files are no longer re-parsed on every run (#1649 / #1656, thanks @Ns2384-star). A modified `.docx`/`.xlsx` reused a byte-identical, deterministically-named sidecar whose content was never refreshed, so `graphify update` never re-extracted it (#1649); meanwhile every incremental run re-unzipped each Office file and re-parsed each PDF just to recount words (#1656). The `.docx`/`.xlsx` sidecar header now records a raw-byte `source-md5` fingerprint of the source — reading the bytes doesn't unzip the OOXML container, so it's cheap. A matching fingerprint returns the existing sidecar without re-parsing or rewriting it (preserving the #1226 unchanged-mtime no-churn guarantee), while a differing or legacy-missing fingerprint re-parses and rewrites so the new sidecar mtime/hash flows the edit through `detect_incremental`. Per-file word counts are also cached in the manifest and reused for any file whose mtime is unchanged, so an unchanged PDF/Office file is parsed once rather than on every run.
 - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place.
 - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import.
 - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance.
diff --git a/graphify/detect.py b/graphify/detect.py
index 0e1c4ba30..ffc1d2512 100644
--- a/graphify/detect.py
+++ b/graphify/detect.py
@@ -601,6 +601,30 @@ def _edge(src: str, tgt: str, relation: str) -> None:
     return {"nodes": nodes, "edges": edges}
 
 
+# Sidecar header records the source's raw-byte fingerprint so a later run can
+# tell whether the Office file changed without re-parsing it (#1649, #1656).
+# Anchor to the ` | source-md5: <fp> -->` delimiter/terminator at the END of the
+# header line so a source *filename* that happens to contain a "source-md5: ..."
+# substring can't be captured as the fingerprint — otherwise the real fingerprint
+# would never match and the file would re-parse+rewrite (and re-queue) every run.
+_SIDECAR_SOURCE_FP_RE = re.compile(r"\| source-md5:\s*([0-9a-f]+)\s*-->\s*$")
+
+
+def _read_sidecar_source_fingerprint(out_path: Path) -> str | None:
+    """Read the source fingerprint stored in an existing sidecar's header.
+
+    Returns ``None`` for legacy sidecars written before the fingerprint was
+    added, so they are treated as needing a refresh on the next run.
+    """
+    try:
+        with out_path.open("r", encoding="utf-8") as fh:
+            first_line = fh.readline()
+    except OSError:
+        return None
+    m = _SIDECAR_SOURCE_FP_RE.search(first_line)
+    return m.group(1) if m else None
+
+
 def convert_office_file(path: Path, out_dir: Path) -> Path | None:
     """Convert a .docx or .xlsx to a markdown sidecar in out_dir.
 
@@ -608,14 +632,7 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None:
     or the required library is not installed.
     """
     ext = path.suffix.lower()
-    if ext == ".docx":
-        text = docx_to_markdown(path)
-    elif ext == ".xlsx":
-        text = xlsx_to_markdown(path)
-    else:
-        return None
-
-    if not text.strip():
+    if ext not in (".docx", ".xlsx"):
         return None
 
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -630,13 +647,31 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None:
     normalized_path = unicodedata.normalize("NFC", str(path.resolve()))
     name_hash = hashlib.sha256(normalized_path.encode()).hexdigest()[:8]
     out_path = out_dir / f"{path.stem}_{name_hash}.md"
-    # Once the hash is stable the sidecar name is deterministic; skip re-writing
-    # an existing sidecar so an unchanged source never churns its mtime (which
-    # would still flag it as changed in detect_incremental).
+
+    # Fingerprint the SOURCE by its raw bytes (md5). Reading the bytes does NOT
+    # unzip/parse the OOXML container, so this is cheap — that is the whole
+    # point. If a sidecar already exists and records the same fingerprint the
+    # source is unchanged: return it WITHOUT parsing (avoids the per-run
+    # re-parse of #1656) and WITHOUT rewriting (so an unchanged source never
+    # churns its mtime, which detect_incremental would otherwise flag as
+    # changed — #1226). If the fingerprint differs (or is missing, e.g. a legacy
+    # sidecar) the source was edited: re-parse and rewrite so the change flows
+    # through detect_incremental via the sidecar's new mtime/md5 (#1649).
+    source_fp = _md5_file(path)
     if out_path.exists():
-        return out_path
+        if not source_fp or _read_sidecar_source_fingerprint(out_path) == source_fp:
+            return out_path
+
+    if ext == ".docx":
+        text = docx_to_markdown(path)
+    else:
+        text = xlsx_to_markdown(path)
+
+    if not text.strip():
+        return None
+
     out_path.write_text(
-        f"<!-- converted from {path.name} -->\n\n{text}",
+        f"<!-- converted from {path.name} | source-md5: {source_fp} -->\n\n{text}",
         encoding="utf-8",
     )
     return out_path
@@ -1015,7 +1050,14 @@ def _resolves_under_root(path: Path, root: Path) -> bool:
     return True
 
 
-def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: bool | None = None, extra_excludes: list[str] | None = None) -> dict:
+def detect(
+    root: Path,
+    *,
+    follow_symlinks: bool | None = None,
+    google_workspace: bool | None = None,
+    extra_excludes: list[str] | None = None,
+    prior_word_counts: dict[str, tuple[float, int]] | None = None,
+) -> dict:
     root = root.resolve()
     if follow_symlinks is None:
         follow_symlinks = False
@@ -1029,6 +1071,21 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace:
     }
     total_words = 0
 
+    # Word counting parses PDFs (extract_pdf_text) and reads sidecars on every
+    # file; re-doing it for unchanged files on each incremental run is pure waste
+    # (#1656). When the caller supplies the previous run's per-file counts, reuse
+    # the cached count for any file whose mtime is unchanged instead of parsing.
+    def _count_words(fp: Path) -> int:
+        if prior_word_counts is not None:
+            cached = prior_word_counts.get(str(fp))
+            if cached is not None:
+                try:
+                    if fp.stat().st_mtime == cached[0]:
+                        return cached[1]
+                except OSError:
+                    pass
+        return count_words(fp)
+
     skipped_sensitive: list[str] = []
     ignore_patterns = _load_graphifyignore(root)
     ignore_cache: dict[Path, bool] = {}  # shared across all _is_ignored calls in this scan
@@ -1133,7 +1190,7 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace:
                     if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache):
                         continue
                     files[ftype].append(str(md_path))
-                    total_words += count_words(md_path)
+                    total_words += _count_words(md_path)
                 else:
                     skipped_sensitive.append(str(p) + " [Google Workspace export produced no readable text]")
                 continue
@@ -1144,14 +1201,14 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace:
                     if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache):
                         continue
                     files[ftype].append(str(md_path))
-                    total_words += count_words(md_path)
+                    total_words += _count_words(md_path)
                 else:
                     # Conversion failed (library not installed) - skip with note
                     skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
                 continue
             files[ftype].append(str(p))
             if ftype != FileType.VIDEO:
-                total_words += count_words(p)
+                total_words += _count_words(p)
 
     for ftype in files:
         files[ftype].sort()
@@ -1323,6 +1380,9 @@ def _normalise_entry(entry):
             continue
 
     all_files = [f for file_list in files.values() for f in file_list]
+    # Video files are never word-counted (mirrors detect(), and avoids reading
+    # large media in as text).
+    video_set = set(files.get("video", []))
     with ThreadPoolExecutor() as pool:
         raw = pool.map(_stat_and_hash, all_files)
     hashed: dict[str, tuple[float, str]] = {
@@ -1334,6 +1394,14 @@ def _normalise_entry(entry):
             continue  # file deleted between detect() and manifest write
         mtime, h = hashed[f]
         prev = _normalise_entry(existing.get(f, {})) or {}
+        # Compare the file's content hash (h) against the prior hash of the SAME
+        # kind so an unchanged file is recognised regardless of which pipeline
+        # wrote the manifest. ast/both key off the prior ast_hash (unchanged
+        # behaviour); a semantic-only manifest never populates ast_hash, so key
+        # off the prior semantic_hash there — otherwise content_unchanged would
+        # always be False and the word_count cache below could never be reused.
+        prior_hash = prev.get("semantic_hash", "") if kind == "semantic" else prev.get("ast_hash", "")
+        content_unchanged = h == prior_hash
         entry: dict = {"mtime": mtime}
         if kind in ("ast", "both"):
             entry["ast_hash"] = h
@@ -1343,7 +1411,17 @@ def _normalise_entry(entry):
             entry["semantic_hash"] = h
         else:
             # Preserve semantic_hash only when content is unchanged
-            entry["semantic_hash"] = prev.get("semantic_hash", "") if h == prev.get("ast_hash", "") else ""
+            entry["semantic_hash"] = prev.get("semantic_hash", "") if content_unchanged else ""
+        # Cache the word count so detect() can reuse it for unchanged files
+        # rather than re-parsing (esp. PDFs) on every incremental run (#1656).
+        # Reuse the previous count whenever the content hash is unchanged; only
+        # (re)parse a genuinely new or changed file.
+        if f in video_set:
+            entry["word_count"] = 0
+        elif content_unchanged and isinstance(prev.get("word_count"), int):
+            entry["word_count"] = prev["word_count"]
+        else:
+            entry["word_count"] = count_words(Path(f))
         manifest[f] = entry
     if root is not None:
         # Persist in portable form: forward-slash relative paths. Keys outside
@@ -1386,11 +1464,27 @@ def detect_incremental(
     runs. ``None`` (default) does not follow symlinked directories; callers must
     opt in explicitly, and resolved targets outside the scan root are skipped.
     """
-    full = detect(root, follow_symlinks=follow_symlinks, google_workspace=google_workspace, extra_excludes=extra_excludes)
     # Pass ``root`` so a manifest written with relative keys (post-#777) is
     # re-anchored to the absolute form the rest of this function compares
-    # against. Legacy absolute-keyed manifests pass through unchanged.
+    # against. Legacy absolute-keyed manifests pass through unchanged. Loaded
+    # before detect() so the cached per-file word counts can be handed down and
+    # reused for unchanged files (avoids re-parsing every PDF each run — #1656).
     manifest = load_manifest(manifest_path, root=root)
+    prior_word_counts: dict[str, tuple[float, int]] = {}
+    for key, entry in manifest.items():
+        if isinstance(entry, dict):
+            wc = entry.get("word_count")
+            mt = entry.get("mtime")
+            if isinstance(wc, int) and isinstance(mt, (int, float)):
+                prior_word_counts[key] = (mt, wc)
+
+    full = detect(
+        root,
+        follow_symlinks=follow_symlinks,
+        google_workspace=google_workspace,
+        extra_excludes=extra_excludes,
+        prior_word_counts=prior_word_counts,
+    )
 
     if not manifest:
         # No previous run - treat everything as new
diff --git a/tests/test_detect.py b/tests/test_detect.py
index 458bb6a84..7e71e5566 100644
--- a/tests/test_detect.py
+++ b/tests/test_detect.py
@@ -1538,3 +1538,230 @@ def test_convert_office_file_does_not_rewrite_existing_sidecar(tmp_path, monkeyp
     second = detect_mod.convert_office_file(src, out_dir)
     assert second == first
     assert second.stat().st_mtime_ns == mtime_before
+
+
+def _flat(files: dict) -> list:
+    return [p for bucket in files.values() for p in bucket]
+
+
+def test_convert_office_file_reparses_when_source_changes(tmp_path, monkeypatch):
+    """A source fingerprint in the sidecar header lets convert_office_file skip
+    re-parsing an unchanged .docx (#1656) yet re-parse + rewrite an edited one so
+    the change flows through detect_incremental (#1649)."""
+    calls = {"n": 0}
+
+    def fake_docx(path):
+        calls["n"] += 1
+        return path.read_bytes().decode("latin-1", "ignore")
+
+    monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx)
+
+    out_dir = tmp_path / "converted"
+    src = tmp_path / "report.docx"
+    src.write_bytes(b"version one")
+
+    first = detect_mod.convert_office_file(src, out_dir)
+    assert first is not None
+    assert calls["n"] == 1
+    assert "version one" in first.read_text(encoding="utf-8")
+    mtime_before = first.stat().st_mtime_ns
+
+    # Source unchanged: fingerprint matches -> no re-parse, no rewrite (#1656/#1226).
+    second = detect_mod.convert_office_file(src, out_dir)
+    assert second == first
+    assert calls["n"] == 1
+    assert second.stat().st_mtime_ns == mtime_before
+
+    # Source edited: fingerprint differs -> re-parse + rewrite the sidecar (#1649).
+    src.write_bytes(b"version two, now considerably longer")
+    third = detect_mod.convert_office_file(src, out_dir)
+    assert third == first  # deterministic sidecar name is unchanged
+    assert calls["n"] == 2
+    assert "version two" in third.read_text(encoding="utf-8")
+
+
+def test_convert_office_file_reparses_xlsx_on_edit(tmp_path, monkeypatch):
+    """Same source-change detection for .xlsx sources (#1649/#1656)."""
+    calls = {"n": 0}
+
+    def fake_xlsx(path):
+        calls["n"] += 1
+        return "sheet: " + path.read_bytes().decode("latin-1", "ignore")
+
+    monkeypatch.setattr(detect_mod, "xlsx_to_markdown", fake_xlsx)
+
+    out_dir = tmp_path / "converted"
+    src = tmp_path / "budget.xlsx"
+    src.write_bytes(b"q1")
+
+    first = detect_mod.convert_office_file(src, out_dir)
+    assert first is not None and calls["n"] == 1
+
+    detect_mod.convert_office_file(src, out_dir)  # unchanged
+    assert calls["n"] == 1
+
+    src.write_bytes(b"q1 q2 q3 q4")  # edited
+    detect_mod.convert_office_file(src, out_dir)
+    assert calls["n"] == 2
+
+
+def test_convert_office_file_upgrades_legacy_sidecar(tmp_path, monkeypatch):
+    """A sidecar written before source fingerprints existed (no source-md5 in the
+    header) is treated as stale and rewritten once, then reused thereafter."""
+    monkeypatch.setattr(detect_mod, "docx_to_markdown", lambda p: "hello world")
+
+    out_dir = tmp_path / "converted"
+    src = tmp_path / "legacy.docx"
+    src.write_bytes(b"body")
+
+    # First conversion writes a fingerprinted sidecar.
+    sidecar = detect_mod.convert_office_file(src, out_dir)
+    assert sidecar is not None
+    assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None
+
+    # Simulate an old (pre-fingerprint) sidecar header and confirm a run refreshes it.
+    sidecar.write_text("<!-- converted from legacy.docx -->\n\nhello world", encoding="utf-8")
+    assert detect_mod._read_sidecar_source_fingerprint(sidecar) is None
+    refreshed = detect_mod.convert_office_file(src, out_dir)
+    assert refreshed == sidecar
+    assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None
+
+
+def test_detect_incremental_reenters_edited_office_file(tmp_path, monkeypatch):
+    """End-to-end: after an Office source is edited, detect_incremental must
+    surface its sidecar as changed so `--update` re-extracts it (#1649). An
+    unchanged source stays out of the changed set (#1226/#1656)."""
+    monkeypatch.setattr(
+        detect_mod,
+        "docx_to_markdown",
+        lambda p: p.read_bytes().decode("latin-1", "ignore"),
+    )
+
+    manifest_path = str(tmp_path / "graphify-out" / "manifest.json")
+    src = tmp_path / "spec.docx"
+    src.write_bytes(b"first draft")
+
+    r1 = detect_incremental(tmp_path, manifest_path)
+    sidecars = r1["files"]["document"]
+    assert sidecars, "office sidecar should be classified as a document"
+    save_manifest(r1["files"], manifest_path, root=tmp_path)
+
+    # Unchanged run: sidecar is not re-queued.
+    r2 = detect_incremental(tmp_path, manifest_path)
+    assert not r2["new_files"]["document"]
+    assert r2["unchanged_files"]["document"] == sidecars
+    save_manifest(r2["files"], manifest_path, root=tmp_path)
+
+    # Edit the source: the sidecar is rewritten and re-queued as changed.
+    src.write_bytes(b"second draft, substantially revised and longer")
+    r3 = detect_incremental(tmp_path, manifest_path)
+    assert r3["new_files"]["document"] == sidecars
+
+
+def test_detect_incremental_pdf_not_reparsed_when_unchanged(tmp_path, monkeypatch):
+    """An unchanged PDF must not be re-parsed for its word count on a later
+    incremental run — the count is cached in the manifest and reused (#1656)."""
+    calls = {"n": 0}
+
+    def fake_pdf(path):
+        calls["n"] += 1
+        return "alpha beta gamma delta"
+
+    monkeypatch.setattr(detect_mod, "extract_pdf_text", fake_pdf)
+
+    manifest_path = str(tmp_path / "graphify-out" / "manifest.json")
+    pdf = tmp_path / "paper.pdf"
+    pdf.write_bytes(b"%PDF-1.4 stub content")
+
+    r1 = detect_incremental(tmp_path, manifest_path)
+    assert any(p.endswith("paper.pdf") for p in _flat(r1["files"]))
+    save_manifest(r1["files"], manifest_path, root=tmp_path)
+    after_run1 = calls["n"]
+    assert after_run1 >= 1  # parsed at least once to seed the cached word count
+
+    r2 = detect_incremental(tmp_path, manifest_path)
+    save_manifest(r2["files"], manifest_path, root=tmp_path)
+    assert calls["n"] == after_run1  # #1656: unchanged PDF is not re-parsed
+    assert not any(p.endswith("paper.pdf") for p in _flat(r2["new_files"]))
+    assert any(p.endswith("paper.pdf") for p in _flat(r2["unchanged_files"]))
+
+
+def test_convert_office_file_pathological_source_filename_fingerprint(tmp_path, monkeypatch):
+    """A source *filename* that itself contains a ``source-md5: <hex>`` substring
+    must not fool the sidecar fingerprint reader. The header interpolates the
+    filename before the real trailing fingerprint, so a naive un-anchored regex
+    would capture the filename's fake hash — the real fingerprint would then
+    never match and the file would re-parse + rewrite (and re-queue) on EVERY
+    run. The reader anchors to the trailing delimiter so only the real
+    fingerprint matches and an unchanged source is parsed exactly once (#1656)."""
+    calls = {"n": 0}
+
+    def fake_docx(path):
+        calls["n"] += 1
+        return path.read_bytes().decode("latin-1", "ignore")
+
+    monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx)
+
+    out_dir = tmp_path / "converted"
+    # Filename embeds a decoy "source-md5: deadbeef" substring.
+    src = tmp_path / "report source-md5: deadbeef.docx"
+    src.write_bytes(b"payload one")
+
+    first = detect_mod.convert_office_file(src, out_dir)
+    assert first is not None
+    assert calls["n"] == 1
+    # The stored fingerprint is the real source md5, not the filename's decoy.
+    stored = detect_mod._read_sidecar_source_fingerprint(first)
+    assert stored is not None and stored != "deadbeef"
+
+    # Unchanged source: must NOT re-parse despite the misleading filename.
+    second = detect_mod.convert_office_file(src, out_dir)
+    assert second == first
+    assert calls["n"] == 1
+
+    # An actual edit still re-parses.
+    src.write_bytes(b"payload two, now different")
+    third = detect_mod.convert_office_file(src, out_dir)
+    assert third == first
+    assert calls["n"] == 2
+
+
+def test_save_manifest_semantic_kind_reuses_cached_word_count(tmp_path, monkeypatch):
+    """For a semantic-only manifest ``ast_hash`` is never populated, so keying the
+    word_count cache off it would always miss. save_manifest keys the reuse check
+    off the prior hash of the matching kind, so an unchanged file's cached count
+    is reused rather than recomputed on the next kind='semantic' write (#1656)."""
+    from graphify.detect import load_manifest
+
+    calls = {"n": 0}
+    real_count_words = detect_mod.count_words
+
+    def counting(path):
+        calls["n"] += 1
+        return real_count_words(path)
+
+    monkeypatch.setattr(detect_mod, "count_words", counting)
+
+    manifest_path = str(tmp_path / "graphify-out" / "manifest.json")
+    doc = tmp_path / "notes.md"
+    doc.write_text("alpha beta gamma delta epsilon", encoding="utf-8")
+    files = {"document": [str(doc)]}
+
+    save_manifest(files, manifest_path, kind="semantic", root=tmp_path)
+    assert calls["n"] == 1  # parsed once to seed the cache
+    stored = load_manifest(manifest_path, root=tmp_path)
+    (entry,) = stored.values()
+    assert entry["word_count"] == 5
+    assert entry.get("ast_hash", "") == ""  # semantic-only: ast_hash stays empty
+
+    # Second semantic write, content unchanged: cached count is reused.
+    save_manifest(files, manifest_path, kind="semantic", root=tmp_path)
+    assert calls["n"] == 1
+
+    # A real content change must still recompute.
+    doc.write_text("one two three", encoding="utf-8")
+    save_manifest(files, manifest_path, kind="semantic", root=tmp_path)
+    assert calls["n"] == 2
+    stored = load_manifest(manifest_path, root=tmp_path)
+    (entry,) = stored.values()
+    assert entry["word_count"] == 3