diff --git a/Cargo.lock b/Cargo.lock index fc8e18a..89900f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,16 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + [[package]] name = "ahash" version = "0.8.12" @@ -1975,6 +1985,30 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "chacha20poly1305" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" +dependencies = [ + "aead", + "chacha20", + "cipher", + "poly1305", + "zeroize", +] + [[package]] name = "chrono" version = "0.4.44" @@ -2001,6 +2035,17 @@ dependencies = [ "unsigned-varint 0.8.0", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", + "zeroize", +] + [[package]] name = "clap" version = "4.5.60" @@ -2283,9 +2328,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", + "rand_core 0.6.4", "typenum", ] +[[package]] +name = "crypto_box" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16182b4f39a82ec8a6851155cc4c0cda3065bb1db33651726a29e1951de0f009" +dependencies = [ + "aead", + "chacha20", + "crypto_secretbox", + "curve25519-dalek", + "salsa20", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto_secretbox" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d6cf87adf719ddf43a805e92c6870a531aedda35ff640442cbaf8674e141e1" +dependencies = [ + "aead", + "chacha20", + "cipher", + "generic-array", + "poly1305", + "salsa20", + "subtle", + "zeroize", +] + [[package]] name = "curve25519-dalek" version = "4.1.3" @@ -3219,8 +3296,11 @@ version = "0.3.9" dependencies = [ "anyhow", "base64", + "chacha20poly1305", "chrono", "cid", + "crypto_box", + "curve25519-dalek", "ed25519-dalek", "hex", "multibase", @@ -3254,6 +3334,7 @@ dependencies = [ "cid", "clap", "dirs-next", + "ed25519-dalek", "futures", "gitlawb-core", "hex", @@ -3836,6 +3917,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -4678,6 +4768,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + [[package]] name = "openssl-probe" version = "0.2.1" @@ -4927,6 +5023,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "poly1305" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" +dependencies = [ + "cpufeatures", + "opaque-debug", + "universal-hash", +] + [[package]] name = "potential_utf" version = "0.1.4" @@ -5642,6 +5749,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6518fc26bced4d53678a22d6e423e9d8716377def84545fe328236e3af070e7f" +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + [[package]] name = "schannel" version = "0.1.29" @@ -6884,6 +7000,16 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + [[package]] name = "unsigned-varint" version = "0.7.2" diff --git a/crates/gitlawb-core/Cargo.toml b/crates/gitlawb-core/Cargo.toml index 486a5aa..4468d0c 100644 --- a/crates/gitlawb-core/Cargo.toml +++ b/crates/gitlawb-core/Cargo.toml @@ -23,6 +23,9 @@ chrono = { workspace = true } uuid = { workspace = true } zeroize = { version = "1", features = ["derive"] } pkcs8 = { version = "0.10", features = ["pem", "std"] } +curve25519-dalek = "4" +crypto_box = { version = "0.9", features = ["std", "chacha20"] } +chacha20poly1305 = "0.10" [dev-dependencies] tokio = { workspace = true } diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs new file mode 100644 index 0000000..0336270 --- /dev/null +++ b/crates/gitlawb-core/src/encrypt.rs @@ -0,0 +1,291 @@ +//! Envelope encryption for withheld blobs (Option B). A random content key +//! encrypts the blob (XChaCha20-Poly1305); the content key is wrapped to each +//! recipient via an X25519 box keyed from their Ed25519 `did:key`. The node +//! seals with public keys only; readers open with their own private key. + +use crate::identity::Keypair; +use anyhow::{Context, Result}; +use ed25519_dalek::VerifyingKey; + +/// X25519 public key (Montgomery u) for an Ed25519 verifying key. +fn x25519_public(vk: &VerifyingKey) -> Result<[u8; 32]> { + use curve25519_dalek::edwards::CompressedEdwardsY; + let edwards = CompressedEdwardsY::from_slice(vk.as_bytes()) + .ok() + .and_then(|c| c.decompress()) + .context("verifying key is not a valid edwards point")?; + Ok(edwards.to_montgomery().to_bytes()) +} + +/// X25519 secret scalar for an Ed25519 seed (SHA-512 of seed, lower 32, clamped). +fn x25519_secret_from_seed(seed: &[u8; 32]) -> [u8; 32] { + use sha2::{Digest, Sha512}; + let h = Sha512::digest(seed); + let mut s = [0u8; 32]; + s.copy_from_slice(&h[..32]); + s[0] &= 248; + s[31] &= 127; + s[31] |= 64; + s +} + +use base64::{engine::general_purpose::STANDARD as B64, Engine}; +use chacha20poly1305::{ + aead::{Aead, KeyInit}, + XChaCha20Poly1305, XNonce, +}; +use crypto_box::{ + aead::{AeadCore, OsRng}, + ChaChaBox, PublicKey as XPublic, SecretKey as XSecret, +}; +use rand::RngCore; +use serde::{Deserialize, Serialize}; + +const MAGIC: &[u8] = b"GLENC"; +const VERSION: u8 = 2; + +#[derive(Serialize, Deserialize)] +struct Recipient { + eph: String, // base64 ephemeral x25519 pubkey (32B) + nonce: String, // base64 box nonce (24B) + wrap: String, // base64 wrapped content key +} + +#[derive(Serialize, Deserialize)] +struct Header { + alg: String, + nonce: String, // base64 body nonce (24B) + recipients: Vec, +} + +/// Encrypt `plaintext` so any of `recipients` (Ed25519 keys) can decrypt. +pub fn seal_blob(plaintext: &[u8], recipients: &[VerifyingKey]) -> Result> { + if recipients.is_empty() { + return Err(anyhow::anyhow!("seal_blob: no recipients")); + } + let mut content_key = [0u8; 32]; + OsRng.fill_bytes(&mut content_key); + let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) + .map_err(|e| anyhow::anyhow!("content key: {e}"))?; + let mut body_nonce = [0u8; 24]; + OsRng.fill_bytes(&mut body_nonce); + let body = body_cipher + .encrypt(XNonce::from_slice(&body_nonce), plaintext) + .map_err(|e| anyhow::anyhow!("body encrypt: {e}"))?; + + let mut wrapped = Vec::with_capacity(recipients.len()); + for vk in recipients { + let recip_x = XPublic::from(x25519_public(vk)?); + let eph = XSecret::generate(&mut OsRng); + let abox = ChaChaBox::new(&recip_x, &eph); + let nonce = ChaChaBox::generate_nonce(&mut OsRng); + let ct = abox + .encrypt(&nonce, &content_key[..]) + .map_err(|e| anyhow::anyhow!("wrap: {e}"))?; + wrapped.push(Recipient { + eph: B64.encode(eph.public_key().as_bytes()), + nonce: B64.encode(nonce), + wrap: B64.encode(ct), + }); + } + + let header = Header { + alg: "xchacha20poly1305".into(), + nonce: B64.encode(body_nonce), + recipients: wrapped, + }; + let header_json = serde_json::to_vec(&header).context("encode header")?; + + let mut out = Vec::new(); + out.extend_from_slice(MAGIC); + out.push(VERSION); + out.extend_from_slice(&(header_json.len() as u32).to_le_bytes()); + out.extend_from_slice(&header_json); + out.extend_from_slice(&body); + Ok(out) +} + +/// Decrypt an envelope with `keypair`. Errors if not a recipient or on auth fail. +pub fn open_blob(envelope: &[u8], keypair: &Keypair) -> Result> { + let mut p = 0; + if envelope.len() < MAGIC.len() + 1 + 4 || &envelope[..MAGIC.len()] != MAGIC { + return Err(anyhow::anyhow!("bad envelope magic")); + } + p += MAGIC.len(); + if envelope[p] != VERSION { + return Err(anyhow::anyhow!("unsupported envelope version")); + } + p += 1; + let hlen = u32::from_le_bytes(envelope[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header: Header = + serde_json::from_slice(envelope.get(p..p + hlen).context("truncated header")?) + .context("decode header")?; + let body = &envelope[p + hlen..]; + + let my_x = XSecret::from(x25519_secret_from_seed(&keypair.seed_bytes())); + + // Identities are blinded: no entry says which recipient it belongs to, so + // try each one. The ChaChaBox AEAD tag authenticates, so exactly the + // reader's own entry unwraps; every other entry fails cleanly. + let mut content_key: Option> = None; + for entry in &header.recipients { + let eph = match B64 + .decode(&entry.eph) + .ok() + .and_then(|b| <[u8; 32]>::try_from(b.as_slice()).ok()) + { + Some(b) => XPublic::from(b), + None => continue, + }; + // from_slice panics on a wrong length, and the envelope is attacker + // controlled, so validate the 24-byte box nonce before using it. + let nonce = match B64 + .decode(&entry.nonce) + .ok() + .and_then(|n| <[u8; 24]>::try_from(n.as_slice()).ok()) + { + Some(n) => n, + None => continue, + }; + let wrap = match B64.decode(&entry.wrap) { + Ok(w) => w, + Err(_) => continue, + }; + let abox = ChaChaBox::new(&eph, &my_x); + if let Ok(ck) = abox.decrypt( + crypto_box::aead::generic_array::GenericArray::from_slice(&nonce), + wrap.as_slice(), + ) { + content_key = Some(ck); + break; + } + } + let content_key = content_key.context("not a recipient of this envelope")?; + + let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) + .map_err(|e| anyhow::anyhow!("content key: {e}"))?; + let body_nonce = B64 + .decode(&header.nonce) + .ok() + .and_then(|n| <[u8; 24]>::try_from(n.as_slice()).ok()) + .context("invalid body nonce")?; + body_cipher + .decrypt(XNonce::from_slice(&body_nonce), body) + .map_err(|_| anyhow::anyhow!("body decrypt failed")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::identity::Keypair; + + #[test] + fn ed25519_to_x25519_keypair_agrees() { + // The X25519 public derived from the Ed25519 public must equal the + // X25519 public of the X25519 secret derived from the same seed. + let kp = Keypair::generate(); + let seed = kp.seed_bytes(); + let xpub_from_public = x25519_public(&kp.verifying_key()).unwrap(); + let xsec = x25519_secret_from_seed(&seed); + let xpub_from_secret = crypto_box::SecretKey::from(xsec).public_key().to_bytes(); + assert_eq!(xpub_from_public, xpub_from_secret); + } + + #[test] + fn seal_open_round_trip_for_recipients() { + let owner = Keypair::generate(); + let reader_a = Keypair::generate(); + let reader_b = Keypair::generate(); + let msg = b"private blob contents"; + + let env = seal_blob(msg, &[owner.verifying_key(), reader_a.verifying_key()]).unwrap(); + + assert_eq!(open_blob(&env, &owner).unwrap(), msg); + assert_eq!(open_blob(&env, &reader_a).unwrap(), msg); + assert!( + open_blob(&env, &reader_b).is_err(), + "non-recipient must fail" + ); + } + + #[test] + fn tampered_envelope_fails() { + let owner = Keypair::generate(); + let mut env = seal_blob(b"hi", &[owner.verifying_key()]).unwrap(); + let last = env.len() - 1; + env[last] ^= 0x01; + assert!(open_blob(&env, &owner).is_err()); + } + + #[test] + fn v2_header_contains_no_recipient_pubkey() { + // The blinded envelope header must not carry any recipient's public key. + let reader = Keypair::generate(); + let env = seal_blob(b"private blob contents", &[reader.verifying_key()]).unwrap(); + + // Slice out the header bytes using the envelope framing: + // MAGIC | version(1B) | header_len(4B LE) | header_json | body + let mut p = MAGIC.len() + 1; // skip MAGIC + version byte + let hlen = u32::from_le_bytes(env[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header = &env[p..p + hlen]; + let header_str = String::from_utf8_lossy(header); + + let pubkey_b64 = B64.encode(reader.verifying_key().as_bytes()); + assert!( + !header_str.contains(&pubkey_b64), + "recipient public key must not appear in the blinded header" + ); + } + + #[test] + fn v1_envelope_is_rejected() { + let reader = Keypair::generate(); + let mut env = seal_blob(b"hi", &[reader.verifying_key()]).unwrap(); + // Flip the version byte (immediately after MAGIC) from 2 to 1. + env[MAGIC.len()] = 1; + let err = open_blob(&env, &reader).unwrap_err(); + assert!( + err.to_string().contains("unsupported envelope version"), + "expected version-rejection error, got: {err}" + ); + } + + #[test] + fn malformed_nonce_returns_err_not_panic() { + // from_slice panics on wrong-length input; a crafted envelope on the + // public recovery path must surface an error, never panic. + let reader = Keypair::generate(); + let env = seal_blob(b"private blob contents", &[reader.verifying_key()]).unwrap(); + + // Split the envelope framing into header JSON and body. + let mut p = MAGIC.len() + 1; + let hlen = u32::from_le_bytes(env[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header_bytes = &env[p..p + hlen]; + let body = &env[p + hlen..]; + + let reframe = |header: &serde_json::Value| -> Vec { + let hj = serde_json::to_vec(header).unwrap(); + let mut out = Vec::new(); + out.extend_from_slice(MAGIC); + out.push(VERSION); + out.extend_from_slice(&(hj.len() as u32).to_le_bytes()); + out.extend_from_slice(&hj); + out.extend_from_slice(body); + out + }; + let bad_nonce = serde_json::Value::String(B64.encode([0u8; 12])); + + // Corrupted per-recipient nonce: entry is skipped, no match. + let mut header: serde_json::Value = serde_json::from_slice(header_bytes).unwrap(); + header["recipients"][0]["nonce"] = bad_nonce.clone(); + assert!(open_blob(&reframe(&header), &reader).is_err()); + + // Corrupted body nonce: unwrap succeeds, body nonce is rejected. + let mut header: serde_json::Value = serde_json::from_slice(header_bytes).unwrap(); + header["nonce"] = bad_nonce; + assert!(open_blob(&reframe(&header), &reader).is_err()); + } +} diff --git a/crates/gitlawb-core/src/identity.rs b/crates/gitlawb-core/src/identity.rs index 96d50b9..9d3fea1 100644 --- a/crates/gitlawb-core/src/identity.rs +++ b/crates/gitlawb-core/src/identity.rs @@ -52,6 +52,12 @@ impl Keypair { URL_SAFE_NO_PAD.encode(sig.to_bytes()) } + /// The raw 32-byte Ed25519 seed. Used to derive the X25519 secret for + /// envelope decryption (see `crate::encrypt`). + pub fn seed_bytes(&self) -> [u8; 32] { + self.signing_key.to_bytes() + } + /// Export the signing key as raw 32-byte seed (wrapped in Zeroizing). pub fn to_seed(&self) -> Zeroizing<[u8; 32]> { Zeroizing::new(self.signing_key.to_bytes()) diff --git a/crates/gitlawb-core/src/lib.rs b/crates/gitlawb-core/src/lib.rs index a608be1..a9e91f6 100644 --- a/crates/gitlawb-core/src/lib.rs +++ b/crates/gitlawb-core/src/lib.rs @@ -1,6 +1,7 @@ pub mod cert; pub mod cid; pub mod did; +pub mod encrypt; pub mod error; pub mod http_sig; pub mod identity; diff --git a/crates/gitlawb-node/Cargo.toml b/crates/gitlawb-node/Cargo.toml index 9cc3ba1..61f63b0 100644 --- a/crates/gitlawb-node/Cargo.toml +++ b/crates/gitlawb-node/Cargo.toml @@ -11,6 +11,7 @@ path = "src/main.rs" [dependencies] gitlawb-core = { path = "../gitlawb-core" } +ed25519-dalek = { workspace = true } tokio = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/gitlawb-node/src/api/encrypted.rs b/crates/gitlawb-node/src/api/encrypted.rs new file mode 100644 index 0000000..d9fa52a --- /dev/null +++ b/crates/gitlawb-node/src/api/encrypted.rs @@ -0,0 +1,123 @@ +//! Authenticated discovery + fetch for encrypted withheld blobs (Option B1). + +use axum::extract::{Extension, Path, State}; +use axum::Json; + +use crate::auth::AuthenticatedDid; +use crate::error::{AppError, Result}; +use crate::state::AppState; +use crate::visibility::{visibility_check, Decision}; + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blobs +/// Returns [{oid, cid}] for every encrypted blob in the repo, to any caller who +/// can read the repo. Not recipient-scoped: recipient identities are not stored, +/// so access control here is repo readability and decryption is gated by the +/// envelope crypto (only a real recipient can open an envelope). +pub async fn list_encrypted_blobs( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + let rules = state.db.list_visibility_rules(&record.id).await?; + if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}"))); + } + let rows = state.db.list_all_encrypted_blobs(&record.id).await?; + let blobs: Vec<_> = rows + .into_iter() + .map(|(oid, cid)| serde_json::json!({ "oid": oid, "cid": cid })) + .collect(); + Ok(Json(serde_json::json!({ "blobs": blobs }))) +} + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blob/{oid} +/// Returns raw envelope bytes to callers who can read the repo; the envelope +/// crypto still ensures only true recipients can decrypt. +pub async fn get_encrypted_blob( + State(state): State, + auth: Option>, + Path((owner, repo, oid)): Path<(String, String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + let rules = state.db.list_visibility_rules(&record.id).await?; + if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}/{oid}"))); + } + let cid = state + .db + .encrypted_blob_cid(&record.id, &oid) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}/{oid}")))?; + let bytes = crate::ipfs_pin::cat(&state.config.ipfs_api, &cid) + .await + .map_err(|e| AppError::Git(e.to_string()))?; + Ok(bytes) +} + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate +/// Returns [{oid, cid}] for every encrypted blob in the repo, for peer-mirror +/// replication (Option B2). Gated by repo readability, like discovery, so a +/// non-readable repo does not expose its blob index; for the intended case (a +/// public repo with withheld subtrees) the public root keeps this open to peers. +/// Recipient identities are deliberately withheld: the v2 envelopes no longer +/// carry recipient public keys, so peers must not learn the reader set either. A +/// mirror detects a re-seal by the CID changing (the OID is stable across +/// re-seals). Ciphertext metadata only, never plaintext. +pub async fn replicate_encrypted_blobs( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + let rules = state.db.list_visibility_rules(&record.id).await?; + if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}"))); + } + let rows = state.db.list_all_encrypted_blobs(&record.id).await?; + let blobs: Vec<_> = rows + .into_iter() + .map(|(oid, cid)| replicate_blob_json(oid, cid)) + .collect(); + Ok(Json(serde_json::json!({ "blobs": blobs }))) +} + +/// Serialize one blob for the replication wire. Recipient identities are +/// intentionally absent so a mirror never learns the reader set. +fn replicate_blob_json(oid: String, cid: String) -> serde_json::Value { + serde_json::json!({ "oid": oid, "cid": cid }) +} + +#[cfg(test)] +mod tests { + use super::replicate_blob_json; + + #[test] + fn replicate_blob_json_omits_recipients() { + let v = replicate_blob_json("oid1".into(), "cidA".into()); + assert_eq!(v["oid"], "oid1"); + assert_eq!(v["cid"], "cidA"); + assert!( + v.get("recipients").is_none(), + "replication wire must not carry recipient identities" + ); + } +} diff --git a/crates/gitlawb-node/src/api/mod.rs b/crates/gitlawb-node/src/api/mod.rs index 2595c48..7f01365 100644 --- a/crates/gitlawb-node/src/api/mod.rs +++ b/crates/gitlawb-node/src/api/mod.rs @@ -3,6 +3,7 @@ pub mod arweave; pub mod bounties; pub mod certs; pub mod changelog; +pub mod encrypted; pub mod events; pub mod ipfs; pub mod issues; diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 0993d4b..d23782f 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -12,7 +12,7 @@ use uuid::Uuid; use crate::cert; use crate::error::{AppError, Result}; -use crate::git::{smart_http, store}; +use crate::git::{smart_http, store, visibility_pack}; use crate::state::AppState; use crate::visibility::{visibility_check, Decision}; use crate::webhooks; @@ -330,6 +330,8 @@ pub async fn git_info_refs( if service == "git-upload-pack" { let rules = state.db.list_visibility_rules(&record.id).await?; let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + // Subtree (mode B) rules do not gate the advertisement: refs expose commit + // tips only, and blob withholding happens in the upload-pack pack build. if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny { @@ -392,18 +394,45 @@ pub async fn git_upload_pack( .await .map_err(|e| AppError::Git(e.to_string()))?; let body_len = body.len(); - let resp = smart_http::upload_pack(&disk_path, body) + + // withheld_blob_oids walks every ref with blocking `git ls-tree`; keep that + // off the async worker thread. + let withheld = { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let caller_owned = caller.map(str::to_string); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + visibility_pack::withheld_blob_oids( + &path, + &rules, + is_public, + &owner_did, + caller_owned.as_deref(), + ) + }) .await - .map_err(|e| { - let msg = e.to_string(); - if msg.contains("bad line length") || msg.contains("protocol error") { - tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); - AppError::BadRequest(msg) - } else { - tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); - AppError::Git(msg) - } - })?; + .map_err(|e| AppError::Git(e.to_string()))? + .map_err(|e| AppError::Git(e.to_string()))? + }; + + let resp = if withheld.is_empty() { + smart_http::upload_pack(&disk_path, body).await + } else { + tracing::info!(repo = %name, caller = ?caller, withheld = withheld.len(), "serving filtered pack"); + smart_http::upload_pack_excluding(&disk_path, body, &withheld).await + } + .map_err(|e| { + let msg = e.to_string(); + if msg.contains("bad line length") || msg.contains("protocol error") { + tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); + AppError::BadRequest(msg) + } else { + tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); + AppError::Git(msg) + } + })?; crate::metrics::record_fetch(&format!("{owner}/{name}")); crate::metrics::observe_pack_size(body_len as f64); Ok(resp) @@ -579,20 +608,142 @@ pub async fn git_receive_pack( } } - // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty) - { + // Replication enforcement (Phase 2): decide once per push whether the public + // may read this repo at all and, if so, which blob OIDs must not leave the + // node. `withheld == None` means replicate nothing (private / mode A / + // undetermined): skip every pin so even commit and tree objects (which + // withheld_blob_oids never lists) stay local. `announce` gates the + // network-facing announcements. Fail closed: a private or undetermined repo + // never leaks. + let rules_opt = state.db.list_visibility_rules(&record.id).await.ok(); + let announce = match &rules_opt { + Some(rules) => { + visibility_check(rules, record.is_public, &record.owner_did, None, "/") + == Decision::Allow + } + None => false, + }; + let withheld: Option> = if !announce { + None + } else { + match &rules_opt { + Some(rules) if rules.is_empty() => Some(std::collections::HashSet::new()), + // withheld_blob_oids walks every ref with blocking `git ls-tree`; + // keep that off the async worker thread. + Some(rules) => { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + crate::git::visibility_pack::withheld_blob_oids( + &path, &rules, is_public, &owner_did, None, + ) + }) + .await + .map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids task panicked; skipping replication for this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids failed; skipping replication for this push") + }) + .ok() + }) + } + None => None, + } + }; + + // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty). + // Skipped entirely when the public cannot read the repo (withheld == None). + if let Some(withheld_ipfs) = withheld.clone() { let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); + let rules_for_enc = rules_opt.clone(); + let repo_id = record.id.clone(); + let owner_did = record.owner_did.clone(); + let is_public = record.is_public; + let irys_url = state.config.irys_url.clone(); + let http_client = std::sync::Arc::clone(&state.http_client); + let node_did_str = state.node_did.to_string(); + let node_seed = state.node_keypair.seed_bytes(); + let repo_name = record.name.clone(); tokio::spawn(async move { - let pinned = - crate::ipfs_pin::pin_new_objects(&ipfs_api, &repo_path_clone, &db_clone).await; + let pinned = crate::ipfs_pin::pin_new_objects( + &ipfs_api, + &repo_path_clone, + &db_clone, + &withheld_ipfs, + ) + .await; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to IPFS"); for (sha, cid) in &pinned { tracing::info!(sha = %sha, %cid, "pinned"); } } + + // Option B1: encrypt-then-pin the withheld blobs so authorized + // readers can recover them when the origin cannot serve them. + if let Some(rules) = rules_for_enc.filter(|r| !r.is_empty()) { + let p = repo_path_clone.clone(); + let owner = owner_did.clone(); + let recip = tokio::task::spawn_blocking(move || { + crate::git::visibility_pack::withheld_blob_recipients( + &p, &rules, is_public, &owner, + ) + }) + .await; + if let Ok(Ok(recipients)) = recip { + let delta = crate::encrypted_pin::encrypt_and_pin( + &ipfs_api, + &repo_path_clone, + &db_clone, + &repo_id, + &node_seed, + &recipients, + ) + .await; + + // Option B3: anchor a per-push manifest of the blobs sealed + // this push to Arweave, so the oid->cid index survives total + // node loss. Best-effort; never fails the push. + if !delta.is_empty() && !irys_url.is_empty() { + let owner_short = owner_did.split(':').next_back().unwrap_or(&owner_did); + let repo_slug = format!("{owner_short}/{repo_name}"); + let ts = chrono::Utc::now().to_rfc3339(); + let manifest = crate::arweave::EncryptedManifest { + repo: &repo_slug, + owner_did: &owner_did, + node_did: &node_did_str, + timestamp: &ts, + blobs: &delta, + }; + match crate::arweave::anchor_encrypted_manifest( + &http_client, + &irys_url, + &manifest, + ) + .await + { + Ok(tx) if !tx.is_empty() => tracing::info!( + repo = %repo_slug, + tx_id = %tx, + "anchored encrypted manifest to Arweave" + ), + Ok(_) => {} + Err(e) => tracing::warn!( + repo = %repo_slug, + err = %e, + "encrypted manifest anchor failed" + ), + } + } + } + } }); } @@ -625,15 +776,22 @@ pub async fn git_receive_pack( let owner_did_for_arweave = record.owner_did.clone(); let self_public_url = state.config.public_url.clone(); let node_keypair = Arc::clone(&state.node_keypair); + let withheld_pinata = withheld; tokio::spawn(async move { - let pinned = crate::pinata::pin_new_objects( - &http_client, - &pinata_upload_url, - &pinata_jwt, - &repo_path_clone, - &db_clone, - ) - .await; + let pinned = match &withheld_pinata { + Some(withheld) => { + crate::pinata::pin_new_objects( + &http_client, + &pinata_upload_url, + &pinata_jwt, + &repo_path_clone, + &db_clone, + withheld, + ) + .await + } + None => Vec::new(), + }; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to Pinata"); @@ -652,77 +810,82 @@ pub async fn git_receive_pack( .await; } - if let Some(p2p) = &p2p_handle { - p2p.publish_ref_update(crate::p2p::RefUpdateEvent { - node_did: node_did_str.clone(), - pusher_did: pusher_did_clone.clone(), - repo: repo_slug.clone(), - ref_name: ref_name.clone(), - old_sha: "".to_string(), - new_sha: new_sha.clone(), - timestamp: chrono::Utc::now().to_rfc3339(), - cert_id: None, - cid: cid.map(|s| s.to_string()), - }) - .await; + if announce { + if let Some(p2p) = &p2p_handle { + p2p.publish_ref_update(crate::p2p::RefUpdateEvent { + node_did: node_did_str.clone(), + pusher_did: pusher_did_clone.clone(), + repo: repo_slug.clone(), + ref_name: ref_name.clone(), + old_sha: "".to_string(), + new_sha: new_sha.clone(), + timestamp: chrono::Utc::now().to_rfc3339(), + cert_id: None, + cid: cid.map(|s| s.to_string()), + }) + .await; + } } } // HTTP peer notification — notify all known peers to pull from us. // This is the reliable fallback when Gossipsub p2p is not yet connected. - if let Ok(peers) = db_for_peers.list_peers().await { - for peer in peers { - if peer.http_url.is_empty() { - continue; - } - let peer_url = peer.http_url.trim_end_matches('/'); - if let Some(self_url) = self_public_url.as_deref() { - if peer_url == self_url.trim_end_matches('/') { + // Suppressed for repos the public cannot read. + if announce { + if let Ok(peers) = db_for_peers.list_peers().await { + for peer in peers { + if peer.http_url.is_empty() { continue; } - } - let path = "/api/v1/sync/notify"; - let notify_url = format!("{peer_url}{path}"); - let body = serde_json::json!({ - "repo": repo_slug.clone(), - "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), - "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), - "node_did": node_did_str.clone(), - "pusher_did": pusher_did_clone.clone(), - "old_sha": "0000000000000000000000000000000000000000", - "timestamp": chrono::Utc::now().to_rfc3339(), - }); - let body_bytes = match serde_json::to_vec(&body) { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); - continue; - } - }; - let signed = gitlawb_core::http_sig::sign_request( - node_keypair.as_ref(), - "POST", - path, - &body_bytes, - ); - match http_client - .post(¬ify_url) - .header("Content-Type", "application/json") - .header("Content-Digest", signed.content_digest) - .header("Signature-Input", signed.signature_input) - .header("Signature", signed.signature) - .body(body_bytes) - .send() - .await - { - Ok(r) if r.status().is_success() => { - tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") - } - Ok(r) => { - tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + let peer_url = peer.http_url.trim_end_matches('/'); + if let Some(self_url) = self_public_url.as_deref() { + if peer_url == self_url.trim_end_matches('/') { + continue; + } } - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + let path = "/api/v1/sync/notify"; + let notify_url = format!("{peer_url}{path}"); + let body = serde_json::json!({ + "repo": repo_slug.clone(), + "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), + "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), + "node_did": node_did_str.clone(), + "pusher_did": pusher_did_clone.clone(), + "old_sha": "0000000000000000000000000000000000000000", + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + let body_bytes = match serde_json::to_vec(&body) { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); + continue; + } + }; + let signed = gitlawb_core::http_sig::sign_request( + node_keypair.as_ref(), + "POST", + path, + &body_bytes, + ); + match http_client + .post(¬ify_url) + .header("Content-Type", "application/json") + .header("Content-Digest", signed.content_digest) + .header("Signature-Input", signed.signature_input) + .header("Signature", signed.signature) + .body(body_bytes) + .send() + .await + { + Ok(r) if r.status().is_success() => { + tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") + } + Ok(r) => { + tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + } + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + } } } } @@ -746,8 +909,9 @@ pub async fn git_receive_pack( timestamp: now_ts.clone(), }); - // Arweave permanent anchoring — fire for each ref update - if !irys_url.is_empty() { + // Arweave permanent anchoring — fire for each ref update. + // Suppressed for repos the public cannot read (public permanent ledger). + if announce && !irys_url.is_empty() { for (ref_name, new_sha) in &ref_updates_clone { let cid = cid_map.get(new_sha).cloned(); let anchor = crate::arweave::RefAnchor { diff --git a/crates/gitlawb-node/src/api/visibility.rs b/crates/gitlawb-node/src/api/visibility.rs index 531c724..6665b9e 100644 --- a/crates/gitlawb-node/src/api/visibility.rs +++ b/crates/gitlawb-node/src/api/visibility.rs @@ -185,6 +185,48 @@ pub async fn list_visibility( }))) } +/// GET /api/v1/repos/{owner}/{repo}/withheld-paths +/// +/// Returns the path globs the (optionally authenticated) caller is denied +/// (`withheld`) plus any more-specific globs that are allowed underneath a +/// denied one (`reinclude`), so a clean-clone client can sparse-exclude the +/// denied subtrees while re-including the allowed nested paths. Unlike +/// `list_visibility` this is not owner-gated and never exposes reader_dids. +pub async fn withheld_paths( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + + let rules = state.db.list_visibility_rules(&record.id).await?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + + // Whole-repo read gate: a caller who cannot read "/" gets repo-not-found, + // matching the git read endpoints, so this never discloses a private repo's + // existence or its path layout to an unauthorized caller. + if crate::visibility::visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") + == crate::visibility::Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}"))); + } + + let withheld = + crate::visibility::withheld_globs(&rules, record.is_public, &record.owner_did, caller); + let reinclude = + crate::visibility::reincluded_globs(&rules, record.is_public, &record.owner_did, caller); + + Ok(Json(serde_json::json!({ + "repo": format!("{owner}/{repo}"), + "withheld": withheld, + "reinclude": reinclude, + }))) +} + #[cfg(test)] mod tests { use super::validate_path_glob; diff --git a/crates/gitlawb-node/src/arweave.rs b/crates/gitlawb-node/src/arweave.rs index a88f31f..43f35a0 100644 --- a/crates/gitlawb-node/src/arweave.rs +++ b/crates/gitlawb-node/src/arweave.rs @@ -103,6 +103,108 @@ pub async fn anchor_ref_update( Ok(tx_id) } +/// A per-push manifest of the blobs encrypted this push (Option B3). The +/// `blobs` slice is `(oid, cid)` tuples. Anchored directly to Arweave as its JSON +/// body so the discovery index survives total node loss. Recipient identities are +/// never part of the manifest. +pub struct EncryptedManifest<'a> { + pub repo: &'a str, + pub owner_did: &'a str, + pub node_did: &'a str, + pub timestamp: &'a str, + pub blobs: &'a [(String, String)], +} + +/// Anchor a per-push encrypted-blob manifest to Arweave via Irys. The manifest +/// JSON body is the payload (not a CID pointer to IPFS), so the index is +/// permanent and self-contained. Recipient identities are deliberately omitted: +/// the anchor is permanent and public, and the v2 envelopes no longer expose +/// recipients, so the reader set must not be written to Arweave either. +/// +/// Returns the Irys/Arweave transaction ID, or `Ok("")` when `irys_url` is empty +/// (anchoring disabled) or there are no blobs to anchor. +pub async fn anchor_encrypted_manifest( + client: &reqwest::Client, + irys_url: &str, + manifest: &EncryptedManifest<'_>, +) -> Result { + if irys_url.is_empty() || manifest.blobs.is_empty() { + return Ok(String::new()); + } + + let blobs_json: Vec = manifest + .blobs + .iter() + .map(|(oid, cid)| manifest_blob_json(oid, cid)) + .collect(); + + let payload = json!({ + "schema": "gitlawb/encrypted-manifest/v1", + "repo": manifest.repo, + "owner_did": manifest.owner_did, + "node_did": manifest.node_did, + "timestamp": manifest.timestamp, + "blobs": blobs_json, + }); + + let body = serde_json::to_vec(&payload)?; + let url = format!("{}/upload", irys_url.trim_end_matches('/')); + + let resp = client + .post(&url) + .header("Content-Type", "application/json") + .header("x-irys-tags", build_manifest_tags_header(manifest)) + .body(body) + .send() + .await + .map_err(|e| anyhow::anyhow!("Irys upload failed: {e}"))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + return Err(anyhow::anyhow!("Irys returned {status}: {body}")); + } + + let json: serde_json::Value = resp + .json() + .await + .map_err(|e| anyhow::anyhow!("failed to parse Irys response: {e}"))?; + + let tx_id = json["id"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("no 'id' in Irys response: {json}"))? + .to_string(); + + tracing::info!( + repo = %manifest.repo, + tx_id = %tx_id, + blobs = manifest.blobs.len(), + "anchored encrypted manifest to Arweave" + ); + + Ok(tx_id) +} + +/// Serialize one blob for the Arweave manifest. Recipient identities are +/// intentionally absent so the permanent public anchor never records who can +/// read a blob. +fn manifest_blob_json(oid: &str, cid: &str) -> serde_json::Value { + json!({ "oid": oid, "cid": cid }) +} + +/// Build the Irys tag header for an encrypted-blob manifest. `Repo` and `Schema` +/// are the tags the `gl` recovery query filters on. +fn build_manifest_tags_header(manifest: &EncryptedManifest<'_>) -> String { + [ + "App-Name:gitlawb".to_string(), + "Schema:gitlawb/encrypted-manifest/v1".to_string(), + format!("Repo:{}", sanitize_tag(manifest.repo)), + format!("Owner-DID:{}", sanitize_tag(manifest.owner_did)), + format!("Node-DID:{}", sanitize_tag(manifest.node_did)), + ] + .join(",") +} + /// Arweave permanent URL for a given Irys transaction ID. pub fn arweave_url(tx_id: &str) -> String { format!("https://arweave.net/{tx_id}") @@ -193,6 +295,79 @@ mod tests { ); } + #[tokio::test] + async fn test_manifest_anchor_noop_when_url_empty() { + let client = reqwest::Client::new(); + let blobs = vec![("oid1".to_string(), "cid1".to_string())]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + assert_eq!( + anchor_encrypted_manifest(&client, "", &m).await.unwrap(), + "" + ); + } + + #[tokio::test] + async fn test_manifest_anchor_noop_when_no_blobs() { + let client = reqwest::Client::new(); + let blobs: Vec<(String, String)> = vec![]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + // Non-empty URL, but no blobs: still a no-op. + assert_eq!( + anchor_encrypted_manifest(&client, "https://example.invalid", &m) + .await + .unwrap(), + "" + ); + } + + #[tokio::test] + async fn test_manifest_anchor_success() { + let mut server = mockito::Server::new_async().await; + let _mock = server + .mock("POST", "/upload") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"id":"MANIFESTTX123","timestamp":1710000000000,"version":"1.0.0"}"#) + .create_async() + .await; + + let client = reqwest::Client::new(); + let blobs = vec![("oid1".to_string(), "cid1".to_string())]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + let r = anchor_encrypted_manifest(&client, &server.url(), &m).await; + assert_eq!(r.unwrap(), "MANIFESTTX123"); + _mock.assert_async().await; + } + + #[test] + fn manifest_blob_json_omits_recipients() { + let v = manifest_blob_json("oid1", "cidA"); + assert_eq!(v["oid"], "oid1"); + assert_eq!(v["cid"], "cidA"); + assert!( + v.get("recipients").is_none(), + "Arweave manifest must not anchor recipient identities" + ); + } + #[test] fn test_sanitize_tag() { assert_eq!(sanitize_tag("alice/myrepo"), "alice/myrepo"); diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 6af5ee8..be83ead 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -720,6 +720,32 @@ const MIGRATIONS: &[Migration] = &[ "CREATE INDEX IF NOT EXISTS idx_visibility_rules_repo ON visibility_rules(repo_id)", ], }, + Migration { + version: 4, + name: "encrypted_blobs", + stmts: &[ + r#"CREATE TABLE IF NOT EXISTS encrypted_blobs ( + repo_id TEXT NOT NULL, + oid TEXT NOT NULL, + cid TEXT NOT NULL, + recipients TEXT NOT NULL, + created_at TEXT NOT NULL, + PRIMARY KEY (repo_id, oid) + )"#, + "CREATE INDEX IF NOT EXISTS idx_encrypted_blobs_repo ON encrypted_blobs(repo_id)", + ], + }, + Migration { + version: 5, + name: "encrypted_blobs_blind_recipients", + stmts: &[ + // Replace the cleartext recipient DID list with an opaque, node-keyed + // tag used only to detect a recipient-set change. Existing rows get an + // empty tag and are re-sealed on the next push. + "ALTER TABLE encrypted_blobs DROP COLUMN IF EXISTS recipients", + "ALTER TABLE encrypted_blobs ADD COLUMN IF NOT EXISTS recipients_tag TEXT NOT NULL DEFAULT ''", + ], + }, ]; // ── Repos ───────────────────────────────────────────────────────────────────── @@ -1621,6 +1647,76 @@ impl Db { Ok(()) } + pub async fn record_encrypted_blob( + &self, + repo_id: &str, + oid: &str, + cid: &str, + recipients_tag: &str, + ) -> Result<()> { + sqlx::query( + "INSERT INTO encrypted_blobs (repo_id, oid, cid, recipients_tag, created_at) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (repo_id, oid) DO UPDATE SET cid = EXCLUDED.cid, recipients_tag = EXCLUDED.recipients_tag", + ) + .bind(repo_id) + .bind(oid) + .bind(cid) + .bind(recipients_tag) + .bind(Utc::now().to_rfc3339()) + .execute(&self.pool) + .await?; + Ok(()) + } + + /// (oid, cid) for every encrypted blob in the repo, unscoped by caller. Used + /// by both the B2 replication view and B1 discovery. Recipient identities are + /// not stored, so authorization is the caller's repo readability, not a per + /// recipient check. Ciphertext metadata only. + pub async fn list_all_encrypted_blobs(&self, repo_id: &str) -> Result> { + let rows = sqlx::query("SELECT oid, cid FROM encrypted_blobs WHERE repo_id = $1") + .bind(repo_id) + .fetch_all(&self.pool) + .await?; + let mut out = Vec::new(); + for row in rows { + let oid: String = row.get("oid"); + let cid: String = row.get("cid"); + out.push((oid, cid)); + } + Ok(out) + } + + /// The CID of one encrypted blob, or None if there is no such row. Recipient + /// authorization is not enforced here: the handler checks repo readability and + /// the envelope crypto gates decryption. + pub async fn encrypted_blob_cid(&self, repo_id: &str, oid: &str) -> Result> { + let row = sqlx::query("SELECT cid FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; + Ok(row.map(|r| r.get("cid"))) + } + + /// The opaque recipients tag stored for an encrypted blob, or None if there is + /// no row. Used only to decide whether a re-seal is needed (the recipient set + /// changed); the tag is a node-keyed fingerprint, not the DID list. + pub async fn encrypted_blob_recipients_tag( + &self, + repo_id: &str, + oid: &str, + ) -> Result> { + let row = sqlx::query( + "SELECT recipients_tag FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2", + ) + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; + Ok(row.map(|r| r.get("recipients_tag"))) + } + pub async fn list_pinned_cids(&self) -> Result> { let rows = sqlx::query( "SELECT sha256_hex, cid, pinned_at, pinata_cid FROM pinned_cids ORDER BY pinned_at DESC", diff --git a/crates/gitlawb-node/src/encrypted_pin.rs b/crates/gitlawb-node/src/encrypted_pin.rs new file mode 100644 index 0000000..25439ee --- /dev/null +++ b/crates/gitlawb-node/src/encrypted_pin.rs @@ -0,0 +1,136 @@ +//! Encrypt-then-pin for withheld blobs (Option B1). Each withheld blob is sealed +//! to its recipient DIDs and the envelope pinned to IPFS, recorded in +//! `encrypted_blobs`. Best-effort per blob: a failure is logged and skipped, +//! never pinned in plaintext. + +use std::collections::{BTreeSet, HashMap}; +use std::path::Path; +use std::str::FromStr; + +use ed25519_dalek::VerifyingKey; +use gitlawb_core::did::Did; +use gitlawb_core::encrypt::seal_blob; + +use crate::db::Db; + +use hmac::{Hmac, Mac}; +use sha2::Sha256; + +type HmacSha256 = Hmac; + +/// Opaque, node-keyed fingerprint of a blob's recipient set. Stored in place of +/// the cleartext DID list so a DB compromise cannot reveal the reader set; used +/// only to detect a recipient-set change so an unchanged blob is not re-sealed. +/// Order-insensitive (the input `BTreeSet` is already sorted). +pub fn recipients_tag(node_seed: &[u8; 32], dids: &BTreeSet) -> String { + let mut mac = HmacSha256::new_from_slice(node_seed).expect("HMAC accepts any key length"); + mac.update(b"gitlawb/recipients-tag/v1"); + for did in dids { + mac.update(b"\n"); + mac.update(did.as_bytes()); + } + hex::encode(mac.finalize().into_bytes()) +} + +/// Resolve a DID string to its Ed25519 verifying key, or None if it carries no +/// inline key (e.g. did:web / did:gitlawb). +fn did_to_key(did: &str) -> Option { + Did::from_str(did).ok()?.to_verifying_key().ok() +} + +/// Encrypt and pin every withheld blob. `recipients` maps blob oid -> DID set; +/// `node_seed` keys the opaque recipients tag. Returns `(oid, cid)` for each blob +/// actually sealed and recorded this call (the per-push delta), used by Option B3 +/// to anchor a manifest. Recipient identities are never stored or returned. +pub async fn encrypt_and_pin( + ipfs_api: &str, + repo_path: &Path, + db: &Db, + repo_id: &str, + node_seed: &[u8; 32], + recipients: &HashMap>, +) -> Vec<(String, String)> { + let mut sealed = Vec::new(); + for (oid, dids) in recipients { + // Skip only if an existing envelope already covers exactly these + // recipients. If the recipient set changed (e.g. a reader was added to + // the rule), re-seal so the new reader can recover the blob. Reader + // removal is not retroactive: the old envelope is already public. The + // comparison is on the opaque node-keyed tag, never the DID list. + let tag = recipients_tag(node_seed, dids); + match db.encrypted_blob_recipients_tag(repo_id, oid).await { + Ok(Some(stored_tag)) if stored_tag == tag => continue, + Ok(_) => {} + Err(e) => { + // A DB read failure is not a cache miss: re-sealing here would do + // an avoidable IPFS write during a partial outage. Skip and retry + // on the next push. + tracing::warn!(oid = %oid, err = %e, "recipients_tag lookup failed; skipping reseal"); + continue; + } + } + let keys: Vec = dids.iter().filter_map(|d| did_to_key(d)).collect(); + if keys.is_empty() { + tracing::warn!(oid = %oid, "no resolvable recipient keys; skipping encrypted pin"); + continue; + } + let data = match crate::git::store::read_object(repo_path, oid) { + Ok(Some((_t, bytes))) => bytes, + _ => continue, + }; + let envelope = match seal_blob(&data, &keys) { + Ok(e) => e, + Err(e) => { + tracing::warn!(oid = %oid, err = %e, "seal_blob failed; skipping"); + continue; + } + }; + let cid = match crate::ipfs_pin::pin_git_object(ipfs_api, oid, &envelope).await { + Ok(c) if !c.is_empty() => c, + _ => continue, + }; + if let Err(e) = db.record_encrypted_blob(repo_id, oid, &cid, &tag).await { + tracing::warn!(oid = %oid, err = %e, "record_encrypted_blob failed"); + continue; + } + sealed.push((oid.clone(), cid.clone())); + } + sealed +} + +#[cfg(test)] +mod tests { + use super::recipients_tag; + use std::collections::BTreeSet; + + fn set(dids: &[&str]) -> BTreeSet { + dids.iter().map(|s| s.to_string()).collect() + } + + #[test] + fn tag_is_order_insensitive() { + let seed = [7u8; 32]; + let a = recipients_tag(&seed, &set(&["did:key:zA", "did:key:zB"])); + let b = recipients_tag(&seed, &set(&["did:key:zB", "did:key:zA"])); + assert_eq!(a, b); + } + + #[test] + fn tag_differs_for_different_sets() { + let seed = [7u8; 32]; + let a = recipients_tag(&seed, &set(&["did:key:zA"])); + let b = recipients_tag(&seed, &set(&["did:key:zA", "did:key:zB"])); + assert_ne!(a, b); + } + + #[test] + fn tag_is_keyed_by_node_seed() { + let dids = set(&["did:key:zA", "did:key:zB"]); + let a = recipients_tag(&[1u8; 32], &dids); + let b = recipients_tag(&[2u8; 32], &dids); + assert_ne!( + a, b, + "tag must depend on the node seed, not be a plain hash" + ); + } +} diff --git a/crates/gitlawb-node/src/git/mod.rs b/crates/gitlawb-node/src/git/mod.rs index 4dcd233..49259d5 100644 --- a/crates/gitlawb-node/src/git/mod.rs +++ b/crates/gitlawb-node/src/git/mod.rs @@ -3,3 +3,4 @@ pub mod repo_store; pub mod smart_http; pub mod store; pub mod tigris; +pub mod visibility_pack; diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 6a00107..80374fb 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -1,8 +1,9 @@ -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use axum::body::Body; use axum::http::StatusCode; use axum::response::Response; use bytes::Bytes; +use std::collections::HashSet; use std::path::Path; use std::process::Stdio; use tokio::io::AsyncWriteExt; @@ -120,3 +121,571 @@ fn pkt_line(data: &str) -> Vec { let len = data.len() + 4; format!("{len:04x}{data}").into_bytes() } + +/// Build a packfile containing every object reachable from all refs EXCEPT the +/// given blob OIDs. Commits and trees are always included, so SHAs stay intact; +/// only the named blobs are dropped. +pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { + // All reachable objects as "oid [path]" lines. + let rev = std::process::Command::new("git") + .args(["rev-list", "--objects", "--all"]) + .current_dir(repo_path) + .output()?; + if !rev.status.success() { + bail!( + "git rev-list failed: {}", + String::from_utf8_lossy(&rev.stderr) + ); + } + let mut keep = Vec::new(); + for line in String::from_utf8_lossy(&rev.stdout).lines() { + let oid = line.split_whitespace().next().unwrap_or(""); + if oid.is_empty() || withheld.contains(oid) { + continue; + } + keep.push(oid.to_string()); + } + let mut child = std::process::Command::new("git") + .args(["pack-objects", "--stdout"]) + .current_dir(repo_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + { + use std::io::Write as _; + let mut stdin = child.stdin.take().expect("stdin"); + stdin.write_all(keep.join("\n").as_bytes())?; + stdin.write_all(b"\n")?; + } + let out = child.wait_with_output()?; + if !out.status.success() { + bail!( + "git pack-objects failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(out.stdout) +} + +/// Serve a clone/fetch with the withheld blobs removed from the response pack. +/// +/// The framing is git protocol v0 (`NAK` then the pack), matching the v0 ref +/// advertisement that `info_refs` emits (it runs `git upload-pack +/// --advertise-refs` without `GIT_PROTOCOL=version=2`, so clients negotiate v0). +/// If `info_refs` ever advertises v2, this serve path must learn v2 framing too. +/// +/// Because the pack deliberately omits blobs that the sent trees still +/// reference, the pack is not closed under reachability. A stock full clone +/// rejects it at fetch time ("remote did not send all necessary objects"); only +/// a partial clone (the client passes `--filter`, marking a promisor remote) +/// accepts the pack with the private blobs absent. Tree and commit SHAs stay +/// intact either way. The clean partial-clone client UX is a separate follow-up +/// (git-remote-gitlawb); the security guarantee (private bytes never leave the +/// node) holds regardless of client. +/// +/// Negotiation is intentionally ignored: rather than honoring the client's +/// `want`/`have` lines, this always sends a self-contained pack of every object +/// across all refs minus the withheld blobs, and replies `NAK`. A fresh clone +/// and an incremental fetch are both correct (the client de-duplicates objects +/// it already has); the cost is that a fetch re-sends the full object set +/// instead of a thin delta. Honoring negotiation for smaller fetch packs is an +/// optimization follow-up, not a correctness requirement. +pub async fn upload_pack_excluding( + repo_path: &Path, + request_body: Bytes, + withheld: &HashSet, +) -> Result { + // build_filtered_pack shells out to git (rev-list, pack-objects) with + // blocking std::process I/O; run it off the async worker so a large repo's + // pack build does not stall the tokio runtime. + let pack = { + let repo_path = repo_path.to_path_buf(); + let withheld = withheld.clone(); + tokio::task::spawn_blocking(move || build_filtered_pack(&repo_path, &withheld)) + .await + .context("filtered-pack build task panicked")?? + }; + + // The client lists its capabilities on the first `want` line. Honor + // side-band-64k when offered (every modern smart-HTTP client offers it); + // otherwise stream the raw pack after NAK. + let sideband = memmem(&request_body, b"side-band-64k"); + + let mut body = Vec::new(); + body.extend_from_slice(&pkt_line("NAK\n")); + if sideband { + // Band 1 carries pack data, chunked under the pkt-line size limit. + for chunk in pack.chunks(65515) { + let mut framed = Vec::with_capacity(chunk.len() + 1); + framed.push(0x01); + framed.extend_from_slice(chunk); + let len = framed.len() + 4; + body.extend_from_slice(format!("{len:04x}").as_bytes()); + body.extend_from_slice(&framed); + } + body.extend_from_slice(b"0000"); + } else { + body.extend_from_slice(&pack); + } + + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/x-git-upload-pack-result") + .header("Cache-Control", "no-cache") + .body(Body::from(body))?) +} + +/// True if `needle` occurs anywhere in `haystack`. Small substring scan used to +/// detect a client capability token in the upload-pack request body. +fn memmem(haystack: &[u8], needle: &[u8]) -> bool { + if needle.is_empty() || haystack.len() < needle.len() { + return needle.is_empty(); + } + haystack + .windows(needle.len()) + .any(|window| window == needle) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + /// List OIDs in a pack by writing it to a temp dir and running verify-pack. + pub(super) fn pack_object_ids(pack: &[u8]) -> std::collections::HashSet { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("test.pack"); + std::fs::write(&path, pack).unwrap(); + // index-pack creates the matching .idx next to the pack. + let ok = Command::new("git") + .args(["index-pack", path.to_str().unwrap()]) + .status() + .unwrap() + .success(); + assert!(ok, "index-pack failed"); + let out = Command::new("git") + .args(["verify-pack", "-v", path.to_str().unwrap()]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter_map(|l| l.split_whitespace().next()) + .filter(|t| t.len() == 40 && t.chars().all(|c| c.is_ascii_hexdigit())) + .map(|s| s.to_string()) + .collect() + } + + #[tokio::test] + async fn filtered_serve_excludes_withheld_blob() { + // Build a bare repo, capture the secret + public blob OIDs. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret.clone()); + + let pack = build_filtered_pack(&bare, &withheld).unwrap(); + let ids = pack_object_ids(&pack); + assert!(ids.contains(&public), "public blob must be in the pack"); + assert!( + !ids.contains(&secret), + "secret blob must NOT be in the pack" + ); + } + + #[tokio::test] + async fn client_clone_lacks_withheld_blob_bytes() { + use axum::body::to_bytes; + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret_oid.clone()); + + // A realistic v0 request advertises side-band-64k, so the serve frames + // the pack in band 1 (the path real clients exercise). + let req = Bytes::from_static( + b"0098want 0000000000000000000000000000000000000000 \ + side-band-64k ofs-delta agent=git/2\n00000009done\n", + ); + let resp = upload_pack_excluding(&bare, req, &withheld).await.unwrap(); + let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let ids = pack_object_ids(&extract_pack(&body)); + assert!( + ids.contains(&public_oid), + "public blob must be present in served pack" + ); + assert!( + !ids.contains(&secret_oid), + "withheld blob must be absent from served pack" + ); + } + + /// Strip the v0 upload-pack framing (NAK line + sideband-64k bands), + /// returning the raw pack. Mirrors how a client de-frames the band-1 stream. + fn extract_pack(body: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i + 4 <= body.len() { + let len = + usize::from_str_radix(std::str::from_utf8(&body[i..i + 4]).unwrap_or("0000"), 16) + .unwrap_or(0); + if len == 0 { + i += 4; + continue; + } + let chunk = &body[i + 4..i + len]; + // band 1 = pack data; skip the NAK line and any other bands. + if chunk.first() == Some(&0x01) { + out.extend_from_slice(&chunk[1..]); + } + i += len; + } + out + } + + // Shared harness for the real-git server tests: a minimal smart-HTTP server + // backed by the real info_refs + upload_pack_excluding. + + #[derive(Clone)] + struct FilterState { + repo: std::path::PathBuf, + withheld: HashSet, + } + + async fn refs_handler( + axum::extract::State(st): axum::extract::State>, + axum::extract::Query(q): axum::extract::Query>, + ) -> Response { + let service = q.get("service").cloned().unwrap_or_default(); + info_refs(&st.repo, &service).await.unwrap() + } + + async fn pack_handler( + axum::extract::State(st): axum::extract::State>, + body: Bytes, + ) -> Response { + upload_pack_excluding(&st.repo, body, &st.withheld) + .await + .unwrap() + } + + /// Spawn the server for `bare`, withholding `withheld`. Returns the clone URL + /// and the server task (abort it when done). + async fn spawn_filter_server( + bare: std::path::PathBuf, + withheld: HashSet, + ) -> (String, tokio::task::JoinHandle<()>) { + use axum::routing::{get, post}; + let state = std::sync::Arc::new(FilterState { + repo: bare, + withheld, + }); + let app = axum::Router::new() + .route("/repo.git/info/refs", get(refs_handler)) + .route("/repo.git/git-upload-pack", post(pack_handler)) + .with_state(state); + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + let handle = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + (format!("http://127.0.0.1:{port}/repo.git"), handle) + } + + fn run_git(args: &[&str], dir: &std::path::Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a work repo (public/a.txt, secret/b.txt) and a bare clone of it. + /// Returns (work, bare, secret_blob_oid, public_blob_oid). + fn fixture_with_secret( + td: &TempDir, + ) -> (std::path::PathBuf, std::path::PathBuf, String, String) { + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + run_git(&["init", "-q"], &work); + run_git(&["config", "user.email", "t@t"], &work); + run_git(&["config", "user.name", "t"], &work); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); + run_git( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + (work, bare, secret_oid, public_oid) + } + + /// Enumerate exactly the objects a repo physically has (no promisor lazy + /// fetch), so tests assert on what bytes actually crossed the wire. + fn local_object_ids(repo: &std::path::Path) -> String { + let out = Command::new("git") + .args(["cat-file", "--batch-all-objects", "--batch-check"]) + .current_dir(repo) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).into_owned() + } + + /// End-to-end: a real `git` client clones through `info_refs` + + /// `upload_pack_excluding` and ends up without the withheld blob's bytes + /// while still seeing its tree entry (SHA). Uses a partial clone + /// (`--filter`) because a pack that omits a referenced blob is only + /// accepted by a promisor-aware client; a stock full clone is refused at + /// fetch time by the connectivity check. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_partial_clone_omits_withheld_blob() { + let td = TempDir::new().unwrap(); + let (_work, bare, secret_oid, public_oid) = fixture_with_secret(&td); + + let (url, server) = spawn_filter_server(bare, HashSet::from([secret_oid.clone()])).await; + + let dest = td.path().join("clone"); + let dest_s = dest.to_str().unwrap().to_string(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // The public blob is present in the clone, the withheld blob is not. + let local = local_object_ids(&dest); + assert!( + local.contains(&public_oid), + "public blob should be present in the clone" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob bytes must be absent from the clone" + ); + + // The tree entry (and SHA) for the private file is still visible. + let tree = Command::new("git") + .args(["ls-tree", "-r", "HEAD"]) + .current_dir(&dest) + .output() + .unwrap(); + let tree = String::from_utf8_lossy(&tree.stdout); + assert!( + tree.contains(&secret_oid) && tree.contains("secret/b.txt"), + "the private path and its blob SHA must remain visible: {tree}" + ); + + server.abort(); + } + + /// End-to-end: an incremental `git fetch` after a partial clone still works + /// and still withholds the private blob. The serve path ignores the client's + /// have/want negotiation and always sends a self-contained pack of all refs + /// minus the withheld blobs (it replies NAK, so the client treats it as "no + /// common commits" and accepts the full set). This is correct, just not + /// bandwidth-optimal; thin-pack/negotiation is an optimization follow-up. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_fetch_after_partial_clone_still_withholds() { + let td = TempDir::new().unwrap(); + let (work, bare, secret_oid, _public_oid) = fixture_with_secret(&td); + let branch = { + let o = Command::new("git") + .args(["symbolic-ref", "--short", "HEAD"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + + let (url, server) = + spawn_filter_server(bare.clone(), HashSet::from([secret_oid.clone()])).await; + + // Partial-clone the initial state. + let dest = td.path().join("clone"); + let dest_s = dest.to_str().unwrap().to_string(); + let url_c = url.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url_c, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // Add a new public commit on the server side. + std::fs::write(work.join("public/c.txt"), b"v2\n").unwrap(); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "c2"], &work); + let new_oid = { + let o = Command::new("git") + .args(["rev-parse", "HEAD:public/c.txt"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + run_git(&["push", "-q", bare.to_str().unwrap(), &branch], &work); + + // Incremental fetch: the client has c1 and asks for the update. + let dest_f = dest.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args(["-c", "protocol.version=2", "fetch", "-q", "origin"]) + .current_dir(&dest_f) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "fetch failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // The new commit's blob arrived; the withheld blob is still absent. + let local = local_object_ids(&dest); + assert!( + local.contains(&new_oid), + "the new commit's blob must be fetched" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob must remain absent after fetch" + ); + + server.abort(); + } +} diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs new file mode 100644 index 0000000..90ca772 --- /dev/null +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -0,0 +1,298 @@ +//! Resolve which blob OIDs must be withheld from a caller because every path +//! at which the blob appears is denied by the repo's visibility rules. Trees +//! and commits are never withheld (mode B keeps SHAs intact); only blob +//! content is held back. + +use crate::db::VisibilityRule; +use crate::git::store; +use crate::visibility::{visibility_check, Decision}; +use anyhow::{Context, Result}; +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::path::Path; + +/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch +/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives +/// at is represented (the same blob content can appear at several paths). Paths +/// are returned with a leading "/" to match the glob form used by visibility +/// rules ("/secret/**"). +fn blob_paths(repo_path: &Path) -> Result> { + let refs = store::list_refs(repo_path).context("list_refs failed")?; + let mut out = Vec::new(); + for (refname, _oid) in refs { + if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { + continue; + } + let listing = std::process::Command::new("git") + .args(["ls-tree", "-r", &refname]) + .current_dir(repo_path) + .output() + .context("git ls-tree -r failed")?; + if !listing.status.success() { + continue; + } + for line in String::from_utf8_lossy(&listing.stdout).lines() { + // " blob \t" + let Some((meta, path)) = line.split_once('\t') else { + continue; + }; + let mut parts = meta.split_whitespace(); + let _mode = parts.next(); + let kind = parts.next(); + let oid = parts.next(); + if kind == Some("blob") { + if let Some(oid) = oid { + out.push((oid.to_string(), format!("/{path}"))); + } + } + } + } + Ok(out) +} + +/// Blob OIDs the caller may not read. A blob is withheld only if visibility +/// denies the caller at *every* path the blob appears at; a blob that is also +/// reachable through an allowed path is sent (its content is public elsewhere). +/// +/// The whole-repo "/" gate is handled by the caller before this function runs: +/// if "/" denies, the caller gets a 404 and never reaches the filtered serve. +pub fn withheld_blob_oids( + repo_path: &Path, + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Result> { + let mut denied: HashSet = HashSet::new(); + let mut allowed: HashSet = HashSet::new(); + for (oid, path) in blob_paths(repo_path)? { + match visibility_check(rules, is_public, owner_did, caller, &path) { + Decision::Deny => { + denied.insert(oid); + } + Decision::Allow => { + allowed.insert(oid); + } + } + } + Ok(denied.difference(&allowed).cloned().collect()) +} + +/// Objects that may replicate to the public: everything not in `withheld`. +/// Order-preserving. The single seam every replication site (IPFS, Pinata) +/// passes its object list through; option B would later reroute the withheld +/// ones through encrypt-then-pin instead of dropping them. +pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec { + all.into_iter() + .filter(|oid| !withheld.contains(oid)) + .collect() +} + +/// For every blob withheld from anonymous, the DIDs allowed to read it: the +/// owner plus any reader DID that `visibility_check` Allows at some path the +/// blob appears at. Least-privilege: a reader of one private subtree is not a +/// recipient of a blob that only lives in another. +pub fn withheld_blob_recipients( + repo_path: &Path, + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, +) -> Result>> { + let withheld = withheld_blob_oids(repo_path, rules, is_public, owner_did, None)?; + if withheld.is_empty() { + return Ok(HashMap::new()); + } + let mut candidates: BTreeSet = BTreeSet::new(); + for r in rules { + for d in &r.reader_dids { + candidates.insert(d.clone()); + } + } + let mut out: HashMap> = HashMap::new(); + for (oid, path) in blob_paths(repo_path)? { + if !withheld.contains(&oid) { + continue; + } + let entry = out.entry(oid).or_default(); + entry.insert(owner_did.to_string()); + for did in &candidates { + if visibility_check(rules, is_public, owner_did, Some(did), &path) == Decision::Allow { + entry.insert(did.clone()); + } + } + } + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::VisibilityMode; + use chrono::Utc; + use std::process::Command; + use tempfile::TempDir; + + fn rule(path_glob: &str, readers: &[&str]) -> VisibilityRule { + VisibilityRule { + id: "x".into(), + repo_id: "r1".into(), + path_glob: path_glob.into(), + mode: VisibilityMode::B, + reader_dids: readers.iter().map(|s| s.to_string()).collect(), + created_by: "did:key:zOwner".into(), + created_at: Utc::now(), + } + } + + const OWNER: &str = "did:key:zOwner"; + + /// Build a bare repo with public/a.txt and secret/b.txt at one commit. + /// Returns (tempdir, bare_path, secret_blob_oid, public_blob_oid). + fn fixture() -> (TempDir, std::path::PathBuf, String, String) { + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let run = |args: &[&str], dir: &Path| { + let ok = Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(); + assert!(ok, "git {args:?} failed"); + }; + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + (td, bare, secret, public) + } + + #[test] + fn anonymous_caller_withholds_only_private_blob() { + let (_td, bare, secret_oid, public_oid) = fixture(); + let rules = [rule("/secret/**", &[])]; + // caller = None models the public / any peer: what must not replicate. + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob must be withheld" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must replicate" + ); + // Trees and commits are never withheld; the set holds only the secret blob. + assert_eq!(withheld.len(), 1, "only the secret blob OID is withheld"); + } + + #[test] + fn non_reader_withholds_only_the_private_blob() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zStranger")).unwrap(); + assert!(withheld.contains(&secret), "secret blob must be withheld"); + assert!( + !withheld.contains(&public), + "public blob must NOT be withheld" + ); + } + + #[test] + fn owner_withholds_nothing() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, Some(OWNER)).unwrap(); + assert!(withheld.is_empty(), "owner sees everything"); + let _ = (secret, public); + } + + #[test] + fn listed_reader_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zFriend")).unwrap(); + assert!(withheld.is_empty(), "listed reader sees the subtree"); + } + + #[test] + fn no_subtree_rules_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let withheld = withheld_blob_oids(&bare, &[], true, OWNER, None).unwrap(); + assert!( + withheld.is_empty(), + "public repo, no rules, nothing withheld" + ); + } + + #[test] + fn replicable_objects_drops_withheld_keeps_rest() { + let all = vec!["aaa".to_string(), "bbb".to_string(), "ccc".to_string()]; + let withheld: HashSet = ["bbb".to_string()].into_iter().collect(); + let got = replicable_objects(all, &withheld); + assert_eq!(got, vec!["aaa".to_string(), "ccc".to_string()]); + } + + #[test] + fn replicable_objects_empty_withheld_keeps_all() { + let all = vec!["aaa".to_string(), "bbb".to_string()]; + let withheld: HashSet = HashSet::new(); + let got = replicable_objects(all.clone(), &withheld); + assert_eq!(got, all); + } + + #[test] + fn recipients_are_owner_plus_allowed_readers_only() { + let (_td, repo, secret_oid, public_oid) = fixture(); + let reader = "did:key:zReader"; + let rules = vec![rule("/secret/**", &[reader])]; + let map = withheld_blob_recipients(&repo, &rules, true, OWNER).unwrap(); + + let recips = map.get(&secret_oid).expect("secret blob has recipients"); + assert!(recips.contains(OWNER)); + assert!(recips.contains(reader)); + assert!( + !map.contains_key(&public_oid), + "public blob is not encrypted" + ); + } + + #[test] + fn node_seal_open_round_trip() { + use gitlawb_core::encrypt::{open_blob, seal_blob}; + use gitlawb_core::identity::Keypair; + let (_td, repo, secret_oid, _public) = fixture(); + let (_t, bytes) = crate::git::store::read_object(&repo, &secret_oid) + .unwrap() + .unwrap(); + let reader = Keypair::generate(); + let env = seal_blob(&bytes, &[reader.verifying_key()]).unwrap(); + assert_eq!(open_blob(&env, &reader).unwrap(), bytes); + } +} diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index 831f1ad..9bdaade 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -7,6 +7,8 @@ //! If `ipfs_api` is empty the functions are no-ops, so the node works fine //! without a local IPFS daemon. +use std::collections::HashSet; + use anyhow::Result; use gitlawb_core::cid::Cid; @@ -70,6 +72,19 @@ pub async fn pin_git_object(ipfs_api: &str, sha256_hex: &str, data: &[u8]) -> Re Ok(cid) } +/// Fetch raw bytes for a CID from the local Kubo node (`/api/v0/cat`). +pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { + if ipfs_api.is_empty() { + return Err(anyhow::anyhow!("IPFS not configured")); + } + let url = format!("{}/api/v0/cat?arg={}", ipfs_api.trim_end_matches('/'), cid); + let resp = reqwest::Client::new().post(&url).send().await?; + if !resp.status().is_success() { + return Err(anyhow::anyhow!("ipfs cat {cid}: {}", resp.status())); + } + Ok(resp.bytes().await?.to_vec()) +} + /// List all git objects in the given bare repo and pin any that are not yet /// recorded in `pinned_cids`. /// @@ -78,6 +93,7 @@ pub async fn pin_new_objects( ipfs_api: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if ipfs_api.is_empty() { return vec![]; @@ -92,6 +108,8 @@ pub async fn pin_new_objects( } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); + let mut pinned = Vec::new(); for sha in object_list { diff --git a/crates/gitlawb-node/src/main.rs b/crates/gitlawb-node/src/main.rs index 1946637..a49ed41 100644 --- a/crates/gitlawb-node/src/main.rs +++ b/crates/gitlawb-node/src/main.rs @@ -5,6 +5,7 @@ mod bootstrap; mod cert; mod config; mod db; +mod encrypted_pin; mod error; mod git; mod graphql; diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index ee9d416..90bddad 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -7,6 +7,7 @@ //! no-op, so nodes without Pinata backing work fine. use anyhow::Result; +use std::collections::HashSet; /// Pin a single git object's raw bytes on Pinata (v3 API). /// @@ -76,6 +77,7 @@ pub async fn pin_new_objects( jwt: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if jwt.is_empty() { return vec![]; @@ -92,6 +94,7 @@ pub async fn pin_new_objects( return vec![]; } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); let mut pinned = Vec::new(); diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 4a8ec37..31ce4b4 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -352,6 +352,22 @@ pub fn build_router(state: AppState) -> Router { "/{owner}/{repo}/git-upload-pack", post(repos::git_upload_pack), ) + .route( + "/api/v1/repos/{owner}/{repo}/withheld-paths", + axum::routing::get(visibility::withheld_paths), + ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blobs", + axum::routing::get(crate::api::encrypted::list_encrypted_blobs), + ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blob/{oid}", + axum::routing::get(crate::api::encrypted::get_encrypted_blob), + ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate", + axum::routing::get(crate::api::encrypted::replicate_encrypted_blobs), + ) .layer(DefaultBodyLimit::disable()) .layer(RequestBodyLimitLayer::new(pack_limit)) .layer(middleware::from_fn(auth::optional_signature)); diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index cdcfd3e..9639c9c 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -10,6 +10,7 @@ //! 3. If it exists → `git fetch --prune` from the origin. //! 4. Mark done or failed. +use std::collections::HashMap; use std::path::Path; use std::sync::Arc; @@ -18,6 +19,70 @@ use tracing::{info, warn}; use crate::config::Config; use crate::db::Db; +/// How to mirror a repo, decided from the origin's `withheld-paths` answer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum MirrorMode { + /// No withheld content: a normal full mirror. + Plain, + /// Withheld content present: a promisor mirror that tolerates the blobs the + /// origin omits for an anonymous caller. + Promisor, +} + +/// Decide the mirror mode from the origin's `withheld-paths` response. +/// +/// `Some(non-empty)` → the repo has a private subtree → `Promisor`. +/// `Some(empty)` → fully public → `Plain`. +/// `None` → the lookup 404'd or failed. Attempt a `Plain` mirror; a +/// mode-A repo also 404s the git read endpoint, so the clone +/// fails and nothing is mirrored (fail-closed at the git +/// layer), while a public repo on a peer that predates the +/// `withheld-paths` route still gets mirrored. +fn classify_mirror(withheld: Option>) -> MirrorMode { + match withheld { + Some(globs) if !globs.is_empty() => MirrorMode::Promisor, + _ => MirrorMode::Plain, + } +} + +/// One encrypted blob as advertised by an origin's `encrypted-blobs/replicate` +/// endpoint (Option B2). Ciphertext metadata only; recipient identities are +/// withheld from peers, so a re-seal is detected by the CID changing. +#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize)] +struct ReplicaBlob { + oid: String, + cid: String, +} + +/// The shape of the `encrypted-blobs/replicate` JSON response. +#[derive(Debug, serde::Deserialize)] +struct ReplicateResponse { + #[serde(default)] + blobs: Vec, +} + +/// Decide which of the origin's encrypted blobs this mirror must (re)replicate. +/// +/// `have` maps each already-stored blob's oid to the CID the mirror pinned. A +/// remote blob is returned when the mirror has no row for that oid, or when the +/// stored CID differs from the advertised one. A re-seal regenerates the +/// envelope (new content key, nonce, and per-recipient wraps), so the CID +/// changes while the OID stays stable; comparing CIDs detects a re-seal without +/// the mirror ever holding recipient identities. +fn blobs_needing_replication( + remote: &[ReplicaBlob], + have: &HashMap, +) -> Vec { + remote + .iter() + .filter(|b| match have.get(&b.oid) { + None => true, + Some(stored_cid) => stored_cid != &b.cid, + }) + .cloned() + .collect() +} + /// Start the background sync worker. Returns immediately; the worker runs /// as a detached tokio task that exits cleanly when `shutdown_rx` flips /// to `true`. @@ -37,12 +102,17 @@ async fn run( shutdown_rx: &mut tokio::sync::watch::Receiver, ) { let machine_id = std::env::var("FLY_MACHINE_ID").ok(); + // Bound each withheld-paths lookup so a stalled peer cannot hang the worker. + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); info!("sync worker started (auto_sync=true)"); let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { tokio::select! { _ = interval.tick() => { - process_batch(&db, &config, machine_id.as_deref()).await; + process_batch(&db, &config, machine_id.as_deref(), &client).await; } _ = shutdown_rx.changed() => { if *shutdown_rx.borrow() { @@ -54,7 +124,12 @@ async fn run( } } -async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { +async fn process_batch( + db: &Db, + config: &Config, + machine_id: Option<&str>, + client: &reqwest::Client, +) { let items = match db.dequeue_pending_syncs(10).await { Ok(v) => v, Err(e) => { @@ -103,10 +178,13 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { // (no .git suffix — the server routes don't include it) let remote_url = format!("{}/{}", origin_url, item.repo); + let withheld = fetch_withheld(client, &origin_url, owner_short, repo_name).await; + let mode = classify_mirror(withheld); + let result = if local_path.exists() { - fetch_repo(&local_path, &remote_url).await + fetch_repo(&local_path, &remote_url, mode).await } else { - clone_repo(&remote_url, &local_path).await + clone_repo(&remote_url, &local_path, mode).await }; match result { @@ -121,6 +199,20 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { machine_id, ) .await; + // Option B2: carry the encrypted withheld-blob envelopes too, so an + // authorized reader can recover private content from this mirror if + // the origin dies. `item.repo` is the slug "{owner_short}/{name}", + // which is the id upsert_mirror_repo wrote (the local repo_id). + replicate_encrypted_blobs( + client, + &origin_url, + owner_short, + repo_name, + db, + &item.repo, + &config.ipfs_api, + ) + .await; let _ = db.mark_sync_done(&item.id).await; crate::metrics::record_sync_processed("done"); } @@ -133,44 +225,424 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } } -/// Mirror-clone a repo from a remote URL into a local bare repo. -async fn clone_repo(remote_url: &str, local_path: &Path) -> anyhow::Result<()> { +/// Query the origin's anonymous `withheld-paths` endpoint. Returns the withheld +/// glob list on a 2xx, or `None` on any non-success / network / parse error +/// (treated as "unknown" by `classify_mirror`). +async fn fetch_withheld( + client: &reqwest::Client, + origin_url: &str, + owner: &str, + repo: &str, +) -> Option> { + let url = format!("{origin_url}/api/v1/repos/{owner}/{repo}/withheld-paths"); + let resp = client.get(&url).send().await.ok()?; + if !resp.status().is_success() { + return None; + } + let body: serde_json::Value = resp.json().await.ok()?; + let globs = body + .get("withheld")? + .as_array()? + .iter() + .filter_map(|v| v.as_str().map(str::to_string)) + .collect(); + Some(globs) +} + +/// Replicate the origin's encrypted withheld blobs onto this mirror (Option B2). +/// +/// After the git objects are mirrored, fetch the origin's replication listing, +/// then for each blob the mirror does not already hold (or whose CID changed, +/// i.e. the origin re-sealed) pull the ciphertext envelope over IPFS, pin it +/// locally, and record the `encrypted_blobs` row keyed by this mirror's local +/// `repo_id`. The mirror stores no recipient identities. +/// +/// Best-effort and idempotent: any per-blob failure is logged and skipped, to be +/// retried on the next sync. Confidentiality is never at risk; the mirror only +/// ever handles ciphertext and never decrypts. Cleanly a no-op when IPFS is +/// unconfigured, the origin reports no encrypted blobs, or the replicate endpoint +/// is absent (older peer) or unreachable. +async fn replicate_encrypted_blobs( + client: &reqwest::Client, + origin_url: &str, + owner: &str, + repo: &str, + db: &Db, + repo_id: &str, + ipfs_api: &str, +) { + if ipfs_api.is_empty() { + return; + } + + let url = format!("{origin_url}/api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate"); + let resp = match client.get(&url).send().await { + Ok(r) if r.status().is_success() => r, + _ => return, + }; + let parsed: ReplicateResponse = match resp.json().await { + Ok(p) => p, + Err(e) => { + warn!(repo = %repo, err = %e, "failed to parse encrypted-blobs/replicate response"); + return; + } + }; + if parsed.blobs.is_empty() { + return; + } + + let have: HashMap = match db.list_all_encrypted_blobs(repo_id).await { + Ok(rows) => rows.into_iter().collect(), + Err(e) => { + warn!(repo = %repo, err = %e, "failed to list local encrypted blobs for replication"); + return; + } + }; + + for blob in blobs_needing_replication(&parsed.blobs, &have) { + let envelope = match crate::ipfs_pin::cat(ipfs_api, &blob.cid).await { + Ok(bytes) => bytes, + Err(e) => { + warn!(oid = %blob.oid, cid = %blob.cid, err = %e, "failed to fetch encrypted envelope over IPFS; will retry next sync"); + continue; + } + }; + match crate::ipfs_pin::pin_git_object(ipfs_api, &blob.oid, &envelope).await { + Ok(cid) if !cid.is_empty() => { + if cid != blob.cid { + warn!(oid = %blob.oid, expected = %blob.cid, got = %cid, "replicated envelope CID mismatch; skipping record"); + continue; + } + if let Err(e) = db.record_encrypted_blob(repo_id, &blob.oid, &cid, "").await { + warn!(oid = %blob.oid, err = %e, "failed to record replicated encrypted blob"); + } + } + _ => { + warn!(oid = %blob.oid, "failed to pin replicated encrypted envelope; will retry next sync"); + } + } + } +} + +/// Run a git subprocess, returning an error with stderr on non-zero exit. +async fn git_run(args: &[&str]) -> anyhow::Result<()> { let out = tokio::process::Command::new("git") - .args([ - "clone", - "--mirror", - remote_url, - local_path.to_str().unwrap_or("."), - ]) + .args(args) .output() .await - .map_err(|e| anyhow::anyhow!("git clone failed to spawn: {e}"))?; - + .map_err(|e| anyhow::anyhow!("git failed to spawn: {e}"))?; if !out.status.success() { let stderr = String::from_utf8_lossy(&out.stderr); - return Err(anyhow::anyhow!("git clone --mirror failed: {stderr}")); + return Err(anyhow::anyhow!("git {args:?} failed: {stderr}")); } Ok(()) } -/// Fetch all refs from the remote into an existing mirror repo. -async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { +/// Run a git subprocess, ignoring a non-zero exit. Used for idempotent +/// `config --unset`, which exits non-zero when the key is already absent. +async fn git_run_lenient(args: &[&str]) { + let _ = tokio::process::Command::new("git") + .args(args) + .output() + .await; +} + +/// Read a single git config value; `None` if unset or on error. +async fn git_config_get(repo: &str, key: &str) -> Option { let out = tokio::process::Command::new("git") - .args([ - "-C", - local_path.to_str().unwrap_or("."), - "fetch", - "--prune", - remote_url, - "+refs/*:refs/*", - ]) + .args(["-C", repo, "config", "--get", key]) .output() .await - .map_err(|e| anyhow::anyhow!("git fetch failed to spawn: {e}"))?; + .ok()?; + if !out.status.success() { + return None; + } + let value = String::from_utf8_lossy(&out.stdout).trim().to_string(); + (!value.is_empty()).then_some(value) +} + +/// Mirror-clone a repo from a remote URL into a local bare repo. +/// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git +/// promisor (so a pack with origin-omitted withheld blobs is accepted) while +/// the huge size limit means every blob the origin *does* send is kept. +async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); + let mut args = vec!["clone", "--mirror"]; + if mode == MirrorMode::Promisor { + args.push("--filter=blob:limit=10g"); + } + args.push(remote_url); + args.push(local_str); + + let out = tokio::process::Command::new("git") + .args(&args) + .output() + .await + .map_err(|e| anyhow::anyhow!("git clone failed to spawn: {e}"))?; if !out.status.success() { let stderr = String::from_utf8_lossy(&out.stderr); - return Err(anyhow::anyhow!("git fetch failed: {stderr}")); + return Err(anyhow::anyhow!("git clone --mirror failed: {stderr}")); } Ok(()) } + +/// Fetch all refs from the remote into an existing mirror repo. Refreshes the +/// stored `origin` URL (the peer's URL may have changed) and fetches via the +/// `origin` remote so any stored promisor settings are honored. +/// +/// `Promisor` applies the promisor config first (covers a repo that became +/// mode-B after a plain initial mirror). `Plain` on a mirror that was previously +/// a promisor (the repo went private -> public) clears the partial-clone config +/// and `--refetch`es, so the once-withheld, now-public blobs are backfilled +/// rather than left permanently missing. +async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); + + git_run(&["-C", local_str, "remote", "set-url", "origin", remote_url]).await?; + + match mode { + MirrorMode::Promisor => { + git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; + git_run(&[ + "-C", + local_str, + "config", + "remote.origin.partialclonefilter", + "blob:limit=10g", + ]) + .await?; + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + MirrorMode::Plain => { + let was_promisor = git_config_get(local_str, "remote.origin.promisor") + .await + .as_deref() + == Some("true"); + if was_promisor { + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.promisor", + ]) + .await; + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.partialclonefilter", + ]) + .await; + git_run(&["-C", local_str, "fetch", "--refetch", "--prune", "origin"]).await + } else { + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + #[test] + fn classify_promisor_when_withheld_nonempty() { + let mode = classify_mirror(Some(vec!["/secret/**".to_string()])); + assert!(matches!(mode, MirrorMode::Promisor)); + } + + #[test] + fn classify_plain_when_withheld_empty() { + let mode = classify_mirror(Some(vec![])); + assert!(matches!(mode, MirrorMode::Plain)); + } + + #[test] + fn classify_plain_when_lookup_failed() { + // None == 404 / network error / parse failure: attempt a plain mirror + // and let the git read endpoint fail-close a mode-A repo. + let mode = classify_mirror(None); + assert!(matches!(mode, MirrorMode::Plain)); + } + + fn rb(oid: &str, cid: &str) -> ReplicaBlob { + ReplicaBlob { + oid: oid.to_string(), + cid: cid.to_string(), + } + } + + #[test] + fn replicate_stores_new_blob() { + let remote = vec![rb("oid1", "cidA")]; + let have = HashMap::new(); + assert_eq!(blobs_needing_replication(&remote, &have), remote); + } + + #[test] + fn replicate_skips_already_present_same_cid() { + let remote = vec![rb("oid1", "cidA")]; + let mut have = HashMap::new(); + have.insert("oid1".to_string(), "cidA".to_string()); + assert!(blobs_needing_replication(&remote, &have).is_empty()); + } + + #[test] + fn replicate_restores_on_cid_change() { + // The origin re-sealed: same oid, new envelope, new cid. + let remote = vec![rb("oid1", "cidB")]; + let mut have = HashMap::new(); + have.insert("oid1".to_string(), "cidA".to_string()); + assert_eq!(blobs_needing_replication(&remote, &have), remote); + } + + #[test] + fn replicate_empty_remote_is_noop() { + assert!(blobs_needing_replication(&[], &HashMap::new()).is_empty()); + } + + #[test] + fn replicate_response_parses() { + // An older origin may still send a recipients field; it must be ignored. + let json = r#"{"blobs":[{"oid":"o1","cid":"c1","recipients":["did:key:zA"]}]}"#; + let parsed: ReplicateResponse = serde_json::from_str(json).unwrap(); + assert_eq!(parsed.blobs.len(), 1); + assert_eq!(parsed.blobs[0].oid, "o1"); + assert_eq!(parsed.blobs[0].cid, "c1"); + } + + #[test] + fn replicate_response_empty_blobs_parses() { + let parsed: ReplicateResponse = serde_json::from_str(r#"{"blobs":[]}"#).unwrap(); + assert!(parsed.blobs.is_empty()); + } + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a bare remote containing `files`, committed on one branch. + /// Returns (tempdir, file:// url). file:// makes git honor --filter. + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + fn git_config(repo: &Path, key: &str) -> String { + let out = Command::new("git") + .args(["-C", repo.to_str().unwrap(), "config", "--get", key]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + } + + fn object_count(repo: &Path) -> usize { + let out = Command::new("git") + .args([ + "-C", + repo.to_str().unwrap(), + "cat-file", + "--batch-all-objects", + "--batch-check=%(objectname)", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter(|l| !l.trim().is_empty()) + .count() + } + + #[tokio::test] + async fn promisor_clone_marks_promisor_and_keeps_objects() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + // No withholding on a plain bare origin, so every object is present: + // 1 commit + 1 root tree + 2 subtrees + 2 blobs = 6. + assert_eq!(object_count(&dest), 6); + } + + #[tokio::test] + async fn plain_clone_is_not_promisor() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + } + + #[tokio::test] + async fn promisor_fetch_updates_existing_mirror() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + let before = object_count(&dest); + + // Add a second commit to the origin working tree and push to the bare + // (the working repo has no named remote, so push via the file:// URL). + let origin = td.path().join("origin"); + std::fs::write(origin.join("public/c.txt"), b"more\n").unwrap(); + g(&["add", "."], &origin); + g(&["commit", "-qm", "second"], &origin); + g(&["push", "-q", &url, "HEAD"], &origin); + + fetch_repo(&dest, &url, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert!(object_count(&dest) > before, "fetch pulled the new commit"); + } + + #[tokio::test] + async fn plain_fetch_clears_promisor_config_on_transition() { + // Repo started mode-B (promisor mirror), then went fully public, so the + // next sync classifies Plain. fetch_repo must drop the partial-clone + // config and refetch instead of leaving the mirror a promisor forever. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + + fetch_repo(&dest, &url, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.partialclonefilter"), ""); + } +} diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index b246dbf..afe6a7c 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -96,6 +96,78 @@ pub fn visibility_check( } } +/// The subtree path globs that `caller` (None = anonymous) may NOT read, given +/// the repo's rules. Whole-repo ("/") rules are excluded: a denied whole-repo +/// read is handled by the 404 gate before a clone ever starts. Each remaining +/// rule is reported when `visibility_check` denies the caller at the glob's +/// representative path. Used by the clean-clone client to sparse-exclude the +/// private paths from checkout. +pub fn withheld_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + let probe = glob_prefix(&r.path_glob); + visibility_check(rules, is_public, owner_did, caller, probe) == Decision::Deny + }) + .map(|r| r.path_glob.clone()) + .collect() +} + +/// The allowed globs that sit strictly underneath a denied glob. A clean-clone +/// client sparse-excludes everything in `withheld_globs`, which would also hide +/// these nested allowed paths; re-including them restores the caller's access. +/// Example: with `/secret/**` denied and `/secret/public/**` allowed for the +/// same caller, `/secret/public/**` is returned here so the client re-includes +/// it after excluding `/secret/`. +pub fn reincluded_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + let denied: Vec<&str> = rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Deny + }) + .map(|r| glob_prefix(&r.path_glob)) + .collect(); + + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Allow + }) + .filter(|r| { + let p = glob_prefix(&r.path_glob); + denied + .iter() + .any(|d| *d != p && p.starts_with(&format!("{d}/"))) + }) + .map(|r| r.path_glob.clone()) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -116,6 +188,43 @@ mod tests { const OWNER: &str = "did:key:z6MkOwner"; + #[test] + fn withheld_globs_lists_only_denied_subtrees() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule("/docs/**", VisibilityMode::B, &["did:key:z6MkStranger"]), + ]; + // Stranger is denied /secret but allowed /docs. + let mut got = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + got.sort(); + assert_eq!(got, vec!["/secret/**".to_string()]); + // Owner is denied nothing. + assert!(withheld_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + // Anonymous is denied both. + let mut anon = withheld_globs(&rules, true, OWNER, None); + anon.sort(); + assert_eq!(anon, vec!["/docs/**".to_string(), "/secret/**".to_string()]); + } + + #[test] + fn reincluded_globs_restores_allowed_nested_path() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule( + "/secret/public/**", + VisibilityMode::B, + &["did:key:z6MkFriend", "did:key:z6MkStranger"], + ), + ]; + // Stranger is denied /secret/** but allowed the nested /secret/public/**. + let withheld = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(withheld, vec!["/secret/**".to_string()]); + let reinc = reincluded_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(reinc, vec!["/secret/public/**".to_string()]); + // Owner is denied nothing, so there is nothing to re-include. + assert!(reincluded_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + } + #[test] fn no_rules_public_allows_anonymous() { assert_eq!( @@ -242,4 +351,24 @@ mod tests { Decision::Allow ); } + + // Mirrors the gossip-announce gate in git_receive_pack: announce iff an + // anonymous caller can read "/". + #[test] + fn announce_gate_matches_public_readability() { + let announce = |rules: &[VisibilityRule], is_public: bool| { + visibility_check(rules, is_public, OWNER, None, "/") == Decision::Allow + }; + // Public repo, no rules → announce. + assert!(announce(&[], true)); + // Legacy private repo (is_public false, no rules) → silent. + assert!(!announce(&[], false)); + // Mode A whole-repo rule with no public readers → silent. + assert!(!announce(&[rule("/", VisibilityMode::A, &[])], true)); + // Mode B public repo with a private subtree → still announce. + assert!(announce( + &[rule("/secret/**", VisibilityMode::B, &[])], + true + )); + } } diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs new file mode 100644 index 0000000..93e998d --- /dev/null +++ b/crates/gl/src/clone.rs @@ -0,0 +1,1114 @@ +//! `gl clone`: clean partial clone of a gitlawb repo with private subtrees. +//! +//! A repo may withhold blob content under some path globs from the caller +//! (Phase 3). The resulting pack is not closed under reachability, so a stock +//! `git clone` is refused at fetch. This command clones as a promisor +//! (`--filter=blob:none`) and sparse-excludes the caller's withheld globs, +//! producing a clean checkout: public files present, private paths absent. + +use anyhow::{bail, Context, Result}; +use clap::Args; +use serde::Deserialize; +use std::path::Path; +use std::process::Command; + +use crate::http::NodeClient; +use crate::identity::load_keypair_from_dir; + +#[derive(Args)] +pub struct CloneArgs { + /// Repo to clone: gitlawb:/// or /. + pub repo: String, + + /// Destination directory (default: the repo name). + pub dir: Option, + + /// Branch to check out (default: the remote's default branch). + #[arg(long)] + pub branch: Option, + + #[arg(long, default_value = "https://node.gitlawb.com", env = "GITLAWB_NODE")] + pub node: String, + + /// Arweave gateway for B3 manifest discovery/fetch when a node cannot supply + /// the encrypted-blob mapping. + #[arg( + long, + default_value = "https://arweave.net", + env = "GITLAWB_ARWEAVE_GATEWAY" + )] + pub arweave_gateway: String, + + /// Public IPFS gateway for fetching encrypted envelopes during B3 recovery. + #[arg( + long, + default_value = "https://dweb.link", + env = "GITLAWB_IPFS_GATEWAY" + )] + pub ipfs_gateway: String, +} + +/// Run a git command inside `dir`, erroring with stderr on failure. +fn git(dir: &Path, args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .current_dir(dir) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Run a git command not tied to a working tree (e.g. `clone`). +fn git_global(args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Sparse-checkout pattern(s) for a visibility glob. A subtree glob +/// (`/secret/**`) maps to the directory `/secret/`. A wildcard-free glob +/// (`/docs/private`) matches both the exact path and a subtree at that path +/// (mirroring the node's `glob_matches`), so it maps to both `/docs/private` +/// and `/docs/private/`. Callers prefix these with `!` to exclude. +fn sparse_patterns(glob: &str) -> Vec { + match glob.strip_suffix("/**") { + Some(base) => vec![format!("{base}/")], + None => vec![glob.to_string(), format!("{glob}/")], + } +} + +/// Clone `remote_url` into `dest`, excluding `withheld_globs` from checkout. +/// `dest` must not already exist. With nothing withheld this is a plain full +/// clone. With globs withheld it clones as a promisor (`--filter=blob:none`, +/// marking the repo a promisor so the node's non-closed pack is accepted) +/// without checkout, sparse-excludes each glob, then checks out so the absent +/// blobs are never materialized. `--no-cone` is required for negated excludes. +pub fn setup_partial_clone( + dest: &Path, + remote_url: &str, + withheld_globs: &[String], + reinclude_globs: &[String], + branch: Option<&str>, +) -> Result<()> { + let dest_str = dest + .to_str() + .context("destination path is not valid UTF-8")?; + + if withheld_globs.is_empty() { + match branch { + Some(b) => git_global(&["clone", "-q", "--branch", b, remote_url, dest_str])?, + None => git_global(&["clone", "-q", remote_url, dest_str])?, + } + return Ok(()); + } + + git_global(&[ + "clone", + "-q", + "--filter=blob:none", + "--no-checkout", + remote_url, + dest_str, + ])?; + git(dest, &["sparse-checkout", "init", "--no-cone"])?; + // Non-cone sparse-checkout, gitignore-style: include everything, exclude the + // withheld globs, then re-include any allowed globs nested under an excluded + // one. Emitting all excludes before the re-includes is safe even for deeper + // re-denials (deny /secret, allow /secret/public, deny /secret/public/admin): + // git does not re-traverse an explicitly excluded directory, so a broader + // parent re-include never resurrects a more specific excluded subtree. + let mut spec = String::from("/*\n"); + for g in withheld_globs { + for pat in sparse_patterns(g) { + spec.push('!'); + spec.push_str(&pat); + spec.push('\n'); + } + } + for g in reinclude_globs { + for pat in sparse_patterns(g) { + spec.push_str(&pat); + spec.push('\n'); + } + } + std::fs::write(dest.join(".git/info/sparse-checkout"), spec) + .context("writing sparse-checkout spec")?; + + match branch { + Some(b) => git(dest, &["checkout", "-q", b])?, + None => { + // Read the default branch from the local `origin/HEAD` symref that + // clone just set, instead of parsing `git remote show origin`, whose + // "HEAD branch:" line is localized and needs a network round-trip. + let out = Command::new("git") + .args(["symbolic-ref", "--short", "refs/remotes/origin/HEAD"]) + .current_dir(dest) + .output()?; + if !out.status.success() { + bail!( + "could not determine default branch: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + let symref = String::from_utf8_lossy(&out.stdout); + let head = symref + .trim() + .strip_prefix("origin/") + .context("unexpected origin/HEAD format")?; + git(dest, &["checkout", "-q", head])?; + } + } + Ok(()) +} + +/// Parse `repo` into (gitlawb_url, owner, name). Accepts a full +/// `gitlawb:///` URL or a bare `/`. The owner DID may +/// itself contain colons but no slash, so split on the first slash. +fn parse_repo(repo: &str) -> Result<(String, String, String)> { + let stripped = repo.strip_prefix("gitlawb://").unwrap_or(repo); + let (owner, name) = stripped + .trim_end_matches('/') + .split_once('/') + .context("repo must be / or gitlawb:///")?; + if owner.is_empty() || name.is_empty() || name.contains('/') { + bail!("repo must be / or gitlawb:///"); + } + Ok(( + format!("gitlawb://{owner}/{name}"), + owner.to_string(), + name.to_string(), + )) +} + +/// Ask the node which globs are withheld for this caller and which allowed globs +/// nested under them must be re-included. Returns `(withheld, reinclude)`. A +/// node that does not implement the endpoint (404/501) yields empties so public +/// repos on older nodes still clone normally. Other failures (network, auth, +/// 5xx, malformed JSON) are propagated: failing open here would silently fall +/// back to a stock clone, which the node refuses once blobs are withheld, hiding +/// the real cause behind a confusing fetch error. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Result<(Vec, Vec)> { + let kp = load_keypair_from_dir(None).ok(); + let signed = kp.is_some(); + let client = NodeClient::new(node, kp); + let path = format!("/api/v1/repos/{owner}/{name}/withheld-paths"); + let resp = if signed { + client.get_signed(&path).await + } else { + client.get(&path).await + }; + let resp = match resp { + Ok(r) if r.status().is_success() => r, + Ok(r) if matches!(r.status().as_u16(), 404 | 501) => return Ok((Vec::new(), Vec::new())), + Ok(r) => bail!("withheld-paths lookup failed: {}", r.status()), + Err(err) => return Err(err).context("fetching withheld paths"), + }; + let body: WithheldPathsResponse = resp + .json() + .await + .context("parsing withheld-paths response")?; + Ok((body.withheld, body.reinclude)) +} + +/// The node's `/withheld-paths` 200 body. Both fields are always emitted as JSON +/// arrays; deserializing into this struct (rather than poking at a `Value`) makes +/// a missing or mistyped field a hard error instead of silently becoming `[]`, +/// which would mask a server regression behind a confusing later clone failure. +#[derive(Deserialize)] +struct WithheldPathsResponse { + withheld: Vec, + reinclude: Vec, +} + +/// After the base clone, recover encrypted blobs the caller is authorized for +/// that are missing locally: fetch the envelope, decrypt with the caller's key, +/// install as a loose object. Returns the repo-relative paths recovered. +/// Best-effort; logs and continues on any per-blob failure. +async fn recover_encrypted_blobs( + node: &str, + owner: &str, + name: &str, + dest: &Path, + keypair: &gitlawb_core::identity::Keypair, +) -> Result> { + use gitlawb_core::encrypt::open_blob; + use std::collections::HashMap; + use std::io::Write; + + let dest_str = dest.to_str().context("dest path not utf-8")?; + let client = NodeClient::new(node, Some(keypair.clone())); + + let resp = match client + .get_signed(&format!("/api/v1/repos/{owner}/{name}/encrypted-blobs")) + .await + { + Ok(r) if r.status().is_success() => r, + _ => return Ok(vec![]), + }; + let body: serde_json::Value = resp.json().await.context("parsing encrypted-blobs")?; + let blobs = body + .get("blobs") + .and_then(|b| b.as_array()) + .cloned() + .unwrap_or_default(); + if blobs.is_empty() { + return Ok(vec![]); + } + + // Map oid -> repo-relative path from the cloned tree. + let ls = Command::new("git") + .args(["-C", dest_str, "ls-tree", "-r", "HEAD"]) + .output()?; + let mut oid_to_path: HashMap = HashMap::new(); + for line in String::from_utf8_lossy(&ls.stdout).lines() { + if let Some((meta, path)) = line.split_once('\t') { + if let Some(oid) = meta.split_whitespace().nth(2) { + oid_to_path.insert(oid.to_string(), path.to_string()); + } + } + } + + let mut recovered = Vec::new(); + for entry in blobs { + let Some(oid) = entry.get("oid").and_then(|o| o.as_str()) else { + continue; + }; + // Skip if already present locally. + let present = Command::new("git") + .args(["-C", dest_str, "cat-file", "-e", oid]) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if present { + continue; + } + let env_resp = match client + .get_signed(&format!( + "/api/v1/repos/{owner}/{name}/encrypted-blob/{oid}" + )) + .await + { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + let Ok(envelope) = env_resp.bytes().await else { + continue; + }; + let plaintext = match open_blob(&envelope, keypair) { + Ok(p) => p, + Err(e) => { + eprintln!("warning: could not decrypt {oid}: {e}"); + continue; + } + }; + // Install as a loose object; verify the OID matches. + let mut child = Command::new("git") + .args(["-C", dest_str, "hash-object", "-w", "-t", "blob", "--stdin"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn()?; + child.stdin.take().unwrap().write_all(&plaintext)?; + let out = child.wait_with_output()?; + let written = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if written == oid { + if let Some(p) = oid_to_path.get(oid) { + recovered.push(p.clone()); + } + } else { + eprintln!("warning: recovered blob {oid} hashed to {written}; discarding"); + } + } + Ok(recovered) +} + +/// One blob entry in an Arweave-anchored encrypted manifest. The manifest also +/// carries a `recipients` field per blob, but `gl` does not need it: authorization +/// is enforced by whether `open_blob` can decrypt with the caller's key. Unknown +/// JSON fields are ignored by serde, so `recipients` is simply not declared here. +#[derive(Deserialize)] +struct ManifestBlob { + oid: String, + cid: String, +} + +/// An Arweave-anchored per-push encrypted manifest (Option B3). +#[derive(Deserialize)] +struct Manifest { + #[serde(default)] + timestamp: String, + #[serde(default)] + blobs: Vec, +} + +/// Extract transaction ids from an Arweave GraphQL `transactions` response. +fn parse_tx_ids(v: &serde_json::Value) -> Vec { + v.get("data") + .and_then(|d| d.get("transactions")) + .and_then(|t| t.get("edges")) + .and_then(|e| e.as_array()) + .map(|edges| { + edges + .iter() + .filter_map(|edge| { + edge.get("node") + .and_then(|n| n.get("id")) + .and_then(|i| i.as_str()) + .map(String::from) + }) + .collect() + }) + .unwrap_or_default() +} + +/// Merge per-push manifests into a single `oid -> cid` map, latest-wins by the +/// manifest `timestamp` (RFC3339, compared lexicographically; a later push that +/// re-sealed a blob overrides the earlier entry). +fn merge_manifests(manifests: Vec) -> std::collections::HashMap { + let mut best: std::collections::HashMap = + std::collections::HashMap::new(); // oid -> (cid, timestamp) + for m in manifests { + for b in m.blobs { + match best.get(&b.oid) { + Some((_, ts)) if ts.as_str() >= m.timestamp.as_str() => {} + _ => { + best.insert(b.oid, (b.cid, m.timestamp.clone())); + } + } + } + } + best.into_iter().map(|(oid, (cid, _))| (oid, cid)).collect() +} + +/// Option B3 fallback recovery, with no dependency on a gitlawb node API. Query +/// the Arweave gateway for this repo's encrypted manifests, merge them, and for +/// each blob still missing locally that the caller can decrypt, pull the envelope +/// from a public IPFS gateway, decrypt, and install it as a loose object. Returns +/// the repo-relative paths recovered. Best-effort; silent when gateways are +/// unreachable, leaving the clone exactly as node-based recovery left it. +async fn recover_from_arweave( + arweave_gateway: &str, + ipfs_gateway: &str, + owner: &str, + name: &str, + dest: &Path, + keypair: &gitlawb_core::identity::Keypair, +) -> Result> { + use gitlawb_core::encrypt::open_blob; + use std::collections::HashMap; + use std::io::Write; + + let dest_str = dest.to_str().context("dest path not utf-8")?; + let owner_short = owner.split(':').next_back().unwrap_or(owner); + let slug = format!("{owner_short}/{name}"); + let ag = arweave_gateway.trim_end_matches('/'); + let ig = ipfs_gateway.trim_end_matches('/'); + // Bound every gateway request: this runs on every clone, so a slow or hung + // public gateway must not stall it. Best-effort recovery, so a timeout just + // skips the affected blob. + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + + // 1. Discover manifest transaction ids via Arweave GraphQL. + let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; + let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug } }); + let resp = match client + .post(format!("{ag}/graphql")) + .json(&gql_body) + .send() + .await + { + Ok(r) if r.status().is_success() => r, + _ => return Ok(vec![]), + }; + let gql: serde_json::Value = match resp.json().await { + Ok(v) => v, + Err(_) => return Ok(vec![]), + }; + let tx_ids = parse_tx_ids(&gql); + if tx_ids.is_empty() { + return Ok(vec![]); + } + + // 2. Fetch and parse each manifest body, then merge latest-wins per oid. + let mut manifests = Vec::new(); + for tx in tx_ids { + let m = match client.get(format!("{ag}/{tx}")).send().await { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + if let Ok(parsed) = m.json::().await { + manifests.push(parsed); + } + } + let oid_cid = merge_manifests(manifests); + if oid_cid.is_empty() { + return Ok(vec![]); + } + + // Map oid -> repo-relative path from the cloned tree. + let ls = Command::new("git") + .args(["-C", dest_str, "ls-tree", "-r", "HEAD"]) + .output()?; + let mut oid_to_path: HashMap = HashMap::new(); + for line in String::from_utf8_lossy(&ls.stdout).lines() { + if let Some((meta, path)) = line.split_once('\t') { + if let Some(oid) = meta.split_whitespace().nth(2) { + oid_to_path.insert(oid.to_string(), path.to_string()); + } + } + } + + // 3. Recover each missing blob the caller can decrypt. + let mut recovered = Vec::new(); + for (oid, cid) in oid_cid { + // Local presence check. GIT_NO_LAZY_FETCH stops git from making a wasted + // promisor fetch attempt (we are recovering precisely because the promisor + // cannot supply the blob), and `.output()` captures git's "missing object" + // stderr so that expected case does not leak a confusing error to the user. + let present = Command::new("git") + .args(["-C", dest_str, "cat-file", "-e", &oid]) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + if present { + continue; + } + let env_resp = match client.get(format!("{ig}/ipfs/{cid}")).send().await { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + let Ok(envelope) = env_resp.bytes().await else { + continue; + }; + // open_blob succeeds only if this caller is a recipient: this is the + // authorization gate (no node, no DID check needed). + let plaintext = match open_blob(&envelope, keypair) { + Ok(p) => p, + Err(_) => continue, + }; + let mut child = Command::new("git") + .args(["-C", dest_str, "hash-object", "-w", "-t", "blob", "--stdin"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn()?; + child.stdin.take().unwrap().write_all(&plaintext)?; + let out = child.wait_with_output()?; + let written = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if written == oid { + if let Some(p) = oid_to_path.get(&oid) { + recovered.push(p.clone()); + } + } else { + eprintln!("warning: recovered blob {oid} hashed to {written}; discarding"); + } + } + Ok(recovered) +} + +pub async fn run(args: CloneArgs) -> Result<()> { + let (url, owner, name) = parse_repo(&args.repo)?; + let dest_name = args.dir.unwrap_or_else(|| name.clone()); + let dest = std::path::PathBuf::from(&dest_name); + if dest.exists() { + bail!("destination '{dest_name}' already exists"); + } + + let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await?; + if withheld.is_empty() { + println!("Cloning {url} into {dest_name}"); + } else { + println!( + "Cloning {url} into {dest_name} ({} private path(s) excluded)", + withheld.len() + ); + } + + setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; + + if let Ok(keypair) = load_keypair_from_dir(None) { + // Node-based recovery first (B1/B2), then the B3 Arweave/IPFS gateway + // fallback for any authorized blobs the node could not supply. + let mut paths = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair) + .await + .unwrap_or_default(); + let from_arweave = recover_from_arweave( + &args.arweave_gateway, + &args.ipfs_gateway, + &owner, + &name, + &dest, + &keypair, + ) + .await + .unwrap_or_default(); + paths.extend(from_arweave); + + if !paths.is_empty() { + // Re-include recovered paths if this was a sparse clone, then + // materialize them in the working tree. + let spec = dest.join(".git/info/sparse-checkout"); + if spec.exists() { + match std::fs::read_to_string(&spec) { + Ok(mut s) => { + for p in &paths { + s.push_str(&format!("/{p}\n")); + } + if let Err(e) = std::fs::write(&spec, &s) { + eprintln!( + "warning: failed to update sparse-checkout, recovered files may not appear: {e}" + ); + } + } + Err(e) => { + eprintln!( + "warning: failed to read sparse-checkout, recovered files may not appear: {e}" + ); + } + } + } + if let Err(e) = git(&dest, &["checkout", "--", "."]) { + eprintln!( + "warning: checkout after recovery failed, recovered files may not appear: {e}" + ); + } + println!( + "Recovered {} private file(s) you are authorized to read", + paths.len() + ); + } + } + + println!("Done. Cloned into {dest_name}"); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + #[test] + fn setup_partial_clone_excludes_withheld_path() { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(origin.join("secret")).unwrap(); + std::fs::create_dir_all(origin.join("public")).unwrap(); + std::fs::write(origin.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(origin.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + // file:// so --filter is honored (local-path clones ignore it). + let dest = td.path().join("dest"); + let url = format!("file://{}", bare.display()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public file present"); + assert!( + !dest.join("secret/b.txt").exists(), + "withheld path must be excluded from checkout" + ); + } + + /// Build a bare remote with the given files (relative path -> contents), + /// committed on one branch. Returns (tempdir, file:// url). + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + #[test] + fn reinclude_restores_allowed_nested_path() { + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &["/secret/**".to_string()], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed nested path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied nested path must stay excluded" + ); + } + + #[test] + fn three_level_alternating_nesting_respects_specificity() { + // deny /secret, allow /secret/public, deny /secret/public/admin. + // The deepest deny must win even though a shallower allow re-includes + // its parent: order patterns by depth, not all-excludes-then-includes. + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ("secret/public/admin/k.txt", b"ADMIN\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &[ + "/secret/**".to_string(), + "/secret/public/admin/**".to_string(), + ], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed middle path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied sibling must stay excluded" + ); + assert!( + !dest.join("secret/public/admin/k.txt").exists(), + "deepest denied path must stay excluded despite the shallower re-include" + ); + } + + #[test] + fn exact_path_glob_is_excluded() { + // A wildcard-free glob must exclude the exact file, not just a subtree. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("docs/private", b"SECRET\n")]); + let dest = td.path().join("dest"); + setup_partial_clone(&dest, &url, &["/docs/private".to_string()], &[], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + !dest.join("docs/private").exists(), + "exact-path withheld file must be excluded" + ); + } + + #[test] + fn sparse_patterns_subtree_and_exact() { + assert_eq!(sparse_patterns("/secret/**"), vec!["/secret/".to_string()]); + assert_eq!( + sparse_patterns("/docs/private"), + vec!["/docs/private".to_string(), "/docs/private/".to_string()] + ); + } + + #[test] + fn withheld_response_requires_both_fields() { + let ok: WithheldPathsResponse = + serde_json::from_str(r#"{"withheld":["/secret/**"],"reinclude":[]}"#).unwrap(); + assert_eq!(ok.withheld, vec!["/secret/**".to_string()]); + assert!(ok.reinclude.is_empty()); + + // A missing field is a schema mismatch: it must error, not default to []. + assert!(serde_json::from_str::(r#"{"withheld":[]}"#).is_err()); + // A wrong-typed field must error too. + assert!(serde_json::from_str::( + r#"{"withheld":"nope","reinclude":[]}"# + ) + .is_err()); + } + + #[test] + fn parse_tx_ids_extracts_node_ids() { + let v: serde_json::Value = serde_json::from_str( + r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}},{"node":{"id":"TX2"}}]}}}"#, + ) + .unwrap(); + assert_eq!(parse_tx_ids(&v), vec!["TX1".to_string(), "TX2".to_string()]); + } + + #[test] + fn parse_tx_ids_empty_on_no_edges() { + let v: serde_json::Value = + serde_json::from_str(r#"{"data":{"transactions":{"edges":[]}}}"#).unwrap(); + assert!(parse_tx_ids(&v).is_empty()); + } + + #[test] + fn manifest_parses_and_ignores_recipients() { + let m: Manifest = serde_json::from_str( + r#"{"timestamp":"2026-06-11T00:00:00Z","blobs":[{"oid":"o1","cid":"c1","recipients":["did:key:zA"]}]}"#, + ) + .unwrap(); + assert_eq!(m.timestamp, "2026-06-11T00:00:00Z"); + assert_eq!(m.blobs.len(), 1); + assert_eq!(m.blobs[0].oid, "o1"); + assert_eq!(m.blobs[0].cid, "c1"); + } + + #[test] + fn merge_manifests_latest_wins_per_oid() { + let older = Manifest { + timestamp: "2026-06-10T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidOLD".to_string(), + }], + }; + let newer = Manifest { + timestamp: "2026-06-11T00:00:00Z".to_string(), + blobs: vec![ + ManifestBlob { + oid: "o1".to_string(), + cid: "cidNEW".to_string(), + }, + ManifestBlob { + oid: "o2".to_string(), + cid: "cid2".to_string(), + }, + ], + }; + let merged = merge_manifests(vec![older, newer]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + assert_eq!(merged.get("o2").map(String::as_str), Some("cid2")); + } + + #[test] + fn merge_manifests_is_order_independent() { + let older = Manifest { + timestamp: "2026-06-10T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidOLD".to_string(), + }], + }; + let newer = Manifest { + timestamp: "2026-06-11T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidNEW".to_string(), + }], + }; + // Newer first, older second: newer must still win. + let merged = merge_manifests(vec![newer, older]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + } + + /// Read-path end-to-end over a mocked Arweave + IPFS gateway: discover the + /// manifest via GraphQL, fetch it, fetch the envelope, decrypt with the + /// caller's key, and install the previously-withheld blob. + #[tokio::test] + async fn recover_from_arweave_installs_authorized_blob() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + // Make the bare honor `--filter=blob:none` over file:// so the withheld + // blob is genuinely omitted from the local store, not just unchecked-out. + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + assert!( + !dest.join("secret/b.txt").exists(), + "secret starts withheld" + ); + + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + // Simulate origin death: drop the promisor remote so `cat-file -e` cannot + // lazily fetch the withheld blob. This is exactly the B3 premise (the node + // can no longer serve it), and forces recovery to go through Arweave/IPFS. + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + + let cid = "testcid123"; + let mut server = mockito::Server::new_async().await; + let _gql = server + .mock("POST", "/graphql") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + + let present = Command::new("git") + .args(["-C", dest.to_str().unwrap(), "cat-file", "-e", &oid]) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .unwrap() + .status + .success(); + assert!( + present, + "authorized reader's blob must be installed locally" + ); + } + + /// A caller who is not a recipient cannot decrypt the envelope, so nothing is + /// recovered even though the manifest and envelope are reachable. + #[tokio::test] + async fn recover_from_arweave_skips_unauthorized() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + // Simulate origin death (see the authorized test) so the withheld blob + // cannot be lazily fetched from the promisor remote. + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + // Sealed to a different reader; the caller below is not a recipient. + let authorized = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[authorized.verifying_key()]).unwrap(); + let intruder = Keypair::generate(); + + let cid = "testcid123"; + let mut server = mockito::Server::new_async().await; + let _gql = server + .mock("POST", "/graphql") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &intruder, + ) + .await + .unwrap(); + assert!(paths.is_empty(), "non-recipient must recover nothing"); + + let present = Command::new("git") + .args(["-C", dest.to_str().unwrap(), "cat-file", "-e", &oid]) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .unwrap() + .status + .success(); + assert!(!present, "non-recipient must not install the blob"); + } + + #[test] + fn parse_repo_accepts_url_and_bare() { + let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); + assert_eq!(url, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o.as_str(), n.as_str()), ("did:key:zAbc", "myrepo")); + + let (url2, o2, n2) = parse_repo("did:key:zAbc/myrepo").unwrap(); + assert_eq!(url2, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o2.as_str(), n2.as_str()), ("did:key:zAbc", "myrepo")); + } + + #[test] + fn parse_repo_rejects_malformed() { + assert!(parse_repo("noslash").is_err()); + assert!(parse_repo("gitlawb://owner/").is_err()); + assert!(parse_repo("/name").is_err()); + // An extra slash would otherwise smuggle a path segment into the name. + assert!(parse_repo("owner/name/extra").is_err()); + } + + #[test] + fn recovered_blob_installs_with_matching_oid() { + use gitlawb_core::encrypt::{open_blob, seal_blob}; + use gitlawb_core::identity::Keypair; + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = std::process::Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let reader = Keypair::generate(); + let env = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let plaintext = open_blob(&env, &reader).unwrap(); + let mut child = std::process::Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "hash-object", + "-w", + "-t", + "blob", + "--stdin", + ]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn() + .unwrap(); + use std::io::Write; + child.stdin.take().unwrap().write_all(&plaintext).unwrap(); + let out = child.wait_with_output().unwrap(); + assert_eq!(String::from_utf8_lossy(&out.stdout).trim(), oid); + } +} diff --git a/crates/gl/src/main.rs b/crates/gl/src/main.rs index 0af7398..1c1a50d 100644 --- a/crates/gl/src/main.rs +++ b/crates/gl/src/main.rs @@ -7,6 +7,7 @@ mod agent; mod bounty; mod cert; mod changelog; +mod clone; mod doctor; mod http; mod identity; @@ -57,6 +58,9 @@ enum Commands { /// Register this agent with a gitlawb node Register(register::RegisterArgs), + /// Clone a gitlawb repo, handling private subtrees cleanly + Clone(clone::CloneArgs), + /// Manage repositories Repo(repo::RepoArgs), @@ -150,6 +154,7 @@ async fn main() -> Result<()> { match cli.command { Commands::Identity { cmd } => identity::run(cmd).await, Commands::Register(args) => register::run(args).await, + Commands::Clone(args) => clone::run(args).await, Commands::Repo(args) => repo::run(args).await, Commands::Issue(args) => issue::run(args).await, Commands::Pr(args) => pr::run(args).await,