ChEB-AI · sfluegel05 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+    branches: ["**"]
+
+jobs:
+  lint:
+    name: Lint (ruff)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install ruff
+        run: pip install ruff
+
+      - name: Check formatting
+        run: ruff format --check .
+
+      - name: Check linting
+        run: ruff check .
+
+  test:
+    name: Unit Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install package and test dependencies
+        run: pip install -e ".[dev]"
+
+      - name: Run tests
+        run: pytest tests/ -v
diff --git a/README.md b/README.md
@@ -1,2 +1,85 @@
 # python-chebi-utils
-Common processing functionality for the ChEBI ontology (e.g. extraction of molecules, classes and relations).
+
+Common processing functionality for the ChEBI ontology — download data files, extract classes and relations, extract molecules, and generate stratified train/val/test splits.
+
+## Installation
+
+```bash
+pip install chebi-utils
+```
+
+For development (includes `pytest` and `ruff`):
+
+```bash
+pip install -e ".[dev]"
+```
+
+## Features
+
+### Download ChEBI data files
+
+```python
+from chebi_utils import download_chebi_obo, download_chebi_sdf
+
+obo_path = download_chebi_obo(dest_dir="data/")   # downloads chebi.obo
+sdf_path = download_chebi_sdf(dest_dir="data/")   # downloads chebi.sdf.gz
+```
+
+Files are fetched from the [EBI FTP server](https://ftp.ebi.ac.uk/pub/databases/chebi/).
+
+### Extract ontology classes and relations
+
+```python
+from chebi_utils import extract_classes, extract_relations
+
+classes = extract_classes("chebi.obo")
+# DataFrame: id, name, definition, is_obsolete
+
+relations = extract_relations("chebi.obo")
+# DataFrame: source_id, target_id, relation_type  (is_a, has_role, …)
+```
+
+### Extract molecules
+
+```python
+from chebi_utils import extract_molecules
+
+molecules = extract_molecules("chebi.sdf.gz")
+# DataFrame: chebi_id, name, smiles, inchi, inchikey, formula, charge, mass, …
+```
+
+Both plain `.sdf` and gzip-compressed `.sdf.gz` files are supported.
+
+### Generate train/val/test splits
+
+```python
+from chebi_utils import create_splits
+
+splits = create_splits(molecules, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)
+train_df = splits["train"]
+val_df   = splits["val"]
+test_df  = splits["test"]
+```
+
+Pass `stratify_col` to preserve class proportions across splits:
+
+```python
+splits = create_splits(classes, stratify_col="is_obsolete", seed=42)
+```
+
+## Running Tests
+
+```bash
+pytest tests/ -v
+```
+
+## Linting
+
+```bash
+ruff check .
+ruff format --check .
+```
+
+## CI/CD
+
+A GitHub Actions workflow (`.github/workflows/ci.yml`) automatically runs ruff linting and the full test suite on every push and pull request across Python 3.10, 3.11, and 3.12.
diff --git a/chebi_utils/__init__.py b/chebi_utils/__init__.py
@@ -0,0 +1,12 @@
+from chebi_utils.downloader import download_chebi_obo, download_chebi_sdf
+from chebi_utils.obo_extractor import build_chebi_graph
+from chebi_utils.sdf_extractor import extract_molecules
+from chebi_utils.splitter import create_splits
+
+__all__ = [
+    "download_chebi_obo",
+    "download_chebi_sdf",
+    "build_chebi_graph",
+    "extract_molecules",
+    "create_splits",
+]
diff --git a/chebi_utils/downloader.py b/chebi_utils/downloader.py
@@ -0,0 +1,78 @@
+"""Download ChEBI data files from the EBI FTP server."""
+
+from __future__ import annotations
+
+import urllib.request
+from pathlib import Path
+
+_CHEBI_LEGACY_VERSION_THRESHOLD = 245
+
+
+def _chebi_obo_url(version: int) -> str:
+    if version < _CHEBI_LEGACY_VERSION_THRESHOLD:
+        return f"https://ftp.ebi.ac.uk/pub/databases/chebi/archive/chebi_legacy/archive/rel{version}/ontology/chebi.obo"
+    return f"https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel{version}/ontology/chebi.obo"
+
+
+def _chebi_sdf_url(version: int) -> str:
+    if version < _CHEBI_LEGACY_VERSION_THRESHOLD:
+        return f"https://ftp.ebi.ac.uk/pub/databases/chebi/archive/chebi_legacy/archive/rel{version}/ontology/chebi.obo"
+    return f"https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel{version}/SDF/chebi.sdf.gz"
+
+
+def download_chebi_obo(
+    version: int,
+    dest_dir: str | Path = ".",
+    filename: str = "chebi.obo",
+) -> Path:
+    """Download a versioned ChEBI OBO ontology file from the EBI FTP server.
+
+    Parameters
+    ----------
+    version : int
+        ChEBI release version number (e.g. 230, 245, 250).
+        Versions below 245 are fetched from the legacy archive path.
+    dest_dir : str or Path
+        Directory where the file will be saved (created if it doesn't exist).
+    filename : str
+        Name for the downloaded file.
+
+    Returns
+    -------
+    Path
+        Path to the downloaded file.
+    """
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest_path = dest_dir / filename
+    urllib.request.urlretrieve(_chebi_obo_url(version), dest_path)
+    return dest_path
+
+
+def download_chebi_sdf(
+    version: int,
+    dest_dir: str | Path = ".",
+    filename: str = "chebi.sdf.gz",
+) -> Path:
+    """Download a versioned ChEBI SDF file from the EBI FTP server.
+
+    Parameters
+    ----------
+    version : int
+        ChEBI release version number (e.g. 230, 245, 250).
+        Versions below 245 are fetched from the legacy archive path.
+    dest_dir : str or Path
+        Directory where the file will be saved (created if it doesn't exist).
+    filename : str
+        Name for the downloaded file.
+
+    Returns
+    -------
+    Path
+        Path to the downloaded file.
+    """
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest_path = dest_dir / filename
+    urllib.request.urlretrieve(_chebi_sdf_url(version), dest_path)
+    return dest_path
diff --git a/chebi_utils/obo_extractor.py b/chebi_utils/obo_extractor.py
@@ -0,0 +1,113 @@
+"""Extract ChEBI ontology data using fastobo and build a networkx graph."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import fastobo
+import networkx as nx
+
+
+def _chebi_id_to_str(chebi_id: str) -> str:
+    """Convert 'CHEBI:123' to '123' (string)."""
+    return chebi_id.split(":")[1]
+
+
+def _term_data(doc: "fastobo.term.TermFrame") -> dict | None:
+    """Extract data from a single fastobo TermFrame.
+
+    Returns
+    -------
+    dict or None
+        Parsed term data, or ``None`` if the term is marked as obsolete.
+    """
+    parents: list[str] = []
+    has_part: set[str] = set()
+    name: str | None = None
+    smiles: str | None = None
+    subset: str | None = None
+
+    for clause in doc:
+        if isinstance(clause, fastobo.term.IsObsoleteClause):
+            if clause.obsolete:
+                return None
+        elif isinstance(clause, fastobo.term.PropertyValueClause):
+            pv = clause.property_value
+            if str(pv.relation) in (
+                "chemrof:smiles_string",
+                "http://purl.obolibrary.org/obo/chebi/smiles",
+            ):
+                smiles = pv.value
+        elif isinstance(clause, fastobo.term.SynonymClause):
+            if "SMILES" in clause.raw_value() and smiles is None:
+                smiles = clause.raw_value().split('"')[1]
+        elif isinstance(clause, fastobo.term.RelationshipClause):
+            if str(clause.typedef) == "has_part":
+                has_part.add(_chebi_id_to_str(str(clause.term)))
+        elif isinstance(clause, fastobo.term.IsAClause):
+            parents.append(_chebi_id_to_str(str(clause.term)))
+        elif isinstance(clause, fastobo.term.NameClause):
+            name = str(clause.name)
+        elif isinstance(clause, fastobo.term.SubsetClause):
+            subset = str(clause.subset)
+
+    return {
+        "id": _chebi_id_to_str(str(doc.id)),
+        "parents": parents,
+        "has_part": has_part,
+        "name": name,
+        "smiles": smiles,
+        "subset": subset,
+    }
+
+
+def build_chebi_graph(filepath: str | Path) -> nx.DiGraph:
+    """Parse a ChEBI OBO file and build a directed graph of ontology terms.
+
+    ``xref:`` lines are stripped before parsing as they can cause fastobo
+    errors on some ChEBI releases.  Only non-obsolete CHEBI-prefixed terms
+    are included.
+
+    **Nodes** are string CHEBI IDs (e.g. ``"1"`` for ``CHEBI:1``) with
+    attributes ``name``, ``smiles``, and ``subset``.
+
+    **Edges** carry a ``relation`` attribute and represent:
+
+    - ``is_a`` — directed from child to parent
+    - ``has_part`` — directed from whole to part
+
+    Parameters
+    ----------
+    filepath : str or Path
+        Path to the ChEBI OBO file.
+
+    Returns
+    -------
+    nx.DiGraph
+        Directed graph of ChEBI ontology terms and their relationships.
+    """
+    with open(filepath, encoding="utf-8") as f:
+        content = "\n".join(line for line in f if not line.startswith("xref:"))
+
+    graph: nx.DiGraph = nx.DiGraph()
+
+    for frame in fastobo.loads(content):
+        if not (
+            frame and isinstance(frame.id, fastobo.id.PrefixedIdent) and frame.id.prefix == "CHEBI"
+        ):
+            continue
+
+        term = _term_data(frame)
+        if term is None:
+            continue
+
+        node_id = term["id"]
+        graph.add_node(node_id, name=term["name"], smiles=term["smiles"], subset=term["subset"])
+
+        for parent_id in term["parents"]:
+            graph.add_edge(node_id, parent_id, relation="is_a")
+
+        for part_id in term["has_part"]:
+            graph.add_edge(node_id, part_id, relation="has_part")
+
+    return graph