From b7cec6a3e9f8e25e8bf8afa0a379d986214ecb40 Mon Sep 17 00:00:00 2001 From: trxvorr Date: Sat, 13 Dec 2025 01:44:15 +0530 Subject: [PATCH 1/3] feat: implement NoSQL Parser --- pyproject.toml | 7 ++ src/intugle/nosql/__init__.py | 0 src/intugle/nosql/inference.py | 75 ++++++++++++++++++ src/intugle/nosql/parser.py | 105 +++++++++++++++++++++++++ src/intugle/nosql/writer.py | 28 +++++++ tests/nosql/test_inference.py | 56 +++++++++++++ tests/nosql/test_parser.py | 139 +++++++++++++++++++++++++++++++++ tests/nosql/test_writer.py | 51 ++++++++++++ uv.lock | 12 ++- 9 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 src/intugle/nosql/__init__.py create mode 100644 src/intugle/nosql/inference.py create mode 100644 src/intugle/nosql/parser.py create mode 100644 src/intugle/nosql/writer.py create mode 100644 tests/nosql/test_inference.py create mode 100644 tests/nosql/test_parser.py create mode 100644 tests/nosql/test_writer.py diff --git a/pyproject.toml b/pyproject.toml index 4df6392..bdc1d2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,11 @@ mysql = [ "sqlglot>=27.20.0", ] +nosql = [ + "pandas>=2.0.0", + "pyarrow>=14.0.0", +] + streamlit = [ "streamlit==1.50.0", "pyngrok==7.4.0", @@ -113,6 +118,8 @@ dev = [ "ipykernel>=6.30.1", "langfuse==2.60.4", "mssql-python>=0.13.1", + "pandas>=2.0.0", + "pyarrow>=14.0.0", "pysonar>=1.2.0.2419", "pyspark>=3.5.0,<4.0.0", "pytest>=8.4.1", diff --git a/src/intugle/nosql/__init__.py b/src/intugle/nosql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/intugle/nosql/inference.py b/src/intugle/nosql/inference.py new file mode 100644 index 0000000..d27d646 --- /dev/null +++ b/src/intugle/nosql/inference.py @@ -0,0 +1,75 @@ +from typing import Any, Dict, List, Set + + +def infer_schema(data: List[Dict[str, Any]], sample_size: int = 100) -> Dict[str, str]: + """ + Infers the schema of the NoSQL data by sampling records. + Handles type conflicts (e.g., int vs str -> str). + Identifies potential primary keys. + """ + if not data: + return {} + + schema: Dict[str, Set[str]] = {} + + # Analyze sample data + for row in data[:sample_size]: + for key, value in row.items(): + if key not in schema: + schema[key] = set() + + if value is None: + continue + + # Determine type + if isinstance(value, bool): + type_name = "bool" + elif isinstance(value, int): + type_name = "int" + elif isinstance(value, float): + type_name = "float" + elif isinstance(value, str): + type_name = "string" + elif isinstance(value, dict): + type_name = "object" + elif isinstance(value, list): + type_name = "list" + else: + type_name = "unknown" + + schema[key].add(type_name) + + final_schema: Dict[str, str] = {} + + # Resolve conflicts + for key, types in schema.items(): + if not types: + final_schema[key] = "string" # Default for all-null + elif len(types) == 1: + final_schema[key] = list(types)[0] + else: + # Conflict resolution + if "string" in types: + final_schema[key] = "string" + elif "float" in types and "int" in types: + final_schema[key] = "float" + elif "list" in types: + # If list mixes with something else, it's tricky. + # For now, if list is involved, we might mark it as complex/string or keep as mixed. + # Let's fallback to string/object representation for now if mixed. + final_schema[key] = "string" # simplified fallback + else: + final_schema[key] = "string" + + return final_schema + + +def identify_primary_key(schema: Dict[str, str]) -> str | None: + """ + Identifies a potential primary key from the schema keys. + """ + candidates = ["_id", "id", "uuid", "guid"] + for candidate in candidates: + if candidate in schema: + return candidate + return None diff --git a/src/intugle/nosql/parser.py b/src/intugle/nosql/parser.py new file mode 100644 index 0000000..cb96e67 --- /dev/null +++ b/src/intugle/nosql/parser.py @@ -0,0 +1,105 @@ +import uuid + +from typing import Any, Dict, List, Optional + +import pandas as pd + + +class NoSQLParser: + """ + Parser for converting NoSQL-style data (list of dictionaries) into a set of relational tables. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + self.tables: Dict[str, List[Dict[str, Any]]] = {} + self.config = config or {} + + def parse(self, data: List[Dict[str, Any]], root_table_name: str = "root") -> Dict[str, pd.DataFrame]: + """ + Parses a list of dictionaries into a dictionary of pandas DataFrames. + Nested lists are stripped out into separate tables and linked via foreign keys. + + Args: + data: A list of dictionaries representing the NoSQL data. + root_table_name: The name of the root table (default: "root"). + + Returns: + A dictionary where keys are table names and values are pandas DataFrames. + """ + self.tables = {} + + if not data: + return {} + + # Check for root table rename + root_table_name = self.config.get("table_renames", {}).get(root_table_name, root_table_name) + + for row in data: + self._process_node(row, root_table_name) + + # Convert lists of dicts to DataFrames + result = {} + for table_name, rows in self.tables.items(): + if rows: + result[table_name] = pd.json_normalize(rows) + else: + result[table_name] = pd.DataFrame() + + return result + + def _process_node( + self, + row_data: Dict[str, Any], + table_name: str, + parent_info: Optional[Dict[str, Any]] = None + ) -> None: + """ + Recursively processes a node (row) of data. + """ + # Determine Primary Key + pk_column = self.config.get("pk_overrides", {}).get(table_name) + + # If ID not present, we need to establish it either via override or generation + # If override is set, we expect that column to exist. + if pk_column and pk_column in row_data: + current_id = row_data[pk_column] + # Ensure internal _id tracks this for consistency in processing, + # though usually we might want to keep the original column as the PK. + # For this parser, we use _id as the internal tracker. + row_data["_id"] = current_id + elif "_id" not in row_data: + row_data["_id"] = str(uuid.uuid4()) + current_id = row_data["_id"] + else: + current_id = row_data["_id"] + + # If there is a parent, link back to it + if parent_info: + parent_id_col = f"{parent_info['table_name']}_id" + row_data[parent_id_col] = parent_info['id'] + + # Prepare storage for the current row, separating scalars from lists + scalar_data = {} + + for key, value in row_data.items(): + if isinstance(value, list) and value and isinstance(value[0], dict): + # Handle list of dicts -> Child Table + default_child_name = f"{table_name}_{key}" + child_table_name = self.config.get("table_renames", {}).get(default_child_name, default_child_name) + + for item in value: + self._process_node( + item, + child_table_name, + parent_info={"table_name": table_name, "id": current_id} + ) + elif isinstance(value, list) and not value: + # Empty list, ignore + scalar_data[key] = value + else: + # Scalar or simple object + scalar_data[key] = value + + if table_name not in self.tables: + self.tables[table_name] = [] + self.tables[table_name].append(scalar_data) diff --git a/src/intugle/nosql/writer.py b/src/intugle/nosql/writer.py new file mode 100644 index 0000000..efa2522 --- /dev/null +++ b/src/intugle/nosql/writer.py @@ -0,0 +1,28 @@ +import os + +from typing import Dict + +import pandas as pd + + +class ParquetTarget: + """ + Writes data to Parquet files. + """ + def __init__(self, output_dir: str): + self.output_dir = output_dir + + def write(self, tables: Dict[str, pd.DataFrame]) -> None: + """ + Writes a dictionary of DataFrames to Parquet files in the output directory. + + Args: + tables: A dictionary where keys are table names and values are DataFrames. + """ + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + + for table_name, df in tables.items(): + if not df.empty: + file_path = os.path.join(self.output_dir, f"{table_name}.parquet") + df.to_parquet(file_path, index=False) diff --git a/tests/nosql/test_inference.py b/tests/nosql/test_inference.py new file mode 100644 index 0000000..510d331 --- /dev/null +++ b/tests/nosql/test_inference.py @@ -0,0 +1,56 @@ +from intugle.nosql.inference import identify_primary_key, infer_schema + + +class TestSchemaInference: + def test_basic_types(self): + data = [{"a": 1, "b": "text", "c": True}] + schema = infer_schema(data) + assert schema["a"] == "int" + assert schema["b"] == "string" + assert schema["c"] == "bool" + + def test_mixed_types(self): + """Input mixed int/string. Assert schema result says "string".""" + data = [{"a": 1}, {"a": "2"}] + schema = infer_schema(data) + assert schema["a"] == "string" + + def test_numeric_types(self): + """Mixed int and float should become float.""" + data = [{"a": 1}, {"a": 2.5}] + schema = infer_schema(data) + assert schema["a"] == "float" + + def test_missing_fields(self): + """Input [{"a": 1}, {"b": 2}]. Assert schema contains both columns a and b.""" + data = [{"a": 1}, {"b": 2}] + schema = infer_schema(data) + assert "a" in schema + assert "b" in schema + assert schema["a"] == "int" + assert schema["b"] == "int" + + def test_null_values(self): + """Nulls should be ignored for type determination.""" + data = [{"a": 1}, {"a": None}] + schema = infer_schema(data) + assert schema["a"] == "int" + + # All nulls -> string default + data_null = [{"b": None}] + schema_null = infer_schema(data_null) + assert schema_null["b"] == "string" + + def test_pk_identification(self): + """Test primary key identification.""" + schema = {"_id": "string", "name": "string"} + pk = identify_primary_key(schema) + assert pk == "_id" + + schema2 = {"id": "int", "val": "string"} + pk2 = identify_primary_key(schema2) + assert pk2 == "id" + + schema3 = {"val": "string"} + pk3 = identify_primary_key(schema3) + assert pk3 is None diff --git a/tests/nosql/test_parser.py b/tests/nosql/test_parser.py new file mode 100644 index 0000000..85d3594 --- /dev/null +++ b/tests/nosql/test_parser.py @@ -0,0 +1,139 @@ +import pandas as pd + +from intugle.nosql.parser import NoSQLParser + + +class TestNoSQLParser: + def test_parser_init(self): + """Test that the parser can be initialized.""" + parser = NoSQLParser() + assert isinstance(parser, NoSQLParser) + + def test_flat_json(self): + """Test parsing of a list of flat JSON objects.""" + parser = NoSQLParser() + data = [{"a": 1}, {"a": 2}] + dfs = parser.parse(data, root_table_name="root") + + assert isinstance(dfs, dict) + assert "root" in dfs + df = dfs["root"] + assert isinstance(df, pd.DataFrame) + assert len(df) == 2 + assert "a" in df.columns + assert df["a"].tolist() == [1, 2] + + def test_nested_object(self): + """Test parsing of nested JSON objects (should be flattened).""" + parser = NoSQLParser() + data = [{"user": {"name": "Trevor", "age": 30}}] + dfs = parser.parse(data, root_table_name="root") + + df = dfs["root"] + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + # json_normalize flattens by default with dot notation + assert "user.name" in df.columns + assert "user.age" in df.columns + assert df["user.name"].iloc[0] == "Trevor" + assert df["user.age"].iloc[0] == 30 + + def test_mixed_flat_and_nested(self): + """Test mixed flat and nested structures.""" + parser = NoSQLParser() + data = [ + {"id": 1, "info": {"status": "active"}}, + {"id": 2, "info": {"status": "inactive"}} + ] + dfs = parser.parse(data, root_table_name="root") + + df = dfs["root"] + assert len(df) == 2 + assert "id" in df.columns + assert "info.status" in df.columns + assert df["info.status"].tolist() == ["active", "inactive"] + + def test_empty_input(self): + """Test parsing empty input.""" + parser = NoSQLParser() + dfs = parser.parse([]) + assert dfs == {} + + # Phase 2 Tests + + def test_single_list_split(self): + """Input {"id": 1, "items": [{"id": A}]}. Assert two tables exist: root and root_items.""" + parser = NoSQLParser() + data = [{"id": 1, "items": [{"id": "A"}]}] + dfs = parser.parse(data, root_table_name="root") + + assert "root" in dfs + assert "root_items" in dfs + + root_df = dfs["root"] + items_df = dfs["root_items"] + + assert len(root_df) == 1 + assert len(items_df) == 1 + assert items_df["id"].iloc[0] == "A" + # Since 'items' was split out, it should NOT be in root (or should be removed) + assert "items" not in root_df.columns + + def test_foreign_key_link(self): + """Assert the row in root_items has a root_id column matching the parent.""" + parser = NoSQLParser() + data = [{"_id": "parent_1", "items": [{"id": "child_1"}]}] + dfs = parser.parse(data, root_table_name="root") + + items_df = dfs["root_items"] + assert "root_id" in items_df.columns + assert items_df["root_id"].iloc[0] == "parent_1" + + def test_deep_nesting(self): + """Input {"a": [{"b": [{"c": 1}]}]}. Assert 3 tables are created.""" + parser = NoSQLParser() + data = [{"val": "top", "a": [{"val": "mid", "b": [{"c": 1}]}]}] + dfs = parser.parse(data, root_table_name="root") + + assert "root" in dfs + assert "root_a" in dfs + assert "root_a_b" in dfs + + assert len(dfs["root"]) == 1 + assert len(dfs["root_a"]) == 1 + assert len(dfs["root_a_b"]) == 1 + + assert dfs["root_a_b"]["c"].iloc[0] == 1 + + # Phase 5 Tests (Configuration) + + def test_table_renaming(self): + """Assert output table name matches config.""" + config = { + "table_renames": { + "root_items": "order_lines" + } + } + parser = NoSQLParser(config=config) + data = [{"id": 1, "items": [{"id": "A"}]}] + dfs = parser.parse(data, root_table_name="root") + + assert "root" in dfs + assert "order_lines" in dfs + assert "root_items" not in dfs + assert len(dfs["order_lines"]) == 1 + + def test_custom_pk(self): + """Assert foreign keys use the user-defined PK column.""" + config = { + "pk_overrides": { + "root": "email" + } + } + parser = NoSQLParser(config=config) + data = [{"email": "user@example.com", "items": [{"item_id": 1}]}] + dfs = parser.parse(data, root_table_name="root") + + items_df = dfs["root_items"] + assert "root_id" in items_df.columns + assert items_df["root_id"].iloc[0] == "user@example.com" diff --git a/tests/nosql/test_writer.py b/tests/nosql/test_writer.py new file mode 100644 index 0000000..da6760a --- /dev/null +++ b/tests/nosql/test_writer.py @@ -0,0 +1,51 @@ +import os + +import pandas as pd +import pytest + +from intugle.nosql.writer import ParquetTarget + + +@pytest.fixture +def output_dir(tmp_path): + d = tmp_path / "parquet_output" + d.mkdir() + return str(d) + + +class TestParquetWriter: + def test_write_parquet(self, output_dir): + """Run the parser logic (simulate) and check if file exists.""" + writer = ParquetTarget(output_dir) + df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]}) + tables = {"root": df} + + writer.write(tables) + + expected_file = os.path.join(output_dir, "root.parquet") + assert os.path.exists(expected_file) + + def test_read_back(self, output_dir): + """Read the parquet file with pandas and verify data matches.""" + writer = ParquetTarget(output_dir) + df_original = pd.DataFrame({"id": [1, 2], "val": [10, 20]}) + tables = {"data": df_original} + + writer.write(tables) + + file_path = os.path.join(output_dir, "data.parquet") + df_read = pd.read_parquet(file_path) + + pd.testing.assert_frame_equal(df_original, df_read) + + def test_empty_dataframe(self, output_dir): + """Empty DataFrames should probably NOT ideally be written, or handle gracefully.""" + writer = ParquetTarget(output_dir) + df_empty = pd.DataFrame() + tables = {"empty": df_empty} + + writer.write(tables) + + # Current implementation skips writing empty dataframes + expected_file = os.path.join(output_dir, "empty.parquet") + assert not os.path.exists(expected_file) diff --git a/uv.lock b/uv.lock index fee0f82..9d15176 100644 --- a/uv.lock +++ b/uv.lock @@ -2043,6 +2043,10 @@ mysql = [ { name = "pymysql" }, { name = "sqlglot" }, ] +nosql = [ + { name = "pandas" }, + { name = "pyarrow" }, +] postgres = [ { name = "asyncpg" }, { name = "sqlglot" }, @@ -2071,6 +2075,8 @@ dev = [ { name = "ipykernel" }, { name = "langfuse" }, { name = "mssql-python" }, + { name = "pandas" }, + { name = "pyarrow" }, { name = "pysonar" }, { name = "pyspark" }, { name = "pytest" }, @@ -2115,8 +2121,10 @@ requires-dist = [ { name = "nltk", specifier = ">=3.9.1" }, { name = "numpy", specifier = "<=2.3.0" }, { name = "pandas", specifier = ">=2.2.2" }, + { name = "pandas", marker = "extra == 'nosql'", specifier = ">=2.0.0" }, { name = "plotly", marker = "extra == 'streamlit'" }, { name = "pyaml", specifier = ">=25.7.0" }, + { name = "pyarrow", marker = "extra == 'nosql'", specifier = ">=14.0.0" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pyfunctional", specifier = ">=1.5.0" }, @@ -2142,7 +2150,7 @@ requires-dist = [ { name = "xgboost", specifier = ">=3.0.4" }, { name = "xlsxwriter", marker = "extra == 'streamlit'", specifier = "==3.2.9" }, ] -provides-extras = ["snowflake", "databricks", "postgres", "sqlserver", "mysql", "streamlit"] +provides-extras = ["snowflake", "databricks", "postgres", "sqlserver", "mysql", "nosql", "streamlit"] [package.metadata.requires-dev] dev = [ @@ -2151,6 +2159,8 @@ dev = [ { name = "ipykernel", specifier = ">=6.30.1" }, { name = "langfuse", specifier = "==2.60.4" }, { name = "mssql-python", specifier = ">=0.13.1" }, + { name = "pandas", specifier = ">=2.0.0" }, + { name = "pyarrow", specifier = ">=14.0.0" }, { name = "pysonar", specifier = ">=1.2.0.2419" }, { name = "pyspark", specifier = ">=3.5.0,<4.0.0" }, { name = "pytest", specifier = ">=8.4.1" }, From 24ee1121641c06e82a509e49a38faf04f254223c Mon Sep 17 00:00:00 2001 From: trxvorr Date: Fri, 19 Dec 2025 14:21:35 +0530 Subject: [PATCH 2/3] feat: implement CLI, MongoSource, and High-Level API --- pyproject.toml | 2 + src/intugle/__init__.py | 9 ++++ src/intugle/cli.py | 104 ++++++++++++++++++++++++++++++++++-- src/intugle/nosql/api.py | 63 ++++++++++++++++++++++ src/intugle/nosql/source.py | 39 ++++++++++++++ tests/nosql/__init__.py | 1 + tests/nosql/test_api.py | 87 ++++++++++++++++++++++++++++++ tests/nosql/test_source.py | 90 +++++++++++++++++++++++++++++++ uv.lock | 73 +++++++++++++++++++++++++ 9 files changed, 464 insertions(+), 4 deletions(-) create mode 100644 src/intugle/nosql/api.py create mode 100644 src/intugle/nosql/source.py create mode 100644 tests/nosql/__init__.py create mode 100644 tests/nosql/test_api.py create mode 100644 tests/nosql/test_source.py diff --git a/pyproject.toml b/pyproject.toml index cc13ebe..b38f612 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ oracle = [ nosql = [ "pandas>=2.0.0", "pyarrow>=14.0.0", + "pymongo>=4.0.0", ] streamlit = [ @@ -104,6 +105,7 @@ streamlit = [ "Bug Tracker" = "https://github.com/Intugle/data-tools/issues" [project.scripts] +intugle = "intugle.cli:main" intugle-mcp = "intugle.mcp.server:main" intugle-streamlit = "intugle.cli:run_streamlit_app" diff --git a/src/intugle/__init__.py b/src/intugle/__init__.py index 272058c..bcb1317 100644 --- a/src/intugle/__init__.py +++ b/src/intugle/__init__.py @@ -1,3 +1,12 @@ from intugle.analysis.models import DataSet as DataSet from intugle.data_product import DataProduct as DataProduct from intugle.semantic_model import SemanticModel as SemanticModel + +# Expose NoSQL components if dependencies are available +try: + from intugle.nosql.api import NoSQLToRelationalParser + from intugle.nosql.source import MongoSource + from intugle.nosql.writer import ParquetTarget +except ImportError: + # Dependencies (pandas/pyarrow/pymongo) might not be installed + pass diff --git a/src/intugle/cli.py b/src/intugle/cli.py index ab126a9..d1f4ddd 100644 --- a/src/intugle/cli.py +++ b/src/intugle/cli.py @@ -1,6 +1,15 @@ +import argparse import importlib.util +import logging import os import subprocess +import sys + +# Setup basic logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) def run_streamlit_app(): @@ -28,9 +37,9 @@ def run_streamlit_app(): return # Get the absolute path to the main.py of the Streamlit app - app_dir = os.path.join(os.path.dirname(__file__), 'streamlit_app') - app_path = os.path.join(app_dir, 'main.py') - + app_dir = os.path.join(os.path.dirname(__file__), "streamlit_app") + app_path = os.path.join(app_dir, "main.py") + # Ensure the app_path exists if not os.path.exists(app_path): print(f"Error: Streamlit app not found at {app_path}") @@ -41,5 +50,92 @@ def run_streamlit_app(): subprocess.run(["streamlit", "run", app_path], cwd=app_dir) +def run_nosql_to_relational(args): + """Execute the NoSQL to Relational conversion command.""" + # Import here to avoid issues when nosql dependencies aren't installed + try: + from intugle.nosql.api import NoSQLToRelationalParser + from intugle.nosql.source import MongoSource + from intugle.nosql.writer import ParquetTarget + except ImportError as e: + logger.error( + "NoSQL dependencies not installed. Install with: pip install 'intugle[nosql]'" + ) + logger.error(f"Missing: {e}") + sys.exit(1) + + try: + logger.info(f"Connecting to MongoDB: {args.db}.{args.collection}") + source = MongoSource( + uri=args.uri, + database=args.db, + collection=args.collection, + sample_size=args.sample, + ) + + logger.info("Initializing parser...") + # Initialize orchestrator + orchestrator = NoSQLToRelationalParser(source) + + # Run parsing + logger.info("Starting extraction and parsing...") + orchestrator.run() + + # Write output + logger.info(f"Writing results to: {args.output}") + target = ParquetTarget(args.output) + orchestrator.write(target) + + logger.info("✅ Job completed successfully!") + + except Exception as e: + logger.error(f"Job failed: {str(e)}") + sys.exit(1) + + +def main(): + """Main entry point for the intugle CLI.""" + parser = argparse.ArgumentParser( + description="Intugle - GenAI-powered semantic layer toolkit" + ) + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Define the 'streamlit' command + subparsers.add_parser("streamlit", help="Launch the Streamlit web application") + + # Define the 'nosql-to-relational' command + nosql_parser = subparsers.add_parser( + "nosql-to-relational", help="Convert NoSQL collection to Parquet" + ) + + # Source Arguments + nosql_parser.add_argument("--uri", required=True, help="MongoDB connection URI") + nosql_parser.add_argument("--db", required=True, help="Database name") + nosql_parser.add_argument("--collection", required=True, help="Collection name") + + # Output Arguments + nosql_parser.add_argument( + "--output", required=True, help="Output directory for Parquet files" + ) + + # Optional Arguments + nosql_parser.add_argument( + "--sample", + type=int, + default=0, + help="Number of documents to sample (0 = all)", + ) + + args = parser.parse_args() + + if args.command == "streamlit": + run_streamlit_app() + elif args.command == "nosql-to-relational": + run_nosql_to_relational(args) + else: + parser.print_help() + + if __name__ == "__main__": - run_streamlit_app() + main() + diff --git a/src/intugle/nosql/api.py b/src/intugle/nosql/api.py new file mode 100644 index 0000000..ef612d2 --- /dev/null +++ b/src/intugle/nosql/api.py @@ -0,0 +1,63 @@ +from typing import Dict, Any, Optional +from intugle.nosql.source import NoSQLSource +from intugle.nosql.parser import NoSQLParser +from intugle.nosql.inference import infer_schema + + +class NoSQLToRelationalParser: + """ + High-level orchestrator for converting NoSQL data to relational formats. + """ + + def __init__(self, source: NoSQLSource, config: Optional[Dict[str, Any]] = None): + """ + Args: + source: A configured NoSQLSource (e.g., MongoSource) + config: Configuration dict for table renaming, PK overrides, etc. + """ + self.source = source + self.config = config or {} + self._parsed_tables: Optional[Dict[str, Any]] = None + + def infer_model(self) -> Dict[str, Any]: + """ + Peeks at the source data to infer the schema and relationships. + + Returns: + Dict containing the inferred schema structure. + """ + # Fetch a sample from the source (iterator) + # We assume the source handles sampling limits internally if configured + sample_data = list(self.source.get_data()) + + # Use our existing inference logic + schema = infer_schema(sample_data) + return schema + + def run(self) -> None: + """ + Executes the full parsing pipeline: fetch -> parse. + Stores results in memory (self._parsed_tables). + """ + # Note: For V1, we load source data into memory. + # Future optimization: Implement chunked streaming here. + all_data = list(self.source.get_data()) + + # Initialize the core parser with our config + # (Assuming NoSQLParser accepts config in __init__ from Phase 5) + parser = NoSQLParser(config=self.config) + + self._parsed_tables = parser.parse(all_data) + + def write(self, target: Any) -> None: + """ + Writes the parsed tables to the specified target. + + Args: + target: An instance of a Target (e.g., ParquetTarget) + """ + if self._parsed_tables is None: + # Auto-run if not already run + self.run() + + target.write(self._parsed_tables) diff --git a/src/intugle/nosql/source.py b/src/intugle/nosql/source.py new file mode 100644 index 0000000..1d7eb11 --- /dev/null +++ b/src/intugle/nosql/source.py @@ -0,0 +1,39 @@ +from typing import Iterator, Dict, Any +import pymongo + + +class NoSQLSource: + """Base interface for NoSQL data sources.""" + + def get_data(self) -> Iterator[Dict[str, Any]]: + raise NotImplementedError + + +class MongoSource(NoSQLSource): + """Reads data from a MongoDB collection.""" + + def __init__(self, uri: str, database: str, collection: str, sample_size: int = 0): + self.uri = uri + self.database = database + self.collection = collection + self.sample_size = sample_size + self._client = None + + def _connect(self): + if not self._client: + self._client = pymongo.MongoClient(self.uri) + + def get_data(self) -> Iterator[Dict[str, Any]]: + self._connect() + db = self._client[self.database] + coll = db[self.collection] + + cursor = coll.find() + if self.sample_size > 0: + cursor = cursor.limit(self.sample_size) + + for doc in cursor: + # Convert ObjectId to string to avoid serialization issues later + if '_id' in doc: + doc['_id'] = str(doc['_id']) + yield doc diff --git a/tests/nosql/__init__.py b/tests/nosql/__init__.py new file mode 100644 index 0000000..5d21017 --- /dev/null +++ b/tests/nosql/__init__.py @@ -0,0 +1 @@ +# NoSQL module tests diff --git a/tests/nosql/test_api.py b/tests/nosql/test_api.py new file mode 100644 index 0000000..b3437d2 --- /dev/null +++ b/tests/nosql/test_api.py @@ -0,0 +1,87 @@ +from unittest.mock import MagicMock +from intugle.nosql.api import NoSQLToRelationalParser + + +def test_api_initialization(): + """Test NoSQLToRelationalParser initializes correctly.""" + mock_source = MagicMock() + orchestrator = NoSQLToRelationalParser(mock_source) + + assert orchestrator.source == mock_source + assert orchestrator.config == {} + assert orchestrator._parsed_tables is None + + +def test_api_initialization_with_config(): + """Test NoSQLToRelationalParser accepts config.""" + mock_source = MagicMock() + config = {"rename_tables": {"root": "orders"}} + orchestrator = NoSQLToRelationalParser(mock_source, config=config) + + assert orchestrator.config == config + + +def test_api_infer_model(): + """Test infer_model calls source and returns schema.""" + mock_source = MagicMock() + mock_source.get_data.return_value = iter([{"id": 1, "name": "test"}]) + + orchestrator = NoSQLToRelationalParser(mock_source) + model = orchestrator.infer_model() + + assert "id" in model + assert "name" in model + mock_source.get_data.assert_called_once() + + +def test_api_run_flow(): + """Test run() executes parsing pipeline.""" + mock_source = MagicMock() + mock_source.get_data.return_value = iter([{"id": 1, "val": "a"}]) + + orchestrator = NoSQLToRelationalParser(mock_source) + orchestrator.run() + + assert orchestrator._parsed_tables is not None + assert "root" in orchestrator._parsed_tables + + +def test_api_run_with_nested_data(): + """Test run() handles nested arrays correctly.""" + mock_source = MagicMock() + mock_source.get_data.return_value = iter( + [{"id": 1, "items": [{"name": "a"}, {"name": "b"}]}] + ) + + orchestrator = NoSQLToRelationalParser(mock_source) + orchestrator.run() + + assert "root" in orchestrator._parsed_tables + assert "root_items" in orchestrator._parsed_tables + + +def test_api_write_delegation(): + """Test write() delegates to target.""" + mock_source = MagicMock() + mock_source.get_data.return_value = iter([]) + mock_target = MagicMock() + + orchestrator = NoSQLToRelationalParser(mock_source) + orchestrator.write(mock_target) + + mock_target.write.assert_called_once() + + +def test_api_write_auto_runs_if_not_run(): + """Test write() auto-runs if run() wasn't called.""" + mock_source = MagicMock() + mock_source.get_data.return_value = iter([{"id": 1}]) + mock_target = MagicMock() + + orchestrator = NoSQLToRelationalParser(mock_source) + assert orchestrator._parsed_tables is None + + orchestrator.write(mock_target) + + assert orchestrator._parsed_tables is not None + mock_target.write.assert_called_once() diff --git a/tests/nosql/test_source.py b/tests/nosql/test_source.py new file mode 100644 index 0000000..3897299 --- /dev/null +++ b/tests/nosql/test_source.py @@ -0,0 +1,90 @@ +from unittest.mock import MagicMock, patch +from intugle.nosql.source import MongoSource, NoSQLSource + + +def test_nosql_source_interface(): + """Test that NoSQLSource raises NotImplementedError.""" + source = NoSQLSource() + try: + source.get_data() + assert False, "Should have raised NotImplementedError" + except NotImplementedError: + pass + + +def test_mongo_source_initialization(): + """Test MongoSource stores initialization parameters correctly.""" + source = MongoSource("mongodb://test", "db", "coll") + assert source.uri == "mongodb://test" + assert source.database == "db" + assert source.collection == "coll" + assert source.sample_size == 0 + assert source._client is None + + +def test_mongo_source_initialization_with_sample(): + """Test MongoSource with sample_size parameter.""" + source = MongoSource("mongodb://test", "db", "coll", sample_size=100) + assert source.sample_size == 100 + + +@patch("intugle.nosql.source.pymongo.MongoClient") +def test_mongo_source_get_data(mock_client): + """Test MongoSource fetches and yields documents correctly.""" + # Setup mock + mock_db = MagicMock() + mock_coll = MagicMock() + mock_cursor = [{"_id": "1", "data": "test"}] + + mock_client.return_value.__getitem__.return_value = mock_db + mock_db.__getitem__.return_value = mock_coll + mock_coll.find.return_value = mock_cursor + + # Run + source = MongoSource("mongodb://test", "db", "coll") + data = list(source.get_data()) + + # Verify + assert len(data) == 1 + assert data[0]["data"] == "test" + mock_coll.find.assert_called_once() + + +@patch("intugle.nosql.source.pymongo.MongoClient") +def test_mongo_source_converts_objectid_to_string(mock_client): + """Test that ObjectId is converted to string.""" + from bson import ObjectId + + mock_db = MagicMock() + mock_coll = MagicMock() + oid = ObjectId() + mock_cursor = [{"_id": oid, "name": "test"}] + + mock_client.return_value.__getitem__.return_value = mock_db + mock_db.__getitem__.return_value = mock_coll + mock_coll.find.return_value = mock_cursor + + source = MongoSource("mongodb://test", "db", "coll") + data = list(source.get_data()) + + assert isinstance(data[0]["_id"], str) + assert data[0]["_id"] == str(oid) + + +@patch("intugle.nosql.source.pymongo.MongoClient") +def test_mongo_source_sample_limit(mock_client): + """Test that sample_size limits the cursor.""" + mock_db = MagicMock() + mock_coll = MagicMock() + mock_cursor = MagicMock() + mock_cursor.__iter__ = MagicMock(return_value=iter([])) + + mock_client.return_value.__getitem__.return_value = mock_db + mock_db.__getitem__.return_value = mock_coll + mock_coll.find.return_value = mock_cursor + mock_cursor.limit.return_value = mock_cursor + + source = MongoSource("mongodb://test", "db", "coll", sample_size=50) + list(source.get_data()) + + mock_cursor.limit.assert_called_once_with(50) diff --git a/uv.lock b/uv.lock index 163150a..130640a 100644 --- a/uv.lock +++ b/uv.lock @@ -2046,6 +2046,7 @@ mysql = [ nosql = [ { name = "pandas" }, { name = "pyarrow" }, + { name = "pymongo" }, ] oracle = [ { name = "oracledb" }, @@ -2133,6 +2134,7 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.11.7" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pyfunctional", specifier = ">=1.5.0" }, + { name = "pymongo", marker = "extra == 'nosql'", specifier = ">=4.0.0" }, { name = "pymysql", marker = "extra == 'mysql'", specifier = ">=1.1.0" }, { name = "pyngrok", marker = "extra == 'streamlit'", specifier = "==7.4.0" }, { name = "pyspark", marker = "extra == 'databricks'", specifier = ">=3.5.0,<4.0.0" }, @@ -4612,6 +4614,77 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymongo" +version = "4.15.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dnspython" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/a0/5c324fe6735b2bc189779ff46e981a59d495a74594f45542159125d77256/pymongo-4.15.5.tar.gz", hash = "sha256:3a8d6bf2610abe0c97c567cf98bf5bba3e90ccc93cc03c9dde75fa11e4267b42", size = 2471889, upload-time = "2025-12-02T18:44:30.992Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/e4/d80061be4e53125597dd2916171c87986043b190e50c1834fff455e71d42/pymongo-4.15.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a01a2054d50b50c121c720739a2216d855c48726b0002894de9b991cdd68a2a5", size = 811318, upload-time = "2025-12-02T18:42:12.09Z" }, + { url = "https://files.pythonhosted.org/packages/fb/b3/c499fe0814e4d3a84fa3ff5df5133bf847529d8b5a051e6108b5a25b75c7/pymongo-4.15.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e57968139d81367117ed7b75d921445a575d4d7e61536f5e860475df92ac0a9", size = 811676, upload-time = "2025-12-02T18:42:14.396Z" }, + { url = "https://files.pythonhosted.org/packages/62/71/8e21a8a680546b3a90afbb878a16fe2a7cb0f7d9652aa675c172e57856a1/pymongo-4.15.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:266aa37e3673e5dcfdd359a81d27131fc133e49cf8e5d9f9f27a5845fac2cd1f", size = 1185485, upload-time = "2025-12-02T18:42:16.147Z" }, + { url = "https://files.pythonhosted.org/packages/03/56/bdc292a7b01aa2aba806883dbcacc3be837d65425453aa2bc27954ba5a55/pymongo-4.15.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2883da6bd0545cc2f12672f6a609b33d48e099a220872ca2bf9bf29fe96a32c3", size = 1203866, upload-time = "2025-12-02T18:42:18.018Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e2/12bebc7e93a81c2f804ffcc94997f61f0e2cd2c11bf0f01da8e0e1425e5c/pymongo-4.15.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2fc32b354a608ec748d89bbe236b74b967890667eea1af54e92dfd8fbf26df52", size = 1242550, upload-time = "2025-12-02T18:42:19.898Z" }, + { url = "https://files.pythonhosted.org/packages/0d/ac/c48f6f59a660ec44052ee448dea1c71da85cfaa4a0c17c726d4ee2db7716/pymongo-4.15.5-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3c006cbaa4b40d296dd2bb8828976866c876ead4c39032b761dcf26f1ba56fde", size = 1232844, upload-time = "2025-12-02T18:42:21.709Z" }, + { url = "https://files.pythonhosted.org/packages/89/cc/6368befca7a2f3b51460755a373f78b72003aeee95e8e138cbd479c307f4/pymongo-4.15.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce21e3dc5939b83d03f871090d83ac29fef055bd057f8d3074b6cad10f86b04c", size = 1200192, upload-time = "2025-12-02T18:42:23.605Z" }, + { url = "https://files.pythonhosted.org/packages/9d/97/bc810a017ebb20e6e301fa8c5b21c5e53691fdde2cfd39bd9c450e957b14/pymongo-4.15.5-cp310-cp310-win32.whl", hash = "sha256:1b545dcf66a9f06e9b501bfb0438e1eb9af67336e8a5cf36c4bc0a5d3fbe7a37", size = 798338, upload-time = "2025-12-02T18:42:25.438Z" }, + { url = "https://files.pythonhosted.org/packages/46/17/3be0b476a6bfb3a51bf1750323b5eddf883dddb6482ccb8dbcab2c6c48ad/pymongo-4.15.5-cp310-cp310-win_amd64.whl", hash = "sha256:1ecc544f515f828f05d3c56cd98063ba3ef8b75f534c63de43306d59f1e93fcd", size = 808153, upload-time = "2025-12-02T18:42:26.889Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0a/39f9daf16d695abd58987bb5e2c164b5a64e42b8d53d3c43bc06e4aa7dfc/pymongo-4.15.5-cp310-cp310-win_arm64.whl", hash = "sha256:1151968ab90db146f0591b6c7db27ce4f73c7ffa0bbddc1d7fb7cb14c9f0b967", size = 800943, upload-time = "2025-12-02T18:42:28.668Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ea/e43387c2ed78a60ad917c45f4d4de4f6992929d63fe15af4c2e624f093a9/pymongo-4.15.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:57157a4b936e28e2fbe7017b2f6a751da5e284675cab371f2c596d4e0e4f58f3", size = 865894, upload-time = "2025-12-02T18:42:30.496Z" }, + { url = "https://files.pythonhosted.org/packages/5e/8c/f2c9c55adb9709a4b2244d8d8d9ec05e4abb274e03fe8388b58a34ae08b0/pymongo-4.15.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2a34a7391f4cc54fc584e49db6f7c3929221a9da08b3af2d2689884a5943843", size = 866235, upload-time = "2025-12-02T18:42:31.862Z" }, + { url = "https://files.pythonhosted.org/packages/5e/aa/bdf3553d7309b0ebc0c6edc23f43829b1758431f2f2f7385d2427b20563b/pymongo-4.15.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:be040c8cdaf9c2d5ae9ab60a67ecab453ec19d9ccd457a678053fdceab5ee4c8", size = 1429787, upload-time = "2025-12-02T18:42:33.829Z" }, + { url = "https://files.pythonhosted.org/packages/b3/55/80a8eefc88f578fde56489e5278ba5caa5ee9b6f285959ed2b98b44e2133/pymongo-4.15.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:defe93944526b1774265c16acf014689cb1b0b18eb84a7b370083b214f9e18cd", size = 1456747, upload-time = "2025-12-02T18:42:35.805Z" }, + { url = "https://files.pythonhosted.org/packages/1d/54/6a7ec290c7ab22aab117ab60e7375882ec5af7433eaf077f86e187a3a9e8/pymongo-4.15.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:816e66116f0ef868eff0463a8b28774af8b547466dbad30c8e82bf0325041848", size = 1514670, upload-time = "2025-12-02T18:42:37.737Z" }, + { url = "https://files.pythonhosted.org/packages/65/8a/5822aa20b274ee8a8821bf0284f131e7fc555b0758c3f2a82c51ae73a3c6/pymongo-4.15.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66c7b332532e0f021d784d04488dbf7ed39b7e7d6d5505e282ec8e9cf1025791", size = 1500711, upload-time = "2025-12-02T18:42:39.61Z" }, + { url = "https://files.pythonhosted.org/packages/32/ca/63984e32b4d745a25445c9da1159dfe4568a03375f32bb1a9e009dccb023/pymongo-4.15.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:acc46a9e47efad8c5229e644a3774169013a46ee28ac72d1fa4edd67c0b7ee9b", size = 1452021, upload-time = "2025-12-02T18:42:41.323Z" }, + { url = "https://files.pythonhosted.org/packages/f1/23/0d6988f3fdfcacae2ac8d7b76eb24f80ebee9eb607c53bcebfad75b7fd85/pymongo-4.15.5-cp311-cp311-win32.whl", hash = "sha256:b9836c28ba350d8182a51f32ef9bb29f0c40e82ba1dfb9e4371cd4d94338a55d", size = 844483, upload-time = "2025-12-02T18:42:42.814Z" }, + { url = "https://files.pythonhosted.org/packages/8e/04/dedff8a5a9539e5b6128d8d2458b9c0c83ebd38b43389620a0d97223f114/pymongo-4.15.5-cp311-cp311-win_amd64.whl", hash = "sha256:3a45876c5c2ab44e2a249fb542eba2a026f60d6ab04c7ef3924eae338d9de790", size = 859194, upload-time = "2025-12-02T18:42:45.025Z" }, + { url = "https://files.pythonhosted.org/packages/67/e5/fb6f49bceffe183e66831c2eebd2ea14bd65e2816aeaf8e2fc018fd8c344/pymongo-4.15.5-cp311-cp311-win_arm64.whl", hash = "sha256:e4a48fc5c712b3db85c9987cfa7fde0366b7930018de262919afd9e52cfbc375", size = 848377, upload-time = "2025-12-02T18:42:47.19Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4e/8f9fcb2dc9eab1fb0ed02da31e7f4847831d9c0ef08854a296588b97e8ed/pymongo-4.15.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c33477af1a50d1b4d86555e098fc2cf5992d839ad538dea0c00a8682162b7a75", size = 920955, upload-time = "2025-12-02T18:42:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b4/c0808bed1f82b3008909b9562615461e59c3b66f8977e502ea87c88b08a4/pymongo-4.15.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e6b30defa4a52d3698cd84d608963a8932f7e9b6ec5130087e7082552ac685e5", size = 920690, upload-time = "2025-12-02T18:42:50.832Z" }, + { url = "https://files.pythonhosted.org/packages/12/f3/feea83150c6a0cd3b44d5f705b1c74bff298a36f82d665f597bf89d42b3f/pymongo-4.15.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:45fec063f5672e6173bcb09b492431e3641cc74399c2b996fcb995881c2cac61", size = 1690351, upload-time = "2025-12-02T18:42:53.402Z" }, + { url = "https://files.pythonhosted.org/packages/d7/4e/15924d33d8d429e4c41666090017c6ac5e7ccc4ce5e435a2df09e45220a8/pymongo-4.15.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c6813110c0d9fde18674b7262f47a2270ae46c0ddd05711e6770caa3c9a3fb", size = 1726089, upload-time = "2025-12-02T18:42:56.187Z" }, + { url = "https://files.pythonhosted.org/packages/a5/49/650ff29dc5f9cf090dfbd6fb248c56d8a10d268b6f46b10fb02fbda3c762/pymongo-4.15.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8ec48d1db9f44c737b13be4299a1782d5fde3e75423acbbbe927cb37ebbe87d", size = 1800637, upload-time = "2025-12-02T18:42:57.913Z" }, + { url = "https://files.pythonhosted.org/packages/7d/18/f34661ade670ee42331543f4aa229569ac7ef45907ecda41b777137b9f40/pymongo-4.15.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1f410694fdd76631ead7df6544cdeadaf2407179196c3642fced8e48bb21d0a6", size = 1785480, upload-time = "2025-12-02T18:43:00.626Z" }, + { url = "https://files.pythonhosted.org/packages/10/b6/378bb26937f6b366754484145826aca2d2361ac05b0bacd45a35876abcef/pymongo-4.15.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8c46765d6ac5727a899190aacdeec7a57f8c93346124ddd7e12633b573e2e65", size = 1718548, upload-time = "2025-12-02T18:43:02.32Z" }, + { url = "https://files.pythonhosted.org/packages/58/79/31b8afba36f794a049633e105e45c30afaa0e1c0bab48332d999e87d4860/pymongo-4.15.5-cp312-cp312-win32.whl", hash = "sha256:647118a58dca7d3547714fc0b383aebf81f5852f4173dfd77dd34e80eea9d29b", size = 891319, upload-time = "2025-12-02T18:43:04.699Z" }, + { url = "https://files.pythonhosted.org/packages/c8/31/a7e6d8c5657d922872ac75ab1c0a1335bfb533d2b4dad082d5d04089abbb/pymongo-4.15.5-cp312-cp312-win_amd64.whl", hash = "sha256:099d3e2dddfc75760c6a8fadfb99c1e88824a99c2c204a829601241dff9da049", size = 910919, upload-time = "2025-12-02T18:43:06.555Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b4/286c12fa955ae0597cd4c763d87c986e7ade681d4b11a81766f62f079c79/pymongo-4.15.5-cp312-cp312-win_arm64.whl", hash = "sha256:649cb906882c4058f467f334fb277083998ba5672ffec6a95d6700db577fd31a", size = 896357, upload-time = "2025-12-02T18:43:08.801Z" }, + { url = "https://files.pythonhosted.org/packages/9b/92/e70db1a53bc0bb5defe755dee66b5dfbe5e514882183ffb696d6e1d38aa2/pymongo-4.15.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b736226f9001bbbd02f822acb9b9b6d28319f362f057672dfae2851f7da6125", size = 975324, upload-time = "2025-12-02T18:43:11.074Z" }, + { url = "https://files.pythonhosted.org/packages/a4/90/dd78c059a031b942fa36d71796e94a0739ea9fb4251fcd971e9579192611/pymongo-4.15.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:60ea9f07fbbcc7c88f922082eb27436dce6756730fdef76a3a9b4c972d0a57a3", size = 975129, upload-time = "2025-12-02T18:43:13.345Z" }, + { url = "https://files.pythonhosted.org/packages/40/72/87cf1bb75ef296456912eb7c6d51ebe7a36dbbe9bee0b8a9cd02a62a8a6e/pymongo-4.15.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:20af63218ae42870eaee31fb8cc4ce9e3af7f04ea02fc98ad751fb7a9c8d7be3", size = 1950973, upload-time = "2025-12-02T18:43:15.225Z" }, + { url = "https://files.pythonhosted.org/packages/8c/68/dfa507c8e5cebee4e305825b436c34f5b9ba34488a224b7e112a03dbc01e/pymongo-4.15.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20d9c11625392f1f8dec7688de5ce344e110ca695344efa313ae4839f13bd017", size = 1995259, upload-time = "2025-12-02T18:43:16.869Z" }, + { url = "https://files.pythonhosted.org/packages/85/9d/832578e5ed7f682a09441bbc0881ffd506b843396ef4b34ec53bd38b2fb2/pymongo-4.15.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1202b3e5357b161acb7b7cc98e730288a5c15544e5ef7254b33931cb9a27c36e", size = 2086591, upload-time = "2025-12-02T18:43:19.559Z" }, + { url = "https://files.pythonhosted.org/packages/0a/99/ca8342a0cefd2bb1392187ef8fe01432855e3b5cd1e640495246bcd65542/pymongo-4.15.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:63af710e9700dbf91abccf119c5f5533b9830286d29edb073803d3b252862c0d", size = 2070200, upload-time = "2025-12-02T18:43:21.214Z" }, + { url = "https://files.pythonhosted.org/packages/3f/7d/f4a9c1fceaaf71524ff9ff964cece0315dcc93df4999a49f064564875bff/pymongo-4.15.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f22eeb86861cf7b8ee6886361d52abb88e3cd96c6f6d102e45e2604fc6e9e316", size = 1985263, upload-time = "2025-12-02T18:43:23.415Z" }, + { url = "https://files.pythonhosted.org/packages/d8/15/f942535bcc6e22d3c26c7e730daf296ffe69d8ce474c430ea7e551f8cf33/pymongo-4.15.5-cp313-cp313-win32.whl", hash = "sha256:aad6efe82b085bf77cec2a047ded2c810e93eced3ccf1a8e3faec3317df3cd52", size = 938143, upload-time = "2025-12-02T18:43:26.081Z" }, + { url = "https://files.pythonhosted.org/packages/02/2a/c92a6927d676dd376d1ae05c680139c5cad068b22e5f0c8cb61014448894/pymongo-4.15.5-cp313-cp313-win_amd64.whl", hash = "sha256:ccc801f6d71ebee2ec2fb3acc64b218fa7cdb7f57933b2f8eee15396b662a0a0", size = 962603, upload-time = "2025-12-02T18:43:27.816Z" }, + { url = "https://files.pythonhosted.org/packages/3a/f0/cdf78e9ed9c26fb36b8d75561ebf3c7fe206ff1c3de2e1b609fccdf3a55b/pymongo-4.15.5-cp313-cp313-win_arm64.whl", hash = "sha256:f043abdf20845bf29a554e95e4fe18d7d7a463095d6a1547699a12f80da91e02", size = 944308, upload-time = "2025-12-02T18:43:29.371Z" }, + { url = "https://files.pythonhosted.org/packages/03/0c/49713e0f8f41110e8b2bcce7c88570b158cf43dd53a0d01d4e1c772c7ede/pymongo-4.15.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:ba0e75a390334221744e2666fd2d4c82419b580c9bc8d6e0d2d61459d263f3af", size = 1029996, upload-time = "2025-12-02T18:43:31.58Z" }, + { url = "https://files.pythonhosted.org/packages/23/de/1df5d7b49647e9e4511054f750c1109cb8e160763b286b96879917170618/pymongo-4.15.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:853ec7da97642eabaf94d3de4453a86365729327d920af167bf14b2e87b24dce", size = 1029612, upload-time = "2025-12-02T18:43:33.69Z" }, + { url = "https://files.pythonhosted.org/packages/8b/19/3a051228e5beb0b421d725bb2ab5207a260c718d9b5be5b85cfe963733e3/pymongo-4.15.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7631304106487480ebbd8acbe44ff1e69d1fdc27e83d9753dc1fd227cea10761", size = 2211814, upload-time = "2025-12-02T18:43:35.769Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b3/989531a056c4388ef18245d1a6d6b3ec5c538666b000764286119efbf194/pymongo-4.15.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50505181365eba5d4d35c462870b3614c8eddd0b2407c89377c1a59380640dd9", size = 2264629, upload-time = "2025-12-02T18:43:37.479Z" }, + { url = "https://files.pythonhosted.org/packages/ea/5f/8b3339fec44d0ba6d9388a19340fb1534c85ab6aa9fd8fb9c1af146bb72a/pymongo-4.15.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3b75ec7006471299a571d6db1c5609ea4aa9c847a701e9b2953a8ede705d82db", size = 2371823, upload-time = "2025-12-02T18:43:39.866Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7f/706bf45cf12990b6cb73e6290b048944a51592de7a597052a761eea90b8d/pymongo-4.15.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c3fc24cb1f4ec60ed83162d4bba0c26abc6c9ae78c928805583673f3b3ea6984", size = 2351860, upload-time = "2025-12-02T18:43:42.002Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c5/fdcc81c20c67a61ba1073122c9ab42c937dd6f914004747e9ceefa4cead3/pymongo-4.15.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21d17bb2934b0640863361c08dd06991f128a97f9bee19425a499227be9ae6b4", size = 2251349, upload-time = "2025-12-02T18:43:43.924Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1c/e540ccac0685b234a23574dce3c8e077cd59bcb73ab19bcab1915894d3a6/pymongo-4.15.5-cp314-cp314-win32.whl", hash = "sha256:5a3974236cb842b4ef50a5a6bfad9c7d83a713af68ea3592ba240bbcb863305a", size = 992901, upload-time = "2025-12-02T18:43:45.732Z" }, + { url = "https://files.pythonhosted.org/packages/89/31/eb72c53bc897cb50b57000d71ce9bdcfc9c84ba4c7f6d55348df47b241d8/pymongo-4.15.5-cp314-cp314-win_amd64.whl", hash = "sha256:73fa8a7eee44fd95ba7d5cf537340ff3ff34efeb1f7d6790532d0a6ed4dee575", size = 1021205, upload-time = "2025-12-02T18:43:47.756Z" }, + { url = "https://files.pythonhosted.org/packages/ea/4a/74a7cc350d60953d27b5636906b43b232b501cee07f70f6513ac603097e8/pymongo-4.15.5-cp314-cp314-win_arm64.whl", hash = "sha256:d41288ca2a3eb9ac7c8cad4ea86ef8d63b69dc46c9b65c2bbd35331ec2a0fc57", size = 1000616, upload-time = "2025-12-02T18:43:49.677Z" }, + { url = "https://files.pythonhosted.org/packages/1a/22/1e557868b9b207d7dbf7706412251b28a82d4b958e007b6f2569d59ada3d/pymongo-4.15.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:552670f0c8bff103656d4e4b1f2c018f789c9de03f7615ed5e547d5b1b83cda0", size = 1086723, upload-time = "2025-12-02T18:43:51.432Z" }, + { url = "https://files.pythonhosted.org/packages/aa/9c/2e24c2da289e1d3b9bc4e0850136a364473bddfbe8b19b33d2bb5d30ee0d/pymongo-4.15.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41891b45f6ff1e23cfd1b7fbe40286664ad4507e2d2aa61c6d8c40eb6e11dded", size = 1086653, upload-time = "2025-12-02T18:43:53.131Z" }, + { url = "https://files.pythonhosted.org/packages/c6/be/4c2460c9ec91a891c754b91914ce700cc46009dae40183a85e26793dfae9/pymongo-4.15.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:524a8a593ae2eb1ec6db761daf0c03f98824e9882ab7df3d458d0c76c7ade255", size = 2531627, upload-time = "2025-12-02T18:43:55.141Z" }, + { url = "https://files.pythonhosted.org/packages/a0/48/cea56d04eb6bbd8b8943ff73d7cf26b94f715fccb23cf7ef9a4f853725a0/pymongo-4.15.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e7ceb35c41b86711a1b284c604e2b944a2d46cb1b8dd3f8b430a9155491378f2", size = 2603767, upload-time = "2025-12-02T18:43:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/d9/ff/6743e351f8e0d5c3f388deb15f0cdbb77d2439eb3fba7ebcdf7878719517/pymongo-4.15.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3be2336715924be3a861b5e40c634376fd6bfe6dd1892d391566aa5a88a31307", size = 2725216, upload-time = "2025-12-02T18:43:59.463Z" }, + { url = "https://files.pythonhosted.org/packages/d4/90/fa532b6320b3ba61872110ff6f674bd54b54a592c0c64719e4f46852d0b6/pymongo-4.15.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d65df9c015e33f74ea9d1abf474971abca21e347a660384f8227dbdab75a33ca", size = 2704804, upload-time = "2025-12-02T18:44:01.415Z" }, + { url = "https://files.pythonhosted.org/packages/e1/84/1905c269aced043973b9528d94678e62e2eba249e70490c3c32dc70e2501/pymongo-4.15.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83c05bea05e151754357f8e6bbb80d5accead5110dc58f64e283173c71ec9de2", size = 2582274, upload-time = "2025-12-02T18:44:03.427Z" }, + { url = "https://files.pythonhosted.org/packages/7e/af/78c13179961e418396ec6ef53c0f1c855f1e9f1176d10909e8345d65366a/pymongo-4.15.5-cp314-cp314t-win32.whl", hash = "sha256:7c285614a3e8570b03174a25db642e449b0e7f77a6c9e487b73b05c9bf228ee6", size = 1044015, upload-time = "2025-12-02T18:44:05.318Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d5/49012f03418dce976124da339f3a6afbe6959cb0468ca6302596fe272926/pymongo-4.15.5-cp314-cp314t-win_amd64.whl", hash = "sha256:aae7d96f7b2b1a2753349130797543e61e93ee2ace8faa7fbe0565e2eb5d815f", size = 1078481, upload-time = "2025-12-02T18:44:07.215Z" }, + { url = "https://files.pythonhosted.org/packages/5e/fc/f352a070d8ff6f388ce344c5ddb82348a38e0d1c99346fa6bfdef07134fe/pymongo-4.15.5-cp314-cp314t-win_arm64.whl", hash = "sha256:576a7d4b99465d38112c72f7f3d345f9d16aeeff0f923a3b298c13e15ab4f0ad", size = 1051166, upload-time = "2025-12-02T18:44:09.048Z" }, +] + [[package]] name = "pymysql" version = "1.1.2" From 61142f23d9765d86cd191e24df5b0f45c0e338cf Mon Sep 17 00:00:00 2001 From: trxvorr Date: Fri, 19 Dec 2025 15:22:41 +0530 Subject: [PATCH 3/3] style: fix ruff linting issues --- src/intugle/__init__.py | 10 +++++++--- src/intugle/nosql/api.py | 7 ++++--- src/intugle/nosql/source.py | 3 ++- src/intugle/semantic_model.py | 4 ++-- tests/adapters/test_utils.py | 4 +++- tests/nosql/test_api.py | 1 + tests/nosql/test_source.py | 1 + 7 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/intugle/__init__.py b/src/intugle/__init__.py index bcb1317..016ac93 100644 --- a/src/intugle/__init__.py +++ b/src/intugle/__init__.py @@ -2,11 +2,15 @@ from intugle.data_product import DataProduct as DataProduct from intugle.semantic_model import SemanticModel as SemanticModel +__all__ = ["DataSet", "DataProduct", "SemanticModel"] + # Expose NoSQL components if dependencies are available try: - from intugle.nosql.api import NoSQLToRelationalParser - from intugle.nosql.source import MongoSource - from intugle.nosql.writer import ParquetTarget + from intugle.nosql.api import NoSQLToRelationalParser # noqa: F401 + from intugle.nosql.source import MongoSource # noqa: F401 + from intugle.nosql.writer import ParquetTarget # noqa: F401 + + __all__.extend(["NoSQLToRelationalParser", "MongoSource", "ParquetTarget"]) except ImportError: # Dependencies (pandas/pyarrow/pymongo) might not be installed pass diff --git a/src/intugle/nosql/api.py b/src/intugle/nosql/api.py index ef612d2..25beca3 100644 --- a/src/intugle/nosql/api.py +++ b/src/intugle/nosql/api.py @@ -1,7 +1,8 @@ -from typing import Dict, Any, Optional -from intugle.nosql.source import NoSQLSource -from intugle.nosql.parser import NoSQLParser +from typing import Any, Dict, Optional + from intugle.nosql.inference import infer_schema +from intugle.nosql.parser import NoSQLParser +from intugle.nosql.source import NoSQLSource class NoSQLToRelationalParser: diff --git a/src/intugle/nosql/source.py b/src/intugle/nosql/source.py index 1d7eb11..856feda 100644 --- a/src/intugle/nosql/source.py +++ b/src/intugle/nosql/source.py @@ -1,4 +1,5 @@ -from typing import Iterator, Dict, Any +from typing import Any, Dict, Iterator + import pymongo diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py index 802edf6..e9bfe87 100644 --- a/src/intugle/semantic_model.py +++ b/src/intugle/semantic_model.py @@ -1,5 +1,6 @@ import logging +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List import pandas as pd @@ -11,7 +12,6 @@ from intugle.link_predictor.predictor import LinkPredictor from intugle.semantic_search import SemanticSearch from intugle.utils.files import update_relationship_file_mtime -from pathlib import Path if TYPE_CHECKING: from intugle.adapters.adapter import Adapter @@ -83,6 +83,7 @@ def _initialize_from_list(self, data_list: List[DataSet]): "DataSet objects provided in a list must have a 'name' attribute." ) self.datasets[dataset.name] = dataset + def _initialize_from_folder(self, folder_path: str): """ Initialize datasets by scanning a folder (recursively) for supported data files. @@ -133,7 +134,6 @@ def _initialize_from_folder(self, folder_path: str): f"No supported data files (.csv, .parquet, .xlsx) found in directory: {folder_path}" ) - def profile(self, force_recreate: bool = False): """ Runs the data profiling pipeline for all contained datasets. diff --git a/tests/adapters/test_utils.py b/tests/adapters/test_utils.py index 567a96a..e19a27b 100644 --- a/tests/adapters/test_utils.py +++ b/tests/adapters/test_utils.py @@ -1,7 +1,9 @@ import numpy as np -from intugle.adapters.utils import convert_to_native import pytest +from intugle.adapters.utils import convert_to_native + + def test_numpy_scalar_int(): value = np.int64(10) result = convert_to_native(value) diff --git a/tests/nosql/test_api.py b/tests/nosql/test_api.py index b3437d2..1ccdf2f 100644 --- a/tests/nosql/test_api.py +++ b/tests/nosql/test_api.py @@ -1,4 +1,5 @@ from unittest.mock import MagicMock + from intugle.nosql.api import NoSQLToRelationalParser diff --git a/tests/nosql/test_source.py b/tests/nosql/test_source.py index 3897299..7df0ef7 100644 --- a/tests/nosql/test_source.py +++ b/tests/nosql/test_source.py @@ -1,4 +1,5 @@ from unittest.mock import MagicMock, patch + from intugle.nosql.source import MongoSource, NoSQLSource