diff --git a/.gitignore b/.gitignore index b6d5248d..f00eefe0 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ examples/research/**/*.json # OSM cache and large outputs cache/ +nomad/tests/tmp_osm_cache/ examples/output/ examples/research/virtual_philadelphia/*.gpkg @@ -42,6 +43,7 @@ pip tutorials/IC2S2-2025/tutorial-sample/* # Development environment configurations (personal/local only) +AGENTS.md .vscode/ .idea/ *.code-workspace diff --git a/README.md b/README.md index a48cebc2..a88bcf42 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ The examples/ folder contains notebooks and small sample datasets that demonstra ## Contribute For development clone the repository and ensure unit tests, located in `nomad\tests\' are passed before submitting a pull request. + +Mapping tests use a local OSM cache by default. Create or refresh it with `python -m nomad.data.generate_osm_test_cache`, then run `pytest nomad/tests/test_maps.py`. To run against live OSM instead, set `NOMAD_OSM_TEST_CACHE=0` first (PowerShell: `$env:NOMAD_OSM_TEST_CACHE = "0"`). This mode uses pytest's built-in `monkeypatch` fixture; do not install a separate monkeypatch package. If cached files are missing or stale, rerun the cache script. Keep generated geospatial artifacts out of git unless their size has been reviewed, since OSM extracts can bloat repository history quickly. + ## License MIT © University of Pennsylvania 2025. diff --git a/examples/hdbscan_demo.ipynb b/examples/hdbscan_demo.ipynb index ea22abcd..1f26b212 100644 --- a/examples/hdbscan_demo.ipynb +++ b/examples/hdbscan_demo.ipynb @@ -33,6 +33,7 @@ "from shapely.geometry import box\n", "from nomad.stop_detection.viz import plot_stops_barcode, plot_time_barcode, plot_stops, plot_pings\n", "import nomad.stop_detection.hdbscan as HDBSCAN\n", + "import time\n", "\n", "# Load data\n", "import nomad.data as data_folder\n", @@ -92,7 +93,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -106,7 +107,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/examples/hdbscan_random_trajectories.ipynb b/examples/hdbscan_random_trajectories.ipynb new file mode 100644 index 00000000..86b74e68 --- /dev/null +++ b/examples/hdbscan_random_trajectories.ipynb @@ -0,0 +1,831 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# HDBSCAN Performance on Agent-Generated Trajectories\n", + "\n", + "Evaluate `st_hdbscan` across realistic synthetic trajectories produced by\n", + "the `Agent` simulation (EPR mobility model on the Garden City map). Ground truth comes\n", + "directly from `agent.diary`.\n", + "\n", + "**Sweep dimensions**\n", + "\n", + "| Dimension | Values | What it controls |\n", + "|---|---|---|\n", + "| `seed` | 0–9 | Agent home/workplace and movement |\n", + "| `beta_start` | 100, 250, 400 | Burst inter-arrival time → trajectory sparsity |\n", + "| `time_thresh` | 20, 60 | HDBSCAN temporal neighborhood (min) |\n", + "| `min_pts` | 2, 3 | HDBSCAN core-point threshold |\n", + "\n", + "**Metrics** are time-based (seconds of overlap between detected and ground-truth stop intervals):\n", + "- **Precision** — detected stop time that overlaps a true stop / total detected stop time \n", + "- **Recall** — true stop time covered by any detected stop / total true stop time \n", + "- **F1** — harmonic mean" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import time\n", + "from itertools import product\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from nomad.city_gen import City\n", + "from nomad.traj_gen import Agent\n", + "import nomad.stop_detection.hdbscan as HDBSCAN\n", + "import nomad.data as data_folder" + ] + }, + { + "cell_type": "markdown", + "id": "city-header", + "metadata": {}, + "source": [ + "## 1. Load\n", + "\n", + "Building hub network, gravity matrix, and shortest paths are computed once and reused for all agents." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "load-city", + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = Path(data_folder.__file__).parent\n", + "city_path = data_dir / 'garden-city.gpkg'\n", + "\n", + "t0 = time.perf_counter()\n", + "city = City.from_geopackage(str(city_path))\n", + "city._build_hub_network(hub_size=16)\n", + "city.compute_gravity(exponent=2.0)\n", + "city.compute_shortest_paths(callable_only=True)\n", + "\n", + "# Pre-sample a pool of home/workplace pairs for the agents\n", + "rng_pool = np.random.default_rng(0)\n", + "homes = city.buildings_gdf[city.buildings_gdf['building_type'] == 'home']['id'].to_numpy()\n", + "workplaces = city.buildings_gdf[city.buildings_gdf['building_type'] == 'workplace']['id'].to_numpy()\n", + "N_AGENTS = 10\n", + "home_pool = rng_pool.choice(homes, size=N_AGENTS, replace=True)\n", + "work_pool = rng_pool.choice(workplaces, size=N_AGENTS, replace=True)\n", + "\n", + "TC = {'timestamp': 'timestamp', 'x': 'x', 'y': 'y'}" + ] + }, + { + "cell_type": "markdown", + "id": "metrics-header", + "metadata": {}, + "source": [ + "## 2. Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "metrics", + "metadata": {}, + "outputs": [], + "source": [ + "# compare with contact_estimation, compute metrics and overlapping_visits\n", + "# see how results differ\n", + "\n", + "def _overlap(a_s, a_e, b_s, b_e):\n", + " return max(0, min(a_e, b_e) - max(a_s, b_s))\n", + "\n", + "\n", + "def temporal_metrics(stops_df, gt, tc):\n", + " \"\"\"\n", + " Time-based precision / recall / F1.\n", + "\n", + " Parameters\n", + " ----------\n", + " stops_df : output of st_hdbscan with complete_output=True\n", + " gt : DataFrame with columns start_ts (int, seconds) and end_ts (int, seconds)\n", + " tc : traj_cols dict (must include 'timestamp')\n", + " \"\"\"\n", + " if stops_df is None or stops_df.empty:\n", + " return dict(precision=0.0, recall=0.0, f1=0.0, n_detected=0)\n", + "\n", + " detected = [\n", + " (int(r[tc['timestamp']]), int(r['end_timestamp']))\n", + " for _, r in stops_df.iterrows()\n", + " ]\n", + " truth = [\n", + " (int(r['start_ts']), int(r['end_ts']))\n", + " for _, r in gt.iterrows()\n", + " ]\n", + "\n", + " total_det = sum(e - s for s, e in detected)\n", + " total_gt = sum(e - s for s, e in truth)\n", + "\n", + " tp = sum(_overlap(ds, de, gs, ge) for ds, de in detected for gs, ge in truth)\n", + "\n", + " precision = tp / total_det if total_det > 0 else 0.0\n", + " recall = tp / total_gt if total_gt > 0 else 0.0\n", + " f1 = (\n", + " 2 * precision * recall / (precision + recall)\n", + " if (precision + recall) > 0 else 0.0\n", + " )\n", + " return dict(\n", + " precision=round(precision, 4),\n", + " recall=round(recall, 4),\n", + " f1=round(f1, 4),\n", + " n_detected=len(stops_df),\n", + " )\n", + "\n", + "\n", + "def diary_to_gt(diary):\n", + " \"\"\"\n", + " Convert agent.diary to a ground-truth stop interval DataFrame.\n", + " Skips transit entries (location is None) and consecutive duplicate\n", + " locations are merged so back-to-back entries at the same building\n", + " count as one stop.\n", + " \"\"\"\n", + " df = diary[diary['location'].notna()].copy()\n", + " df['end_ts'] = df['timestamp'] + (df['duration'] * 60).astype(int)\n", + " df = df.rename(columns={'timestamp': 'start_ts'})\n", + "\n", + " # Merge consecutive entries at the same building\n", + " rows = []\n", + " for _, r in df.iterrows():\n", + " if rows and rows[-1]['location'] == r['location'] and r['start_ts'] <= rows[-1]['end_ts']:\n", + " rows[-1]['end_ts'] = r['end_ts']\n", + " else:\n", + " rows.append({'start_ts': r['start_ts'], 'end_ts': r['end_ts'],\n", + " 'location': r['location']})\n", + "\n", + " gt = pd.DataFrame(rows)\n", + " gt['duration_min'] = (gt['end_ts'] - gt['start_ts']) / 60\n", + " return gt" + ] + }, + { + "cell_type": "markdown", + "id": "battery-header", + "metadata": {}, + "source": [ + "## 3. Battery Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "battery-config", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Agents: 10 | Sparsity configs: 3 | HDBSCAN configs: 4\n", + "Total HDBSCAN runs: 120\n" + ] + } + ], + "source": [ + "# Trajectory generation window\n", + "START_DT = pd.Timestamp('2024-01-01T07:00-04:00')\n", + "END_DT = pd.Timestamp('2024-01-02T07:00-04:00') # 1-day trajectories\n", + "\n", + "# Sampling sparsity: higher beta_start → more time between bursts → sparser\n", + "BETA_START_VALS = [100, 250, 400] # burst inter-arrival time (min)\n", + "BETA_PING = 5 # ping inter-arrival within burst (min)\n", + "\n", + "# HDBSCAN configurations\n", + "HDBSCAN_CONFIGS = [\n", + " dict(time_thresh=20, min_pts=2, min_cluster_size=2, dur_min=5, label='tt=20 mp=2 mcs=2'),\n", + " dict(time_thresh=60, min_pts=2, min_cluster_size=2, dur_min=5, label='tt=60 mp=2 mcs=2'),\n", + " dict(time_thresh=60, min_pts=3, min_cluster_size=2, dur_min=5, label='tt=60 mp=3 mcs=2'),\n", + " dict(time_thresh=60, min_pts=2, min_cluster_size=1, dur_min=5, label='tt=60 mp=2 mcs=1'),\n", + "]\n", + "\n", + "total_runs = N_AGENTS * len(BETA_START_VALS) * len(HDBSCAN_CONFIGS)\n", + "print(f'Agents: {N_AGENTS} | Sparsity configs: {len(BETA_START_VALS)} | HDBSCAN configs: {len(HDBSCAN_CONFIGS)}')\n", + "print(f'Total HDBSCAN runs: {total_runs}')" + ] + }, + { + "cell_type": "markdown", + "id": "gen-header", + "metadata": {}, + "source": [ + "## 4. Generate Trajectories" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "generate-trajs", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 10 dense trajectories\n", + "\n", + "Agent 0 diary (17 entries, 1441 ticks):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetimetimestampdurationlocationuser_id
02024-01-01 07:01:00-04:00170410686060h-x7-y13agent_000
12024-01-01 08:01:00-04:0017041104604Noneagent_000
22024-01-01 08:05:00-04:00170411070041p-x12-y11agent_000
32024-01-01 08:46:00-04:0017041131605Noneagent_000
42024-01-01 08:51:00-04:001704113460190w-x11-y16agent_000
52024-01-01 12:01:00-04:0017041248608Noneagent_000
62024-01-01 12:09:00-04:00170412534037r-x3-y1agent_000
72024-01-01 12:46:00-04:0017041275606Noneagent_000
\n", + "
" + ], + "text/plain": [ + " datetime timestamp duration location user_id\n", + "0 2024-01-01 07:01:00-04:00 1704106860 60 h-x7-y13 agent_000\n", + "1 2024-01-01 08:01:00-04:00 1704110460 4 None agent_000\n", + "2 2024-01-01 08:05:00-04:00 1704110700 41 p-x12-y11 agent_000\n", + "3 2024-01-01 08:46:00-04:00 1704113160 5 None agent_000\n", + "4 2024-01-01 08:51:00-04:00 1704113460 190 w-x11-y16 agent_000\n", + "5 2024-01-01 12:01:00-04:00 1704124860 8 None agent_000\n", + "6 2024-01-01 12:09:00-04:00 1704125340 37 r-x3-y1 agent_000\n", + "7 2024-01-01 12:46:00-04:00 1704127560 6 None agent_000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate dense ground-truth trajectories once per agent/seed\n", + "agents = {}\n", + "for seed in range(N_AGENTS):\n", + " agent = Agent(\n", + " identifier=f'agent_{seed:03d}',\n", + " city=city,\n", + " home=home_pool[seed],\n", + " workplace=work_pool[seed],\n", + " datetime=START_DT,\n", + " seed=seed,\n", + " )\n", + " agent.generate_trajectory(end_time=END_DT, seed=seed)\n", + " agents[seed] = agent\n", + "\n", + "print(f'Generated {len(agents)} dense trajectories')\n", + "# Peek at one diary\n", + "ex = agents[0]\n", + "print(f'\\nAgent 0 diary ({len(ex.diary)} entries, {len(ex.trajectory)} ticks):')\n", + "ex.diary.head(8)" + ] + }, + { + "cell_type": "markdown", + "id": "run-header", + "metadata": {}, + "source": [ + "## 5. Run" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "run-battery", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done. 120 rows.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seedbeta_startqn_pingsn_true_stopshdbscan_cfgruntime_sprecisionrecallf1n_detected
001000.042609tt=20 mp=2 mcs=20.13320.98430.15460.26735
101000.042609tt=60 mp=2 mcs=20.12830.98470.19350.32345
201000.042609tt=60 mp=3 mcs=20.13740.98470.19350.32345
301000.042609tt=60 mp=2 mcs=10.13600.98470.19350.32345
402500.1231779tt=20 mp=2 mcs=20.43640.98990.62860.768912
502500.1231779tt=60 mp=2 mcs=20.44950.99020.72030.83408
602500.1231779tt=60 mp=3 mcs=20.46090.99020.72030.83408
702500.1231779tt=60 mp=2 mcs=10.44740.99020.72030.83408
\n", + "
" + ], + "text/plain": [ + " seed beta_start q n_pings n_true_stops hdbscan_cfg \\\n", + "0 0 100 0.042 60 9 tt=20 mp=2 mcs=2 \n", + "1 0 100 0.042 60 9 tt=60 mp=2 mcs=2 \n", + "2 0 100 0.042 60 9 tt=60 mp=3 mcs=2 \n", + "3 0 100 0.042 60 9 tt=60 mp=2 mcs=1 \n", + "4 0 250 0.123 177 9 tt=20 mp=2 mcs=2 \n", + "5 0 250 0.123 177 9 tt=60 mp=2 mcs=2 \n", + "6 0 250 0.123 177 9 tt=60 mp=3 mcs=2 \n", + "7 0 250 0.123 177 9 tt=60 mp=2 mcs=1 \n", + "\n", + " runtime_s precision recall f1 n_detected \n", + "0 0.1332 0.9843 0.1546 0.2673 5 \n", + "1 0.1283 0.9847 0.1935 0.3234 5 \n", + "2 0.1374 0.9847 0.1935 0.3234 5 \n", + "3 0.1360 0.9847 0.1935 0.3234 5 \n", + "4 0.4364 0.9899 0.6286 0.7689 12 \n", + "5 0.4495 0.9902 0.7203 0.8340 8 \n", + "6 0.4609 0.9902 0.7203 0.8340 8 \n", + "7 0.4474 0.9902 0.7203 0.8340 8 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "records = []\n", + "\n", + "for seed, agent in agents.items():\n", + " gt = diary_to_gt(agent.diary)\n", + "\n", + " for beta_start in BETA_START_VALS:\n", + " agent.sample_trajectory(\n", + " beta_ping=BETA_PING,\n", + " beta_start=beta_start,\n", + " beta_durations=beta_start,\n", + " replace_sparse_traj=True,\n", + " seed=seed,\n", + " )\n", + " # Drop datetime column to avoid timestamp/datetime ambiguity in st_hdbscan\n", + " sparse = agent.sparse_traj[['x', 'y', 'timestamp']].copy()\n", + " n_pings = len(sparse)\n", + " q = n_pings / len(agent.trajectory)\n", + "\n", + " for cfg in HDBSCAN_CONFIGS:\n", + " params = {k: v for k, v in cfg.items() if k != 'label'}\n", + "\n", + " t0 = time.perf_counter()\n", + " try:\n", + " stops = HDBSCAN.st_hdbscan(\n", + " sparse.copy(),\n", + " traj_cols=TC,\n", + " complete_output=True,\n", + " **params,\n", + " )\n", + " except Exception as exc:\n", + " print(f' seed={seed} beta_start={beta_start} {cfg[\"label\"]}: {exc}')\n", + " stops = None\n", + " elapsed = time.perf_counter() - t0\n", + "\n", + " m = temporal_metrics(stops, gt, TC)\n", + " records.append({\n", + " 'seed': seed,\n", + " 'beta_start': beta_start,\n", + " 'q': round(q, 3),\n", + " 'n_pings': n_pings,\n", + " 'n_true_stops': len(gt),\n", + " 'hdbscan_cfg': cfg['label'],\n", + " 'runtime_s': round(elapsed, 4),\n", + " **m,\n", + " })\n", + "\n", + "raw_df = pd.DataFrame(records)\n", + "print(f'Done. {len(raw_df)} rows.')\n", + "raw_df.head(8)" + ] + }, + { + "cell_type": "markdown", + "id": "tables-header", + "metadata": {}, + "source": [ + "## 6. Summary Tables\n", + "\n", + "### Mean ± std per HDBSCAN config (across agents and sparsities)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "summary-table", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hdbscan_cfgprecision_meanprecision_stdrecall_meanrecall_stdf1_meanf1_stdruntime_s_meanruntime_s_std
0tt=20 mp=2 mcs=20.9230.2510.3880.1990.5270.2210.2880.165
1tt=60 mp=2 mcs=10.9230.2510.4590.2230.5920.2340.2770.160
2tt=60 mp=2 mcs=20.9230.2510.4590.2230.5920.2340.2820.162
3tt=60 mp=3 mcs=20.9230.2510.4580.2260.5900.2350.3010.172
\n", + "
" + ], + "text/plain": [ + " hdbscan_cfg precision_mean precision_std recall_mean recall_std \\\n", + "0 tt=20 mp=2 mcs=2 0.923 0.251 0.388 0.199 \n", + "1 tt=60 mp=2 mcs=1 0.923 0.251 0.459 0.223 \n", + "2 tt=60 mp=2 mcs=2 0.923 0.251 0.459 0.223 \n", + "3 tt=60 mp=3 mcs=2 0.923 0.251 0.458 0.226 \n", + "\n", + " f1_mean f1_std runtime_s_mean runtime_s_std \n", + "0 0.527 0.221 0.288 0.165 \n", + "1 0.592 0.234 0.277 0.160 \n", + "2 0.592 0.234 0.282 0.162 \n", + "3 0.590 0.235 0.301 0.172 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary = (\n", + " raw_df\n", + " .groupby('hdbscan_cfg')[['precision', 'recall', 'f1', 'runtime_s']]\n", + " .agg(['mean', 'std'])\n", + " .round(3)\n", + ")\n", + "summary.columns = [f'{col}_{stat}' for col, stat in summary.columns]\n", + "summary.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "viz-header", + "metadata": {}, + "source": [ + "## 7. Visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "pr-bars", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABKAAAAGNCAYAAAAmUwnzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAABimElEQVR4nO3dCbxM9f/48fd1L9dSSnayhcoWIhFabSWlvkpURKRFiVYlSwut4ldEScv3S2jftSjfUkoI+fYl+5WsKddyF/fe83+8P9//GTP3zuVzlzP3npnX8/GYzJx7ZubMez5n3s17Pkuc4ziOAAAAAAAAAB4p4dUDAwAAAAAAAIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAD5xww03SN26dYv6MBBDzj//fHOJRnou6TkVS+dtcToWAEDsoQAFACiQV199VeLi4sxl0aJFOf7uOI7UqlXL/P3SSy8tkmMsjg4dOiRjx46VhQsXFvWhwGNTp0415wmKt19//dWck5s3by7qQwEAICpRgAIAFIrSpUvL7Nmzc2z/97//Lb///rskJiYWyXEV5wLUuHHjKEDFAApQxdPatWvlpZdeCilA6TlJAQoAAG9QgAIAFIpLLrlE3nzzTcnIyAjZrkWpVq1aSbVq1cRvtDikPbf4Qhrq4MGDRX0IQIHbphbFS5Ys6enxAACAIyhAAQAKRZ8+feTPP/+UL774IrAtPT1d3nrrLenbt2/Y+2RlZcmkSZOkSZMmpgdV1apVZciQIfLXX3+F7Pf+++9L9+7dpUaNGuZLY/369eWRRx6RzMzMkP10rpqmTZuangwXXHCBlC1bVmrWrClPPvmkRNrSpUula9euUqlSJSlTpozUq1dPBg4caP6mBa3KlSub69rjwh3CqMN/XO+99555LRoX/ffdd9+1fm7beKkff/zRFA8rVKgg5cqVkzPOOEMmT54cMmfMcccdJxs2bDD7HX/88XLttdcGvuzfddddZoilPs9pp50mTz/9tBl2GUzbRIcOHeTEE080j6X7PfDAAyH7PPfcc6Yd6Humx9K6deuwPeqCafsaPXq0KXCecMIJ5vg7duwoX3/9dY59tW1ef/31Ur58eXMc/fv3l5UrV5q4Z++dtGbNGunVq5ecdNJJJv56LB988EHYoaffffedjBgxwryf+vxXXHGF7N69O7Cfzrfzn//8x/QEdN9nd06lw4cPm/e/YcOG5nkqVqxo4hR8DoWzd+9eufvuu6VZs2YmnvqaLr74YvN6whVQ582bJ4899picfPLJ5nkuuugiWb9+fY7HffHFF01b0fbapk0b+fbbb8XWsd5j91jmzp1rtmtBWuN12WWXydatW0MeS5/3qquuktq1a5t2pe1r+PDhkpKSErLf0drmunXr5B//+Id5Hn3N+tqvueYa2bdvX9g5oPT91OdU+tnhvld63NpW9DzW9yu7Ll26mNeaVzaffTpk+ZRTTgl7/3bt2pl2Gexf//qXORf0/dO2q683e2zDmTNnjrmfxk/bkrar4M8AAAAKS0KhPRIAIKbplzn9UvTGG2+YL8Pq008/NV/49IvQ//3f/+W4j37h0i9+AwYMkDvuuEM2bdokzz//vPz888/mi73bO0H30S+a+kVf//3qq69M4SE5OVmeeuqpkMfUL3DdunWTK6+8Uq6++mpTALvvvvvMlyr3uHKjxxr8JdP9sqqPqc/r0i+Mwbez27Vrl/liqkWJ+++/33wp16LTO++8Y/6u21944QW55ZZbTMFCj1Vp8Ud9/vnn5stz48aNZcKECaZ4ojHSL9E2bOOlRQP9klu9enUZNmyY+bL+3//+Vz766CNz26W92rSYpgUGLTBpkUiLTFo80GLPjTfeKC1atJDPPvtM7rnnHtm2bZs8++yz5r5afNHn0Nf28MMPm4KCFj/0/XXpMCh9/7Xoo8+bmpoqq1atMsWx3IqXSl/PjBkzTPFz8ODBsn//fnn55ZfNsS5ZssQck/tlv0ePHmabxvz00083RTotLGSnx9u+fXtTuNT3ToskWsDp2bOnvP322+b9Cnb77bebgtmYMWPMe6xFhaFDh5pCi9Lbuo++Dw8++KDZpsUGpQVHfX8HDRpkCj76erRwuXz5cuncuXOur3vjxo2mQKkFEy1s7ty5U6ZPny7nnXeeKb5q4THY448/LiVKlDBFK23TWpDVQo3G16Vx0/PxnHPOkTvvvNM8h76/WsjQAtDR2LzHLi2EaWFHz0k9TzQ+nTp1khUrVpjCidKelDpEVd8rLcrp+6YFSh3Kq38LFq5tamFSt6WlpZnYa7vWNqnt+u+//zbFyuzOPfdc0wb1c0oLZI0aNTLb9V8tXL7++uumfQfPY7djxw5zbul7n1c2n329e/eWfv36yU8//SRnnXVW4L5btmyRH374IeRc1rg+9NBD5jNP25MWQTVm+rr0MfUzKBz9DNDzR4uSTzzxhNmmnwF6DMGfAQAAFAoHAIACeOWVV7S7i/PTTz85zz//vHP88cc7hw4dMn+76qqrnAsuuMBcr1OnjtO9e/fA/b799ltzv1mzZoU83vz583Nsdx8v2JAhQ5yyZcs6qampgW3nnXeeue/rr78e2JaWluZUq1bN+cc//nHM1+Le/1iX/v37H/Vx3n333UBMcrN7926zz5gxY3L8rUWLFk716tWdv//+O7Dt888/N/trHI/FJl4ZGRlOvXr1zOP99ddfIftmZWUFrutr1ee9//77Q/Z57733zPZHH300ZHuvXr2cuLg4Z/369eb2s88+a/bT15ubyy+/3GnSpImTV/oa9P0Npq+latWqzsCBAwPb3n77bXMMkyZNCmzLzMx0LrzwQrNd27Droosucpo1axbSrjQe55xzjtOwYcMc7b5Tp04h8Ro+fLgTHx8f8t7pa9O2lV3z5s1Dzglbemx6/ME2bdrkJCYmOg8//HBg29dff22OsVGjRiFxmjx5stn+yy+/mNvp6elOlSpVTLsL3u/FF180+4U79mA277F7LDVr1nSSk5MD2+fNm2e26zEdrf1OmDDBtKstW7Ycs23+/PPPZvubb7551OPWth98Luv+ej891mAa65NPPtnp3bt3yPaJEyeaY9q4ceNRn0efI/i8tf3s27dvn3lP77rrrpD9nnzyyZBYbN682bS5xx57LGQ/fX8TEhJCtmc/lmHDhjnly5c35xIAAF5jCB4AoNDor+86TEZ7GmhvFP03tx4s2pNBeyJoT489e/YELjoURHuLBA+jcntGKH1c3U+HWmkvCR0uFUzve9111wVulypVyvQu0R4dx/LMM8+YHgHuRXtUuENbgrffe++9R30ct7eBvv5ww3aOZvv27aY3iPbOCe6poXHSHlE2bOKlvSK014X2dsneO0J7qGSnvVGCffLJJxIfH296bwTTIXnaO0p7vyn3sbXHkfZECkf30d4t2tMjL/T59f1V+tg6NE17xOjQJO1F5Jo/f77pUaK9pFzaI+i2224LeTy9v/Zo0Xbsxk0v2gNNe9TosC7tSRPspptuComXxlmHOmovlWPR1629h/Rx80J7GOnxK30uPT532Fvw63ZpLxs3Tu4xKvec0F5X2hvp5ptvDtlPh6eF6y0U7nUc6z12aY8eHerl0l5v2gNP21O49qvDPPU90J5Z2q603R6rbbrHrD2WtM0XlMZae4zpMExtF65Zs2aZ49JeaHlh+9nnDq3UHnjBw1q1d13btm3NEEWlPSs17tpugx9Pe37p8M5wQ1KD3zuN8bGGfQIAUBgoQAEACo0OLdPhNDp3j34p0i/H+gUzHP3SrcOBqlSpYu4XfDlw4ID5QuzSL+k69Em/tOmXMt3HLTIFz+midJha9gKKDpHKPq9UOPoFUI/fvehtpUOygrcfqxCkQ6F0CJ3O76Nzx1x++eXyyiuvmCFBx+IWLvSLY3a2c83YxEvnzVE6v9SxJCQk5Bj+p8epQ72CiwnKHbrkvg4dRqTx02FBOvRMh2PqF+rgQoUOx9Iv3loo1NethaFww7fCee2118zQL3cOJX2tH3/8cUi70GPRIocOzwrWoEGDkNs6bEy/6OtQpuxt0h1mFdwulVsECG5ryqa96XA1HRJ26qmnmiGiOnxRhx4ei8ZOhzhqrLQYpW1Mj1Hvm/18sDnG3NqcFu1ym4MomM177Mr+HHqu6vsQPNF/UlKSKX7p8D9tF/ra9JxS2V9fuLapBSEdfqrDMzU2WjycMmVK2NjY0sKZFtfdudh0Bb1ly5aZ4Xl5lZfPPo2tzuO0ePHiwHmrz6vbgx9P263GNvvj6XC67G022K233mranxa6NI46T50WbAEA8AJzQAEACpX2eNKeJjo/in6pyW3uEf1yql/AtBdBOO4k3foFXb98aiFFv7DrJMlabNCeHlq4yP4lV3vFhJN9Ymwv6ZdqnXtK52n58MMPTU8M/WKnPax029HmjyqovMYrrz1u8kp7s3zzzTemF4YWhvTLrfbguPDCC81cV/p+adFKv9BrjzH9u861NHXqVDNvlRbxcqM907RQofMzafFG25M+ns6r5BbY8sKNjc6VpEWLcLIXrQrS3nR+Hj1O7TmksdCCiRaWpk2bZoo5uRk/frwpkmmb0snltVCj74/2Zgv3/np9Tti8x7a0aK09g7Q3mrZXna9L5+HSnmf6Xmd/fbm1TT3XdH83ttpTT9uFnn+2c6kF06KzFqS1zWkxSv/V3mLa6yivbD/7lM5dpoVTLehpbyv9V1+vO2G6+3j6maO9DsPF+mifN3oc2uNSP6P0/nrRYrm+Ri3uAgBQmChAAQAKlfa80Ql29YueOxFzOFoY+fLLL03PieAhN9npKlQ6xEh7VOkXdpcOHyvudJiMXnSCYO0VpsN4dMUpLS6EG+am6tSpY/4NNyxLizTHYhsvjb9avXq16dWVV3qc+v7pkKTgXlDuED/3dSj9wqyTHOtl4sSJpoCiE3JrwcJ9bi0yaK8Ovegk0joxu8Zt5MiRpoAWjhb5tIeOvtbgeGafFFqPRZ9Lh2MF94LKvhKc29tHe/7kJya5ye29Vlo80iFyetHeL/qe6eTkRytA6evWldp04vDsxUft8ZNXwW1Oi0YuHT6q7aZ58+bHfAyb99h9juxFMH0f3An4f/nlF/ntt99M8UOLIK78DBHTXmV6GTVqlHz//ffms0aLe48++mie3yelx6M9q3SYrJ7PutKk25ssL2w/+9zzQic+12F7Glf9TNUhlMETzevjaRy155f2ZsorLaRpoUsvWszSXlE6qb0WObMXXAEAKAiG4AEACpX+2q4rvOmXaP1CkxvtOaC9HbQHR3Y6j49+mVbuL/rBvTW0QKE9ZLx2/vnnm+fVFf7yQoc2Ze9d4q7I5g7Dcwsh7ut06VAx3Ve/gAcPGdIv4LrC2bHYxuvMM880X1h1FbLsx2DTM0aXvdf3T1fuCqY9ePSLvLvioPZkyS57LLRglv0LsfY40eM42hxa4V6rruzmDldyaW8mfRxdbc+lX7R1WFb23iD6nuuXby0yZKcri+WHFhGyxzjc69ZzR7/wH2uopr7u7O+RFiiyz09lS+fM0l43WpzRtuLSVdrCHXd2Nu+xS1eTC55HSYtpGmu3vYR7T/X65MmTrV+PriaonyHBtBClRbKjxVbfJ5Xba9bV4rRt6+pwOn9W8FxzeWH72efSouwff/xhesitXLkyZPid0mKtxk17C2ZvF3o7ezsLlv1vGiO3GGgzZBgAgLygBxQAoNCFW94+Ox0mpj2ldFiMDgHp0qWL6XmiPST0y7R+4dT5o3TYifYy0MfUYTT6BfCf//ynJ0PqtMijS9ofi/Y4aNeuXa5/1+KRFny0N5juq1+4tfihw+K0cKO054MWWbRHg/Za0J4wOh+TXjQm2rtCl5bXYVb6BV+XVG/SpInpJXM0tvHSL5paKNQioRYLtAeOFr+0B5POIaVDco5G76e9cLSXi87fo71kdKiTDnnSoWBuDysdBqjDs/T1aE8bnY9GY6PDoPT1KX3vdcJk7RGicwjpvDVa2NL7ZJ9jKpj2DNHeTxpn3Vd762gRReMaHCcdoqfzS+kE6drbRod16YTSbuEkuOeLFqX0uLRgoUNJtVeUtgktaulE6VoAyCsduqWx1p43WmDSQpf2NNLj1IKX/l3ff50MXAsyQ4cOPerj6evWuOp7pu+39hrS4Vw28zWFo+edHpuej3pcWuDQWOpQLJvHtHmPXfo6dZseu8ZVC6AaE3eCeH1vtO3oMEgtqOk5o0MybebUculE8hpDHaam55YWdfQc0CKNzs2WGz0PdJ8nnnjCFH91eJ/GQ98vpUW6bt26mc8nHVqsrzc/bD/7XPqZoeeBxiTca9B46funvQX1XNT2rvvre6hzVulE+XrfcLSnnZ4H+jr1/dL5wPSzRmPhzucGAECh8XydPQBAVHOXo//pp5+Oup8u/R1uyXld6r1Vq1ZOmTJlnOOPP95p1qyZc++99zp//PFHYJ/vvvvOadu2rdmnRo0a5u+fffZZjiXTdbl4XfI+u+xLj+dG76+PeaxL8NLt4Sxfvtzp06ePU7t2bbOMui5xf+mllzpLly4N2e/77783r71UqVLmcceMGRP429tvv+00atTI3L9x48bOO++8Y/06bOOlFi1a5HTu3NnEvly5cs4ZZ5zhPPfccyGx0+3h7N+/3xk+fLh5jpIlSzoNGzZ0nnrqKScrKyuwz4IFC5zLL7/c7KOvU//V2Pz222+BfaZPn+6ce+65TsWKFc3rrV+/vnPPPfeYZeiPRp9n/PjxJiZ6v5YtWzofffRR2Djt3r3b6du3r3mdJ5xwgnPDDTeYOGlM5syZE7Lvhg0bnH79+jnVqlUzr6tmzZrm/XvrrbeO2e41vtnjvGPHDtP29bn1b9rO1KOPPuq0adPGOfHEE817dfrppzuPPfaYk56eftTXnZqa6tx1111O9erVzf3at2/vLF682Dyu+9jBx/Lmm2+G3H/Tpk1mu76GYFOnTnXq1atnYtm6dWvnm2++yfGY4di8x+6xvPHGG87IkSPNOaHHrnHZsmVLyOP9+uuvTqdOnZzjjjvOqVSpkjN48GBn5cqVOY45t7a5ceNGZ+DAgaYdlS5d2jnppJOcCy64wPnyyy9D9tM2kv1cfumll5xTTjnFiY+PD3u+zJs3z2y/6aabHFu5nbc2n32ua6+91jyvxiU3+pnRoUMHExO9aHu67bbbnLVr1+Z6LNqmu3TpYt4Pfe/0M2vIkCHO9u3brV8fAAC24vQ/hVfOAgAA8If33nvP9J5atGiR6X0F7+jcZNpjTnv45LYyph9oDz/tYaQ9vnQuJgAAYI85oAAAQNRLSUkJua1z8OhQIx3ipfNhATZ0KK0OS8w+tBAAABwbc0ABAICod/vtt5silM7dpZMr69xRujKartZ2rJXIAF29ctWqVfLxxx+bOZqOtWIeAADIiQIUAACIejrJ8jPPPCMfffSRpKammomvtQfUsSb8BtwV8HSVwhtvvFFuvfXWoj4cAAB8iTmgAAAAAAAA4CnmgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAAPAUBSgAAAAAAAB4igIUAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhRQjNxwww1St27dPN1n4cKFEhcXZ/4FAMCG5o2xY8cGbr/66qtm2+bNm4v0uAAAQPSiAIWY5/5Pt3spXbq0nHrqqTJ06FDZuXNnUR8eACAKcktCQoLUrFnT/NCwbdu2oj48AECU5Zngy/3332/2+fzzz+XGG2+Upk2bSnx8fJ5/6AYKW0KhPyLgUw8//LDUq1dPUlNTZdGiRfLCCy/IJ598IqtXr5ayZctG5BheeuklycrKytN9zj33XElJSZFSpUp5dlwAgILnlh9++MF8YdAco7lFf/AAAKAw8kwwLTip2bNny9y5c+XMM8+UGjVqFNERAkdQgAL+v4svvlhat25trg8aNEgqVqwoEydOlPfff1/69OmTY/+DBw9KuXLlCvUYSpYsmef7lChRgi8xAOCT3FKpUiV54okn5IMPPpCrr766qA8PABBFeSa78ePHmx+49TvGpZdean78AIoSQ/CAXFx44YXm302bNpkhE8cdd5xs2LBBLrnkEjn++OPl2muvNX/XHkuTJk2SJk2amEJQ1apVZciQIfLXX3/leMxPP/1UzjvvPHP/8uXLy1lnnWV+mTjaHFBz5syRVq1aBe7TrFkzmTx58jHngHrzzTfN/cqUKWO+8Fx33XU5hn24r0u39+zZ01yvXLmy3H333ZKZmVlIkQQAuDp27Gj+1XziWrNmjfTq1UtOOukkk0f0i4QWqLL7+++/Zfjw4SZPJCYmysknnyz9+vWTPXv2mL+np6fL6NGjzWf/CSecYH4k0ef7+uuvI/gKAQDFhfZ6ys8P3IBXKEABuXC/HGhPKJWRkSFdu3aVKlWqyNNPPy3/+Mc/zHYtNt1zzz3Svn17UxgaMGCAzJo1y+x7+PDhwOPpsIvu3bvL3r17ZeTIkfL4449LixYtZP78+bkewxdffGF6X1WoUMH8Yq73Of/88+W777476rHrc+kv6zrWe8KECTJ48GB55513pEOHDuYLTDAtNOmx6uvU16UFsmeeeUZefPHFAsUPAJCTO8m3fq6r//znP9K2bVv573//a+bs0M9fLRzpjwLvvvtu4H4HDhwwxaTnnntOunTpYvLNzTffbIpXv//+u9knOTlZZsyYYfKE5gydZHz37t3mM37FihVF9IoBAF7at2+f+SEi+AIUVwzBA7J9eOs8HVrg0fHU2ntIu6suXrxY0tLS5KqrrjIFHZfO46H/s68Fp759+wa2X3DBBdKtWzfTC0m362Pfcccd0qZNG9NTKXjInOM4uR7Txx9/bHo9ffbZZ6aYZEOLXvfdd58Z+/3NN98EnkuLT/pann32WRk3blxgf329vXv3loceesjc1i80Ok785ZdflltuuSWPUQQA5JZbfvzxR/P5q72X9PNYDRs2TGrXri0//fST2a5uvfVW85mtn+VXXHGF2fbUU0+ZoRP6Y4K7TY0aNSqQR7SopQWu4DkB9QeI008/3RSu9HMdABBdOnXqlGPb0b5fAEWJAhSQy4d3nTp1TGFJVy1yZS/IaIFJhzl07tw55NcGHf6gw9l02IMWoLQn0/79+82v29nna9Lhc7k58cQTzVxTen8taNlYunSp7Nq1y/zyHfxc2vtKv4RoUSu4AOUWnYLpr+z//Oc/rZ4PAGCfW3T43L/+9S8zfE57xH711VfmBw/NEXpxaa+lMWPGmCHSmofefvttad68eUjxKXse0R8q3B8rdHi49njVf3VI3/Llyz1/rQCAyJsyZYpZwRvwAwpQQLYPb10qW+dxOu2008wE3y7drl8Ygq1bt878uq3D8sLRQlDwcD53RQpb+iv4vHnzzOSC+gVEh13o0LqjFaO2bNli/tXjz04LUNprK5gWqXTep2D6K3q4OawAAPnLLZorZs6caXqmuj2d1q9fb36l1h6obi/UcHlEP/81j7hDv4/mtddeM8P4dGhe8DDw7CskAQCig46wyG0ScqC4oQAFWH546xeG4IKU0l+WtfikPaXCyV7YySt9bJ23Q4fg6QTmennllVfMpLP6JaMw2A7tAwAULLfovE46tE57xq5du9bkEKULP2iPp3AaNGhg/Vzas0oXl9Dn0bkJNYe4cwEGT3oOAABQFChAAQVQv359+fLLL80E5Dpf1NH2Uzp/R16+TCidy6NHjx7mol9WtFfU9OnTza/l4R5Lhw4q/XLjruTn0m3u3wEAkeUWg3SewOeff14GDhxotusKReHm8MieR461fPZbb70lp5xyipknKnh4tw7lAwAAKGqsggcUgA6H01XkHnnkkRx/01Xz3BXndOjc8ccfb7546ES0tpME/vnnnyG3tQfWGWecYa7rpOjh6C/t+qv3tGnTQvbR3lO6ypLOBQUAKBq6Qp32ipo0aZJZZEJv648K27dvz7GvrmDn0uF3K1euDFkZL3secXu0BucVnfhcF9IAAAAoavSAAgrgvPPOkyFDhpjCkg6V00KT/pKtc0PpBOW6THavXr3MlwxdfW7QoEFy1llnmeEXOs+Sfpk4dOhQrsPpdH+dpFZ7Mun8Uzq/k65k1KJFC2nUqFHY++jz6/LbAwYMMMfXp08f2blzpzkWnfx2+PDhHkcFAHA0OjxOV1V99dVXzRxROiyvWbNmZsU67cGkn9laNPr9999NnnDvoz2c9H7ac0oXu9D88MEHH5gfHHSCcl1Zz10lT39s2LRpk/lb48aN5cCBA0X9sgEAEbZq1SqTJ9x5B3U+wkcffdTc1ryhIyyASKIABRSQ/s+9fhHQX7AfeOABM1m5Fnquu+46MzTPdeONN5qeSY8//rjpMaWFIp0U/GgFIX2MF198UaZOnWp6U1WrVk169+5tVrjLPh9VMJ0DpGzZsua5dBnvcuXKmS8kWpjSlfUAAEXnyiuvNEPqnn76aVN00tVLdXVSLUhpz1fNFS1btpTRo0cH7qMrq3777bdmOJ32gtIfLnS/iy66KLBAhn7279ixw+QjnTtQC086L5T+ILJw4cIifMUAgKKgK6BmX+TCvd2/f38KUIi4OOdo438AAAAAAACAAmIOKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUwnePry/ZWVlyR9//CHHH3+8xMXFFfXhAEChchxH9u/fLzVq1JASJfg9Ir/IFQCiGbmicJArAEQz21xBAeooNEnUqlWrqA8DADy1detWOfnkk4v6MHyLXAEgFpArCoZcASAWHCtXUIA6Cv2Fwg1i+fLlpbg7ePCgqTi6Sa5cuXJFfUjFEnGyR6yiO1bJycnmf4bdzzrERq4AgLwgVxQOcgWAaGabKyhAHYXbPVaThB8SRXx8fOC6Hq8fvgAXBeJkj1jFRqwYChBbuQIA8oNcUTDkCgCx4Fi5goHcAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAAPAUBSgAAAAAAAB4igIUAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8JRvClDffPON9OjRQ2rUqCFxcXHy3nvvHfM+CxculDPPPFMSExOlQYMG8uqrr0bkWAEAAAAAAODDAtTBgwelefPmMmXKFKv9N23aJN27d5cLLrhAVqxYIXfeeacMGjRIPvvsM8+PFQAAAAAAAEckiE9cfPHF5mJr2rRpUq9ePXnmmWfM7UaNGsmiRYvk2Wefla5du3p4pAAAAAAAAPBlD6i8Wrx4sXTq1ClkmxaedDsAAAAAAAAixzc9oPJqx44dUrVq1ZBtejs5OVlSUlKkTJkyOe6TlpZmLi7dV2VlZZlLcRd8jH455qJAnOwRq+iOlR+OsTjye64AgLzgcy1/yBUAYkmW5eda1Bag8mPChAkybty4HNt3794tqampUtwdOnQo5Jh13izkRJzsEavojtX+/fuL+hB8ye+5AgDyglyRP+QKALFkv2WuiNoCVLVq1WTnzp0h2/R2+fLlw/Z+UiNHjpQRI0aE/FJRq1YtqVy5srlfcRf8hVePuVy5ckV6PMUVcbJHrKI7VqVLly7qQ/Alv+cKAMgLckX+kCsAxJLSlrkiagtQ7dq1k08++SRk2xdffGG25yYxMdFcsitRooS5FHfBx+iXYy4KxMkesYruWPnhGIsjv+cKAMgLPtfyh1wBIJaUsPxc882n34EDB2TFihXmojZt2mSuJyUlBX5l6NevX2D/m2++WTZu3Cj33nuvrFmzRqZOnSrz5s2T4cOHF9lrAAAAAAAAiEW+KUAtXbpUWrZsaS5Ku7Tq9dGjR5vb27dvDxSjVL169eTjjz82vZ6aN28uzzzzjMyYMcOshAcAAAAAAIDI8c0QvPPPP18cx8n176+++mrY+/z8888eHxkAAAAAAACiogcUAAAAAAAA/IkCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8leDtw8e2C59eGNHny0xPCVy/ZPI3El+qTMSe+6u7z8//nZ9rJRGVlnnk+gvtRRLjI/v8ty/L9117vNtDIikjNSNwvdcHvSShdGQ/Mj684sN833dDt4slkg5lHInVxst7StmEyMWq/vxPI/ZcAAAAAJAf9IACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4KkEbx8eAAAAAAD/2759u7nYql69urnEImJlb3sMxYoCFAAAAAAAxzB9+nQZN26c9f5jxoyRsWPHSiwiVvamx1CsfFeAmjJlijz11FOyY8cOad68uTz33HPSpk2bXPefNGmSvPDCC5KUlCSVKlWSXr16yYQJE6R06dIRPW4AAAAAgH8NGTJELrvsssDtlJQU6dChg7m+aNEiKVOmTMj+fu2lUhiIlb0hMRQrXxWg5s6dKyNGjJBp06bJ2WefbYpLXbt2lbVr10qVKlVy7D979my5//77ZebMmXLOOefIb7/9JjfccIPExcXJxIkTi+Q1AAAAAAD8J/vQp4MHDwaut2jRQsqVK1dER1b8ECt71WMoVr6ahFyLRoMHD5YBAwZI48aNTSGqbNmypsAUzvfffy/t27eXvn37St26daVLly7Sp08fWbJkScSPHQAAAAAAIFb5pgCVnp4uy5Ytk06dOgW2lShRwtxevHhx2Ptorye9j1tw2rhxo3zyySdyySWXROy4AQAAAAAAYp1vhuDt2bNHMjMzpWrVqiHb9faaNWvC3kd7Pun9dPyk4ziSkZEhN998szzwwANh909LSzMXV3Jysvk3KyvLXPIqThyJpODn0+uRfP78xOeIOImkrKDn0+vBtyNzAPmPVVyEjzX4+fR6pJ+/IO3KiYvssQY/n16P5PPnN04FO29jV2HnCgAozvhcyx9yRWwIfi95b4+OWNnzY6xsj9E3Baj8WLhwoYwfP16mTp1q5oxav369DBs2TB555BF56KGHcuyvk5OHm31+9+7dkpqamufnr1suQyLpcMKR56tdNkNKJkbu+Xft2pX/O5dpKJF0qITGZYW5vrtMAzmYGOHToACxqiW1JJIOy+HA9ZPlZCkpJSP6/AVpV/tr15ZISjl8JFYHatWSzJIli32c9u/fX+jHEgsKO1cAQHFGrsgfckVsOHToUMh7Gzx3D0IRK3t+jJVtrvBNAUpXsIuPj5edO3eGbNfb1apVC3sfLTJdf/31MmjQIHO7WbNm5s276aab5MEHHzRD+IKNHDnSTHIe/EtFrVq1pHLlylK+fPk8H/Pmg5ENb2b6kedLOpQg8RmRe/5wk8BbS1knkXQwLTNwvXLKeimXFR/R55cCxGqrbJVIypAjRczf5XdJiPBHRkHa1YGkJImk+IwjsTpu61Ypm1D8zz9WA82fws4VAFCckSvyh1wRG4ILA/reRtNk0YWNWNnzY6xsc4VvClClSpWSVq1ayYIFC6Rnz56Bbl56e+jQoblWDrMXmbSIpXRIXnaJiYnmkp0+RvbHsaGD4CIp+Pn+NwAvcs+fn/gcEdmhiiWCnk+vB9+OzAHkP1ZOhI81+Pn0eqSfvyDtKi7MOe6l4OfT65F8/vzGqWDnbewq7FwBAMUZn2v5Q66IDcHvJe/t0REre36Mle0x+qYApfRXhP79+0vr1q2lTZs2MmnSJFMd1FXxVL9+/aRmzZqmy6vq0aOHWTmvZcuWgSF42itKt7uFKAAAAAAAAHjLVwWo3r17mzGQo0ePlh07dkiLFi1k/vz5gYnJk5KSQipvo0aNkri4OPPvtm3bTPc1LT499thjRfgqAAAAAAAAYouvClBKh9vlNuROJx0PlpCQIGPGjDEXAAAAAAAAFI3iP5gQAAAAAAAAvkYBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACAp3w3CTkAAAAAoPBs377dXGxVr17dXAAgLyhAAQAAAEAMmz59uowbN856f11lfOzYsZ4eE4DoQwEKAAAAAGLYkCFD5LLLLgvcTklJkQ4dOpjrixYtkjJlyoTsT+8nAPlBAQoAAAAAYlj2IXUHDx4MXG/RooWUK1euiI4MQDRhEnIAAAAAAAB4igIUAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADyV4O3DAwAAFH/bt283F1vVq1c3l1hErOwRKwAAjqAABQAAYt706dNl3Lhx1vuPGTNGxo4dK7GIWNkjVgAAHEEBysfSkv+U9OQ/A7czD6cFru/ftl7iSyaG7F+qfEVJLF8xoscIAIAfDBkyRC677LLA7ZSUFOnQoYO5vmjRIilTpkzI/rHcS4VY2SNWAAAcQQHKx/744UPZ/MVrYf/289Q7cmyr27m/1Otyg8Sa7fsOy/bkw4HbKemZgesrfj8kZUrFh+xfvXxJqX5CSYlFqXtTJe2vI4XMjPSMwPV9m/ZJQqnQj4zEColS+qTSEot2paXKrrQjsUrNONKufk1OltIJoe2qSmKiVEmMzVgBfpB96NPBgwcD11u0aCHlypUroiMrfoiVPWIFAMARFKB8rEbbHlKp8TnW+2sPqFg0/bvdMm7+jrB/6zB5XY5tY7pVk7GX1JBYtOXzLbJuXs6YqMUPLs6xreHVDeW0a06TWPTG1q3y3IYNYf92zU9Lcmy7vX59GdagYQSODAAAAACKHwpQPqbD6RhSd2xD2leWy5qdaL2/9oCKVXW61JFqZ1Wz3l97QMWqPrVqyUVVqljvrz2gAAAAACBWUYBC1NPhdLE6pC6vdDhdrA6pyysdTseQOgAAAACwU8JyPwAAAAAAACBfKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAAPAUBSgAAAAAAAB4igIUAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABP+a4ANWXKFKlbt66ULl1azj77bFmyZMlR9//777/ltttuk+rVq0tiYqKceuqp8sknn0TseAEAAAAAAGJdgvjI3LlzZcSIETJt2jRTfJo0aZJ07dpV1q5dK1WqVMmxf3p6unTu3Nn87a233pKaNWvKli1b5MQTTyyS4wcAAAAAAIhFvipATZw4UQYPHiwDBgwwt7UQ9fHHH8vMmTPl/vvvz7G/bt+7d698//33UrJkSbNNe08BAAAAAAAgcnxTgNLeTMuWLZORI0cGtpUoUUI6deokixcvDnufDz74QNq1a2eG4L3//vtSuXJl6du3r9x3330SHx+fY/+0tDRzcSUnJ5t/s7KyzCWv4sSRWJGf+BwRJzGlALGKi7FYFaRdOXGxE6v8xqlg523sKuxcgeIp+L3kvT06YhXdsfLDMRZHfs8VfmyrRYE42SNW9vwYK9tj9E0Bas+ePZKZmSlVq1YN2a6316xZE/Y+GzdulK+++kquvfZaM+/T+vXr5dZbb5XDhw/LmDFjcuw/YcIEGTduXI7tu3fvltTU1Dwfc91yGRIrdu3alf87l2koMaUAsaoltSSWFKRd7a9dW2JFfuO0f//+Qj+WWFDYuQLF06FDh0Le24MHDxbp8RRnxCq6Y0WuiM1c4ce2WhSIkz1iZS+ac4VvClD5rcLp/E8vvvii6fHUqlUr2bZtmzz11FNhC1Dau0rnmAr+paJWrVqm51T58uXz/PybD0Z1eEOEm4PLWso6iSkFiNVW2SqxpCDt6kBSksSK/MZJF3NA3hV2rkDxFPw/e/relitXrkiPpzgjVtEdK3JFbOYKP7bVokCc7BEre9GcK3xTIalUqZIpIu3cuTNku96uVq1a2Pvoync691PwcLtGjRrJjh07zJC+UqVKheyvq+TpJTsd6qeXvHJiaLhUfuJzROwMVTQKECsnxmJVkHYV58ROrPIbp4Kdt7GrsHOF17Zv324utjR36iXWBb+XxfW9LS6IVXTHyg/HWBz5LVdEQ1s1nmsV0acrkZZ55Pr0jlIiMec0L566fVm+79rj3R4SSRmpR0YHXf3R1ZJQOrKliA+v+DDf993Q7WKJpEMZR2K1+YorpWxC5GJVf/6n+bqf7WeEbwpQWizSHkwLFiyQnj17Bno46e2hQ4eGvU/79u1l9uzZZj83IL/99pv5H+vsxScAAKLR9OnTww4DyY32EB47dqynxwQAAIDY45sClNJurP3795fWrVtLmzZtZNKkSaZ7mrsqXr9+/aRmzZpmzLW65ZZb5Pnnn5dhw4bJ7bffLuvWrZPx48fLHXfcUcSvBACAyBgyZIhcdtllgdspKSnSoUMHc33RokVSpkyZkP3p/QQAAACJ9QJU7969zSRco0ePNsPoWrRoIfPnzw9MTJ6UlBTS9UvHWX/22WcyfPhwOeOMM0xxSotRugoeAACxIPuQuuB5BTSP+mFeAQAAAPifrwpQSofb5TbkbuHChTm2tWvXTn744YcIHBkAAAAAAADC8clscgAAAAAAAPArClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgqQRvHx4AAAAAUBAXPr0wos+XmZ4SuH7J5G8kvlSZiD33V3efH7HnAhBZ9IACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8leDtwwMAABSC51pF9vnSMo9cf6G9SGJ8ZJ//9mX5vmuPd3tIJGWkZgSu9/qglySUjuz/Xn54xYf5vu+GbhdLJB3KOBKrjZf3lLIJkYtV/fmfRuy5AAAIhx5QAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAU74rQE2ZMkXq1q0rpUuXlrPPPluWLFlidb85c+ZIXFyc9OzZ0/NjBAAAAAAAgE8LUHPnzpURI0bImDFjZPny5dK8eXPp2rWr7Nq166j327x5s9x9993SsWPHiB0rAAAAAAAAfFiAmjhxogwePFgGDBggjRs3lmnTpknZsmVl5syZud4nMzNTrr32Whk3bpyccsopET1eAAAAAAAAiCSIT6Snp8uyZctk5MiRgW0lSpSQTp06yeLFi3O938MPPyxVqlSRG2+8Ub799tujPkdaWpq5uJKTk82/WVlZ5pJXceJIrMhPfI6Ik5hSgFjFxVisCtKunLjYiVV+41Sw8zZ2FXauiLTgY/TLMf9PZM/prKDn0+vBtyNzAP7JFcHPp9cj/fx+yhXBz6fXI/n85IrI8vv3iuDn0+uRfH4/fa8gV+Tv+cgVsZsrfFOA2rNnj+nNVLVq1ZDtenvNmjVh77No0SJ5+eWXZcWKFVbPMWHCBNNTKrvdu3dLampqno+5brkMiRXHGgZ5VGUaSkwpQKxqSS2JJQVpV/tr15ZYkd847d+/v9CPJRYUdq6ItEOHDoUc88GDB8UXIpwrDpXQHP6//3/YXaaBHEyM8P8y+ShXHJbDgesny8lSUkpG9Pn9lCtSDh+J1YFatSSzZORiRa6ILL9/rziccOT5apfNkJKJkXt+P32vIFfYI1fYi+Zc4ZsCVH4CcP3118tLL70klSpVsrqP9q7SOaaCf6moVauWVK5cWcqXL5/nY9h8MGrDm4P2Msu3lHWFeSjFXwFitVW2SiwpSLs6kJQksSK/cdLFHJB3hZ0rIi244KTHXK5cOfGFCOeKg2mZgeuVU9ZLuaz4iD6/n3JFhhz5Yvq7/C4JEf7fSz/liviMI7E6butWKZsQuViRKyLL798rMtOPPF/SoQSJzyj+bdUgV1gjV9gjVxRervBNhUSLSPHx8bJz586Q7Xq7WrVqOfbfsGGDmXy8R48eObqFJSQkyNq1a6V+/foh90lMTDSX7HSon17ySjurxor8xOeI2BmqaBQgVk6Mxaog7SrOiZ1Y5TdOBTtvY1dh54pICz5Gvxzz/0T2nC4R9Hx6Pfh2ZA7AP7ki+Pn0eqSf30+5Ivj59Hokn59cEVl+/14R/Hz/G4AXuef30/cKckX+no9cEbu5wjcFqFKlSkmrVq1kwYIF0rNnz0BBSW8PHTo0x/6nn366/PLLLyHbRo0aZXpGTZ482fwCAQBAUbvw6YURfb7M9JTA9UsmfyPxpcpE7Lm/uvv8iD0XAAAAihffFKCUdmPt37+/tG7dWtq0aSOTJk0yQwl0VTzVr18/qVmzphlzrV3AmjZtGnL/E0880fybfTsAAAAAAAC846sCVO/evc3EfaNHj5YdO3ZIixYtZP78+YGJyZOSkugmDAAAAAAAUMz4qgCldLhduCF3auHCow9jePXVVz06KgAAAAAAAOSG7kIAAAAAAADwlO96QAEAAAAAEGnb9x2W7cmHA7dT0jMD11f8fkjKlIoP2b96+ZJS/YSSET1G+M+utFTZlZYWuJ2acaRd/ZqcLKUTQttVlcREqZJYWvyIAhQAAAAAAMcw/bvdMm7+jrB/6zB5XY5tY7pVk7GX1JBYlLo3VdL+OlJUyUjPCFzft2mfJJQKLUUkVkiU0if5s6hSUG9s3SrPbdgQ9m/X/LQkx7bb69eXYQ0aih9RgAIAAAAA4BiGtK8slzX738rqNrQHVKza8vkWWTcvZ1FOLX5wcY5tDa9uKKddc5rEoj61aslFVapY7689oPyKAhQAAAAAAMegw+kYUmenTpc6Uu2satb7aw+oWFUlsbRvh9TlFQUoAAAAAABQaHQ4XawOqUPuWAUPAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAAPAUBSgAAAAAAAB4KsHbhwcAAAAAFGdpyX9KevKfgduZh9MC1/dvWy/xJRND9i9VvqIklq8Y0WME4H8UoAAAAAAghv3xw4ey+YvXwv7t56l35NhWt3N/qdflhggcGYBoQgEKAAAAAGJYjbY9pFLjc6z31x5QAJBXFKAAAAAAIIbpcDqG1AHwGpOQAwAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8FSCtw8PAABQ/G3fd1i2Jx8O3E5JzwxcX/H7ISlTKj5k/+rlS0r1E0pG9BjhP7vSUmVXWlrgdmrGkXb1a3KylE4IbVdVEhOlSmLpiB4jAACRQgEKAADEvOnf7ZZx83eE/VuHyetybBvTrZqMvaSGxKLUvamS9teRokpGekbg+r5N+yShVOj/XiZWSJTSJ8VmUeWNrVvluQ0bwv7tmp+W5Nh2e/36MqxBwwgcGQAAkUcBCgAAxLwh7SvLZc1OtN5fe0DFqi2fb5F183IW5dTiBxfn2Nbw6oZy2jWnSSzqU6uWXFSlivX+2gMKAIBoRQEKAADEPB1Ox5A6O3W61JFqZ1Wz3l97QMUqHU7HkDoAAHw6CfmUKVOkbt26Urp0aTn77LNlyZKc3ZddL730knTs2FEqVKhgLp06dTrq/gAAADg6HU53Qv0TrC+xOvwOAAD4uAA1d+5cGTFihIwZM0aWL18uzZs3l65du8quXbvC7r9w4ULp06ePfP3117J48WKpVauWdOnSRbZt2xbxYwcAAAAAAIhVvipATZw4UQYPHiwDBgyQxo0by7Rp06Rs2bIyc+bMsPvPmjVLbr31VmnRooWcfvrpMmPGDMnKypIFCxZE/NgBAAAAAABilW/mgEpPT5dly5bJyJEjA9tKlChhhtVp7yYbhw4dksOHD8tJJ50U9u9paWnm4kpOTjb/atFKL3kVJ47EivzE54g4iSkFiFVcjMWqIO3KiYudWOU3TgU7b2OX33NF8PPp9Ug+P7kiD8gV1sgVdsgVkeX3XFGUyBV5QK6wRq4oHrnCNwWoPXv2SGZmplStWjVku95es2aN1WPcd999UqNGDVO0CmfChAkybty4HNt3794tqampeT7muuWOLEsc7XIbBmmlTIwtN1yAWNWSWhJLCtKu9teuLbEiv3Hav39/oR9LLPB7rjiccOT5apfNkJKJkXt+ckUekCuskSvskCsiy++5oiiRK/KAXGGNXFE8coVvClAF9fjjj8ucOXPMvFA6gXk42rtK55gK/qVC542qXLmylC9fPs/PuflgzIRXquRhieEcUsIv5Ry1ChCrrbJVYklB2tWBpCSJFfmNU26fhTg6v+eKzPQjz5d0KEHiMyL3/OSKPCBXWCNX2CFXRJbfc0VRIlfkAbnCGrmieOQK33ySVapUSeLj42Xnzp0h2/V2tWpHXwr46aefNgWoL7/8Us4444xc90tMTDSX7HSon17ySgc2xIr8xOeI2OlSbBQgVk6Mxaog7SrOiZ1Y5TdOBTtvY5ffckVa8p+Snvxn4Hbm4aAhIds2SHzJ0NdSqnxFSSxf0ZNjIVfkAbnCGrnCDrkisvyWK4oTckUekCuskSuKR67wTQGqVKlS0qpVKzOBeM+ePc02d0LxoUOH5nq/J598Uh577DH57LPPpHXr1hE8YgAAit4fP3wom794Lezffp56R45tdTv3l3pdbojAkQEAACCW+KYApbQba//+/U0hqU2bNjJp0iQ5ePCgWRVP9evXT2rWrGnGXKsnnnhCRo8eLbNnz5a6devKjh07zPbjjjvOXAAAiHY12vaQSo3Psd5fe0ABAAAAMV2A6t27t5m4T4tKWkxq0aKFzJ8/PzAxeVJSUkjXrxdeeMGsnterV6+QxxkzZoyMHTs24scPAECkJXo4pA4AAACIygKU0uF2uQ250wnGg23evDlCRwUAAAAAAIDcMKsgAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeIoCFAAAAAAAADxFAQoAAAAAAACeogAFAAAAAAAAT1GAAgAAAAAAgKcoQAEAAAAAAMBTFKAAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAAPAUBSgAAAAAAAB4igIUAAAAAAAAPEUBCgAAAAAAAJ6iAAUAAAAAAABPUYACAAAAAACApyhAAQAAAAAAwFMUoAAAAAAAAOApClAAAAAAAADwFAUoAAAAAAAAeMp3BagpU6ZI3bp1pXTp0nL22WfLkiVLjrr/m2++KaeffrrZv1mzZvLJJ59E7FgBAAAAAADgswLU3LlzZcSIETJmzBhZvny5NG/eXLp27Sq7du0Ku//3338vffr0kRtvvFF+/vln6dmzp7msXr064scOAAAAAAAQq3xVgJo4caIMHjxYBgwYII0bN5Zp06ZJ2bJlZebMmWH3nzx5snTr1k3uueceadSokTzyyCNy5plnyvPPPx/xYwcAAAAAAIhVvilApaeny7Jly6RTp06BbSVKlDC3Fy9eHPY+uj14f6U9pnLbHwAAAAAAAIUvQXxiz549kpmZKVWrVg3ZrrfXrFkT9j47duwIu79uDyctLc1cXPv27TP//v3335KVlZXnY85MPSCxQmOUbyl5j62vFSBWGYcyJJYUpF0lZ8ZOrPIbp+TkZPOv4ziFfETRjVyRf+SKPCBXWCNX2CFXRBa5Iv/IFXlArrBGrigeucI3BahImDBhgowbNy7H9jp16hTJ8fhJhYeK+gh85L4KRX0EvlFBiJWVCgWL0/79++WEE04otMOJduSK/CNX5AG5whq5whK5IqLIFflHrsgDcoU1ckXxyBW+KUBVqlRJ4uPjZefOnSHb9Xa1atXC3ke352X/kSNHmknOXfrrxN69e6VixYoSFxcnfqCVx1q1asnWrVulfPnyRX04xRZxskesojdW+guFJokaNWoU9aH4CrkidhAne8QqemNFrsgfckXsIE72iFX0xso2V/imAFWqVClp1aqVLFiwwKxk536Q6+2hQ4eGvU+7du3M3++8887Ati+++MJsDycxMdFcgp144oniR9pI/dBQixpxskesojNW/Jqdd+SK2EOc7BGr6IwVuSLvyBWxhzjZI1axmyt8U4BS+itC//79pXXr1tKmTRuZNGmSHDx40KyKp/r16yc1a9Y0XV7VsGHD5LzzzpNnnnlGunfvLnPmzJGlS5fKiy++WMSvBAAAAAAAIHb4qgDVu3dv2b17t4wePdpMJN6iRQuZP39+YKLxpKQkszKe65xzzpHZs2fLqFGj5IEHHpCGDRvKe++9J02bNi3CVwEAAAAAABBbfFWAUjrcLrchdwsXLsyx7aqrrjKXWKFdfceMGZOjyy9CESd7xMoesYJf0FbtECd7xMoesYJf0FbtECd7xMpetMYqzmFNVQAAAAAAAHjoyHg1AAAAAAAAwAMUoAAAAAAAAOApClA+xuhJAMCxkCsAAMdCrgAQCRSgfGbZsmUyefJk2bRpkxw+fLioD6fYx+qf//yn7Nq1q6gPpdhbvny5WVFy3759RX0oxdratWsD5x3/o4bijFxhj1xhj1xhh1wBvyBX2CNX2CNX2Fkbo7mCApSPTuRLLrlEbrnlFlmxYoXccccdMn369KI+rGLpxx9/NLEaMmSIfPrppzJgwAD58MMPi/qwiqXvvvtOunfvLoMGDZIZM2ZI3759ZfXq1UV9WMXOypUr5dxzz5UmTZrIc889Z7ZlZWUV9WEBOZAr7JEr7JEr7JAr4BfkCnvkCnvkCjsrYzxXUIDygczMTJMozjvvPFmyZIm88sorctFFF8l//vMfOXToUFEfXrFy4MABWbhwoXTp0kWWLl0qs2fPlgYNGsiqVatMHHGE/oLz/vvvy6WXXmra11tvvSXx8fHmFx7k/IWiXbt2Jpm+8847ZpvGCihOyBX2yBX2yBX2yBXwA3KFPXKFPXKFvbUxnisSivoAcGzaIPWDr1q1aoFtKSkpsn//filbtqzpshcXF1ekx1hcHHfccXLZZZdJo0aNQrZv3brVxJFYHVGlShW55ppr5MwzzwxsO/7442XNmjVFelzF0cUXXyw9evQwyVW7X+v/rOkvYPo/H7GUMFC8kSvskSvskSvskSvgB+QKe+QKe+QKexfHeK6gB1QxpR9owZX12rVrS6lSpQLd8zZu3CjNmzc312P9gy97rNwk4cZKx7VfeOGF5jqxCo2VmyR0e3p6umzZssV8IMa67HHSBFqmTBmpUaOG9OzZU1577TWzPRaSBIo3coU9coU9coUdcgX8glxhj1xhj1xhh1wRigJUMaMfbnrRDzRthNr18+effw75u27TbrI6xjaWZY/VwYMHTQINPtm1srx3714555xzJJZlj5V2sf7zzz9DYrV582bzPyM6HjlWHatNlSxZUjp37mwmDNRuxkBRIVfYI1fYI1fYIVfAL8gV9sgV9sgVdsgV4VGAKib0RNVLiRIlzGXbtm0yYsQIadu2rWmo7sz4CQkJsmDBAmnZsqU5od944w156KGHYmrlitxiddZZZ4Wc1Hqif/TRR2aSt5NPPlnmzZsnU6ZMiakx2/qhFy5W+gvFhg0bAvvp39577z3p1q2bnHDCCWbctl5iZUI82zalTj31VLn22mtl5syZJmZPPvmkZGRkFNmxI7aQK+yRK+yRK+yQK+AX5Ap75Ap75Ao75IqjowBVDLrhKa2M6kV/gbjxxhvl8ssvl1NOOcVM3Na0aVP54YcfAvvqBHh6+4ILLpDXX39dOnXqZCqosRarX3/9VQYOHGhiVb9+ffOLTp06dczEbi7t0qire2hX2ZdffllatWoVtd0bw8VKP/TCxUpXX6hUqZLs3r3b7Kf307ak7U9jpSuhaPvT+0ebgrYpjYmOZ9cVUB5//HE5++yzzf/AAYWNXGGPXGGPXGGHXAG/IFfYI1fYI1fYIVfkg4MitXDhQmfXrl3O4cOHnYceeshp2bKlM3v2bPO3b775xunRo4fTsWNHZ8mSJWbbvn37nLZt2zo9e/Z0VqxY4cSSpUuXOvv373cOHDjg3HnnnU7r1q1zxKpDhw7O6tWrzbYtW7Y4p512mnPVVVc5y5cvd6JZVlZWyO1ff/3VSU9Pd/bs2ePceOONztlnn+3MmjUrJFbt27d3Nm7caLZpfKpXr+707dvXWbZsmRMr8tqm1JgxY5x27do5P/74YxEeOWINucIeuSJ35Ir8IVfAL8gV9sgVuSNX5A+5wg4FqAjIzMw0F5cmhfHjxzvNmjVzbrrpJmfz5s3mpN6wYYP5+7vvvutceOGFTvfu3Z3PPvss5LHS0tLMyR8sIyPDidZYaVweffRRp0mTJs4dd9xhkuqhQ4cCJ+57772Xa6z0A+CPP/6I6lgFJwhtGw8//LDTqFEj55577nEOHjxoYuD+T4bG6oILLggbq71795rEGo2xKsw25d7fpfGPljih6JEr7JEr7JEr7JAr4BfkCnvkCnvkCjvkioKL8v5dRc8d/6l0gjZd3lQnGdMueDqOWFehcJ144olyww03yF9//WW64Ok4UZe7LKNO5laxYkXzuDqOVrdFQ9dPdyy6G6vU1FQpXbq0WZZy3bp18vHHH5suiy4dT/yPf/zD3C+3WOnSqXqJxlgFtytdZULbxVNPPSU7d+6UTz75ROrWrRvYv3z58ma5XY3ZE088ETZWFSpUMBd3bHc0xMqLNqW0W3q0tSkUPXKFHXKFPXKFHXIF/IRcYYdcYY9cYYdcUXjitApViI+HMNavXy/PPvusaZx6Et9///2SnJwsNWvWNONlf//9dzPx33XXXWfGzlauXDlsA42VWE2aNMmsyPHqq6/KVVddZT70atWqZT70NVbnn3++mQBw+/btUr16dYn1WGkiGD9+vPlgu+yyy8wYa6XxueKKK0zsduzYEfI/JbEUK9oU/IJcYY/z2h65wg5tCn5BrrDHeW2PXGGHNlVw0TcTWBEKtwrCl19+aWa2r1evnrz44otmUrFBgwaZCqk2QK166rYJEybIli1bTJJwl2xU0dpIw8Xq7bfflmuuucZ8oI0dO9Zsu+uuu8xKALrkqZ7ousKCfijqB5+e0LEaK5348OqrrzarcAwZMsRse+SRR8wkdnrRD79PP/3UVNx1yU+NabTHijYFvyBX2OO8tkeusEObgl+QK+xxXtsjV9ihTXmHIXiFyG1UKSkpUqZMGXP9u+++k86dO8vdd98d2O/000+XBx98MHBblzzVCqn7C0U0rhCQW6x0mVdNlnqSL1682KzUccsttwT20+Vi9eKaOnWq/Pbbb1KtWrWYi5VbNdcPfl2tZNy4cdKjR4/AfrrUqV5cur92D9UusLEQK9oU/IJcYY/z2h65wg5tCn5BrrDHeW2PXGGHNuUdIlKIldG5c+eaJUxvuukm+eWXX8x4Tl1+8tRTTzWVT62Ouvbt2ydPPvmkWWpx1qxZ0q9fPzOOO1Zipa9ZP9RGjBghe/bsMfFZuHChSaJKP+Bcf//9t4wZM0bOOeccU33v3bu3RLPssdJlTLXLq/4SoXRZT12qU9uVOweAez8d5//AAw9Ix44dzYekLvsZrWhT8AtyhT3Oa3vkCju0KfgFucIe57U9coUd2lTkUIDKh+zd6PTXCO1mN3/+fHOSlitXzkzcpl3xzjjjDPn888/NeFHtEqsn8pQpU8zEZDpO+/nnnzeTBuo40ViIlU6SmJSUZMas33nnnaZCrN0UtbLcqlUrmTFjRmBSN43Vv/71LzOJov7yo+Nt9QOyXbt2EguxWrt2rfkfDZ3UTieRnDNnjun6qv9DoZV2/R8Npbf1g09jqm1PP0CffvppEyuNabShTcEvyBX2OK/tkSvs0KbgF+QKe5zX9sgVdmhTRaAQVtKLSbpk4qxZs5zWrVs7p512mtOiRQtn5syZ5m/Lli1zhgwZ4kydOtU5cOCA069fP6dLly7OzTff7DRv3ty55ZZbzPKMwaJ5yUVdHnbGjBlmedimTZs6bdq0cT766CPzty+++MLp06ePM3/+fGf37t1Ox44dnb59+zq33Xab2X/kyJFmGdBYipW2m4YNG5q2de655waWO50zZ45z6aWXOr/++quzfv16p169es7w4cPNkp+69Ofjjz9u7h8LsaJNwS/IFfY4r+2RK+zQpuAX5Ap7nNf2yBV2aFORRQ+ofNDJxXT852uvvSZffPGFfPXVV9KoUSPTPVadeeaZ5vaSJUvMrxW637333ivNmjUzv2bo2FB3LLe7CGG0Tkq2dOlSGTx4sOmOqF03dVy6rq6wceNG8/dOnTpJ1apVTQy1kvzBBx+YJSt1ckX9hUcrzroUqIr2Cdx0YkntZq3tRqvv+kuErmqiS6Aq7c6pr11jWb9+fROfpk2bmjH+et/77rvP/BoW7bGiTcEvyBX2OK/tkSvs0KbgF+QKe5zX9sgVdmhTkUcBKgydjO2VV14xXRXd8Z1ug1Laze6kk04yKwLo9Ro1aphlKv/44w/TiFXr1q3NeGxtzOqiiy6SW2+91UxIFjwbvo679Xus5s2bZ7oKu2PR3eSnNE46lli7cmo3Tv1g01hobLVLo9JuwitXrpRvv/3WxPPKK680Kwq4sXIfz++TuLntSv9HIy0tLUe7qlixoulSrTRWOu6/TZs2smzZMhNfpWOR3333XVm9erU0aNBABg4cKKNGjYqqWNGm4BfkCnuc1/bIFXZoU/ALcoU9zmt75Ao7tKnihygF0Q92XVVCx21qdVNXlNCJx7LTE1pXCTjllFNkwYIFZluLFi1MI9QGrnQs7ejRo03lOZg2UG2cfm+gepLqiaevUyezu/3228349OwntcaoS5cuZllP/cBzT2Jd0UMryerSSy81sb744ovDxsrvyfTPP/807ei8884z44J1kjp36c7g19ayZUsTG/1g02TixkbHa69atcrcHjBggIwcOdJ8OAbHKRpiRZuCX5Ar7HFe2yNX2KFNwS/IFfY4r+2RK+zQpoovf39aFTLtdrdt2zYz+Z9WgzVpaFfYXbt25fhgb9y4sTRp0kTeeecdc1tnxNfVA7QB7t+/33S90+SRvZFHSwPVX2B2795tYqaTHWpXRE2aerK7sXJftyZe7bLonsS6QoB+GGqc9OTWSd10dYXsoiVW2o1VK+vaTVrbi3al1u6vWoV3X6P7i4X+EqG/eLkfgPo/JPqrhrZLXQY0MTFRLrnkkpDH18eIhljRpuAX5Ap7nNf2yBV2aFPwC3KFPc5re+QKO7SpYizCc04VazqB2IYNGwK3f/jhB+eSSy5xtm/fHnb/Dz74wGnfvr3z7bffmts6MWCs2Llzp7Njx47A7Xnz5jmXX365mQQxKysrx/6TJk1yrrzySmflypXm9t69e51YERyP1atXO+3atXMmTpzo7Nmzx2zLzMwM2X/YsGFmQsnNmzeb28Fxjma0KfgFucIe57U9coUd2hT8glxhj/PaHrnCDm2q+PrfzGIxTJeWdCcK0wnEtBueVkO1oqmTj2mFWbvABnP/rkuh6ljZunXrmu06btStOvu9K+yxYqUT1GkM3G1aSddlO91JELPHqn379mYiu+rVq5vtFSpUCPl7NMfKfX06CeCgQYPMZHYaL+3OqdV5rbAHd3fVbqD//e9/zXalE99Fa6xoU/ALcoU9zmt75Ao7tCn4BbnCHue1PXKFHdqUP8RpFUpiUHADVbrShK4mEfy3vn37mknIdGUKVyw2wuyx0gntdKK64L/pmNghQ4ZIz549A/sRq9BYZXfhhRea7pzjxo2LuVjRpuAX5Ap7nNf2yBV2aFPwC3KFPc5re+QKO7Qpf4m+cvoxaCNUbiN99dVXpUOHDjJ9+vTAyhT6t6SkJDOWVidn04nadNy2/gIRrpFGaw0vXKy0Ojxr1ixJT08P/E1XBTjuuOPMCa2xmjBhQq4ndCzHKnhlCqVj+93x/LESK9oU/IJcYY/z2h65wg5tCn5BrrDHeW2PXGGHNuVPCbFUDVV6WxPA888/L7Nnzza/TsycOdPMfP/TTz8FJhj74YcfzFKL1113nekyq1VTtztj9sYaDZVTm1hp12CNVe3atc0KCrq6gtIJFXWZyn79+pkuntdcc03g5CVWR2LldqH+/vvvzX46Md7ZZ5+d6/P4PVa0KfgFucIe57U9coUd2hT8glxhj/PaHrnCDm0qijhRKCMjI+T24cOHA9fnzp3r1K1b17nnnnvMJIC7du1y7r//fqdFixZmAjf3vn369HFq1KjhzJ4924lm2Seqcydl039fe+01p0GDBs7dd98diNV9991nYvXKK68EYtW5c2enXr16zqxZs5xoVpBYuZPh3XvvvU7z5s2d6dOnO9GKNgW/IFfY47y2R66wQ5uCX5Ar7HFe2yNX2KFNRZ+oLEC5FixY4HTp0sUZOnSoWVlC6WoUycnJ5t8hQ4Y4rVu3dp544gknNTU15L7ZVwjQBhxuxvxoMX/+fLMywIMPPugsWbLEbFu+fHnYWKWkpATupzFZv359yGMRq9xjlZSUdNT/qYkmtCn4BbnCHue1PXKFHdoU/IJcYY/z2h65wg5tKnr4vgCljcc92dyGpEtSDh482Pza8PrrrztTp041vzro8qdq/PjxTuPGjZ0ZM2YcszEG/8oRTbFyq8l64o4YMcKc0Fop1gqynryrVq0yfx81apTTrFmzsLHKjljZxypaEgRtCn5BrrDHeW2PXGGHNgW/IFfY47y2R66wQ5uKDb4vQIWjyaFs2bIh3Vy1O17Hjh3N9ViqFh/L6NGjnQoVKjhffPGFua2/2AwfPtzp27evub127dqQ/YkVsToW4gS/IFfY47y2R6zsECf4BbnCHue1PWJlhzhFH9+tgqdFs+BZ/3WGe53JXmf9f/bZZ+X333+XK6+80kzup5P/uXTSsZ07d8qff/4ptWrVCjtzfrTJHitdjeORRx6Rc889V1577TVJS0uT66+/3kxYt3r1arNPyZIlzTKeOpFdcnKyWVFBEStipYgT/IJcYY/z2h6xskOc4BfkCnuc1/aIlR3iFJt8VYByV4rQGf937NghmzZtkg8++ED+/vtveeihh8wSi0888YSUK1fOLLO4fPlyMwO+WrRokXTu3FkqVqwYeLxobqDBsdq1a5f88ccf8uabb5pY3XDDDTJv3jyZNm2aNGjQwCxXuWzZMtm6davZ/+eff5ZWrVpJ+fLlAysEECtiRZzgF+QKe5zX9oiVHeIEvyBX2OO8tkes7BCn2JUgPqKNdNu2bWaZxUmTJskpp5xiKp/6i0SpUqVMgvi///s/c1t/qXjhhRdk4MCBpoF+8803JpnECjdWGoPXX3/dLAerFeN///vfgV94vvvuO5Nse/ToYU74IUOGSI0aNWTJkiXm1x/3caIdsbJDnOAX5Ap7nNf2iJUd4gS/IFfY47y2R6zsEKfY5aseUPph365dO9NVLykpSR577DFT7fz000/N35s0aSJNmzaVWbNmmV8k+vfvL1WqVJEOHTqYbnu9e/eWWKDxef/9902X4QoVKphYDRo0yFSI3ZO6bdu2JrF++OGH0rx5c7n00ktNrM4//3xZtWqVdO/eXWIBsbJDnOAn5Ao7nNf2iJUd4gQ/IVfY4by2R6zsEKfYVqwKUNrVdfLkyaaLXTi1a9c2DfHAgQMmQTRq1Eiuvvpq88uF0jHYZ511lvzwww+mIZ9zzjly3HHHyd69e83fDx8+LNFCT7yXX37ZVIWV2/1QaddE7ZKoY2MTExPNNh1Lq0n2k08+Mbd1bHudOnXk66+/Nl0dO3bsKBkZGeZ+iljFXqyIE/yCXGGP89oesbJDnOAX5Ap7nNf2iJUd4oRiXYDSScNGjRol11xzjWzZssWMs9YPf00Iyp2cTBPFrbfeGqiMVq5c2VRB9+zZI/Pnzzfb9FcJ7can+9atW9dUT7VrnjZ67dbnd9odcejQoSZWS5culYsuusi8drf7oRurM888U6677rpA0tXxsy1btpTNmzfL4sWLzTaN8/jx4+XEE0803YmrVasmhw4dIlYxFiviBL8gV9jjvLZHrOwQJ/gFucIe57U9YmWHOOGYnGIgJSXFufLKK51ly5aZ27NmzXKuu+468292v/32m3Puuec6b775prm9Y8cO55577nGee+65kP2ysrKicinGnTt3OldddZWzadMmc3vSpEnO5Zdf7nz//fc59v3ss8/M39xlK3/55RdnyJAhzpw5c8LGKi0tzYkmxMoOcYJfkCvscV7bI1Z2iBP8glxhj/PaHrGyQ5xwLEVSgNJGpQ1OP+TVkiVLnG7dujkbN24MfLiPHj3aGTZsWGCfzMxM829qaqrz6KOPOq1atQo8nm6LVq+99prTsWNHZ8GCBea2nqBt27YN/F1j1a9fP+fxxx93kpOTQ2K1fft256677nJ69+4d2H///v1OtCJWdogT/IJcYY/z2h6xskOc4BfkCnuc1/aIlR3ihLyK6BC8F1980XS3mzlzppnJXru5Kh1fvWbNGlm3bp25reOwdZy1jvd0t+lYUaXjRLU73oABA8w4UC2i6bb/X0yTaLFv3z7p27evvPHGGzJixAgTD9WpUyczpl0vbqwuvPBCsxzlX3/9FRIr7aZ4wQUXSLdu3Ux3ZI2Pjl0nVrEZK+IEvyBX2OO8tkes7BAn+AW5wh7ntT1iZYc4Id+cCPnjjz+cPn36mF8lXLVr13beeOMNc127u55//vmBv2mF9IwzznBWrFjhxKLVq1c7F198ceC2VordavGgQYOc7t27B/72119/OTVr1nS2bt3qxCJiZYc4wQ/IFXnDeW2PWNkhTvADckXecF7bI1Z2iBOKfQ+o6tWry4MPPmh+ldAKp/7K0L59e0lISDB/Hzt2rGzYsEHeeustc/v44483Sy0ePHjwaMUziVaLFi2SU0891cTqjjvukH79+skTTzwhqamp8vDDD8vnn38emKBNJ2bTZWK1Ep0bYkWsiBP8gFyRN5zX9oiVHeIEPyBX5A3ntT1iZYc4Ib/+9yntEZ3lXme8d2e9b9KkSaAr3s6dO82yphMmTDDbypYta5LF3Llz5Z133jFdZ+vVqyfNmzfP9fHdx40G2WOlS79qYtUVN8qUKSP33nuvDBw40HRdfPLJJ+WRRx4xJ7l2Xfzpp5/M0rGnnXZaro9PrGIvVsQJfkGusMd5bY9Y2SFO8AtyhT3Oa3vEyg5xQqFxPJB9hYj//Oc/gS557t9mzpzp9OrVK2Q/3Udnzp84caLz9ddfO7EgXKzcbQ0bNnQ6dOgQmPH/yy+/dM466yzTjTE9Pd2s3DFq1Cjnq6++cmIBsbJDnOAX5Ap7nNf2iJUd4gS/IFfY47y2R6zsECcUNvGygWpj69mzp9OuXbvAUowunfH+3//+t/P33387DzzwgPPTTz+FfUw3wUSbo8Vq/fr1Ztu0adPMeNmDBw+a20lJSU7nzp2dvXv3hn1MYhXbsSJO8AtyhT3Oa3vEyg5xgl+QK+xxXtsjVnaIE4pdAUqXTVy3bl3Yv7399tvORRddZCYmcyuen3/+ech9q1atahpxmzZtnAEDBphfKKK1geY1VrqUrEuXpbzzzjud/v37Ow0aNHCmTJniZGVlhTwGsYq9WBEn+AW5wh7ntT1iZYc4wS/IFfY4r+0RKzvECZGUkJ/xn7p04r///W9JSkqSDh06yOrVq6Vt27ZSvnx5s4xiy5YtzdjPM844Q2bMmCH333+/NGzY0EwUqJOQfffdd7Jr1y7p2LGj3HDDDXLSSSfleB53eUY/K0isWrVqJRUrVjTbVq1aJT/++KM888wzZlt2xCp2YkWc4BfkCnuc1/aIlR3iBL8gV9jjvLZHrOwQJxQJ20rVypUrnZ9//jlw+/3333cqVqzo1KlTx3n44YedQ4cOmermli1bzFKnEyZMcFq2bOncdtttzubNm3M8njtWNPuyjdGgsGOVvTtk9qqynxErO8QJfkGusMd5bY9Y2SFO8AtyhT3Oa3vEyg5xQlGyLkfqjPbXXnutLF261CyzeODAAVMR7dWrlzz00ENm9nutbmr1VFelSElJka+++kqef/55qVOnjqmw6kXpv6VKlTJLpuowQL1fNFVGCzNWwTRWutJHNK0SQKzsECf4BbnCHue1PWJlhzjBL8gV9jiv7RErO8QJRcq2UqWz2MfHxzuNGzd2Xn75ZbNNV5To27ev8+677wb20xnvDx8+HLW/QtggVvaIlR3iBL+grdojVvaIlR3iBL+grdojVvaIlR3ihKJk/fNA2bJlZeDAgaaiqf8qHSd63HHHyYoVKwL7JScnS0JCghw+fDgqf4WwQazsESs7xAl+QVu1R6zsESs7xAl+QVu1R6zsESs7xAlFKU6rUHltsO+++6507drV3H7zzTflgw8+kN27d5tG/MADD5hJAEGs8oJY2SFO8Avaqj1iZY9Y2SFO8Avaqj1iZY9Y2SFO8EUB6p577pHFixfLokWLTOPUhvvf//5XPv30U7n++uulbt263h2tzxAre8TKDnGCX9BW7REre8TKDnGCX9BW7REre8TKDnGCLwpQOglZ06ZN5fTTT5eVK1fKxx9/LM2bNw/8XR+Oicf+h1jZI1Z2iBP8grZqj1jZI1Z2iBP8grZqj1jZI1Z2iBN8UYBSq1atks2bN0u3bt3MqhMunQ2fcaGhiJU9YmWHOMEvaKv2iJU9YmWHOMEvaKv2iJU9YmWHOMEXBahgmZmZZrlFHBuxskes7BAn+AVt1R6xskes7BAn+AVt1R6xskes7BAn+KIABQAAAAAAABwN/eoAAAAAAADgKQpQAAAAAAAA8BQFKAAAAAAAAHiKAhQAAAAAAAA8RQEKAAAAAAAAnqIABQAAAAAAAE9RgAIAAAAAAICnKEABAAAAAADAUxSgAAAAAAAA4CkKUAAAAAAAABAv/T+7sIpCpU3QnAAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "configs = [cfg['label'] for cfg in HDBSCAN_CONFIGS]\n", + "palette = dict(zip(configs, sns.color_palette('tab10', n_colors=len(configs))))\n", + "\n", + "fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharey=True)\n", + "for ax, metric in zip(axes, ['precision', 'recall', 'f1']):\n", + " means = raw_df.groupby('hdbscan_cfg')[metric].mean().reindex(configs)\n", + " stds = raw_df.groupby('hdbscan_cfg')[metric].std().reindex(configs)\n", + " ax.bar(range(len(configs)), means.values, yerr=stds.values,\n", + " color=[palette[c] for c in configs], capsize=4, alpha=0.85)\n", + " ax.set_xticks(range(len(configs)))\n", + " ax.set_xticklabels(configs, rotation=25, ha='right', fontsize=8)\n", + " ax.set_ylim(0, 1.15)\n", + " ax.set_title(metric.capitalize())\n", + " ax.grid(axis='y', alpha=0.3)\n", + "\n", + "fig.suptitle('Mean ± std across agents and sparsity levels')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/research/hdbscan_validation_paper.ipynb b/examples/research/hdbscan_validation_paper.ipynb index 0e01db9a..aa26e8c0 100644 --- a/examples/research/hdbscan_validation_paper.ipynb +++ b/examples/research/hdbscan_validation_paper.ipynb @@ -4,45 +4,44 @@ "cell_type": "code", "execution_count": null, "id": "3d2fb0ef-73cc-4592-8ce5-9ad5f3027e45", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "import pandas as pd\n", - "# pd.set_option('display.max_rows', 500)\n", - "# pd.set_option('display.max_colwidth', 500)\n", "import warnings\n", - "import time\n", "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n", "\n", "import geopandas as gpd\n", - "import shapely\n", "import numpy as np\n", "from functools import partial\n", + "from pathlib import Path\n", + "from shapely.geometry import Point\n", "from tqdm import tqdm\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.colors as mcolors\n", - "import matplotlib.patches as mpatches\n", - "from matplotlib.ticker import MaxNLocator\n", - "\n", - "import seaborn as sns\n", - "from statsmodels.nonparametric.smoothers_lowess import lowess\n", "\n", + "import nomad.data as data_folder\n", "import nomad.io.base as loader\n", "import nomad.stop_detection.utils as utils\n", + "import nomad.stop_detection.dbscan as TADBSCAN\n", "import nomad.stop_detection.hdbscan as HDBSCAN\n", + "import nomad.stop_detection.grid_based as GRID_BASED\n", "import nomad.stop_detection.lachesis as LACHESIS\n", - "#import nomad.stop_detection.ta_dbscan as TADBSCAN\n", - "import nomad.stop_detection.dbscan as TADBSCAN\n", - "import nomad.stop_detection.grid_based as GRID_BASED # for oracle visits\n", - "import nomad.stop_detection.postprocessing as pp\n", + "import nomad.stop_detection.density_based as SEQSCAN\n", + "from nomad.stop_detection.validation import (\n", + " AlgorithmRegistry,\n", + " bootstrap_metric_summary,\n", + " compute_stop_detection_metrics,\n", + " plot_metric_boxplots,\n", + " plot_metric_intervals,\n", + ")\n", + "from joblib import Parallel, delayed\n", + "import time\n", + "from nomad.traj_gen import Agent, Population\n", "\n", "import nomad.visit_attribution.visit_attribution as visits\n", - "import nomad.filters as filters\n", - "import nomad.city_gen as cg\n", "from nomad.city_gen import City\n", - "\n", - "from nomad.contact_estimation import overlapping_visits, compute_visitation_errors, precision_recall_f1_from_minutes\n", - "import pdb" + "from nomad.map_utils import blocks_to_mercator_gdf" ] }, { @@ -52,72 +51,9 @@ "metadata": {}, "outputs": [], "source": [ - "import nomad.city_gen as cg\n", - "# city = cg.load('../garden-city.pkl')\n", - "\n", - "city = City.from_geopackage('/Users/andresmondragon/nomad/nomad/data/garden-city.gpkg')\n", - "\n", - "def plot_metrics_boxplots(df, metrics,\n", - " algo_order=None, colors=None,\n", - " figsize=(24, 5.5), save_path=None):\n", - " # --- normalise inputs -------------------------------------------------\n", - " if algo_order is None:\n", - " # preserve appearance order in the DataFrame\n", - " algo_order = df.algorithm.drop_duplicates().tolist()\n", - "\n", - " if colors is None:\n", - " cmap = plt.colormaps.get_cmap('tab10')\n", - " colors = {a: cmap(i % cmap.N) for i, a in enumerate(algo_order)}\n", - " else:\n", - " # fill in any missing algorithm colour with the next Tab10 entry\n", - " cmap = plt.colormaps.get_cmap('tab10')\n", - " for i, a in enumerate(algo_order):\n", - " colors.setdefault(a, cmap(i % cmap.N))\n", - "\n", - " # --- figure -----------------------------------------------------------\n", - " fig, axes = plt.subplots(1, len(metrics), figsize=figsize, sharey=False)\n", "\n", - " if len(metrics) == 1: # when only one metric is passed\n", - " axes = [axes]\n", - "\n", - " for ax, metric in zip(axes, metrics):\n", - " ax.set_facecolor('#EAEAF2')\n", - "\n", - " # list of series, one per algorithm\n", - " data = [df.loc[df.algorithm == a, metric].dropna() for a in algo_order]\n", - "\n", - " bp = ax.boxplot(data,\n", - " positions=range(len(algo_order)),\n", - " patch_artist=True,\n", - " widths=0.4,\n", - " whis=(5, 95),\n", - " showfliers=False,\n", - " medianprops={'color': 'black', 'linewidth': 0.6})\n", - "\n", - " for box, alg in zip(bp['boxes'], algo_order):\n", - " box.set_facecolor(colors[alg])\n", - "\n", - " ax.grid(axis='y', color='darkgray', linestyle='--', linewidth=0.8)\n", - " ax.yaxis.set_major_locator(MaxNLocator(nbins=5))\n", - " ax.set_title(metric.replace('_', ' ').title(), fontsize=16)\n", - " ax.set_xticks([])\n", - "\n", - " # legend\n", - " handles = [plt.matplotlib.patches.Patch(color=colors[a], label=a)\n", - " for a in algo_order]\n", - " fig.legend(handles, algo_order,\n", - " loc='lower center',\n", - " ncol=len(algo_order),\n", - " bbox_to_anchor=(0.5, -0.015),\n", - " fontsize=15)\n", - "\n", - " plt.subplots_adjust(bottom=0.1, top=0.95)\n", - "\n", - " if save_path:\n", - " fig.savefig(f'{save_path}.png', dpi=300)\n", - " fig.savefig(f'{save_path}.svg')\n", - "\n", - " plt.show()\n", + "data_dir = Path(data_folder.__file__).parent\n", + "city = City.from_geopackage(data_dir / \"garden-city.gpkg\")\n", "\n", "def classify_building_size_from_id(building_id):\n", " building = city.buildings_df.loc[building_id]\n", @@ -145,352 +81,253 @@ { "cell_type": "code", "execution_count": null, - "id": "139c3d06-7bc1-4669-a2bf-f04d8b3bd6e5", + "id": "549b44aa", "metadata": {}, "outputs": [], "source": [ - "traj_cols = {'user_id':'user_id', 'x':'x', 'y':'y', 'timestamp':'timestamp'}\n", - "poi_table = gpd.read_file('/Users/andresmondragon/nomad/nomad/data/garden-city.gpkg')\n", + "_GEN_N = 250\n", + "_GEN_SEED = 2025\n", + "_GEN_START = pd.Timestamp(\"2024-06-01T00:00:00-04:00\")\n", + "_GEN_END = pd.Timestamp(\"2024-06-08T00:00:00-04:00\")\n", + "_DATA_DIR = Path(data_folder.__file__).parent\n", + "\n", + "_sparse_path = Path(\"robustness-of-algorithms/sparse_traj_2\")\n", + "_diaries_path = Path(\"robustness-of-algorithms/diaries_2\")\n", + "_homes_path = Path(\"robustness-of-algorithms/homes_2\")\n", + "\n", + "_rng = np.random.default_rng(_GEN_SEED)\n", + "HA_LOWER_BOUND = 8/15 # blocks; matches nomad.traj_gen._sample_horizontal_noise lower bound for pareto_prior\n", + "_PARAMS = {\n", + " \"beta_ping\": _rng.uniform(3, 12, _GEN_N).tolist(),\n", + " \"beta_start\": _rng.uniform(50, 500, _GEN_N).tolist(),\n", + " \"beta_durations\": _rng.uniform(30, 300, _GEN_N).tolist(),\n", + " \"ha\": _rng.uniform(HA_LOWER_BOUND + 1e-9, 5, _GEN_N).tolist(),\n", + "}\n", + "\n", + "def generate_agent_trajectory(args):\n", + " \"\"\"Worker function for parallel generation.\"\"\"\n", + " identifier, home, work, seed, beta_ping, beta_start, beta_durations, ha = args\n", + "\n", + " city = City.from_geopackage(_DATA_DIR / \"garden-city.gpkg\")\n", + " city._build_hub_network(hub_size=16)\n", + " city.compute_gravity(exponent=2.0)\n", + " city.compute_shortest_paths(callable_only=True)\n", + "\n", + " agent = Agent(identifier=identifier, city=city, home=home, workplace=work)\n", + " agent.generate_trajectory(datetime=_GEN_START, end_time=_GEN_END, seed=seed)\n", + " agent.sample_trajectory(\n", + " beta_ping=beta_ping, beta_start=beta_start, beta_durations=beta_durations,\n", + " ha=ha, seed=seed, replace_sparse_traj=True,\n", + " )\n", "\n", - "poi_table = poi_table.rename({'type':'building_type'}, axis=1)\n", - "poi_table['building_size'] = poi_table['id'].apply(classify_building_size_from_id)\n", + " sparse_df = agent.sparse_traj.copy()\n", + " sparse_df['user_id'] = identifier\n", + " sparse_df['home'] = home\n", + " sparse_df['workplace'] = work\n", + "\n", + " diary_df = agent.diary.copy()\n", + " diary_df['user_id'] = identifier\n", + "\n", + " return sparse_df, diary_df\n", + "\n", + "if _sparse_path.exists() and _diaries_path.exists():\n", + " print(\"Data already exists — skipping generation.\")\n", + "else:\n", + " _city = City.from_geopackage(_DATA_DIR / \"garden-city.gpkg\")\n", + " homes = _city.buildings_gdf[_city.buildings_gdf['building_type'] == 'home']['id'].to_numpy()\n", + " workplaces = _city.buildings_gdf[_city.buildings_gdf['building_type'] == 'workplace']['id'].to_numpy()\n", + "\n", + " agent_params = [\n", + " (f'agent_{i:04d}',\n", + " _rng.choice(homes),\n", + " _rng.choice(workplaces),\n", + " i,\n", + " _PARAMS['beta_ping'][i],\n", + " _PARAMS['beta_start'][i],\n", + " _PARAMS['beta_durations'][i],\n", + " _PARAMS['ha'][i])\n", + " for i in range(_GEN_N)\n", + " ]\n", "\n", - "diaries_df = loader.from_file(\"robustness-of-algorithms/diaries_2\", format=\"parquet\", **traj_cols)\n", - "diaries_df = diaries_df.rename({'location':'id'}, axis=1)\n", - "diaries_df = diaries_df.merge(poi_table[['id', 'building_size', 'building_type']], on='id', how='left')\n", + " print(f\"Generating {_GEN_N} agents in parallel...\")\n", + " start_time = time.time()\n", "\n", - "diaries_df.loc[~diaries_df.id.isna(),'dwell_length'] = diaries_df.loc[~diaries_df.id.isna(),'duration'].apply(classify_dwell)\n", - "diaries_df['id'] = diaries_df['id']\n", - "sparse_df = loader.from_file(\"robustness-of-algorithms/sparse_traj_2\", format=\"parquet\", **traj_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54031b65", - "metadata": {}, - "outputs": [], - "source": [ - "poi_table" - ] - }, - { - "cell_type": "markdown", - "id": "eac0bbd9-4ed4-4d61-8b38-9c5bb59d443b", - "metadata": {}, - "source": [ - "## Analyze completeness" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22f51807", - "metadata": {}, - "outputs": [], - "source": [ - "completeness_df = filters.completeness(sparse_df, traj_cols=traj_cols)" + " results = Parallel(n_jobs=-1, verbose=10)(\n", + " delayed(generate_agent_trajectory)(params) for params in agent_params\n", + " )\n", + "\n", + " generation_time = time.time() - start_time\n", + " print(f\"Generated {_GEN_N} agents in {generation_time:.2f}s ({generation_time / _GEN_N:.2f}s per agent)\")\n", + "\n", + " population = Population(_city)\n", + " for (sparse_df, diary_df), params in zip(results, agent_params):\n", + " identifier, home, work = params[0], params[1], params[2]\n", + " agent = Agent(identifier=identifier, city=_city, home=home, workplace=work)\n", + " agent.sparse_traj = sparse_df.drop(columns=['home', 'workplace'])\n", + " agent.diary = diary_df\n", + " population.add_agent(agent, verbose=False)\n", + "\n", + " poi_data = pd.DataFrame({\n", + " 'building_id': _city.buildings_gdf['id'].values,\n", + " 'x': (_city.buildings_gdf['door_cell_x'].astype(float) + 0.5).values,\n", + " 'y': (_city.buildings_gdf['door_cell_y'].astype(float) + 0.5).values,\n", + " })\n", + " population.reproject_to_mercator(sparse_traj=True, diaries=True, poi_data=poi_data)\n", + "\n", + " population.save_pop(\n", + " sparse_path=str(_sparse_path),\n", + " diaries_path=str(_diaries_path),\n", + " homes_path=str(_homes_path),\n", + " beta_ping=_PARAMS['beta_ping'],\n", + " beta_start=_PARAMS['beta_start'],\n", + " beta_durations=_PARAMS['beta_durations'],\n", + " ha=_PARAMS['ha'],\n", + " )\n", + " del _city, population, results\n", + " print(f\"Generated {_GEN_N} agents in {generation_time:.2f}s -> {_diaries_path} / {_sparse_path}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "ea573424", + "id": "040956b6", "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(4,3))\n", - "ax = sns.kdeplot(\n", - " data=pd.DataFrame(completeness_df),\n", - " x=\"0\",\n", - " fill=True,\n", - " linewidth=1.5,\n", - " bw_adjust=0.25\n", + "# ── DATA LOADING ──────────────────────────────────────────────────────────────\n", + "poi_table = gpd.read_file(data_dir / \"garden-city.gpkg\", layer='buildings')\n", + "poi_table = poi_table.rename({'type': 'building_type'}, axis=1)\n", + "# Project from local grid units → EPSG:3857 meters to match the saved sparse trajectories\n", + "poi_table = blocks_to_mercator_gdf(\n", + " poi_table,\n", + " block_size=city.block_side_length,\n", + " false_easting=city.web_mercator_origin_x,\n", + " false_northing=city.web_mercator_origin_y,\n", + " drop_garden_cols=False,\n", ")\n", + "poi_table['building_size'] = poi_table['id'].apply(classify_building_size_from_id)\n", "\n", - "# cosmetics\n", - "ax.set_xlabel(\"q (Trajectory Completeness)\")\n", - "ax.set_ylabel(\"Density\")\n", - "ax.grid(True, alpha=0.3)\n", - "sns.despine(top=True, right=True)\n", - "\n", - "# annotation (top-right corner in axes coords)\n", - "ax.text(\n", - " 0.99, 0.95,\n", - " f\"N = {len(completeness_df)}\",\n", - " transform=ax.transAxes,\n", - " ha=\"right\",\n", - " va=\"top\",\n", - " fontsize=9\n", + "diaries_df = loader.from_file(\"robustness-of-algorithms/diaries_2\", format=\"parquet\")\n", + "diaries_df = diaries_df.rename({'location': 'id'}, axis=1)\n", + "diaries_df = diaries_df.merge(poi_table[['id', 'building_size', 'building_type']], on='id', how='left')\n", + "diaries_df.loc[~diaries_df.id.isna(), 'dwell_length'] = (\n", + " diaries_df.loc[~diaries_df.id.isna(), 'duration'].apply(classify_dwell)\n", ")\n", "\n", - "plt.tight_layout()\n", - "plt.savefig(\"q_stat_density.svg\", format=\"svg\", bbox_inches=\"tight\")\n", - "plt.savefig(\"q_stat_density.png\", format=\"svg\", bbox_inches=\"tight\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "77ae7252-a50e-430f-aaee-f47425e1a74a", - "metadata": {}, - "source": [ - "## Execution for all users" + "sparse_df = loader.from_file(\"robustness-of-algorithms/sparse_traj_2\", format=\"parquet\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d4f65fbe-25bc-4dd0-9cfc-78c9da6802ff", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 1 + }, "outputs": [], "source": [ - "## TRANSFORMATION FUNCTIONS\n", - "# Use this for steps that should pass a generic DataFrame through\n", - "def no_op_df(data, **kwargs):\n", - " return data\n", + "# ── PREPROCESSING / POST-PROCESSING FUNCTIONS ─────────────────────────────────\n", "\n", - "# Use this for steps that should pass the 'stops' DataFrame through\n", - "def no_op_stops(stops, **kwargs):\n", - " return stops\n", + "def no_op(value, *_args, **_kwargs):\n", + " return value\n", "\n", - "\n", - "# Pre-processing snippets\n", - "def prejoin_oracle_map(data, diary, **kwargs):\n", + "def prejoin_oracle_map(data, diary):\n", " location = visits.oracle_map(data, diary, timestamp='timestamp', location_id='id')\n", " return data.join(location)\n", "\n", - "# Stop summarization snippets\n", - "summarize_stops_with_loc = partial(utils.summarize_stop, x='x', y='y', keep_col_names=False, passthrough_cols=['id'], complete_output=True)\n", - "\n", - "# def postjoin_poly_map(data, **kwargs):\n", - "# # This is a post-labeling step, before stop creation\n", - "# location = visits.point_in_polygon(data=data, poi_table=poi_table, method='majority', data_crs='EPSG:3857',\n", - "# max_distance=12, cluster_label='cluster', location_id='building_id', x='x', y='y')\n", - "# return data.join(location)\n", + "summarize_stops_with_loc = partial(\n", + " utils.summarize_stop, x='x', y='y',\n", + " keep_col_names=True, passthrough_cols=['id'], complete_output=True,\n", + " timestamp='timestamp',\n", + ")\n", "\n", - "def postjoin_poly_map(data, **kwargs):\n", - " # Convert data to GeoDataFrame if it isn't already\n", + "def postjoin_poly_map(data):\n", " if not isinstance(data, gpd.GeoDataFrame):\n", - " # Create point geometries from x, y coordinates\n", - " from shapely.geometry import Point\n", " geometry = [Point(xy) for xy in zip(data['x'], data['y'])]\n", " data_gdf = gpd.GeoDataFrame(data, geometry=geometry, crs='EPSG:3857')\n", " else:\n", " data_gdf = data\n", " if data_gdf.crs is None:\n", " data_gdf = data_gdf.set_crs('EPSG:3857', allow_override=True)\n", - " \n", - " # Now call point_in_polygon with the GeoDataFrame and explicit x/y columns\n", " location = visits.point_in_polygon(\n", - " data=data_gdf, \n", - " poi_table=poi_table, \n", - " method='majority',\n", - " max_distance=12, \n", - " cluster_label='cluster', \n", - " location_id='id',\n", - " x='x',\n", - " y='y',\n", - " data_crs='EPSG:3857' # Add this line\n", + " data=data_gdf, poi_table=poi_table, method='majority',\n", + " max_distance=12, cluster_label='cluster', location_id='id',\n", + " x='x', y='y', data_crs='EPSG:3857',\n", " )\n", " return data.join(location)\n", "\n", - "# Special fix/adjustment snippets\n", - "# overlap-removal special fix removed\n", - "\n", - "\n", - "def pad_oracle_stops(stops, **kwargs):\n", - " return utils.pad_short_stops(stops, pad=5, dur_min=0, start_timestamp='start_timestamp')\n", - "\n", - "def pad_oracle_stops_long(stops, **kwargs):\n", - " return utils.pad_short_stops(stops, pad=15, dur_min=4, start_timestamp='start_timestamp')\n", - "\n", - "# Each row defines the unique workflow for one algorithm\n", - "algo_params = pd.DataFrame([\n", - " {\n", - " 'algo': 'ta-hdbscan',\n", - " 'func': HDBSCAN.hdbscan_labels,\n", - " 'params': {'time_thresh': 240, 'min_pts': 3, 'min_cluster_size': 1, 'include_border_points': True},\n", - " 'pre_process_func': no_op_df, # Correct: Passes 'data'\n", - " 'post_labels_func': postjoin_poly_map,\n", - " 'special_fix_func': no_op_stops,\n", - " },\n", - " {\n", - " 'algo': 'oracle',\n", - " 'func': GRID_BASED.grid_based_labels,\n", - " 'params': {'time_thresh': 600, 'min_pts': 0, 'location_id': 'id'},\n", - " 'pre_process_func': prejoin_oracle_map,\n", - " 'post_labels_func': no_op_df, # Correct: Passes 'data'\n", - " 'special_fix_func': pad_oracle_stops_long,\n", - " },\n", - " {\n", - " 'algo': 'tadbscan_coarse',\n", - " 'func': TADBSCAN.ta_dbscan_labels,\n", - " 'params': {'time_thresh': 240, 'min_pts': 2, 'dist_thresh': 30},\n", - " 'pre_process_func': no_op_df, # Correct\n", - " 'post_labels_func': postjoin_poly_map,\n", - " 'special_fix_func': no_op_stops, # Correct\n", - " },\n", - " {\n", - " 'algo': 'tadbscan_fine',\n", - " 'func': TADBSCAN.ta_dbscan_labels,\n", - " 'params': {'time_thresh': 120, 'min_pts': 3, 'dist_thresh': 20},\n", - " 'pre_process_func': no_op_df, # Correct\n", - " 'post_labels_func': postjoin_poly_map,\n", - " 'special_fix_func': no_op_stops, # Correct\n", - " },\n", - " {\n", - " 'algo': 'lachesis_coarse',\n", - " 'func': LACHESIS.lachesis_labels,\n", - " 'params': {'dt_max': 240, 'delta_roam': 40},\n", - " 'pre_process_func': no_op_df, # Correct: Passes 'data'\n", - " 'post_labels_func': postjoin_poly_map,\n", - " 'special_fix_func': no_op_stops, # Correct: Passes 'stops'\n", - " },\n", - " {\n", - " 'algo': 'lachesis_fine',\n", - " 'func': LACHESIS.lachesis_labels,\n", - " 'params': {'dt_max': 120, 'delta_roam': 25},\n", - " 'pre_process_func': no_op_df, # Correct\n", - " 'post_labels_func': postjoin_poly_map,\n", - " 'special_fix_func': no_op_stops, # Correct\n", - " }\n", - "])\n", - "\n", - "## METRICS FUNCTIONS\n", - "def compute_all_metrics(stops, truth, poi_table, user, algo):\n", - " \"\"\"\n", - " Calculate all metrics. One expensive overlap on *raw* stops for\n", - " error rates, a second (gap-filled) overlap for precision/recall.\n", - " \"\"\"\n", - " \n", - " # Convert truth timestamps to datetime if they aren't already\n", - " truth = truth.copy()\n", - " if not pd.api.types.is_datetime64_any_dtype(truth['timestamp']):\n", - " truth['timestamp'] = pd.to_datetime(truth['timestamp'], unit='s')\n", - " \n", - " # Add end_timestamp column to truth (critical!)\n", - " if 'end_timestamp' not in truth.columns:\n", - " truth['end_timestamp'] = truth['timestamp'] + pd.to_timedelta(truth['duration'], unit='m')\n", - " \n", - " # Ensure stops has end_datetime (should already exist from summarize_stop)\n", - " stops = stops.copy()\n", - " if 'end_datetime' not in stops.columns and 'start_datetime' in stops.columns:\n", - " stops['end_datetime'] = stops['start_datetime'] + pd.to_timedelta(stops['duration'], unit='m')\n", - " \n", - " # Rename columns to simpler names for overlapping_visits\n", - " # This ensures the _left/_right suffixes work correctly\n", - " stops = stops.rename(columns={\n", - " 'start_datetime': 'datetime',\n", - " 'end_datetime': 'end_datetime'\n", - " })\n", - " \n", - " truth = truth.rename(columns={\n", - " 'timestamp': 'datetime',\n", - " 'end_timestamp': 'end_datetime'\n", - " })\n", - " \n", - " # Now use consistent traj_cols\n", - " common_traj_cols = {\n", - " 'user_id': 'user_id',\n", - " 'location_id': 'id',\n", - " 'start_timestamp': 'datetime',\n", - " 'end_timestamp': 'end_datetime',\n", - " 'duration': 'duration'\n", - " }\n", - " \n", - " stops_err = stops.fillna({'id': 'Street'}).reset_index(drop=True)\n", - " truth_err = truth.dropna().reset_index(drop=True)\n", - " \n", - " overlaps_err = overlapping_visits(\n", - " left=stops_err,\n", - " right=truth_err,\n", - " match_location=False,\n", - " traj_cols=common_traj_cols\n", - " )\n", + "def pad_oracle_stops_long(stops):\n", + " return utils.pad_short_stops(stops, pad=15, dur_min=4, timestamp='timestamp')\n", "\n", - " # 2) PRF overlaps on raw stops (no gap fill at all)\n", - " stops_prf = stops.fillna({'id': 'Street'}).reset_index(drop=True)\n", - " truth_prf = truth.copy().fillna({'id': 'Street'}).reset_index(drop=True)\n", - " \n", - " overlaps_prf = overlapping_visits(\n", - " left=stops_prf,\n", - " right=truth_prf,\n", - " match_location=False,\n", - " traj_cols=common_traj_cols\n", - " )\n", "\n", - " # subset to *matching* POI rows\n", - " prf_overlaps = overlaps_prf[\n", - " overlaps_prf['id_left'] == overlaps_prf['id_right']\n", - " ]\n", + "# ── PRE/POST PIPELINE — keyed by family name ──────────────────────────────────\n", "\n", - " # 3) global PRF from minute‐totals\n", - " total_pred = stops_prf['duration'].sum()\n", - " total_truth = truth_prf['duration'].sum()\n", - " tp = prf_overlaps['duration'].sum()\n", - " prf1 = precision_recall_f1_from_minutes(total_pred, total_truth, tp)\n", - "\n", - " # 4) error metrics\n", - " errors = compute_visitation_errors(\n", - " overlaps=overlaps_err,\n", - " true_visits=truth_err,\n", - " traj_cols=common_traj_cols\n", - " )\n", + "pipeline = {\n", + " 'ta-hdbscan': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + " 'oracle': {'pre': prejoin_oracle_map, 'post': no_op, 'fix': pad_oracle_stops_long},\n", + " 'tadbscan_coarse': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + " 'tadbscan_fine': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + " 'lachesis_coarse': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + " 'lachesis_fine': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + " 'seqscan': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op},\n", + "}\n", "\n", - " results = [\n", - " {'metric_category': 'general',\n", - " 'category_value': 'all',\n", - " 'total_pred':total_pred,\n", - " **prf1, **errors}\n", - " ]\n", "\n", - " # 5) per-category PRF + error\n", - " # Keep original column name for filtering\n", - " truth_original = truth.rename(columns={'datetime': 'timestamp'})\n", - " \n", - " for category in ['building_size', 'building_type', 'dwell_length']:\n", - " for val in truth_original[category].dropna().unique():\n", - " truth_sub = truth_original[(truth_original[category] == val)&(truth_original['id']!='Street')]\n", - " # restrict overlaps to any prediction *overlapping* these truth stops\n", - " if category == 'dwell_length':\n", - " # Use datetime_right which now matches\n", - " mask_err = overlaps_err['datetime_right'].isin(truth_sub['timestamp'])\n", - " mask_prf = overlaps_prf['datetime_right'].isin(truth_sub['timestamp'])\n", - " else:\n", - " mask_err = overlaps_err['id_right'].isin(truth_sub['id'])\n", - " mask_prf = overlaps_prf['id_right'].isin(truth_sub['id'])\n", - "\n", - " ov_err = overlaps_err[mask_err]\n", - " ov_prf = overlaps_prf[mask_prf]\n", - " prf_ov = ov_prf[ov_prf['id_left'] == ov_prf['id_right']]\n", - "\n", - " # per-category PRF from minutes in these slices\n", - " tp_cat = prf_ov['duration'].sum()\n", - " pred_cat = ov_err['duration'].sum()\n", - " truth_cat = truth_sub['duration'].sum()\n", - " prf1_cat = precision_recall_f1_from_minutes(pred_cat, truth_cat, tp_cat)\n", - "\n", - " # For error metrics, align truth_sub columns\n", - " truth_sub_aligned = truth_sub.rename(columns={\n", - " 'timestamp': 'datetime'\n", - " })\n", - " \n", - " err_cat = compute_visitation_errors(\n", - " overlaps=ov_err,\n", - " true_visits=truth_sub_aligned,\n", - " traj_cols=common_traj_cols\n", - " )\n", + "registry = AlgorithmRegistry()\n", "\n", - " results.append({\n", - " 'metric_category': category,\n", - " 'category_value': val,\n", - " **prf1_cat, **err_cat\n", - " })\n", + "registry.add_algorithm(SEQSCAN.seqscan_labels, family='seqscan', dist_thresh=30, time_thresh=120)\n", "\n", - " # tag with user & algo\n", - " for rec in results:\n", - " rec.update({'user': user, 'algorithm': algo})\n", + "registry.add_algorithm(HDBSCAN.hdbscan_labels, family='ta-hdbscan',\n", + " time_thresh=240, min_pts=3, min_cluster_size=1, include_border_points=True)\n", + "registry.add_algorithm(GRID_BASED.grid_based_labels, family='oracle',\n", + " time_thresh=600, min_pts=0, location_id='id')\n", + "registry.add_algorithm(TADBSCAN.ta_dbscan_labels, family='tadbscan_coarse',\n", + " time_thresh=240, min_pts=2, dist_thresh=30)\n", + "registry.add_algorithm(TADBSCAN.ta_dbscan_labels, family='tadbscan_fine',\n", + " time_thresh=120, min_pts=3, dist_thresh=20)\n", + "registry.add_algorithm(LACHESIS.lachesis_labels, family='lachesis_coarse',\n", + " dt_max=240, delta_roam=40)\n", + "registry.add_algorithm(LACHESIS.lachesis_labels, family='lachesis_fine',\n", + " dt_max=120, delta_roam=25)\n", + "\n", + "print(f\"Registry: {len(registry)} algorithm configurations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94d3f559", + "metadata": {}, + "outputs": [], + "source": [ + "# ── METRICS FUNCTION ──────────────────────────────────────────────────────────\n", + "def compute_all_metrics(stops, truth, user, algo):\n", + " gen = compute_stop_detection_metrics(\n", + " stops,\n", + " truth,\n", + " algorithm=algo,\n", + " prf_only=False,\n", + " location_id='id',\n", + " timestamp='timestamp',\n", + " )\n", + " gen.update({'user': user, 'metric_category': 'general', 'category_value': 'all'})\n", + " gen.pop('user_id', None)\n", + " results = [gen]\n", + "\n", + " for category in ['building_size', 'building_type', 'dwell_length']:\n", + " for val in truth[category].dropna().unique():\n", + " truth_sub = truth[truth[category] == val]\n", + " cat = compute_stop_detection_metrics(\n", + " stops,\n", + " truth_sub,\n", + " algorithm=algo,\n", + " prf_only=False,\n", + " location_id='id',\n", + " timestamp='timestamp',\n", + " )\n", + " cat.update({'user': user, 'metric_category': category, 'category_value': val})\n", + " cat.pop('user_id', None)\n", + " results.append(cat)\n", "\n", " return results" ] @@ -510,52 +347,44 @@ "metadata": {}, "outputs": [], "source": [ - "results_list = []\n", - "\n", - "for user in tqdm(diaries_df.user_id.unique(), desc='Processing users'):\n", - " for _, row in algo_params.iloc.iterrows():\n", - " \n", - " # PRE\n", - " sparse = sparse_df[sparse_df['user_id'] == user].copy()\n", - " truth = diaries_df[diaries_df['user_id'] == user].copy()\n", - " processed_sparse = row.pre_process_func(data=sparse, diary=truth)\n", - " \n", - " # ALGORITHM\n", - " start_time = time.time()\n", - " labels = row.func(processed_sparse, **row.params, traj_cols=traj_cols)\n", - " execution_time = time.time() - start_time\n", - " \n", - " # POST\n", - " data_with_clusters = processed_sparse.join(labels)\n", - " data_with_locations = row.post_labels_func(data=data_with_clusters)\n", - " stops = data_with_locations[data_with_locations.cluster != -1].groupby('cluster', as_index=False).apply(summarize_stops_with_loc, include_groups=False)\n", - " stops = row.special_fix_func(stops=stops, data_with_clusters=data_with_locations, params=row.params)\n", - "\n", - " # METRICS\n", - " metrics = compute_all_metrics(stops, truth, poi_table, user, row.algo)\n", - "\n", - " # Add execution time to the first metric entry (the 'general' one)\n", - " if metrics:\n", - " metrics[0]['execution_time'] = execution_time\n", - " \n", - " results_list.extend(metrics)\n", - "\n", - "results_df = pd.DataFrame(results_list)\n", + "results_rows = []\n", + "\n", + "for user in tqdm(diaries_df.user_id.unique()[:10], desc='Processing users'):\n", + " user_sparse = sparse_df[sparse_df['user_id'] == user].copy()\n", + " user_truth = diaries_df[diaries_df['user_id'] == user].copy()\n", + "\n", + " for algo in registry:\n", + " algorithm = algo[\"family\"]\n", + " steps = pipeline[algorithm]\n", + "\n", + " sparse_for_algo = steps[\"pre\"](user_sparse, user_truth)\n", + " labels = registry.time_call(algo, sparse_for_algo, timestamp='timestamp')\n", + "\n", + " clustered = sparse_for_algo.join(labels)\n", + " located = steps[\"post\"](clustered)\n", + " stops = located.loc[located['cluster'] != -1].groupby(\n", + " 'cluster', as_index=False\n", + " ).apply(summarize_stops_with_loc, include_groups=False)\n", + " stops = steps[\"fix\"](stops)\n", + "\n", + " metric_rows = compute_all_metrics(stops, user_truth, user, algorithm)\n", + " elapsed_s = registry._timings[-1]['elapsed_s']\n", + " for row in metric_rows:\n", + " row['execution_time'] = elapsed_s\n", + " results_rows.extend(metric_rows)\n", "\n", + "results_df = pd.DataFrame(results_rows)\n", "print(\"Processing Complete!\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "134f4013-90b1-4309-9365-51f51c9d16f8", + "id": "cab01799", "metadata": {}, "outputs": [], "source": [ - "try:\n", - " results_df = pd.read_csv(\"results.csv\")\n", - "except:\n", - " results_df.to_csv('results.csv', index=False)" + "results_df[results_df['category_value'] == 'all']" ] }, { @@ -575,28 +404,16 @@ "metadata": {}, "outputs": [], "source": [ - "# BOOTSTRAPPING GROUPBY\n", - "agg_keys = ['missed_fraction','merged_fraction','split_fraction','precision','recall','f1']\n", - "ua = general_metrics_df.groupby(['user','algorithm'], as_index=False)[agg_keys].median()\n", - "users = ua['user'].unique()\n", - "output = []\n", - "for _ in range(2000):\n", - " draw = np.random.choice(users, size=len(users), replace=True)\n", - " bs = ua[ua.user.isin(draw)]\n", - " output.append(bs.groupby('algorithm', as_index=False)[agg_keys].median())\n", - "metrics_bootstrap_df = pd.concat(output, ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c78cd7a5-b9e9-4dea-b7ff-b770ab7095d8", - "metadata": {}, - "outputs": [], - "source": [ - "# # For debugging!!!\n", - "# metrics = ['missed_fraction', 'merged_fraction', 'split_fraction', 'precision', 'recall', 'f1']\n", - "# plot_metrics_boxplots(metrics_bootstrap_df, metrics, algo_order=None, colors=None, save_path=None)" + "metrics = ['missed_fraction', 'merged_fraction', 'split_fraction', 'precision', 'recall', 'f1']\n", + "general_plot_df = bootstrap_metric_summary(\n", + " general_metrics_df,\n", + " metrics=metrics,\n", + " unit_col='user',\n", + " group_col='algorithm',\n", + " n_boot=2000,\n", + " interval=(0.05, 0.95),\n", + " random_state=2025,\n", + ")" ] }, { @@ -604,7 +421,7 @@ "id": "517d841a-d369-4c8f-bda2-af61fb74f492", "metadata": {}, "source": [ - "### Plot boostrapped per-stop metrics for general trajectory" + "### Plot per-user metric distributions for general trajectory" ] }, { @@ -614,14 +431,8 @@ "metadata": {}, "outputs": [], "source": [ - "# base colors for coarse versions\n", - "shade = lambda c,f=0.6: mcolors.to_hex(tuple(f*x for x in mcolors.to_rgb(c)))\n", - "raw = {'oracle':'royalblue','ta-hdbscan':'darkorange','lachesis_coarse':'palevioletred','tadbscan_coarse':'limegreen'}\n", - "\n", - "_base = {k:mcolors.to_hex(mcolors.to_rgb(v)) for k,v in raw.items()}\n", - "\n", "algo_order = ['oracle','ta-hdbscan','lachesis_coarse','tadbscan_coarse','lachesis_fine', 'tadbscan_fine']\n", - "colors = {a:(_base[a] if a in _base else shade(_base[a.split('_')[0]+'_coarse'])) for a in algo_order}" + "algorithm_groups = {algo['family']: algo['algorithm'] for algo in registry}" ] }, { @@ -631,101 +442,37 @@ "metadata": {}, "outputs": [], "source": [ - "metrics = ['missed_fraction', 'merged_fraction', 'split_fraction', 'precision', 'recall', 'f1']\n", - "plot_metrics_boxplots(metrics_bootstrap_df, metrics, algo_order=algo_order, colors=colors, save_path='errors_per_stop')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92c56847-d60a-499e-91ca-6a878b11aa7d", - "metadata": {}, - "outputs": [], - "source": [ - "# ylabels = {m:f\"% {m.split('_')[0]}\" for m in ['precision','recall','f1']}\n", - "# metrics = ['precision', 'recall', 'f1']\n", - "# plot_metrics_boxplots( metrics_bootstrap_df, metrics, algo_order=None, colors=None, save_path='errors_per_stop')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37cd2ced-eba8-4632-b1f2-144dfe6a149e", - "metadata": {}, - "outputs": [], - "source": [ - "### Plot pr, rec, f1 per each group in 4 subplots. " + "plot_metric_boxplots(\n", + " general_metrics_df,\n", + " metrics,\n", + " group_order=algo_order,\n", + " group_families=algorithm_groups,\n", + " save_path=Path('examples/research/errors_per_stop_boxplots'),\n", + ")" ] }, { "cell_type": "markdown", - "id": "f56f4c41-cdb3-43ba-a272-db82113f2de6", + "id": "3109e8b3", "metadata": {}, "source": [ - "### Plot of F1 scores for each q" + "### Plot bootstrapped median metrics for general trajectory" ] }, { "cell_type": "code", "execution_count": null, - "id": "1f43bcf7-035f-4de8-9cf4-60e22912650c", + "id": "4d67aa45", "metadata": {}, "outputs": [], "source": [ - "def plot_f1_vs_q(general_df, completeness_df, save_path=None):\n", - " \"\"\"Three-panel plot: oracle vs hdbscan / tadbscan_coarse / lachesis_coarse.\"\"\"\n", - "\n", - " # --- palette & comparison pairs --------------------------------------\n", - " pairs = [('oracle', 'ta-hdbscan'),\n", - " ('oracle', 'tadbscan_coarse'),\n", - " ('oracle', 'lachesis_coarse')]\n", - "\n", - " colors = {'oracle': 'royalblue',\n", - " 'ta-hdbscan': 'darkorange',\n", - " 'tadbscan_coarse': 'limegreen',\n", - " 'lachesis_coarse': 'palevioletred'}\n", - "\n", - " # --- merge: just rename q_stat -> q ----------------------------------\n", - " comp = completeness_df.rename(columns={'q_stat': 'q'})[['user_id', 'q']]\n", - " df = general_df.merge(comp, left_on='user', right_on='user_id', how='left')\n", - "\n", - " # --- figure ----------------------------------------------------------\n", - " fig, axes = plt.subplots(1, 3, figsize=(13, 2.75), sharey=True)\n", - "\n", - " for ax, (a1, a2) in zip(axes, pairs):\n", - " ax.set_facecolor('#EAEAF2') # seaborn-like bg\n", - "\n", - " for alg in (a1, a2):\n", - " sub = df[df.algorithm == alg].dropna(subset=['q']).sort_values('q')\n", - " if sub.empty:\n", - " continue\n", - " sm = lowess(sub['f1'], sub['q'], frac=0.6)\n", - " ax.plot(sm[:, 0], sm[:, 1], color=colors[alg], lw=2)\n", - " ax.scatter(sub['q'], sub['f1'], color=colors[alg], s=6, alpha=0.25)\n", - "\n", - " ax.grid(axis='y', color='darkgray', linestyle='--', lw=0.8)\n", - " ax.yaxis.set_major_locator(MaxNLocator(nbins=5))\n", - " ax.set_xlabel(\"q (trajectory completeness)\")\n", - " ax.set_xticks(np.linspace(0, 1, 6))\n", - " ax.tick_params(axis='both', labelsize=10)\n", - "\n", - " axes[0].set_ylabel(\"F1 score\")\n", - "\n", - " # --- single legend ---------------------------------------------------\n", - " handles = [plt.matplotlib.patches.Patch(color=colors[a], label=a) for a in colors]\n", - " fig.legend(handles, colors, loc='lower center',\n", - " ncol=len(colors), bbox_to_anchor=(0.5, -0.12))\n", - "\n", - " plt.tight_layout()\n", - "\n", - " if save_path:\n", - " fig.savefig(f\"{save_path}.png\", dpi=300, bbox_inches='tight')\n", - " fig.savefig(f\"{save_path}.svg\", bbox_inches='tight')\n", - "\n", - " plt.show()\n", - "\n", - "plot_f1_vs_q(general_metrics_df, completeness_df,\n", - " save_path=\"f1_vs_q\")" + "plot_metric_intervals(\n", + " general_plot_df,\n", + " metrics,\n", + " group_order=algo_order,\n", + " group_families=algorithm_groups,\n", + " save_path=Path('examples/research/errors_per_stop_medians'),\n", + ")" ] }, { @@ -764,85 +511,6 @@ "table_results.to_csv('results_by_category.csv', index=True)\n", "table_results" ] - }, - { - "cell_type": "markdown", - "id": "f8f8087a", - "metadata": {}, - "source": [ - "# Updated Plots (Post-Netmob)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "237d4cf3", - "metadata": {}, - "outputs": [], - "source": [ - "print(stops.columns.tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93d68df0", - "metadata": {}, - "outputs": [], - "source": [ - "poi_table = poi_table.set_crs('EPSG:3857', allow_override=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "228f60d3-4a16-4ed8-a1d9-9efbe2b881c4", - "metadata": {}, - "outputs": [], - "source": [ - "results_list = []\n", - "\n", - "for user in tqdm(diaries_df.user_id.unique(), desc='Processing users'):\n", - " for _, row in algo_params.iterrows():\n", - " \n", - " # PRE\n", - " sparse = sparse_df[sparse_df['user_id'] == user].copy()\n", - " truth = diaries_df[diaries_df['user_id'] == user].copy()\n", - " processed_sparse = row.pre_process_func(data=sparse, diary=truth)\n", - " \n", - " # ALGORITHM\n", - " start_time = time.time()\n", - " labels = row.func(processed_sparse, **row.params, traj_cols=traj_cols)\n", - " execution_time = time.time() - start_time\n", - " \n", - " # POST\n", - " data_with_clusters = processed_sparse.join(labels)\n", - " data_with_locations = row.post_labels_func(data=data_with_clusters)\n", - " stops = data_with_locations[data_with_locations.cluster != -1].groupby('cluster', as_index=False).apply(summarize_stops_with_loc, include_groups=False)\n", - " # Uncomment if you want special fixes:\n", - " # stops = row.special_fix_func(stops=stops, data_with_clusters=data_with_locations, params=row.params)\n", - "\n", - " # METRICS\n", - " metrics = compute_all_metrics(stops, truth, poi_table, user, row.algo)\n", - "\n", - " # Add execution time to the first metric entry (the 'general' one)\n", - " if metrics:\n", - " metrics[0]['execution_time'] = execution_time\n", - " \n", - " results_list.extend(metrics)\n", - "\n", - "results_df = pd.DataFrame(results_list)\n", - "\n", - "print(\"Processing Complete!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84848085", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -850,7 +518,7 @@ "formats": "ipynb,py:percent" }, "kernelspec": { - "display_name": "nomad_env", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -864,7 +532,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/examples/research/hdbscan_validation_paper.py b/examples/research/hdbscan_validation_paper.py index a3c5ad94..3178af31 100644 --- a/examples/research/hdbscan_validation_paper.py +++ b/examples/research/hdbscan_validation_paper.py @@ -6,116 +6,56 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.17.1 +# jupytext_version: 1.17.3 # kernelspec: -# display_name: Python 3.10 (daphme) +# display_name: Python 3 # language: python -# name: daphme +# name: python3 # --- # %% import pandas as pd -# pd.set_option('display.max_rows', 500) -# pd.set_option('display.max_colwidth', 500) import warnings -import time warnings.filterwarnings("ignore", category=FutureWarning) import geopandas as gpd -import shapely import numpy as np from functools import partial +from pathlib import Path +from shapely.geometry import Point from tqdm import tqdm -import matplotlib.pyplot as plt -import matplotlib.colors as mcolors -import matplotlib.patches as mpatches -from matplotlib.ticker import MaxNLocator - -import seaborn as sns -from statsmodels.nonparametric.smoothers_lowess import lowess +import nomad.data as data_folder import nomad.io.base as loader import nomad.stop_detection.utils as utils +import nomad.stop_detection.dbscan as TADBSCAN import nomad.stop_detection.hdbscan as HDBSCAN +import nomad.stop_detection.grid_based as GRID_BASED import nomad.stop_detection.lachesis as LACHESIS -import nomad.stop_detection.dbscan as TADBSCAN -import nomad.stop_detection.grid_based as GRID_BASED # for oracle visits +import nomad.stop_detection.density_based as SEQSCAN +from nomad.stop_detection.validation import ( + AlgorithmRegistry, + bootstrap_metric_summary, + compute_stop_detection_metrics, + plot_metric_boxplots, + plot_metric_intervals, +) +from joblib import Parallel, delayed +import time +from nomad.traj_gen import Agent, Population -import nomad.visit_attribution as visits -import nomad.filters as filters -import nomad.city_gen as cg +import nomad.visit_attribution.visit_attribution as visits +from nomad.city_gen import City +from nomad.map_utils import blocks_to_mercator_gdf -from nomad.contact_estimation import overlapping_visits, compute_visitation_errors, precision_recall_f1_from_minutes -import pdb # %% -import nomad.city_gen as cg -city = cg.load('../garden-city.pkl') - -def plot_metrics_boxplots(df, metrics, - algo_order=None, colors=None, - figsize=(24, 5.5), save_path=None): - # --- normalise inputs ------------------------------------------------- - if algo_order is None: - # preserve appearance order in the DataFrame - algo_order = df.algorithm.drop_duplicates().tolist() - - if colors is None: - cmap = plt.colormaps.get_cmap('tab10') - colors = {a: cmap(i % cmap.N) for i, a in enumerate(algo_order)} - else: - # fill in any missing algorithm colour with the next Tab10 entry - cmap = plt.colormaps.get_cmap('tab10') - for i, a in enumerate(algo_order): - colors.setdefault(a, cmap(i % cmap.N)) - - # --- figure ----------------------------------------------------------- - fig, axes = plt.subplots(1, len(metrics), figsize=figsize, sharey=False) - - if len(metrics) == 1: # when only one metric is passed - axes = [axes] - - for ax, metric in zip(axes, metrics): - ax.set_facecolor('#EAEAF2') - # list of series, one per algorithm - data = [df.loc[df.algorithm == a, metric].dropna() for a in algo_order] - - bp = ax.boxplot(data, - positions=range(len(algo_order)), - patch_artist=True, - widths=0.4, - whis=(5, 95), - showfliers=False, - medianprops={'color': 'black', 'linewidth': 0.6}) - - for box, alg in zip(bp['boxes'], algo_order): - box.set_facecolor(colors[alg]) - - ax.grid(axis='y', color='darkgray', linestyle='--', linewidth=0.8) - ax.yaxis.set_major_locator(MaxNLocator(nbins=5)) - ax.set_title(metric.replace('_', ' ').title(), fontsize=16) - ax.set_xticks([]) - - # legend - handles = [plt.matplotlib.patches.Patch(color=colors[a], label=a) - for a in algo_order] - fig.legend(handles, algo_order, - loc='lower center', - ncol=len(algo_order), - bbox_to_anchor=(0.5, -0.015), - fontsize=15) - - plt.subplots_adjust(bottom=0.1, top=0.95) - - if save_path: - fig.savefig(f'{save_path}.png', dpi=300) - fig.savefig(f'{save_path}.svg') - - plt.show() +data_dir = Path(data_folder.__file__).parent +city = City.from_geopackage(data_dir / "garden-city.gpkg") def classify_building_size_from_id(building_id): - building = city.buildings.get(building_id) + building = city.buildings_df.loc[building_id] n_blocks = len(building.blocks) if n_blocks == 1: return 'small' @@ -125,7 +65,7 @@ def classify_building_size_from_id(building_id): return 'big' def classify_building_type_from_id(building_id): - building = city.buildings.get(building_id) + building = city.buildings_df.loc[building_id] return building.building_type def classify_dwell(duration): @@ -138,388 +78,309 @@ def classify_dwell(duration): # %% -traj_cols = {'user_id':'uid', 'x':'x', 'y':'y', 'timestamp':'timestamp'} -poi_table = gpd.read_file('../garden_city.gpkg') - -poi_table = poi_table.rename({'type':'building_type'}, axis=1) -poi_table['building_size'] = poi_table['building_id'].apply(classify_building_size_from_id) +_GEN_N = 250 +_GEN_SEED = 2025 +_GEN_START = pd.Timestamp("2024-06-01T00:00:00-04:00") +_GEN_END = pd.Timestamp("2024-06-08T00:00:00-04:00") +_DATA_DIR = Path(data_folder.__file__).parent + +_sparse_path = Path("robustness-of-algorithms/sparse_traj_2") +_diaries_path = Path("robustness-of-algorithms/diaries_2") +_homes_path = Path("robustness-of-algorithms/homes_2") + +_rng = np.random.default_rng(_GEN_SEED) +HA_LOWER_BOUND = 8/15 # blocks; matches nomad.traj_gen._sample_horizontal_noise lower bound for pareto_prior +_PARAMS = { + "beta_ping": _rng.uniform(3, 12, _GEN_N).tolist(), + "beta_start": _rng.uniform(50, 500, _GEN_N).tolist(), + "beta_durations": _rng.uniform(30, 300, _GEN_N).tolist(), + "ha": _rng.uniform(HA_LOWER_BOUND + 1e-9, 5, _GEN_N).tolist(), +} + +def generate_agent_trajectory(args): + """Worker function for parallel generation.""" + identifier, home, work, seed, beta_ping, beta_start, beta_durations, ha = args + + city = City.from_geopackage(_DATA_DIR / "garden-city.gpkg") + city._build_hub_network(hub_size=16) + city.compute_gravity(exponent=2.0) + city.compute_shortest_paths(callable_only=True) + + agent = Agent(identifier=identifier, city=city, home=home, workplace=work) + agent.generate_trajectory(datetime=_GEN_START, end_time=_GEN_END, seed=seed) + agent.sample_trajectory( + beta_ping=beta_ping, beta_start=beta_start, beta_durations=beta_durations, + ha=ha, seed=seed, replace_sparse_traj=True, + ) -diaries_df = loader.from_file("../../nomad/data/diaries", format="parquet", **traj_cols) -diaries_df = diaries_df.rename({'location':'building_id'}, axis=1) -diaries_df = diaries_df.merge(poi_table[['building_id', 'building_size', 'building_type']], on='building_id', how='left') + sparse_df = agent.sparse_traj.copy() + sparse_df['user_id'] = identifier + sparse_df['home'] = home + sparse_df['workplace'] = work + + diary_df = agent.diary.copy() + diary_df['user_id'] = identifier + + return sparse_df, diary_df + +if _sparse_path.exists() and _diaries_path.exists(): + print("Data already exists — skipping generation.") +else: + _city = City.from_geopackage(_DATA_DIR / "garden-city.gpkg") + homes = _city.buildings_gdf[_city.buildings_gdf['building_type'] == 'home']['id'].to_numpy() + workplaces = _city.buildings_gdf[_city.buildings_gdf['building_type'] == 'workplace']['id'].to_numpy() + + agent_params = [ + (f'agent_{i:04d}', + _rng.choice(homes), + _rng.choice(workplaces), + i, + _PARAMS['beta_ping'][i], + _PARAMS['beta_start'][i], + _PARAMS['beta_durations'][i], + _PARAMS['ha'][i]) + for i in range(_GEN_N) + ] -diaries_df.loc[~diaries_df.building_id.isna(),'dwell_length'] = diaries_df.loc[~diaries_df.building_id.isna(),'duration'].apply(classify_dwell) -diaries_df['building_id'] = diaries_df['building_id'] -sparse_df = loader.from_file("../../nomad/data/sparse_traj/", format="parquet", **traj_cols) + print(f"Generating {_GEN_N} agents in parallel...") + start_time = time.time() -# %% [markdown] -# ## Analyze completeness + results = Parallel(n_jobs=-1, verbose=10)( + delayed(generate_agent_trajectory)(params) for params in agent_params + ) -# %% -completeness_df = filters.q_stats(sparse_df, traj_cols=traj_cols) + generation_time = time.time() - start_time + print(f"Generated {_GEN_N} agents in {generation_time:.2f}s ({generation_time / _GEN_N:.2f}s per agent)") + + population = Population(_city) + for (sparse_df, diary_df), params in zip(results, agent_params): + identifier, home, work = params[0], params[1], params[2] + agent = Agent(identifier=identifier, city=_city, home=home, workplace=work) + agent.sparse_traj = sparse_df.drop(columns=['home', 'workplace']) + agent.diary = diary_df + population.add_agent(agent, verbose=False) + + poi_data = pd.DataFrame({ + 'building_id': _city.buildings_gdf['id'].values, + 'x': (_city.buildings_gdf['door_cell_x'].astype(float) + 0.5).values, + 'y': (_city.buildings_gdf['door_cell_y'].astype(float) + 0.5).values, + }) + population.reproject_to_mercator(sparse_traj=True, diaries=True, poi_data=poi_data) + + population.save_pop( + sparse_path=str(_sparse_path), + diaries_path=str(_diaries_path), + homes_path=str(_homes_path), + beta_ping=_PARAMS['beta_ping'], + beta_start=_PARAMS['beta_start'], + beta_durations=_PARAMS['beta_durations'], + ha=_PARAMS['ha'], + ) + del _city, population, results + print(f"Generated {_GEN_N} agents in {generation_time:.2f}s -> {_diaries_path} / {_sparse_path}") # %% -plt.figure(figsize=(4,3)) -ax = sns.kdeplot( - data=completeness_df, - x="q_stat", - fill=True, - linewidth=1.5, - bw_adjust=0.25 +# ── DATA LOADING ────────────────────────────────────────────────────────────── +poi_table = gpd.read_file(data_dir / "garden-city.gpkg", layer='buildings') +poi_table = poi_table.rename({'type': 'building_type'}, axis=1) +# Project from local grid units → EPSG:3857 meters to match the saved sparse trajectories +poi_table = blocks_to_mercator_gdf( + poi_table, + block_size=city.block_side_length, + false_easting=city.web_mercator_origin_x, + false_northing=city.web_mercator_origin_y, + drop_garden_cols=False, ) +poi_table['building_size'] = poi_table['id'].apply(classify_building_size_from_id) -# cosmetics -ax.set_xlabel("q (Trajectory Completeness)") -ax.set_ylabel("Density") -ax.grid(True, alpha=0.3) -sns.despine(top=True, right=True) - -# annotation (top-right corner in axes coords) -ax.text( - 0.99, 0.95, - f"N = {len(completeness_df)}", - transform=ax.transAxes, - ha="right", - va="top", - fontsize=9 +diaries_df = loader.from_file("robustness-of-algorithms/diaries_2", format="parquet") +diaries_df = diaries_df.rename({'location': 'id'}, axis=1) +diaries_df = diaries_df.merge(poi_table[['id', 'building_size', 'building_type']], on='id', how='left') +diaries_df.loc[~diaries_df.id.isna(), 'dwell_length'] = ( + diaries_df.loc[~diaries_df.id.isna(), 'duration'].apply(classify_dwell) ) -plt.tight_layout() -plt.savefig("q_stat_density.svg", format="svg", bbox_inches="tight") -plt.savefig("q_stat_density.png", format="svg", bbox_inches="tight") -plt.show() - +sparse_df = loader.from_file("robustness-of-algorithms/sparse_traj_2", format="parquet") -# %% [markdown] -# ## Execution for all users # %% -## TRANSFORMATION FUNCTIONS -# Use this for steps that should pass a generic DataFrame through -def no_op_df(data, **kwargs): - return data - -# Use this for steps that should pass the 'stops' DataFrame through -def no_op_stops(stops, **kwargs): - return stops - -# Pre-processing snippets -def prejoin_oracle_map(data, diary, **kwargs): - location = visits.oracle_map(data, diary, timestamp='timestamp', location_id='building_id') - return data.join(location) +# ── PREPROCESSING / POST-PROCESSING FUNCTIONS ───────────────────────────────── -# Stop summarization snippets -summarize_stops_with_loc = partial(utils.summarize_stop, x='x', y='y', keep_col_names=False, passthrough_cols=['building_id'], complete_output=True) +def no_op(value, *_args, **_kwargs): + return value -def postjoin_poly_map(data, **kwargs): - # This is a post-labeling step, before stop creation - location = visits.point_in_polygon(data=data, poi_table=poi_table, method='majority', data_crs='EPSG:3857', - max_distance=12, cluster_label='cluster', location_id='building_id', x='x', y='y') +def prejoin_oracle_map(data, diary): + location = visits.oracle_map(data, diary, timestamp='timestamp', location_id='id') return data.join(location) -# Special fix/adjustment snippets -def pad_oracle_stops(stops, **kwargs): - return utils.pad_short_stops(stops, pad=5, dur_min=0, start_timestamp='start_timestamp') - -def pad_oracle_stops_long(stops, **kwargs): - return utils.pad_short_stops(stops, pad=15, dur_min=4, start_timestamp='start_timestamp') - -# Each row defines the unique workflow for one algorithm -algo_params = pd.DataFrame([ - { - 'algo': 'ta-hdbscan', - 'func': HDBSCAN.hdbscan_labels, - 'params': {'time_thresh': 240, 'min_pts': 3, 'min_cluster_size': 1, 'include_border_points': True}, - 'pre_process_func': no_op_df, # Correct: Passes 'data' - 'post_labels_func': postjoin_poly_map, - 'special_fix_func': no_op_stops, - }, - { - 'algo': 'oracle', - 'func': GRID_BASED.grid_based_labels, - 'params': {'time_thresh': 600, 'min_pts': 0, 'location_id': 'building_id'}, - 'pre_process_func': prejoin_oracle_map, - 'post_labels_func': no_op_df, # Correct: Passes 'data' - 'special_fix_func': pad_oracle_stops_long, - }, - { - 'algo': 'tadbscan_coarse', - 'func': TADBSCAN._temporal_dbscan_labels, - 'params': {'time_thresh': 240, 'min_pts': 2, 'dist_thresh': 30}, - 'pre_process_func': no_op_df, # Correct - 'post_labels_func': postjoin_poly_map, - 'special_fix_func': no_op_stops, # Correct - }, - { - 'algo': 'tadbscan_fine', - 'func': TADBSCAN._temporal_dbscan_labels, - 'params': {'time_thresh': 120, 'min_pts': 3, 'dist_thresh': 20}, - 'pre_process_func': no_op_df, # Correct - 'post_labels_func': postjoin_poly_map, - 'special_fix_func': no_op_stops, # Correct - }, - { - 'algo': 'lachesis_coarse', - 'func': LACHESIS._lachesis_labels, - 'params': {'dt_max': 240, 'delta_roam': 40}, - 'pre_process_func': no_op_df, # Correct: Passes 'data' - 'post_labels_func': postjoin_poly_map, - 'special_fix_func': no_op_stops, # Correct: Passes 'stops' - }, - { - 'algo': 'lachesis_fine', - 'func': LACHESIS._lachesis_labels, - 'params': {'dt_max': 120, 'delta_roam': 25}, - 'pre_process_func': no_op_df, # Correct - 'post_labels_func': postjoin_poly_map, - 'special_fix_func': no_op_stops, # Correct - } -]) - -## METRICS FUNCTIONS -def compute_all_metrics(stops, truth, poi_table, user, algo): - """ - Calculate all metrics. One expensive overlap on *raw* stops for - error rates, a second (gap-filled) overlap for precision/recall. - """ - - stops_err = stops.fillna({'building_id': 'Street'}) - - overlaps_err = overlapping_visits( - left=stops_err, - right=truth.dropna(), - match_location=False, - location_id='building_id' - ) +summarize_stops_with_loc = partial( + utils.summarize_stop, x='x', y='y', + keep_col_names=True, passthrough_cols=['id'], complete_output=True, + timestamp='timestamp', +) - # 2) PRF overlaps on raw stops (no gap fill at all) - # truth we still label Street so they appear in the join step - stops_prf = stops.fillna({'building_id': 'Street'}) - truth_prf = truth.copy().fillna({'building_id': 'Street'}) - - overlaps_prf = overlapping_visits( - left=stops_prf, - right=truth_prf, - match_location=False, - location_id='building_id' +def postjoin_poly_map(data): + if not isinstance(data, gpd.GeoDataFrame): + geometry = [Point(xy) for xy in zip(data['x'], data['y'])] + data_gdf = gpd.GeoDataFrame(data, geometry=geometry, crs='EPSG:3857') + else: + data_gdf = data + if data_gdf.crs is None: + data_gdf = data_gdf.set_crs('EPSG:3857', allow_override=True) + location = visits.point_in_polygon( + data=data_gdf, poi_table=poi_table, method='majority', + max_distance=12, cluster_label='cluster', location_id='id', + x='x', y='y', data_crs='EPSG:3857', ) + return data.join(location) - # subset to *matching* POI rows - prf_overlaps = overlaps_prf[ - overlaps_prf['building_id_left'] == overlaps_prf['building_id_right'] - ] +def pad_oracle_stops_long(stops): + return utils.pad_short_stops(stops, pad=15, dur_min=4, timestamp='timestamp') - # 3) global PRF from minute‐totals - total_pred = stops_prf['duration'].sum() - total_truth = truth_prf['duration'].sum() - tp = prf_overlaps['duration'].sum() - prf1 = precision_recall_f1_from_minutes(total_pred, total_truth, tp) - - # 4) error metrics - errors = compute_visitation_errors( - overlaps=overlaps_err, - true_visits=truth.dropna(), - location_id='building_id' - ) - results = [ - {'metric_category': 'general', - 'category_value': 'all', - 'total_pred':total_pred, - **prf1, **errors} - ] +# ── PRE/POST PIPELINE — keyed by family name ────────────────────────────────── + +pipeline = { + 'ta-hdbscan': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, + 'oracle': {'pre': prejoin_oracle_map, 'post': no_op, 'fix': pad_oracle_stops_long}, + 'tadbscan_coarse': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, + 'tadbscan_fine': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, + 'lachesis_coarse': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, + 'lachesis_fine': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, + 'seqscan': {'pre': no_op, 'post': postjoin_poly_map, 'fix': no_op}, +} + + +registry = AlgorithmRegistry() + +registry.add_algorithm(SEQSCAN.seqscan_labels, family='seqscan', dist_thresh=30, time_thresh=120) + +registry.add_algorithm(HDBSCAN.hdbscan_labels, family='ta-hdbscan', + time_thresh=240, min_pts=3, min_cluster_size=1, include_border_points=True) +registry.add_algorithm(GRID_BASED.grid_based_labels, family='oracle', + time_thresh=600, min_pts=0, location_id='id') +registry.add_algorithm(TADBSCAN.ta_dbscan_labels, family='tadbscan_coarse', + time_thresh=240, min_pts=2, dist_thresh=30) +registry.add_algorithm(TADBSCAN.ta_dbscan_labels, family='tadbscan_fine', + time_thresh=120, min_pts=3, dist_thresh=20) +registry.add_algorithm(LACHESIS.lachesis_labels, family='lachesis_coarse', + dt_max=240, delta_roam=40) +registry.add_algorithm(LACHESIS.lachesis_labels, family='lachesis_fine', + dt_max=120, delta_roam=25) + +print(f"Registry: {len(registry)} algorithm configurations") + +# %% +# ── METRICS FUNCTION ────────────────────────────────────────────────────────── +def compute_all_metrics(stops, truth, user, algo): + gen = compute_stop_detection_metrics( + stops, + truth, + algorithm=algo, + prf_only=False, + location_id='id', + timestamp='timestamp', + ) + gen.update({'user': user, 'metric_category': 'general', 'category_value': 'all'}) + gen.pop('user_id', None) + results = [gen] - # 5) per-category PRF + error for category in ['building_size', 'building_type', 'dwell_length']: for val in truth[category].dropna().unique(): - truth_sub = truth[(truth[category] == val)&(truth['building_id']!='Street')] - # restrict overlaps to any prediction *overlapping* these truth stops - if category == 'dwell_length': # slice by stop itself - mask_err = overlaps_err['timestamp_right'].isin(truth_sub['timestamp']) - mask_prf = overlaps_prf['timestamp_right'].isin(truth_sub['timestamp']) - else: # slice by building - mask_err = overlaps_err['building_id_right'].isin(truth_sub['building_id']) - mask_prf = overlaps_prf['building_id_right'].isin(truth_sub['building_id']) - - - ov_err = overlaps_err[mask_err] - ov_prf = overlaps_prf[mask_prf] - prf_ov = ov_prf[ov_prf['building_id_left'] == ov_prf['building_id_right']] - - # per-category PRF from minutes in these slices - tp_cat = prf_ov['duration'].sum() - pred_cat = ov_err['duration'].sum() - truth_cat = truth_sub['duration'].sum() - prf1_cat = precision_recall_f1_from_minutes(pred_cat, truth_cat, tp_cat) - - # per-category error on the same slice of truth - - err_cat = compute_visitation_errors( - overlaps=ov_err, - true_visits=truth_sub, - location_id='building_id' + truth_sub = truth[truth[category] == val] + cat = compute_stop_detection_metrics( + stops, + truth_sub, + algorithm=algo, + prf_only=False, + location_id='id', + timestamp='timestamp', ) - - results.append({ - 'metric_category': category, - 'category_value': val, - **prf1_cat, **err_cat - }) - - # tag with user & algo - for rec in results: - rec.update({'user': user, 'algorithm': algo}) + cat.update({'user': user, 'metric_category': category, 'category_value': val}) + cat.pop('user_id', None) + results.append(cat) return results + # %% [markdown] # ### OPTIMIZED LOOP # %% -results_list = [] - -for user in tqdm(diaries_df.uid.unique(), desc='Processing users'): - for _, row in algo_params.iloc.iterrows(): - - # PRE - sparse = sparse_df[sparse_df['uid'] == user].copy() - truth = diaries_df[diaries_df['uid'] == user].copy() - processed_sparse = row.pre_process_func(data=sparse, diary=truth) - - # ALGORITHM - start_time = time.time() - labels = row.func(processed_sparse, **row.params, traj_cols=traj_cols) - execution_time = time.time() - start_time - - # POST - data_with_clusters = processed_sparse.join(labels) - data_with_locations = row.post_labels_func(data=data_with_clusters) - stops = data_with_locations[data_with_locations.cluster != -1].groupby('cluster', as_index=False).apply(summarize_stops_with_loc, include_groups=False) - stops = row.special_fix_func(stops=stops, data_with_clusters=data_with_locations, params=row.params) - - # METRICS - metrics = compute_all_metrics(stops, truth, poi_table, user, row.algo) - - # Add execution time to the first metric entry (the 'general' one) - if metrics: - metrics[0]['execution_time'] = execution_time - - results_list.extend(metrics) - -results_df = pd.DataFrame(results_list) +results_rows = [] -print("Processing Complete!") +for user in tqdm(diaries_df.user_id.unique()[:10], desc='Processing users'): + user_sparse = sparse_df[sparse_df['user_id'] == user].copy() + user_truth = diaries_df[diaries_df['user_id'] == user].copy() -# %% -try: - results_df = pd.read_csv("results.csv") -except: - results_df.to_csv('results.csv', index=False) + for algo in registry: + algorithm = algo["family"] + steps = pipeline[algorithm] -# %% -general_metrics_df = results_df[results_df['metric_category'] == 'general'].copy() + sparse_for_algo = steps["pre"](user_sparse, user_truth) + labels = registry.time_call(algo, sparse_for_algo, timestamp='timestamp') -# %% -# BOOTSTRAPPING GROUPBY -agg_keys = ['missed_fraction','merged_fraction','split_fraction','precision','recall','f1'] -ua = general_metrics_df.groupby(['user','algorithm'], as_index=False)[agg_keys].median() -users = ua['user'].unique() -output = [] -for _ in range(2000): - draw = np.random.choice(users, size=len(users), replace=True) - bs = ua[ua.user.isin(draw)] - output.append(bs.groupby('algorithm', as_index=False)[agg_keys].median()) -metrics_bootstrap_df = pd.concat(output, ignore_index=True) + clustered = sparse_for_algo.join(labels) + located = steps["post"](clustered) + stops = located.loc[located['cluster'] != -1].groupby( + 'cluster', as_index=False + ).apply(summarize_stops_with_loc, include_groups=False) + stops = steps["fix"](stops) -# %% -# # For debugging!!! -# metrics = ['missed_fraction', 'merged_fraction', 'split_fraction', 'precision', 'recall', 'f1'] -# plot_metrics_boxplots(metrics_bootstrap_df, metrics, algo_order=None, colors=None, save_path=None) + metric_rows = compute_all_metrics(stops, user_truth, user, algorithm) + elapsed_s = registry._timings[-1]['elapsed_s'] + for row in metric_rows: + row['execution_time'] = elapsed_s + results_rows.extend(metric_rows) -# %% [markdown] -# ### Plot boostrapped per-stop metrics for general trajectory +results_df = pd.DataFrame(results_rows) +print("Processing Complete!") # %% -# base colors for coarse versions -shade = lambda c,f=0.6: mcolors.to_hex(tuple(f*x for x in mcolors.to_rgb(c))) -raw = {'oracle':'royalblue','ta-hdbscan':'darkorange','lachesis_coarse':'palevioletred','tadbscan_coarse':'limegreen'} +results_df[results_df['category_value'] == 'all'] -_base = {k:mcolors.to_hex(mcolors.to_rgb(v)) for k,v in raw.items()} - -algo_order = ['oracle','ta-hdbscan','lachesis_coarse','tadbscan_coarse','lachesis_fine', 'tadbscan_fine'] -colors = {a:(_base[a] if a in _base else shade(_base[a.split('_')[0]+'_coarse'])) for a in algo_order} +# %% +general_metrics_df = results_df[results_df['metric_category'] == 'general'].copy() # %% metrics = ['missed_fraction', 'merged_fraction', 'split_fraction', 'precision', 'recall', 'f1'] -plot_metrics_boxplots(metrics_bootstrap_df, metrics, algo_order=algo_order, colors=colors, save_path='errors_per_stop') +general_plot_df = bootstrap_metric_summary( + general_metrics_df, + metrics=metrics, + unit_col='user', + group_col='algorithm', + n_boot=2000, + interval=(0.05, 0.95), + random_state=2025, +) +# %% [markdown] +# ### Plot per-user metric distributions for general trajectory # %% -# ylabels = {m:f"% {m.split('_')[0]}" for m in ['precision','recall','f1']} -# metrics = ['precision', 'recall', 'f1'] -# plot_metrics_boxplots( metrics_bootstrap_df, metrics, algo_order=None, colors=None, save_path='errors_per_stop') +algo_order = ['oracle','ta-hdbscan','lachesis_coarse','tadbscan_coarse','lachesis_fine', 'tadbscan_fine'] +algorithm_groups = {algo['family']: algo['algorithm'] for algo in registry} # %% -### Plot pr, rec, f1 per each group in 4 subplots. +plot_metric_boxplots( + general_metrics_df, + metrics, + group_order=algo_order, + group_families=algorithm_groups, + save_path=Path('examples/research/errors_per_stop_boxplots'), +) # %% [markdown] -# ### Plot of F1 scores for each q +# ### Plot bootstrapped median metrics for general trajectory # %% -def plot_f1_vs_q(general_df, completeness_df, save_path=None): - """Three-panel plot: oracle vs hdbscan / tadbscan_coarse / lachesis_coarse.""" - - # --- palette & comparison pairs -------------------------------------- - pairs = [('oracle', 'ta-hdbscan'), - ('oracle', 'tadbscan_coarse'), - ('oracle', 'lachesis_coarse')] - - colors = {'oracle': 'royalblue', - 'ta-hdbscan': 'darkorange', - 'tadbscan_coarse': 'limegreen', - 'lachesis_coarse': 'palevioletred'} - - # --- merge: just rename q_stat -> q ---------------------------------- - comp = completeness_df.rename(columns={'q_stat': 'q'})[['uid', 'q']] - df = general_df.merge(comp, left_on='user', right_on='uid', how='left') - - # --- figure ---------------------------------------------------------- - fig, axes = plt.subplots(1, 3, figsize=(13, 2.75), sharey=True) - - for ax, (a1, a2) in zip(axes, pairs): - ax.set_facecolor('#EAEAF2') # seaborn-like bg - - for alg in (a1, a2): - sub = df[df.algorithm == alg].dropna(subset=['q']).sort_values('q') - if sub.empty: - continue - sm = lowess(sub['f1'], sub['q'], frac=0.6) - ax.plot(sm[:, 0], sm[:, 1], color=colors[alg], lw=2) - ax.scatter(sub['q'], sub['f1'], color=colors[alg], s=6, alpha=0.25) - - ax.grid(axis='y', color='darkgray', linestyle='--', lw=0.8) - ax.yaxis.set_major_locator(MaxNLocator(nbins=5)) - ax.set_xlabel("q (trajectory completeness)") - ax.set_xticks(np.linspace(0, 1, 6)) - ax.tick_params(axis='both', labelsize=10) - - axes[0].set_ylabel("F1 score") - - # --- single legend --------------------------------------------------- - handles = [plt.matplotlib.patches.Patch(color=colors[a], label=a) for a in colors] - fig.legend(handles, colors, loc='lower center', - ncol=len(colors), bbox_to_anchor=(0.5, -0.12)) - - plt.tight_layout() - - if save_path: - fig.savefig(f"{save_path}.png", dpi=300, bbox_inches='tight') - fig.savefig(f"{save_path}.svg", bbox_inches='tight') - - plt.show() - -plot_f1_vs_q(general_metrics_df, completeness_df, - save_path="f1_vs_q") +plot_metric_intervals( + general_plot_df, + metrics, + group_order=algo_order, + group_families=algorithm_groups, + save_path=Path('examples/research/errors_per_stop_medians'), +) # %% [markdown] # ### Metrics for each category value @@ -538,5 +399,3 @@ def plot_f1_vs_q(general_df, completeness_df, save_path=None): table_results = table_results.groupby(['metric_category', 'category_value', 'algorithm'], as_index=True)[['f1', 'f1_as_pct_orac']].median().round(2) table_results.to_csv('results_by_category.csv', index=True) table_results - -# %% diff --git a/examples/research/robustness-of-algorithms/diaries_2/part-0.parquet b/examples/research/robustness-of-algorithms/diaries_2/part-0.parquet new file mode 100644 index 00000000..1196934f Binary files /dev/null and b/examples/research/robustness-of-algorithms/diaries_2/part-0.parquet differ diff --git a/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.ipynb b/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.ipynb index 5c65c90e..f3d87fbd 100644 --- a/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.ipynb +++ b/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.ipynb @@ -33,7 +33,7 @@ "import nomad.city_gen as cg\n", "\n", "from nomad.map_utils import blocks_to_mercator_gdf\n", - "from nomad.contact_estimation import compute_stop_detection_metrics" + "from nomad.stop_detection.validation import compute_stop_detection_metrics" ] }, { diff --git a/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.py b/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.py index fbbf45de..b918c50b 100644 --- a/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.py +++ b/examples/research/robustness-of-algorithms/exp_1_merge_vs_beta_ping.py @@ -40,7 +40,7 @@ import nomad.city_gen as cg from nomad.map_utils import blocks_to_mercator_gdf -from nomad.contact_estimation import compute_stop_detection_metrics +from nomad.stop_detection.validation import compute_stop_detection_metrics # %% diff --git a/examples/research/robustness-of-algorithms/generate_configs.py b/examples/research/robustness-of-algorithms/generate_configs.py index c4372b20..c55d9c5c 100644 --- a/examples/research/robustness-of-algorithms/generate_configs.py +++ b/examples/research/robustness-of-algorithms/generate_configs.py @@ -159,6 +159,37 @@ def main(): }, } + ###################################################################### + # Config family: random general trajectories + # Agents receive random home/workplace; destinations are generated by + # the EPR gravity model (no fixed diary). Intended as the main + # multi-stop, multi-user dataset for validation experiments. + ###################################################################### + N_random = 250 + beta_ping_random = np.linspace(4, 9, N_random).tolist() + + registry["random_general"] = { + "N": N_random, + "name_count": 3, + "name_seed": 2025, + "city_file": str(data_dir / "garden-city.gpkg"), + "start_time": "2024-06-01T00:00:00-04:00", + "end_time": "2024-06-08T00:00:00-04:00", + "epr_time_res": 15, + "dt": 1, + "output_files": { + "sparse_path": str(output_dir / "sparse_traj_2"), + "diaries_path": str(output_dir / "diaries_2"), + "homes_path": str(output_dir / "homes_2"), + }, + "agent_params": { + "seed_trajectory": list(range(N_random)), + "seed_sparsity": list(range(N_random)), + "beta_ping": beta_ping_random, + "ha": 3 / 4, + }, + } + ###################################################################### # Config family: two-stop # Small comparisons and prototypes. diff --git a/examples/research/robustness-of-algorithms/homes_2/part-0.parquet b/examples/research/robustness-of-algorithms/homes_2/part-0.parquet new file mode 100644 index 00000000..bf907785 Binary files /dev/null and b/examples/research/robustness-of-algorithms/homes_2/part-0.parquet differ diff --git a/examples/research/robustness-of-algorithms/random_general_trajectories.py b/examples/research/robustness-of-algorithms/random_general_trajectories.py new file mode 100644 index 00000000..53b13c1e --- /dev/null +++ b/examples/research/robustness-of-algorithms/random_general_trajectories.py @@ -0,0 +1,111 @@ +""" +Generate random general trajectories for robustness-of-algorithms experiments. + +Differences from four_stop_trajectories_exp1: + - Each agent gets a randomly-assigned home and workplace (no fixed diary). + - Destinations are produced by the EPR gravity model, giving each agent a + unique multi-day itinerary across all building types. + - beta_ping is swept linearly across agents to produce a spread of sparsity + levels (q values) within the same population. + +Reads: config_random_general.json (produced by generate_configs.py) +Writes: sparse_traj_2/ — sampled sparse trajectories (Parquet) + diaries_2/ — ground-truth visit diaries (Parquet) + homes_2/ — per-agent home/workplace table (Parquet) + +Usage: + python generate_configs.py random_general # write config + python random_general_trajectories.py # generate data +""" + +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from nomad.city_gen import City +from nomad.traj_gen import Population + + +def main(): + config_file = Path(__file__).resolve().parent / "config_random_general.json" + with open(config_file, "r", encoding="utf-8") as f: + config = json.load(f) + + par = config["agent_params"] + out = config["output_files"] + + # ── City setup ───────────────────────────────────────────────────────── + # EPR destination generation requires city.grav; _build_hub_network and + # compute_gravity are called explicitly in case the saved geopackage does + # not already contain pre-computed gravity data. + city = City.from_geopackage(config["city_file"]) + city._build_hub_network(hub_size=16) + if city.grav is None: + city.compute_gravity(exponent=2.0, use_proxy_hub_distance=True) + city.compute_shortest_paths(callable_only=True) + + # POI coordinates used when reprojecting diaries to Mercator. + poi_data = pd.DataFrame({ + "building_id": city.buildings_gdf["id"].values, + "x": city.buildings_gdf["door_point"].apply(lambda p: p[0]).values, + "y": city.buildings_gdf["door_point"].apply(lambda p: p[1]).values, + }) + + start_time = pd.Timestamp(config["start_time"]) + end_time = pd.Timestamp(config["end_time"]) + + # ── Population ───────────────────────────────────────────────────────── + # home=None and workplace=None (defaults) → each Agent randomly samples + # its own home and workplace from the city's building catalogue, seeded + # by the agent's individual seed so results are reproducible. + population = Population(city) + population.generate_agents( + N=config["N"], + seed=config["name_seed"], + name_count=config["name_count"], + ) + + beta_ping = par["beta_ping"] # list of N values (linspace 4→9) + + for i, agent in enumerate(tqdm(population.roster.values(), desc="Generating trajectories")): + # Full trajectory via EPR model. Passing datetime as a kwarg + # sets the agent's initial position timestamp to start_time. + agent.generate_trajectory( + datetime=start_time, + end_time=end_time, + epr_time_res=config["epr_time_res"], + dt=config["dt"], + seed=par["seed_trajectory"][i], + ) + + agent.sample_trajectory( + beta_ping=beta_ping[i] if isinstance(beta_ping, list) else beta_ping, + ha=par["ha"], + seed=par["seed_sparsity"][i], + replace_sparse_traj=True, + ) + + # ── Reproject & save ─────────────────────────────────────────────────── + population.reproject_to_mercator( + sparse_traj=True, + full_traj=False, + diaries=True, + poi_data=poi_data, + ) + population.save_pop( + sparse_path=out["sparse_path"], + diaries_path=out["diaries_path"], + homes_path=out["homes_path"], + beta_ping=beta_ping, + ha=par["ha"], + ) + print(f"Saved {config['N']} agents") + print(f" sparse → {out['sparse_path']}") + print(f" diaries → {out['diaries_path']}") + + +if __name__ == "__main__": + main() diff --git a/examples/research/robustness-of-algorithms/sparse_traj_2/part-0.parquet b/examples/research/robustness-of-algorithms/sparse_traj_2/part-0.parquet new file mode 100644 index 00000000..5d04ced5 Binary files /dev/null and b/examples/research/robustness-of-algorithms/sparse_traj_2/part-0.parquet differ diff --git a/examples/research/robustness-of-algos-paper.ipynb b/examples/research/robustness-of-algos-paper.ipynb index 6435f4f9..f2499ef9 100644 --- a/examples/research/robustness-of-algos-paper.ipynb +++ b/examples/research/robustness-of-algos-paper.ipynb @@ -40,7 +40,8 @@ "from nomad.traj_gen import Agent, Population\n", "from nomad.generation.sparsity import gen_params_target_q\n", "import nomad.generation.viz as viz\n", - "from nomad.contact_estimation import overlapping_visits, compute_visitation_errors, compute_precision_recall_f1" + "from nomad.contact_estimation import overlapping_visits, compute_precision_recall_f1\n", + "from nomad.stop_detection.validation import compute_visitation_errors" ] }, { diff --git a/examples/research/robustness-of-algos-paper.py b/examples/research/robustness-of-algos-paper.py index 2bd45e0e..c6a4312c 100644 --- a/examples/research/robustness-of-algos-paper.py +++ b/examples/research/robustness-of-algos-paper.py @@ -45,7 +45,8 @@ from nomad.traj_gen import Agent, Population from nomad.generation.sparsity import gen_params_target_q import nomad.generation.viz as viz -from nomad.contact_estimation import overlapping_visits, compute_visitation_errors, compute_precision_recall_f1 +from nomad.contact_estimation import overlapping_visits, compute_precision_recall_f1 +from nomad.stop_detection.validation import compute_visitation_errors # %% # At some point, we should fix the warnings in the codebase, but for now we can ignore them. diff --git a/nomad/contact_estimation.py b/nomad/contact_estimation.py index 5c1bb3dd..d65d758a 100644 --- a/nomad/contact_estimation.py +++ b/nomad/contact_estimation.py @@ -2,155 +2,180 @@ # Authors: Thomas Li and Francisco Barreras +import nomad.filters as filters import nomad.io.base as loader -import nomad.stop_detection.hdbscan as HDBSCAN -import nomad.stop_detection.utils as utils -import pandas as pd import numpy as np -import pdb +import pandas as pd + -def overlapping_visits(left, right, match_location=False, traj_cols=None, **kwargs): +def overlapping_visits(left, right, match_location=False, traj_cols=None, right_traj_cols=None, **kwargs): if len(left) == 0 or len(right) == 0: return pd.DataFrame() - + left = left.copy() right = right.copy() - _ = loader._parse_traj_cols(right.columns, traj_cols, kwargs) # for warning + input_traj_cols = traj_cols traj_cols = loader._parse_traj_cols(left.columns, traj_cols, kwargs) + if right_traj_cols is None: + right_schema_input = input_traj_cols + right_kwargs = kwargs + else: + right_schema_input = right_traj_cols + right_kwargs = {} + temp_traj_cols = loader._parse_traj_cols(right.columns, right_schema_input, right_kwargs) + right_schema_hint = "" + if right_traj_cols is None: + right_schema_hint = ( + " The shared traj_cols/kwargs mapping appears not to fit the right table. " + "Pass right_traj_cols for the right table, or make both tables use the same relevant " + "column names for time, duration, and location." + ) - # Has required columns - uid_key = traj_cols['user_id'] - loc_key = traj_cols['location_id'] - e_t_key = traj_cols['end_timestamp'] - - #check for keys - for df_name, df in {'left':left, 'right':right}.items(): - loader._has_time_cols(df.columns, traj_cols) - - end_col_present = loader._has_end_cols(df.columns, traj_cols) - duration_col_present = loader._has_duration_cols(df.columns, traj_cols) - if not (end_col_present or duration_col_present): - print(f"Missing required (end or duration) temporal columns for {df_name} visits dataframe in columns {df.columns}.") - - if uid_key in df.columns and not (df[uid_key].nunique() == 1): - raise ValueError("Each visits dataframe must have at most one unique user_id") - - if traj_cols['location_id'] not in df: + left_uid_key = traj_cols["user_id"] + right_uid_key = temp_traj_cols["user_id"] + left_loc_key = traj_cols["location_id"] + right_loc_key = temp_traj_cols["location_id"] + + loader._has_time_cols(left.columns, traj_cols) + if right_traj_cols is None: + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + if "timestamp" in kwargs or "start_timestamp" in kwargs: + time_keys = ["timestamp", "start_timestamp", "datetime", "start_datetime"] + if "datetime" in kwargs or "start_datetime" in kwargs: + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + + right_has_time = any( + temp_traj_cols[key] in right.columns + for key in time_keys + ) + if not right_has_time: raise ValueError( - "Could not find required location column in {}. The dataset must contain or map to " - "a location column'.".format(list(df.columns))) + "Could not find required temporal columns in {}. The dataset must contain or map to " + "at least one of 'datetime', 'timestamp', 'start_datetime', or 'start_timestamp'.".format( + list(right.columns) + ) + + right_schema_hint + ) + else: + loader._has_time_cols(right.columns, temp_traj_cols) - keep_uid = (uid_key in left.columns and uid_key in right.columns ) - if keep_uid: - same_id = (left[uid_key].iloc[0] == right[uid_key].iloc[0]) - uid = left[uid_key].iloc[0] - # Possibly different start time keys are valid - t_keys = [] - for df in [left, right]: - if traj_cols['timestamp'] in df.columns: - t_key = traj_cols['timestamp'] - elif traj_cols['start_timestamp'] in df.columns: - t_key = traj_cols['start_timestamp'] - else: - # TO DO: implement to timestamp conversion of datetime - raise ValueError('Overlap of visits with datetime64 objects not yet implemented') - - t_keys += [t_key] - if e_t_key not in df.columns: - df[e_t_key] = df[t_key] + df[traj_cols['duration']]*60 # will fail if t_key is datetime!! - - cols = [t_key, e_t_key, loc_key] - if keep_uid and not same_id: - cols = [uid_key, t_key, e_t_key, loc_key] - df.drop([col for col in df.columns if col not in cols], axis=1, inplace=True) - - # rename timekeys for now to avoid conflict - left.rename({t_keys[0]:t_keys[1]}, axis=1, inplace=True) - t_key = t_keys[1] # keep the t_key in right + if left_loc_key not in left.columns: + raise ValueError( + "Could not find required location column in {}. The dataset must contain or map to " + "a location column.".format(list(left.columns)) + ) + if right_loc_key not in right.columns: + raise ValueError( + "Could not find required location column in {}. The dataset must contain or map to " + "a location column.{}".format(list(right.columns), right_schema_hint if right_traj_cols is None else "") + ) - if match_location: - merged = left.merge(right, on=loc_key, suffixes=('_left','_right')) - else: - merged = left.merge(right, how='cross', suffixes=('_left','_right')) - - t_key_l = t_key+'_left' - t_key_r = t_key+'_right' - - cond = ((merged[t_key_l] < merged[e_t_key+'_right']) & - (merged[t_key_r] < merged[e_t_key+'_left'])) - merged = merged.loc[cond] + if left_uid_key in left.columns and not (left[left_uid_key].nunique() == 1): + raise ValueError("Each visits dataframe must have at most one unique user_id") + if right_uid_key in right.columns and not (right[right_uid_key].nunique() == 1): + raise ValueError("Each visits dataframe must have at most one unique user_id") - start_max = merged[[t_key_l, t_key_r]].max(axis=1) - end_min = merged[[e_t_key+'_left',e_t_key+'_right']].min(axis=1) + keep_uid = (left_uid_key in left.columns and right_uid_key in right.columns) + if keep_uid: + same_id = (left[left_uid_key].iloc[0] == right[right_uid_key].iloc[0]) + uid = left[left_uid_key].iloc[0] + else: + same_id = False - merged.drop([e_t_key+'_left', e_t_key+'_right'], axis=1) - merged[traj_cols['duration']] = ((end_min - start_max) // 60).astype(int) #output names use traj_cols - - if keep_uid and same_id: - merged[uid_key] = uid + left_t_name, left_use_datetime = loader._fallback_time_cols_dt(left.columns, input_traj_cols, kwargs) + left_t_key = traj_cols[left_t_name] + left_e_t_key = traj_cols["end_datetime" if left_use_datetime else "end_timestamp"] - return merged.reset_index(drop=True) + right_t_name, right_use_datetime = loader._fallback_time_cols_dt(right.columns, right_schema_input, right_kwargs) + right_t_key = temp_traj_cols[right_t_name] + right_e_t_key = temp_traj_cols["end_datetime" if right_use_datetime else "end_timestamp"] -def compute_visitation_errors(overlaps, true_visits, traj_cols=None, **kwargs): - """ - Return missed / merged / split fractions for a set of ground-truth stops - and an overlap table produced by `overlapping_visits`. - - The start-timestamp column in `true_visits` *must* be unique; if not, - a ValueError is raised. No gap filling is performed here: the caller - must supply already-filled tables where appropriate. - """ - - # 0 – make sure ground truth rows are valid - true_visits = true_visits.dropna() - - # 1 – decide the canonical start-time column once - t_key, _ = loader._fallback_time_cols_dt(true_visits.columns, traj_cols, kwargs) - - # 2 – check uniqueness of that column in truth - if true_visits[t_key].duplicated().any(): - dup_ts = true_visits.loc[true_visits[t_key].duplicated(), t_key].unique() + if not (loader._has_end_cols(left.columns, traj_cols) or loader._has_duration_cols(left.columns, traj_cols)): raise ValueError( - "Ground-truth stops share the same start time(s), which violates the " - "per-stop key assumption. Duplicated timestamps: " + repr(dup_ts) + "Missing required (end or duration) temporal columns for left visits dataframe in columns {}.".format( + list(left.columns) + ) + ) + if not (loader._has_end_cols(right.columns, temp_traj_cols) or loader._has_duration_cols(right.columns, temp_traj_cols)): + raise ValueError( + "Missing required (end or duration) temporal columns for right visits dataframe in columns {}.{}".format( + list(right.columns), + right_schema_hint if right_traj_cols is None else "", + ) ) - n_truth = len(true_visits) - # 3 – determine the canonical location column - loc_key = loader._parse_traj_cols(true_visits.columns, traj_cols, kwargs)['location_id'] + if left_e_t_key not in left.columns: + if left_use_datetime: + left[left_e_t_key] = left[left_t_key] + pd.to_timedelta(left[traj_cols["duration"]], unit="m") + else: + left[left_e_t_key] = left[left_t_key] + left[traj_cols["duration"]] * 60 + if right_e_t_key not in right.columns: + if right_use_datetime: + right[right_e_t_key] = right[right_t_key] + pd.to_timedelta(right[temp_traj_cols["duration"]], unit="m") + else: + right[right_e_t_key] = right[right_t_key] + right[temp_traj_cols["duration"]] * 60 - # 4 – required column names in the overlap table - t_left = f"{t_key}_left" - t_right = f"{t_key}_right" - loc_left = f"{loc_key}_left" - loc_right = f"{loc_key}_right" + if left_use_datetime: + left["temp_t_key"] = filters.to_timestamp(left[left_t_key]) + left["temp_e_t_key"] = filters.to_timestamp(left[left_e_t_key]) + else: + left["temp_t_key"] = left[left_t_key] + left["temp_e_t_key"] = left[left_e_t_key] - for col in (t_left, t_right, loc_left, loc_right): - if col not in overlaps.columns: - raise ValueError( - f"compute_visitation_errors: expected column '{col}' in overlaps but not found." - ) + if right_use_datetime: + right["temp_t_key"] = filters.to_timestamp(right[right_t_key]) + right["temp_e_t_key"] = filters.to_timestamp(right[right_e_t_key]) + else: + right["temp_t_key"] = right[right_t_key] + right["temp_e_t_key"] = right[right_e_t_key] - overlaps = overlaps.fillna({loc_left: 'Street'}) # So they count on merge and missing + left_cols = [left_t_key, left_e_t_key, left_loc_key, "temp_t_key", "temp_e_t_key"] + right_cols = [right_t_key, right_e_t_key, right_loc_key, "temp_t_key", "temp_e_t_key"] + if keep_uid and not same_id: + left_cols = [left_uid_key] + left_cols + right_cols = [right_uid_key] + right_cols + left.drop([col for col in left.columns if col not in left_cols], axis=1, inplace=True) + right.drop([col for col in right.columns if col not in right_cols], axis=1, inplace=True) - # 5 – error if the overlap table references a start time not in ground truth - bad_ts = set(overlaps[t_right]) - set(true_visits[t_key]) - if bad_ts: - raise ValueError( - "compute_visitation_errors: overlap rows reference start times that " - "do not exist in ground truth: " + repr(sorted(bad_ts)[:10]) - ) + if match_location: + if left_loc_key == right_loc_key: + merged = left.merge(right, on=left_loc_key, suffixes=("_left", "_right")) + else: + merged = left.merge(right, left_on=left_loc_key, right_on=right_loc_key, suffixes=("_left", "_right")) + else: + merged = left.merge(right, how="cross", suffixes=("_left", "_right")) - diff_loc = overlaps[loc_left] != overlaps[loc_right] #because gt "street" segments are always under a minute, then predicted "street" can't ever be correct - same_loc = ~diff_loc + cond = ( + (merged["temp_t_key_left"] < merged["temp_e_t_key_right"]) + & (merged["temp_t_key_right"] < merged["temp_e_t_key_left"]) + ) + merged = merged.loc[cond] - num_overlapped = overlaps[t_right].nunique() - missed_fraction = 1 - num_overlapped / n_truth + start_max = merged[["temp_t_key_left", "temp_t_key_right"]].max(axis=1) + end_min = merged[["temp_e_t_key_left", "temp_e_t_key_right"]].min(axis=1) + merged[traj_cols["duration"]] = ((end_min - start_max) // 60).astype(int) - merged_fraction = diff_loc.groupby(overlaps[t_right]).any().mean() - split_fraction = overlaps[same_loc].groupby(t_right)[t_left].nunique().gt(1).mean() + if keep_uid and same_id: + merged[left_uid_key] = uid + + rename_cols = {} + if left_t_key != right_t_key: + rename_cols[left_t_key] = f"{left_t_key}_left" + rename_cols[right_t_key] = f"{right_t_key}_right" + if left_e_t_key != right_e_t_key: + rename_cols[left_e_t_key] = f"{left_e_t_key}_left" + rename_cols[right_e_t_key] = f"{right_e_t_key}_right" + if left_loc_key != right_loc_key: + rename_cols[left_loc_key] = f"{left_loc_key}_left" + rename_cols[right_loc_key] = f"{right_loc_key}_right" + if keep_uid and not same_id and left_uid_key != right_uid_key: + rename_cols[left_uid_key] = f"{left_uid_key}_left" + rename_cols[right_uid_key] = f"{right_uid_key}_right" + merged.rename(rename_cols, axis=1, inplace=True) + + merged.drop(["temp_t_key_left", "temp_e_t_key_left", "temp_t_key_right", "temp_e_t_key_right"], axis=1, inplace=True) - return {'missed_fraction': missed_fraction, 'merged_fraction': merged_fraction, 'split_fraction': split_fraction} + return merged.reset_index(drop=True) def precision_recall_f1_from_minutes(total_pred, total_truth, tp): """Compute P/R/F1 from minute totals.""" @@ -163,17 +188,3 @@ def precision_recall_f1_from_minutes(total_pred, total_truth, tp): else: f1 = 2 * precision * recall / (precision + recall) return {'precision': precision, 'recall': recall, 'f1': f1} - -def compute_stop_detection_metrics(stops, truth, user_id=None, algorithm=None, prf_only=True, traj_cols=None, **kwargs): - """Backward-compatible forwarder to nomad.stop_detection.validation.""" - from nomad.stop_detection.validation import compute_stop_detection_metrics as _compute - - return _compute( - stops=stops, - truth=truth, - user_id=user_id, - algorithm=algorithm, - prf_only=prf_only, - traj_cols=traj_cols, - **kwargs, - ) diff --git a/nomad/data/generate_osm_test_cache.py b/nomad/data/generate_osm_test_cache.py new file mode 100644 index 00000000..75b57767 --- /dev/null +++ b/nomad/data/generate_osm_test_cache.py @@ -0,0 +1,128 @@ +"""Cache raw OSMnx calls used by map tests. + +Usage: + python -m nomad.data.generate_osm_test_cache +""" + +from pathlib import Path + +import nomad.data as data_folder +import osmnx as ox + +from nomad.constants import ( + PARK_TAGS, + STREET_EXCLUDE_COVERED, + STREET_EXCLUDE_TUNNELS, + STREET_EXCLUDED_SERVICE_TYPES, + STREET_EXCLUDED_SURFACES, + STREET_HIGHWAY_TYPES, +) + + +DATA_DIR = Path(data_folder.__file__).parent +OUTPUT_DIR = DATA_DIR.parent / "tests" / "tmp_osm_cache" +SMALL_TOWN = "Riverton, Burlington County, New Jersey, USA" + +TAG_CASES = { + "buildings": {"building": True}, + "parking": {"amenity": ["parking"]}, + "parks": { + key: values + for key, values in PARK_TAGS.items() + if key not in ["landuse", "amenity"] + }, +} + +BBOX_CASES = { + "philly_small": (-75.1590, 39.9470, -75.1530, 39.9500), + "philly_large": (-75.1662060, 39.9411582, -75.1456557, 39.9557201), + "schuylkill_water": ( + -75.18832868207252, + 39.94457101861711, + -75.17632868207252, + 39.95657101861711, + ), + "washington_square_park": ( + -75.1561894880407, + 39.9432501797551, + -75.1501894880407, + 39.9492501797551, + ), + "parking_lot": ( + -75.1555204236118, + 39.94369232826087, + -75.1545204236118, + 39.94469232826087, + ), + "speculative_classification": ( + -75.15849966125, + 39.946618968625, + -75.15336208875, + 39.950259458375, + ), +} + + +def street_filter(): + highway_types = "|".join(STREET_HIGHWAY_TYPES) + parts = [f'["highway"~"^({highway_types})$"]'] + if STREET_EXCLUDED_SERVICE_TYPES: + excluded_services = "|".join(STREET_EXCLUDED_SERVICE_TYPES) + parts.append(f'["service"!~"{excluded_services}"]') + if STREET_EXCLUDE_TUNNELS: + parts.append('["tunnel"!="yes"]') + if STREET_EXCLUDE_COVERED: + parts.append('["covered"!="yes"]') + if STREET_EXCLUDED_SURFACES: + excluded_surfaces = "|".join(STREET_EXCLUDED_SURFACES) + parts.append(f'["surface"!~"{excluded_surfaces}"]') + return "".join(parts) + + +def save_gdf(gdf, path): + if len(gdf) == 0: + raise AssertionError(f"OSM cache fixture is empty: {path}") + path.parent.mkdir(parents=True, exist_ok=True) + gdf.to_parquet(path) + print(f"written: {path}") + + +def save_graph(graph, path): + if graph.number_of_nodes() == 0: + raise AssertionError(f"OSM cache fixture is empty: {path}") + path.parent.mkdir(parents=True, exist_ok=True) + ox.save_graphml(graph, filepath=str(path)) + print(f"written: {path}") + + +def main(): + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + custom_filter = street_filter() + + for case, bbox in BBOX_CASES.items(): + for tag_case, tags in TAG_CASES.items(): + path = OUTPUT_DIR / "features_from_bbox" / f"{case}__{tag_case}.parquet" + save_gdf(ox.features_from_bbox(bbox=bbox, tags=tags), path) + + town_gdf = ox.geocode_to_gdf(SMALL_TOWN) + save_gdf(town_gdf, OUTPUT_DIR / "geocode_to_gdf" / "small_town.parquet") + + town_polygon = town_gdf.geometry.iloc[0].simplify(tolerance=0.001) + for tag_case, tags in TAG_CASES.items(): + path = OUTPUT_DIR / "features_from_polygon" / f"small_town__{tag_case}.parquet" + save_gdf(ox.features_from_polygon(town_polygon, tags), path) + + graph = ox.graph_from_bbox( + bbox=BBOX_CASES["philly_small"], + custom_filter=custom_filter, + truncate_by_edge=True, + simplify=True, + ) + save_graph(graph, OUTPUT_DIR / "graph_from_bbox" / "philly_small.graphml") + + graph = ox.graph_from_polygon(town_polygon, custom_filter=custom_filter, simplify=True) + save_graph(graph, OUTPUT_DIR / "graph_from_polygon" / "small_town.graphml") + + +if __name__ == "__main__": + main() diff --git a/nomad/stop_detection/validation.py b/nomad/stop_detection/validation.py index f3ed0997..bd773c6d 100644 --- a/nomad/stop_detection/validation.py +++ b/nomad/stop_detection/validation.py @@ -1,11 +1,14 @@ from functools import partial import itertools +from pathlib import Path import time import matplotlib.pyplot as plt import numpy as np import pandas as pd import nomad.contact_estimation as contact +import nomad.filters as filters +import nomad.io.base as loader class AlgorithmRegistry: @@ -126,7 +129,98 @@ def family_timing_summary(self): ) -def compute_stop_detection_metrics(stops, truth, user_id=None, algorithm=None, prf_only=True, traj_cols=None, **kwargs): +def compute_visitation_errors(overlaps, true_visits, traj_cols=None, right_traj_cols=None, **kwargs): + if right_traj_cols is None: + right_schema_input = traj_cols + right_kwargs = kwargs + else: + right_schema_input = right_traj_cols + right_kwargs = {} + + temp_traj_cols = loader._parse_traj_cols(true_visits.columns, right_schema_input, right_kwargs, warn=False) + true_visits = true_visits.dropna() + + t_name, _ = loader._fallback_time_cols_dt(true_visits.columns, right_schema_input, right_kwargs) + t_key = temp_traj_cols[t_name] + + if true_visits[t_key].duplicated().any(): + dup_ts = true_visits.loc[true_visits[t_key].duplicated(), t_key].unique() + raise ValueError( + "Ground-truth stops share the same start time(s), which violates the " + "per-stop key assumption. Duplicated timestamps: " + repr(dup_ts) + ) + n_truth = len(true_visits) + + temp_cols = loader._parse_traj_cols([], traj_cols, kwargs, warn=False) + loc_left = f"{temp_cols['location_id']}_left" + loc_right = f"{temp_traj_cols['location_id']}_right" + + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + if "timestamp" in kwargs or "start_timestamp" in kwargs: + time_keys = ["timestamp", "start_timestamp", "datetime", "start_datetime"] + if "datetime" in kwargs or "start_datetime" in kwargs: + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + + t_left = None + for key in time_keys: + col = f"{temp_cols[key]}_left" + if col in overlaps.columns: + t_left = col + break + + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + if "timestamp" in right_kwargs or "start_timestamp" in right_kwargs: + time_keys = ["timestamp", "start_timestamp", "datetime", "start_datetime"] + if "datetime" in right_kwargs or "start_datetime" in right_kwargs: + time_keys = ["datetime", "start_datetime", "timestamp", "start_timestamp"] + + t_right = None + for key in time_keys: + col = f"{temp_traj_cols[key]}_right" + if col in overlaps.columns: + t_right = col + break + + if t_left is None or t_right is None: + raise ValueError("compute_visitation_errors: could not resolve the overlap start-time columns.") + for col in (loc_left, loc_right): + if col not in overlaps.columns: + raise ValueError(f"compute_visitation_errors: expected column '{col}' in overlaps but not found.") + + overlaps = overlaps.fillna({loc_left: "Street"}) + + bad_ts = set(overlaps[t_right]) - set(true_visits[t_key]) + if bad_ts: + raise ValueError( + "compute_visitation_errors: overlap rows reference start times that " + "do not exist in ground truth: " + repr(sorted(bad_ts)[:10]) + ) + + diff_loc = overlaps[loc_left] != overlaps[loc_right] + same_loc = ~diff_loc + + num_overlapped = overlaps[t_right].nunique() + missed_fraction = 1 - num_overlapped / n_truth + merged_fraction = diff_loc.groupby(overlaps[t_right]).any().mean() + split_fraction = overlaps[same_loc].groupby(t_right)[t_left].nunique().gt(1).mean() + + return { + "missed_fraction": missed_fraction, + "merged_fraction": merged_fraction, + "split_fraction": split_fraction, + } + + +def compute_stop_detection_metrics( + stops, + truth, + user_id=None, + algorithm=None, + prf_only=True, + traj_cols=None, + right_traj_cols=None, + **kwargs, +): if len(stops) == 0: return { "precision": 0.0, @@ -139,21 +233,99 @@ def compute_stop_detection_metrics(stops, truth, user_id=None, algorithm=None, p "algorithm": algorithm, } - stops_clean = stops.fillna({"location": "Street"}) - truth_clean = truth.fillna({"location": "Street"}) - truth_buildings = truth.dropna() + input_traj_cols = traj_cols + left_kwargs = dict(kwargs) + if right_traj_cols is None: + right_schema_input = traj_cols + right_kwargs = left_kwargs + else: + right_schema_input = right_traj_cols + right_kwargs = {} + right_schema_hint = "" + if right_traj_cols is None: + right_schema_hint = ( + " The shared traj_cols/kwargs mapping appears not to fit the truth table. " + "Pass right_traj_cols for the truth table, or make both tables use the same relevant " + "column names for time, duration, and location." + ) + + traj_cols = loader._parse_traj_cols(stops.columns, traj_cols, left_kwargs, warn=False) + temp_traj_cols = loader._parse_traj_cols(truth.columns, right_schema_input, right_kwargs, warn=False) + + left_loc = traj_cols["location_id"] + right_loc = temp_traj_cols["location_id"] + if left_loc not in stops.columns: + raise ValueError( + "Could not find the mapped location column in predicted stops. " + f"Expected '{left_loc}' in columns {list(stops.columns)}." + ) + if right_loc not in truth.columns: + raise ValueError( + "Could not find the mapped location column in ground-truth stops. " + f"Expected '{right_loc}' in columns {list(truth.columns)}." + + right_schema_hint + ) + + stops_clean = stops.copy() + truth_clean = truth.copy() + stops_clean[left_loc] = stops_clean[left_loc].fillna("Street") + truth_clean[right_loc] = truth_clean[right_loc].fillna("Street") + truth_buildings = truth[truth[right_loc].notna()].copy() overlaps = contact.overlapping_visits( left=stops_clean, right=truth_clean, match_location=True, - traj_cols=traj_cols, + traj_cols=input_traj_cols, + right_traj_cols=right_traj_cols, **kwargs, ) - total_pred = stops_clean["duration"].sum() - total_truth = truth_clean["duration"].sum() - tp = overlaps["duration"].sum() + left_t_name, left_use_datetime = loader._fallback_time_cols_dt(stops_clean.columns, input_traj_cols, left_kwargs) + left_t_key = traj_cols[left_t_name] + left_e_t_key = traj_cols["end_datetime" if left_use_datetime else "end_timestamp"] + left_end_col_present = loader._has_end_cols(stops_clean.columns, traj_cols) + left_duration_col_present = loader._has_duration_cols(stops_clean.columns, traj_cols) + if not (left_end_col_present or left_duration_col_present): + raise ValueError("Predicted stops must provide either an end time or a duration.") + if not left_end_col_present: + if left_use_datetime: + stops_clean[left_e_t_key] = stops_clean[left_t_key] + pd.to_timedelta(stops_clean[traj_cols["duration"]], unit="m") + else: + stops_clean[left_e_t_key] = stops_clean[left_t_key] + stops_clean[traj_cols["duration"]] * 60 + + right_t_name, right_use_datetime = loader._fallback_time_cols_dt(truth_clean.columns, right_schema_input, right_kwargs) + right_t_key = temp_traj_cols[right_t_name] + right_e_t_key = temp_traj_cols["end_datetime" if right_use_datetime else "end_timestamp"] + right_end_col_present = loader._has_end_cols(truth_clean.columns, temp_traj_cols) + right_duration_col_present = loader._has_duration_cols(truth_clean.columns, temp_traj_cols) + if not (right_end_col_present or right_duration_col_present): + raise ValueError("Ground-truth stops must provide either an end time or a duration." + right_schema_hint) + if not right_end_col_present: + if right_use_datetime: + truth_clean[right_e_t_key] = truth_clean[right_t_key] + pd.to_timedelta(truth_clean[temp_traj_cols["duration"]], unit="m") + else: + truth_clean[right_e_t_key] = truth_clean[right_t_key] + truth_clean[temp_traj_cols["duration"]] * 60 + + left_duration = traj_cols["duration"] + if left_duration in stops_clean.columns: + total_pred = stops_clean[left_duration].sum() + else: + if left_use_datetime: + total_pred = (filters.to_timestamp(stops_clean[left_e_t_key]) - filters.to_timestamp(stops_clean[left_t_key])).floordiv(60).sum() + else: + total_pred = ((stops_clean[left_e_t_key] - stops_clean[left_t_key]) // 60).sum() + + right_duration = temp_traj_cols["duration"] + if right_duration in truth_clean.columns: + total_truth = truth_clean[right_duration].sum() + else: + if right_use_datetime: + total_truth = (filters.to_timestamp(truth_clean[right_e_t_key]) - filters.to_timestamp(truth_clean[right_t_key])).floordiv(60).sum() + else: + total_truth = ((truth_clean[right_e_t_key] - truth_clean[right_t_key]) // 60).sum() + + tp = overlaps[left_duration].sum() prf_metrics = contact.precision_recall_f1_from_minutes(total_pred, total_truth, tp) if prf_only: @@ -164,10 +336,17 @@ def compute_stop_detection_metrics(stops, truth, user_id=None, algorithm=None, p left=stops_clean, right=truth_buildings, match_location=False, - traj_cols=traj_cols, + traj_cols=input_traj_cols, + right_traj_cols=right_traj_cols, + **kwargs, + ) + error_metrics = compute_visitation_errors( + overlaps_err, + truth_buildings, + traj_cols=input_traj_cols, + right_traj_cols=right_traj_cols, **kwargs, ) - error_metrics = contact.compute_visitation_errors(overlaps_err, truth_buildings, traj_cols, **kwargs) else: error_metrics = {"missed_fraction": 0.0, "merged_fraction": 0.0, "split_fraction": 0.0} @@ -180,6 +359,97 @@ def _desaturate_toward_white(color, amount=0.4): return tuple(rgb + (white - rgb) * amount) +def _blend_toward_white(color, amount=0.3): + rgb = np.array(color[:3]) + white = np.array([1.0, 1.0, 1.0]) + return tuple(rgb + (white - rgb) * amount) + + +def _group_palette(group_order, group_families=None, cmap="tab10"): + cmap_obj = plt.colormaps.get_cmap(cmap) + + if group_families is None: + return { + group: cmap_obj(i / max(len(group_order) - 1, 1)) + for i, group in enumerate(group_order) + }, None + + family_order = list(dict.fromkeys(group_families[group] for group in group_order)) + base_colors = { + family: cmap_obj(i / max(len(family_order) - 1, 1)) + for i, family in enumerate(family_order) + } + + colors = {} + for family in family_order: + family_groups = [group for group in group_order if group_families[group] == family] + shade_levels = np.linspace(0.0, 0.35, len(family_groups)) + for group, shade in zip(family_groups, shade_levels): + colors[group] = _blend_toward_white(base_colors[family], shade) + + return colors, base_colors + + +def bootstrap_metric_summary( + data, + metrics, + group_col="algorithm", + unit_col="user", + n_boot=2000, + interval=(0.05, 0.95), + random_state=2025, +): + """Bootstrap per-unit medians and summarize them as point estimates plus intervals.""" + columns = [group_col, "metric", "estimate", "lower", "upper"] + if data.empty: + return pd.DataFrame(columns=columns) + + per_unit = ( + data.groupby([unit_col, group_col], as_index=False)[metrics] + .median() + ) + point_estimates = ( + per_unit.groupby(group_col, as_index=False)[metrics] + .median() + .melt( + id_vars=group_col, + value_vars=metrics, + var_name="metric", + value_name="estimate", + ) + ) + + unit_ids = per_unit[unit_col].drop_duplicates().to_numpy() + per_unit = per_unit.set_index(unit_col) + rng = np.random.default_rng(random_state) + bootstrap_rows = [] + + for bootstrap_id in range(n_boot): + sampled_units = rng.choice(unit_ids, size=len(unit_ids), replace=True) + sampled = per_unit.loc[sampled_units].reset_index() + summary = sampled.groupby(group_col, as_index=False)[metrics].median() + summary["bootstrap_id"] = bootstrap_id + bootstrap_rows.append(summary) + + bootstrap_df = pd.concat(bootstrap_rows, ignore_index=True) + lower_q, upper_q = interval + intervals = ( + bootstrap_df.melt( + id_vars=[group_col, "bootstrap_id"], + value_vars=metrics, + var_name="metric", + value_name="value", + ) + .groupby([group_col, "metric"])["value"] + .quantile([lower_q, upper_q]) + .unstack() + .reset_index() + .rename(columns={lower_q: "lower", upper_q: "upper"}) + ) + + return point_estimates.merge(intervals, on=[group_col, "metric"], how="left") + + def plot_metric_vs_param( ax, data, @@ -276,9 +546,199 @@ def plot_family_timing(summary_df, ax=None, title="Mean Time Per Parameterizatio return ax +def plot_metric_boxplots( + data, + metrics, + group_col="algorithm", + group_order=None, + group_families=None, + colors=None, + cmap="tab10", + figsize=None, + save_path=None, + metric_titles=None, + legend_title="Base Algorithm", + x_tick_rotation=35, + whis=(5, 95), + showfliers=False, +): + """Plot per-group metric distributions as boxplots.""" + if group_order is None: + group_order = data[group_col].drop_duplicates().tolist() + + if colors is None: + colors, legend_colors = _group_palette(group_order, group_families=group_families, cmap=cmap) + else: + legend_colors = None + + if figsize is None: + figsize = (max(4.2 * len(metrics), 10), 6.2) + + fig, axes = plt.subplots(1, len(metrics), figsize=figsize, sharey=False) + if len(metrics) == 1: + axes = [axes] + + for ax, metric in zip(axes, metrics): + metric_data = [data.loc[data[group_col] == group, metric].dropna() for group in group_order] + bp = ax.boxplot( + metric_data, + positions=np.arange(len(group_order)), + patch_artist=True, + widths=0.5, + whis=whis, + showfliers=showfliers, + medianprops={"color": "black", "linewidth": 0.9}, + boxprops={"linewidth": 0.8}, + whiskerprops={"linewidth": 0.8, "color": "black"}, + capprops={"linewidth": 0.8, "color": "black"}, + ) + for box, group in zip(bp["boxes"], group_order): + box.set_facecolor(colors[group]) + box.set_edgecolor("black") + + ax.set_facecolor("#EAEAF2") + ax.grid(axis="y", color="darkgray", linestyle="--", linewidth=0.8, alpha=0.75) + ax.set_xticks(np.arange(len(group_order))) + ax.set_xticklabels(group_order, rotation=x_tick_rotation, ha="right") + + if metric_titles is None: + title = metric.replace("_", " ").title() + else: + title = metric_titles.get(metric, metric.replace("_", " ").title()) + ax.set_title(title, fontsize=16) + + if legend_colors is not None: + family_order = list(legend_colors.keys()) + handles = [plt.matplotlib.patches.Patch(color=legend_colors[family], label=family) for family in family_order] + fig.legend( + handles, + family_order, + loc="lower center", + ncol=len(family_order), + bbox_to_anchor=(0.5, -0.08), + fontsize=12, + title=legend_title, + title_fontsize=13, + frameon=True, + ) + plt.subplots_adjust(bottom=0.34, top=0.92) + else: + plt.subplots_adjust(bottom=0.22, top=0.92) + + if save_path is not None: + save_path = Path(save_path) + save_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_path.with_suffix(".png"), dpi=300, bbox_inches="tight") + fig.savefig(save_path.with_suffix(".svg"), bbox_inches="tight") + + return fig, axes + + +def plot_metric_intervals( + summary_df, + metrics, + group_col="algorithm", + group_order=None, + group_families=None, + colors=None, + cmap="tab10", + figsize=None, + save_path=None, + metric_titles=None, + legend_title="Base Algorithm", + x_tick_rotation=35, +): + """Plot precomputed point estimates with interval whiskers for each metric.""" + if group_order is None: + group_order = summary_df[group_col].drop_duplicates().tolist() + + if colors is None: + colors, legend_colors = _group_palette(group_order, group_families=group_families, cmap=cmap) + else: + legend_colors = None + + if figsize is None: + figsize = (max(4.2 * len(metrics), 10), 6.2) + + fig, axes = plt.subplots(1, len(metrics), figsize=figsize, sharey=False) + if len(metrics) == 1: + axes = [axes] + + for ax, metric in zip(axes, metrics): + metric_summary = ( + summary_df.loc[summary_df["metric"] == metric] + .set_index(group_col) + .reindex(group_order) + .reset_index() + ) + x = np.arange(len(group_order)) + y = metric_summary["estimate"].to_numpy() + yerr = np.vstack([ + y - metric_summary["lower"].to_numpy(), + metric_summary["upper"].to_numpy() - y, + ]) + + ax.set_facecolor("#EAEAF2") + for idx, group in enumerate(group_order): + ax.errorbar( + x[idx], + y[idx], + yerr=yerr[:, idx:idx + 1], + fmt="o", + color="black", + ecolor="black", + elinewidth=1.0, + capsize=4, + markersize=7, + markerfacecolor=colors[group], + markeredgecolor="black", + markeredgewidth=0.8, + zorder=4, + ) + ax.grid(axis="y", color="darkgray", linestyle="--", linewidth=0.8, alpha=0.75) + ax.set_xticks(x) + ax.set_xticklabels(group_order, rotation=x_tick_rotation, ha="right") + + if metric_titles is None: + title = metric.replace("_", " ").title() + else: + title = metric_titles.get(metric, metric.replace("_", " ").title()) + ax.set_title(title, fontsize=16) + + if legend_colors is not None: + family_order = list(legend_colors.keys()) + handles = [plt.matplotlib.patches.Patch(color=legend_colors[family], label=family) for family in family_order] + fig.legend( + handles, + family_order, + loc="lower center", + ncol=len(family_order), + bbox_to_anchor=(0.5, -0.08), + fontsize=12, + title=legend_title, + title_fontsize=13, + frameon=True, + ) + plt.subplots_adjust(bottom=0.34, top=0.92) + else: + plt.subplots_adjust(bottom=0.22, top=0.92) + + if save_path is not None: + save_path = Path(save_path) + save_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_path.with_suffix(".png"), dpi=300, bbox_inches="tight") + fig.savefig(save_path.with_suffix(".svg"), bbox_inches="tight") + + return fig, axes + + __all__ = [ "AlgorithmRegistry", + "bootstrap_metric_summary", + "compute_visitation_errors", "compute_stop_detection_metrics", + "plot_metric_boxplots", + "plot_metric_intervals", "plot_metric_vs_param", "plot_family_timing", ] diff --git a/nomad/tests/test_contact_estimation.py b/nomad/tests/test_contact_estimation.py new file mode 100644 index 00000000..f1115733 --- /dev/null +++ b/nomad/tests/test_contact_estimation.py @@ -0,0 +1,165 @@ +import pandas as pd +import pytest + +import nomad.contact_estimation as contact +from nomad.stop_detection.validation import compute_stop_detection_metrics, compute_visitation_errors + + +@pytest.fixture +def distinct_visit_tables(): + left = pd.DataFrame( + { + "pred_start": [ + pd.Timestamp("2024-01-01 00:00:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:05:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:20:00", tz="UTC"), + ], + "pred_end": [ + pd.Timestamp("2024-01-01 00:05:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:10:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:30:00", tz="UTC"), + ], + "pred_loc": ["home", "home", "cafe"], + "pred_user": ["u1", "u1", "u1"], + } + ) + right = pd.DataFrame( + { + "truth_start": [ + int(pd.Timestamp("2024-01-01 00:00:00", tz="UTC").timestamp()), + int(pd.Timestamp("2024-01-01 00:20:00", tz="UTC").timestamp()), + ], + "truth_end": [ + int(pd.Timestamp("2024-01-01 00:10:00", tz="UTC").timestamp()), + int(pd.Timestamp("2024-01-01 00:30:00", tz="UTC").timestamp()), + ], + "truth_loc": ["home", "work"], + "truth_user": ["u1", "u1"], + } + ) + left_kwargs = { + "start_datetime": "pred_start", + "end_datetime": "pred_end", + "location_id": "pred_loc", + "user_id": "pred_user", + } + right_traj_cols = { + "start_timestamp": "truth_start", + "end_timestamp": "truth_end", + "location_id": "truth_loc", + "user_id": "truth_user", + } + return left, right, left_kwargs, right_traj_cols + + +def test_overlapping_visits_supports_distinct_schemas(distinct_visit_tables): + left, right, left_kwargs, right_traj_cols = distinct_visit_tables + + overlaps = contact.overlapping_visits( + left, + right, + match_location=False, + right_traj_cols=right_traj_cols, + **left_kwargs, + ) + + assert overlaps["duration"].tolist() == [5, 5, 10] + + +def test_compute_visitation_errors_supports_distinct_schemas(distinct_visit_tables): + left, right, left_kwargs, right_traj_cols = distinct_visit_tables + + overlaps = contact.overlapping_visits( + left, + right, + match_location=False, + right_traj_cols=right_traj_cols, + **left_kwargs, + ) + errors = compute_visitation_errors( + overlaps, + right, + right_traj_cols=right_traj_cols, + **left_kwargs, + ) + + assert errors["missed_fraction"] == 0.0 + assert errors["merged_fraction"] == pytest.approx(0.5) + assert errors["split_fraction"] == pytest.approx(1.0) + + +def test_compute_stop_detection_metrics_supports_distinct_schemas(): + stops = pd.DataFrame( + { + "pred_start": [ + pd.Timestamp("2024-01-01 00:00:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:20:00", tz="UTC"), + ], + "pred_end": [ + pd.Timestamp("2024-01-01 00:10:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:30:00", tz="UTC"), + ], + "pred_loc": ["home", "work"], + "pred_user": ["u1", "u1"], + "pred_minutes": [10, 10], + } + ) + truth = pd.DataFrame( + { + "truth_start": [ + int(pd.Timestamp("2024-01-01 00:00:00", tz="UTC").timestamp()), + int(pd.Timestamp("2024-01-01 00:20:00", tz="UTC").timestamp()), + ], + "truth_end": [ + int(pd.Timestamp("2024-01-01 00:10:00", tz="UTC").timestamp()), + int(pd.Timestamp("2024-01-01 00:30:00", tz="UTC").timestamp()), + ], + "truth_loc": ["home", "work"], + "truth_user": ["u1", "u1"], + "truth_minutes": [10, 10], + "truth_datetime": [ + pd.Timestamp("2024-01-01 00:00:00", tz="UTC"), + pd.Timestamp("2024-01-01 00:20:00", tz="UTC"), + ], + } + ) + + metrics = compute_stop_detection_metrics( + stops, + truth, + prf_only=False, + start_datetime="pred_start", + end_datetime="pred_end", + location_id="pred_loc", + user_id="pred_user", + duration="pred_minutes", + right_traj_cols={ + "start_timestamp": "truth_start", + "end_timestamp": "truth_end", + "location_id": "truth_loc", + "user_id": "truth_user", + "duration": "truth_minutes", + }, + ) + + assert metrics["precision"] == pytest.approx(1.0) + assert metrics["recall"] == pytest.approx(1.0) + assert metrics["f1"] == pytest.approx(1.0) + assert metrics["missed_fraction"] == pytest.approx(0.0) + assert metrics["merged_fraction"] == pytest.approx(0.0) + assert metrics["split_fraction"] == pytest.approx(0.0) + + +def test_overlapping_visits_shared_schema_failure_mentions_right_traj_cols(): + left = pd.DataFrame({"pred_start": [0], "pred_end": [600], "pred_loc": ["home"]}) + right = pd.DataFrame({"truth_start": [300], "truth_end": [900], "truth_loc": ["home"]}) + + with pytest.warns(UserWarning): + with pytest.raises(ValueError, match="right_traj_cols"): + contact.overlapping_visits( + left, + right, + start_timestamp="pred_start", + end_timestamp="pred_end", + location_id="pred_loc", + ) diff --git a/nomad/tests/test_maps.py b/nomad/tests/test_maps.py index 6632d28a..7f26ce86 100644 --- a/nomad/tests/test_maps.py +++ b/nomad/tests/test_maps.py @@ -7,52 +7,144 @@ 3. Output includes OSM type, subtype, and category 4. Geometries are proper polygons 5. Rotation and explosion of geometries works correctly + +By default, tests use cached OSM data. Generate or refresh that cache with: + + python -m nomad.data.generate_osm_test_cache + +To run the same tests against live OSMnx APIs instead, set: + + NOMAD_OSM_TEST_CACHE=0 """ -import pytest +import os + import geopandas as gpd -from shapely.geometry import Polygon, LineString, MultiPolygon, MultiLineString import numpy as np -import pandas as pd -import os -import tempfile import osmnx as ox - -@pytest.fixture(scope="module", autouse=True) -def osmnx_tmp_cache(): - """Use a temporary OSMnx cache folder for this module and clean it up at teardown.""" - prev_use_cache = getattr(ox.settings, 'use_cache', None) - prev_cache_folder = getattr(ox.settings, 'cache_folder', None) - prev_env_dir = os.environ.get('NOMAD_OSMNX_CACHE_DIR') - tmpdir_obj = tempfile.TemporaryDirectory(prefix="nomad-osmnx-cache-") - try: - os.environ['NOMAD_OSMNX_CACHE_DIR'] = tmpdir_obj.name - ox.settings.use_cache = True - ox.settings.cache_folder = tmpdir_obj.name - yield - finally: - if prev_env_dir is None: - os.environ.pop('NOMAD_OSMNX_CACHE_DIR', None) - else: - os.environ['NOMAD_OSMNX_CACHE_DIR'] = prev_env_dir - if prev_use_cache is not None: - ox.settings.use_cache = prev_use_cache - if prev_cache_folder is not None: - ox.settings.cache_folder = prev_cache_folder - tmpdir_obj.cleanup() - +import pandas as pd +import pytest +from osmnx._errors import InsufficientResponseError +from shapely.geometry import LineString, MultiLineString, MultiPolygon, Point, Polygon + +import nomad.map_utils as map_utils +from nomad.data.generate_osm_test_cache import ( + BBOX_CASES, + OUTPUT_DIR, + SMALL_TOWN, + TAG_CASES, +) from nomad.map_utils import ( + _classify_building, download_osm_buildings, download_osm_streets, - rotate, - remove_overlaps, - _classify_building, get_city_boundary_osm, - save_geodata, load_geodata, + remove_overlaps, + rotate, + save_geodata, ) +OSM_TEST_CACHE_ENV = "NOMAD_OSM_TEST_CACHE" +CACHED_OSM_MISSING_MESSAGE = ( + "Run python -m nomad.data.generate_osm_test_cache first, " + "or rerun it to refresh stale cached data. " + "Set NOMAD_OSM_TEST_CACHE=0 to run against live OSMnx APIs instead." +) + +EMPTY_OSM_BBOXES = [ + (-66.20, 37.82, -65.79, 38.03), + (-66.19799553667396, 37.81476271794678, -65.78506182161694, 38.027365740492634), +] + +EMPTY_OSM_POLYGONS = [ + Polygon([ + (-66.20, 37.82), + (-65.79, 37.82), + (-65.79, 38.03), + (-66.20, 38.03), + (-66.20, 37.82), + ]) +] + + +def _require_cached_osm_file(path): + if not path.exists(): + pytest.fail( + f"Missing cached OSM test data: {path}. {CACHED_OSM_MISSING_MESSAGE}", + pytrace=False, + ) + return path + + +def _cached_tag_case(tags): + for case, cached_tags in TAG_CASES.items(): + if tags == cached_tags: + return case + pytest.fail(f"No cached OSM tag fixture for {tags}", pytrace=False) + + +def _cached_bbox_case(bbox): + for case, cached_bbox in BBOX_CASES.items(): + if np.allclose(bbox, cached_bbox): + return case + pytest.fail(f"No cached OSM bbox fixture for {bbox}", pytrace=False) + + +def _is_empty_osm_bbox(bbox): + return any(np.allclose(bbox, empty_bbox) for empty_bbox in EMPTY_OSM_BBOXES) + + +def _is_empty_osm_polygon(polygon): + return any(polygon.equals(empty_polygon) for empty_polygon in EMPTY_OSM_POLYGONS) + + +@pytest.fixture(autouse=True) +def osm_cached_data_mode(monkeypatch): + """Run map_utils normally while serving OSMnx calls from local cache.""" + if os.environ.get(OSM_TEST_CACHE_ENV, "1") == "0": + return + + def features_from_bbox(bbox, tags): + if _is_empty_osm_bbox(bbox): + raise InsufficientResponseError("No cached OSM data for empty test area") + path = OUTPUT_DIR / "features_from_bbox" + path = path / f"{_cached_bbox_case(bbox)}__{_cached_tag_case(tags)}.parquet" + return gpd.read_parquet(_require_cached_osm_file(path)) + + def features_from_polygon(polygon, tags): + if _is_empty_osm_polygon(polygon): + raise InsufficientResponseError("No cached OSM data for empty test polygon") + path = OUTPUT_DIR / "features_from_polygon" + path = path / f"small_town__{_cached_tag_case(tags)}.parquet" + return gpd.read_parquet(_require_cached_osm_file(path)) + + def geocode_to_gdf(query): + if query != SMALL_TOWN: + pytest.fail(f"No cached OSM geocode fixture for {query}", pytrace=False) + path = OUTPUT_DIR / "geocode_to_gdf" / "small_town.parquet" + return gpd.read_parquet(_require_cached_osm_file(path)) + + def graph_from_bbox(bbox, custom_filter=None, truncate_by_edge=True, simplify=True): + if _is_empty_osm_bbox(bbox): + raise InsufficientResponseError("No cached OSM graph for empty test area") + path = OUTPUT_DIR / "graph_from_bbox" / f"{_cached_bbox_case(bbox)}.graphml" + return ox.load_graphml(_require_cached_osm_file(path)) + + def graph_from_polygon(polygon, custom_filter=None, simplify=True): + if _is_empty_osm_polygon(polygon): + raise InsufficientResponseError("No cached OSM graph for empty test polygon") + path = OUTPUT_DIR / "graph_from_polygon" / "small_town.graphml" + return ox.load_graphml(_require_cached_osm_file(path)) + + monkeypatch.setattr(map_utils.ox, "features_from_bbox", features_from_bbox) + monkeypatch.setattr(map_utils.ox, "features_from_polygon", features_from_polygon) + monkeypatch.setattr(map_utils.ox, "geocode_to_gdf", geocode_to_gdf) + monkeypatch.setattr(map_utils.ox, "graph_from_bbox", graph_from_bbox) + monkeypatch.setattr(map_utils.ox, "graph_from_polygon", graph_from_polygon) + + @pytest.fixture def philly_bbox(): """Bounding box for a section of Philadelphia.""" @@ -113,6 +205,7 @@ def test_download_osm_buildings_garden_city(philly_bbox): @pytest.mark.skip(reason="Trimmed suite: covered by schema switching test") def test_download_osm_buildings_geolife(philly_bbox): """Test with geolife_plus category schema.""" + # Large OSM output is not needed here; a tiny bbox with known categories is equally informative. buildings = download_osm_buildings(philly_bbox, schema='geolife_plus') # Should return a GeoDataFrame @@ -215,8 +308,6 @@ def test_priority_order(): def test_save_load_geodata(tmp_path): - import geopandas as gpd - from shapely.geometry import Point, LineString gdf = gpd.GeoDataFrame({'id':[1,2],'name':['a','b']}, geometry=[Point(0,0), Point(1,1)], crs='EPSG:4326') # GeoJSON geojson_path = tmp_path / 'points.geojson' @@ -239,6 +330,7 @@ def test_save_load_geodata(tmp_path): @pytest.mark.skip(reason="Trimmed suite: CRS covered elsewhere") def test_crs_handling(philly_bbox): """Test that coordinate reference systems are handled correctly.""" + # Large OSM output is not needed here; CRS behavior only needs one small non-empty feature set. # Default should be EPSG:4326 (WGS84 lat/lon) buildings_default = download_osm_buildings(philly_bbox) assert buildings_default.crs.to_string() == "EPSG:4326" @@ -458,6 +550,7 @@ def test_remove_overlaps_exclude_categories(): def test_empty_bounding_box(): """Test behavior when no features exist in the bounding box.""" + # Empty-response behavior should be represented as an expected empty OSMnx call, not cached as empty data files. # User-provided empty box (NW/SEx converted to west,south,east,north): # NW (lat, lon): 38.027365740492634, -66.19799553667396 # SE (lat, lon): 37.81476271794678, -65.78506182161694 @@ -483,6 +576,7 @@ def test_empty_bounding_box(): def test_empty_polygon(): """Test behavior when an empty-area polygon is provided directly.""" + # Empty-response behavior should be represented as an expected empty OSMnx call, not cached as empty data files. # Simplified explicit coordinates near the same empty area poly = Polygon([ (-66.20, 37.82), (-65.79, 37.82), (-65.79, 38.03), (-66.20, 38.03), (-66.20, 37.82) @@ -500,6 +594,7 @@ def test_empty_polygon(): @pytest.mark.skip(reason="Trimmed suite: empty-area CRS covered by other tests") def test_empty_bounding_box_custom_crs(): """Empty-area downloads should work in other CRS as well (e.g., EPSG:3857).""" + # Large or non-empty OSM output is not needed here; this is only an empty-response plus CRS check. ocean_bbox = ( -66.19799553667396, # west 37.81476271794678, # south @@ -517,6 +612,7 @@ def test_empty_bounding_box_custom_crs(): @pytest.mark.skip(reason="Trimmed suite: CRS behavior already covered") def test_different_coordinate_systems(): """Test that functions work correctly with different CRS.""" + # Large OSM output is not needed here; a tiny bbox with one feature would exercise CRS conversion. # Philadelphia bounding box philly_bbox = (-75.1662060, 39.9411582, -75.1456557, 39.9557201) @@ -536,6 +632,7 @@ def test_different_coordinate_systems(): def test_water_feature_exclusion(): """Test that water features are properly excluded from buildings and parks.""" + # A small bbox around one water feature is enough; this should not require a large city-scale fixture. # Use a bounding box that includes water features (like Schuylkill River) water_bbox = (-75.18232868207252 - 0.006, 39.95057101861711 - 0.006, -75.18232868207252 + 0.006, 39.95057101861711 + 0.006) @@ -554,6 +651,7 @@ def test_water_feature_exclusion(): def test_park_categorization(): """Test that parks are correctly categorized as 'park', not 'other'.""" + # A tiny bbox around one known park is enough; large OSM output adds no extra signal. # Use a bounding box known to have parks (like Washington Square Park) park_bbox = (-75.1531894880407 - 0.003, 39.9462501797551 - 0.003, -75.1531894880407 + 0.003, 39.9462501797551 + 0.003) @@ -577,6 +675,7 @@ def test_park_categorization(): def test_parking_lot_classification(): """Test that parking lots are classified as buildings, not parks.""" + # A tiny bbox around one known parking lot is enough; large OSM output adds no extra signal. # Use a bounding box with parking lots parking_bbox = (-75.1550204236118 - 0.0005, 39.94419232826087 - 0.0005, -75.1550204236118 + 0.0005, 39.94419232826087 + 0.0005) @@ -592,6 +691,7 @@ def test_parking_lot_classification(): def test_speculative_classification(): """Test the infer_building_types parameter works correctly.""" + # A small bbox with at least one inferable building is enough; large OSM output is not needed. # Use a small bounding box with mixed building types # Smaller bbox to keep this network call fast test_bbox = (-75.15849966125, 39.946618968625, -75.15336208875, 39.950259458375) @@ -681,6 +781,7 @@ def test_overlap_removal_containment(): def test_schema_switching(): """Test that different schemas work correctly.""" + # Large OSM output is only useful here to find category variety; a smaller curated bbox/fixture is preferable. test_bbox = (-75.1662060, 39.9411582, -75.1456557, 39.9557201) # Test garden_city schema @@ -702,9 +803,9 @@ def test_schema_switching(): assert gc_categories != gl_categories, "Different schemas should have different categories" -def test_city_boundary_by_name_salem(): +def test_city_boundary_by_name_small_town(): """Boundary by city name should return (geometry, center tuple, population).""" - boundary, center, population = get_city_boundary_osm('Salem, New Jersey') + boundary, center, population = get_city_boundary_osm(SMALL_TOWN) assert boundary is not None, "Boundary geometry should not be None" assert hasattr(boundary, 'geom_type'), "Boundary must be a shapely geometry" assert isinstance(center, tuple) and len(center) == 2, "Center must be (lon, lat) tuple" @@ -712,13 +813,13 @@ def test_city_boundary_by_name_salem(): assert isinstance(population, int), "Population must be int or None" -def test_download_osm_buildings_city_name_salem(): +def test_download_osm_buildings_city_name_small_town(): """Download buildings by small city name; keep processing minimal.""" - buildings = download_osm_buildings('Salem, New Jersey', explode=False) + buildings = download_osm_buildings(SMALL_TOWN, explode=False) assert isinstance(buildings, gpd.GeoDataFrame) -def test_download_osm_streets_city_name_salem(): +def test_download_osm_streets_city_name_small_town(): """Download streets by small city name; keep processing minimal.""" - streets = download_osm_streets('Salem, New Jersey', explode=False) + streets = download_osm_streets(SMALL_TOWN, explode=False) assert isinstance(streets, gpd.GeoDataFrame) diff --git a/nomad/tests/test_stop_detection.py b/nomad/tests/test_stop_detection.py index d44ac085..a377eb6f 100644 --- a/nomad/tests/test_stop_detection.py +++ b/nomad/tests/test_stop_detection.py @@ -1248,3 +1248,356 @@ def test_empty_dataframe_xy_output(empty_traj_xy, empty_xy_case_registry, algo_n assert result.empty assert set(result.columns) == case["expected_cols"] + +def test_empty_dataframe_consistency(): + """Test that all algorithms return consistent column structures for empty data.""" + + empty_data = pd.DataFrame(columns=['timestamp', 'longitude', 'latitude', 'location_id']) + traj_cols = { + 'timestamp': 'timestamp', + 'longitude': 'longitude', + 'latitude': 'latitude', + 'location_id': 'location_id' + } + + # Test with complete_output=False for all algorithms + grid_result = GRID_BASED.grid_based(empty_data, traj_cols=traj_cols, complete_output=False) + hdbscan_result = HDBSCAN.st_hdbscan(empty_data, time_thresh=60, traj_cols=traj_cols, complete_output=False) + lachesis_result = LACHESIS.lachesis(empty_data, delta_roam=100, dt_max=60, traj_cols=traj_cols, complete_output=False) + + # All should be empty + assert grid_result.empty + assert hdbscan_result.empty + assert lachesis_result.empty + + # Grid-based should have location_id, others should have spatial coordinates + assert 'location_id' in grid_result.columns + assert 'longitude' in hdbscan_result.columns + assert 'latitude' in hdbscan_result.columns + assert 'longitude' in lachesis_result.columns + assert 'latitude' in lachesis_result.columns + +########################################## +#### HDBSCAN TESTS #### +########################################## + +@pytest.fixture +def hdbscan_traj(): + """Trajectory with two clear stops separated by a long gap.""" + np.random.seed(0) + base = 1609459200 # 2021-01-01 00:00:00 UTC + rows = [] + # Stop 1: 20 points at (0, 0) over ~10 minutes (30s intervals) + for i in range(20): + rows.append({'x': np.random.normal(0, 2), 'y': np.random.normal(0, 2), + 'timestamp': base + i * 30}) + # Gap of 30 minutes + t = base + 20 * 30 + 1800 + # Stop 2: 20 points at (1000, 1000) over ~10 minutes + for i in range(20): + rows.append({'x': np.random.normal(1000, 2), 'y': np.random.normal(1000, 2), + 'timestamp': t + i * 30}) + return pd.DataFrame(rows) + + +def test_hdbscan_labels_single_stop(hdbscan_traj): + """hdbscan_labels detects at least one cluster on clear stop data.""" + labels = HDBSCAN.hdbscan_labels( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=5, + traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + ) + assert (labels >= 0).any(), "Expected at least one cluster" + + +def test_hdbscan_labels_no_cluster_when_sparse(hdbscan_traj): + """No cluster forms when min_pts exceeds the number of temporal neighbors.""" + labels = HDBSCAN.hdbscan_labels( + hdbscan_traj, time_thresh=5, min_pts=100, min_cluster_size=3, dur_min=5, + traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + ) + assert (labels == -1).all(), "Expected all noise when min_pts is too high" + + +def test_hdbscan_labels_no_cluster_insufficient_duration(hdbscan_traj): + """No cluster forms when dur_min exceeds actual stop duration.""" + labels = HDBSCAN.hdbscan_labels( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=60, + traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + ) + assert (labels == -1).all(), "Expected all noise when dur_min is too high" + + +def test_hdbscan_labels_two_stops(hdbscan_traj): + """hdbscan_labels finds both stops in a two-stop trajectory.""" + labels = HDBSCAN.hdbscan_labels( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=5, + traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + ) + n_clusters = labels[labels >= 0].nunique() + assert n_clusters == 2, f"Expected 2 clusters, got {n_clusters}" + + +def test_hdbscan_number_labels_matches_stop_table(hdbscan_traj): + """Number of unique non-noise labels equals number of rows in st_hdbscan output.""" + traj_cols = {'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + labels = HDBSCAN.hdbscan_labels( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=5, + traj_cols=traj_cols + ) + stops = HDBSCAN.st_hdbscan( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=5, + traj_cols=traj_cols + ) + assert labels[labels >= 0].nunique() == len(stops) + + +def test_st_hdbscan_output_is_valid_stop_df(base_df): + """st_hdbscan concise output conforms to the stop DataFrame standard.""" + traj_cols = {'user_id': 'uid', 'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + df = loader.from_df(base_df, traj_cols=traj_cols, parse_dates=True, mixed_timezone_behavior='utc') + first_user = df[traj_cols['user_id']].iloc[0] + single = df[df[traj_cols['user_id']] == first_user].copy() + + stops = HDBSCAN.st_hdbscan( + single, time_thresh=10, min_pts=2, min_cluster_size=2, dur_min=5, + traj_cols=traj_cols, complete_output=False + ) + del traj_cols['user_id'] + assert loader._is_stop_df(stops, traj_cols=traj_cols, parse_dates=False) + + +def test_st_hdbscan_ground_truth(agent_traj_ground_truth): + """st_hdbscan detects the expected number of stops on ground-truth data.""" + traj_cols = {'user_id': 'identifier', 'x': 'x', 'y': 'y', 'timestamp': 'unix_timestamp'} + labels = HDBSCAN.hdbscan_labels( + agent_traj_ground_truth, time_thresh=10, min_pts=2, min_cluster_size=2, dur_min=3, + traj_cols=traj_cols + ) + n_clusters = labels[labels >= 0].nunique() + assert 2 <= n_clusters <= 5, f"Expected 2-5 stops on ground truth, got {n_clusters}" + + +def test_st_hdbscan_multiuser_raises(base_df): + """st_hdbscan raises ValueError when passed multi-user data.""" + traj_cols = {'user_id': 'uid', 'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + df = loader.from_df(base_df, traj_cols=traj_cols, parse_dates=True, mixed_timezone_behavior='utc') + with pytest.raises(ValueError, match="Multi-user"): + HDBSCAN.st_hdbscan(df, time_thresh=10, traj_cols=traj_cols) + + +def test_st_hdbscan_per_user_basic(base_df): + """st_hdbscan_per_user runs on multi-user data and returns stops for multiple users.""" + traj_cols = {'user_id': 'uid', 'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + df = loader.from_df(base_df, traj_cols=traj_cols, parse_dates=True, mixed_timezone_behavior='utc') + stops = HDBSCAN.st_hdbscan_per_user( + df, time_thresh=10, min_pts=2, min_cluster_size=2, dur_min=5, + traj_cols=traj_cols + ) + assert not stops.empty + assert 'uid' in stops.columns + assert stops['uid'].nunique() > 1 + + +def test_st_hdbscan_delta_roam(hdbscan_traj): + """delta_roam (epsilon cut) path runs without error and returns a DataFrame.""" + traj_cols = {'timestamp': 'timestamp', 'x': 'x', 'y': 'y'} + stops = HDBSCAN.st_hdbscan( + hdbscan_traj, time_thresh=5, min_pts=3, min_cluster_size=3, dur_min=5, + delta_roam=50, traj_cols=traj_cols + ) + assert isinstance(stops, pd.DataFrame) + + +########################################## +#### LOCATION CLUSTERING #### +#### (SLIDING.PY) #### +########################################## + +@pytest.fixture +def position_fixes_simple(): + """Simple position fixes for testing sliding window algorithm.""" + times = pd.date_range("2025-01-01 08:00", periods=10, freq="1min").tolist() + + # Create position fixes: + # Points 0-5: stationary (should form staypoint if time_threshold <= 5 min) + # Points 6-9: moved away + coords = [(0.0, 0.0)] * 6 + [(0.002, 0.002)] * 4 + + df = gpd.GeoDataFrame({ + "user_id": "user1", + "tracked_at": times, + "geometry": [Point(lon, lat) for lon, lat in coords] + }, crs="EPSG:4326") + + return df + + +@pytest.fixture +def position_fixes_with_gap(): + """Position fixes with temporal gap for testing gap_threshold.""" + times = pd.date_range("2025-01-01 08:00", periods=5, freq="1min").tolist() + # Add a large gap + times += [times[-1] + pd.Timedelta(minutes=20)] + times += pd.date_range(times[-1] + pd.Timedelta(minutes=1), periods=4, freq="1min").tolist() + + # All points at same location + coords = [(0.0, 0.0)] * len(times) + + df = gpd.GeoDataFrame({ + "user_id": "user1", + "tracked_at": times, + "geometry": [Point(lon, lat) for lon, lat in coords] + }, crs="EPSG:4326") + + return df + + +@pytest.fixture +def position_fixes_multi_user(): + """Position fixes for multiple users.""" + times = pd.date_range("2025-01-01 08:00", periods=8, freq="1min").tolist() + + user1_coords = [(0.0, 0.0)] * 4 + [(0.002, 0.002)] * 4 + user2_coords = [(0.001, 0.001)] * 4 + [(0.003, 0.003)] * 4 + + df = gpd.GeoDataFrame({ + "user_id": ["user1"] * 8 + ["user2"] * 8, + "tracked_at": times * 2, + "geometry": [Point(lon, lat) for lon, lat in user1_coords + user2_coords] + }, crs="EPSG:4326") + + return df + + +def test_sliding_basic_staypoint_detection(position_fixes_simple): + pfs, sp = SLIDING.generate_staypoints( + position_fixes_simple, + method="sliding", + dist_threshold=100, + time_threshold=3.0, + gap_threshold=15.0, + include_last=False, + exclude_duplicate_pfs=True, + n_jobs=1 + ) + + # Should detect at least one staypoint from first 6 points + assert len(sp) >= 1 + assert 'staypoint_id' in pfs.columns + assert isinstance(sp, gpd.GeoDataFrame) + + # Check required columns in staypoints + assert 'user_id' in sp.columns + assert 'started_at' in sp.columns + assert 'finished_at' in sp.columns + assert sp.geometry.name in sp.columns + + +def test_sliding_time_threshold(position_fixes_simple): + """Test that time_threshold is respected.""" + # With time_threshold=10, the first 6 points (5 min duration) should not form a staypoint + pfs, sp = SLIDING.generate_staypoints( + position_fixes_simple, + dist_threshold=100, + time_threshold=10.0, + gap_threshold=15.0, + include_last=False, + n_jobs=1 + ) + + # Should not detect any staypoints since max duration < 10 min + assert len(sp) == 0 + + +def test_sliding_distance_threshold(position_fixes_simple): + """Test that distance_threshold is respected.""" + # With very small distance threshold, points should not cluster together + # The test data has 6 points at (0,0) and 4 points at (0.002, 0.002) + # With dist_threshold=1m, these should form separate staypoints + pfs, sp = SLIDING.generate_staypoints( + position_fixes_simple, + dist_threshold=1, # 1 meter - very tight + time_threshold=3.0, + gap_threshold=15.0, + include_last=False, + n_jobs=1 + ) + + # With tight distance constraint, should detect 2 separate staypoints + # (one at each distinct location) + assert len(sp) == 2 + + +def test_sliding_gap_threshold(position_fixes_with_gap): + """Test that gap_threshold prevents clustering across large temporal gaps.""" + # With gap_threshold=15, the 20-minute gap should prevent clustering + pfs, sp = SLIDING.generate_staypoints( + position_fixes_with_gap, + dist_threshold=100, + time_threshold=3.0, + gap_threshold=15.0, + include_last=False, + n_jobs=1 + ) + + # Should detect at most 2 separate staypoints (before and after gap) + assert len(sp) <= 2 + + +def test_sliding_include_last(position_fixes_simple): + """Test include_last parameter.""" + # Without include_last + pfs1, sp1 = SLIDING.generate_staypoints( + position_fixes_simple, + dist_threshold=100, + time_threshold=3.0, + include_last=False, + n_jobs=1 + ) + + # With include_last + pfs2, sp2 = SLIDING.generate_staypoints( + position_fixes_simple, + dist_threshold=100, + time_threshold=3.0, + include_last=True, + n_jobs=1 + ) + + # include_last=True should potentially detect more staypoints + assert len(sp2) >= len(sp1) + + +def test_sliding_multi_user(position_fixes_multi_user): + """Test sliding window with multiple users.""" + pfs, sp = SLIDING.generate_staypoints( + position_fixes_multi_user, + dist_threshold=100, + time_threshold=2.0, + gap_threshold=15.0, + n_jobs=1 + ) + + # Should detect staypoints for both users + assert 'user_id' in sp.columns + unique_users = sp['user_id'].unique() + assert len(unique_users) >= 1 # At least one user should have staypoints + +def test_sliding_empty_dataframe(): + """Test sliding window with empty dataframe.""" + empty_pfs = gpd.GeoDataFrame({ + "user_id": [], + "tracked_at": [], + "geometry": [] + }, crs="EPSG:4326") + + with pytest.warns(UserWarning, match="No staypoints can be generated"): + pfs, sp = SLIDING.generate_staypoints( + empty_pfs, + dist_threshold=100, + time_threshold=5.0, + n_jobs=1 + ) + + assert len(sp) == 0 + assert 'staypoint_id' in pfs.columns